261 lines
7.8 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace bw\sensitivewords;
/**
* 敏感词过滤
*/
class Sensitivewords
{
public function generateJson()
{
//词库目录
$sensitivewords_path = ROOT_PATH . 'extend' . DS . 'bw' . DS .'sensitivewords' . DS .'Vocabulary' . DS ;
//词库txt文件数组字符串集合每个txt文件对应数组一个字符串对象
$sensitivewords_txt = array();
//递归遍历获取文件夹下的所有txt文件
$sensitivewords_files = $this->get_files($sensitivewords_path);
foreach ($sensitivewords_files as $file) {
$sensitivewords_txt[] = file_get_contents($file);
}
$sensitivewords_json_array = [];
foreach ($sensitivewords_txt as $key => $value) {
/**
* 每个词库格式:
* 何祚庥
* 刘刚
* 不要沉默
* 后勤集团
* 食堂涨价
* 发国难财
* 浪漫邂逅
*/
//读取txt文件内容,每行一个词,并添加到敏感词数组中
$sensitivewords_json_array = array_merge($sensitivewords_json_array, explode("\n", $value));
}
//将txt文件内容合并成大json数组并写入$sensitivewords_path文件夹下保存为sensitivewords.json
$save_path = ROOT_PATH . 'extend' . DS . 'bw' . DS .'sensitivewords' . DS . 'sensitivewords.json';
file_put_contents($save_path, json_encode($sensitivewords_json_array));
// 使用示例:
$words = $sensitivewords_json_array;
$words = array_filter(array_unique($words), function ($word) {
return mb_strlen(trim($word), 'UTF-8') >= 2 && mb_strlen(trim($word), 'UTF-8') <= 30;
});
// var_dump(count($words));die;
// $words = $this->build_ac_automaton($words);
//json写入缓存
// 序列化后保存
cache('sensitivewords', serialize($words));
//返回生成的文件全路径
return $save_path;
}
public function readContent($text)
{
//先尝试读取缓存,缓存为空则尝试生成,生成后再尝试读取缓存
$sensitivewords_json_array = unserialize(cache('sensitivewords'));
if (empty($sensitivewords_json_array)) {
$this->generateJson();
$sensitivewords_json_array = unserialize(cache('sensitivewords'));
}
return $sensitivewords_json_array;
// var_dump($sensitivewords_json_array);die;
//执行比对检测
// $results = $this->ac_search($text, $sensitivewords_json_array);
// return $results;
}
/** 检测文本中是否包含敏感词
* @param $text 检测文本
* @param $excption 抛异常还是返回bool
*
*/
public function check($text,$excption = false)
{
$words = $this->readContent($text);
if (empty($words)) {
return [];
}
// 每 1000 个词一组
$chunkSize = 1000;
$chunks = array_chunk($words, $chunkSize);
$allMatches = [];
foreach ($chunks as $chunk) {
$pattern = '/' . implode('|', array_map(function ($word) {
return preg_quote(trim($word), '/');
}, $chunk)) . '/iu';
if (@preg_match_all($pattern, $text, $matches)) {
$allMatches = array_merge($allMatches, $matches[0]);
}
}
$check = array_values(array_unique($allMatches));
if($excption){
if($check){
throw new \Exception("包含敏感词:".implode(",",$check));
}
}else{
return $check;
}
}
/** 检测文本中敏感词并替换
* @param $text
* @param $replace
*
*/
public function check_and_replace($text,$replace = "*"){
$check = $this->check($text,false);//得到违规词数组
if($check){
foreach ($check as $key => $value) {
$text = str_replace($value,$replace,$text);
}
}
return $text;
}
/**
* 递归获取指定目录下的文件路径(默认过滤 .txt 文件)
*
* @param string $dir 目录路径
* @param string $extension 扩展名过滤,默认为 'txt'
* @return array 返回符合条件的文件路径数组
*/
public function get_files($dir, $extension = 'txt')
{
$files = [];
if (!is_dir($dir)) {
return $files;
}
$items = scandir($dir);
foreach ($items as $item) {
if ($item === '.' || $item === '..') {
continue;
}
$path = $dir . DIRECTORY_SEPARATOR . $item;
if (is_dir($path)) {
// 如果是目录,递归处理
$files = array_merge($files, $this->get_files($path, $extension));
} elseif (pathinfo($path, PATHINFO_EXTENSION) === $extension) {
// 如果是目标扩展名的文件,加入结果集
$files[] = $path;
}
}
return $files;
}
function build_ac_automaton($words) {
$root = new AhoNode();
// 构建Trie树
foreach ($words as $word) {
$node = $root;
$len = strlen($word);
for ($i = 0; $i < $len; $i++) {
$char = $word[$i];
if (!isset($node->children[$char])) {
$node->children[$char] = new AhoNode();
}
$node = $node->children[$char];
}
$node->is_end = true;
$node->word = $word;
}
// 使用队列构建失败指针BFS
$queue = [];
array_push($queue, $root);
$root->fail = $root;
while (!empty($queue)) {
$current_node = array_shift($queue);
foreach ($current_node->children as $char => $child) {
if ($current_node === $root) {
$child->fail = $root;
} else {
$p = $current_node->fail;
while ($p !== $root && !isset($p->children[$char])) {
$p = $p->fail;
}
$child->fail = isset($p->children[$char]) ? $p->children[$char] : $root;
}
array_push($queue, $child);
}
}
return $root;
}
function ac_search($text, $root) {
$result = [];
$current = $root;
$len = mb_strlen($text, 'UTF-8');
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($text, $i, 1, 'UTF-8');
// 沿着失败指针回溯直到找到匹配的子节点或到达根节点
while ($current !== $root && !isset($current->children[$char])) {
$current = $current->fail;
}
// 如果当前字符有对应的子节点,则进入该子节点
if (isset($current->children[$char])) {
$current = $current->children[$char];
// 检查当前节点及所有失败链上的结束节点
$temp = $current;
while ($temp !== $root) {
if ($temp->is_end) {
$start_index = $i - strlen($temp->word) + 1;
$result[] = [
'word' => $temp->word,
'start' => $start_index,
'end' => $i
];
}
$temp = $temp->fail;
}
}
}
return $result;
}
// 使用示例:
// $words = ["he", "she", "his", "hers"];
// $root = build_ac_automaton($words);
// $text = "ushers";
// $results = ac_search($text, $root);
// print_r($results);
}