get_files($sensitivewords_path); foreach ($sensitivewords_files as $file) { $sensitivewords_txt[] = file_get_contents($file); } $sensitivewords_json_array = []; foreach ($sensitivewords_txt as $key => $value) { /** * 每个词库格式: * 何祚庥 * 刘刚 * 不要沉默 * 后勤集团 * 食堂涨价 * 发国难财 * 浪漫邂逅 */ //读取txt文件内容,每行一个词,并添加到敏感词数组中 $sensitivewords_json_array = array_merge($sensitivewords_json_array, explode("\n", $value)); } //将txt文件内容合并成大json数组,并写入$sensitivewords_path文件夹下保存为sensitivewords.json $save_path = ROOT_PATH . 'extend' . DS . 'bw' . DS .'sensitivewords' . DS . 'sensitivewords.json'; file_put_contents($save_path, json_encode($sensitivewords_json_array)); // 使用示例: $words = $sensitivewords_json_array; $words = array_filter(array_unique($words), function ($word) { return mb_strlen(trim($word), 'UTF-8') >= 2 && mb_strlen(trim($word), 'UTF-8') <= 30; }); // var_dump(count($words));die; // $words = $this->build_ac_automaton($words); //json写入缓存 // 序列化后保存 cache('sensitivewords', serialize($words)); //返回生成的文件全路径 return $save_path; } public function readContent($text) { //先尝试读取缓存,缓存为空则尝试生成,生成后再尝试读取缓存 $sensitivewords_json_array = unserialize(cache('sensitivewords')); if (empty($sensitivewords_json_array)) { $this->generateJson(); $sensitivewords_json_array = unserialize(cache('sensitivewords')); } return $sensitivewords_json_array; // var_dump($sensitivewords_json_array);die; //执行比对检测 // $results = $this->ac_search($text, $sensitivewords_json_array); // return $results; } /** 检测文本中是否包含敏感词 * @param $text 检测文本 * @param $excption 抛异常还是返回bool * */ public function check($text,$excption = false) { $words = $this->readContent($text); if (empty($words)) { return []; } // 每 1000 个词一组 $chunkSize = 1000; $chunks = array_chunk($words, $chunkSize); $allMatches = []; foreach ($chunks as $chunk) { $pattern = '/' . implode('|', array_map(function ($word) { return preg_quote(trim($word), '/'); }, $chunk)) . '/iu'; if (@preg_match_all($pattern, $text, $matches)) { $allMatches = array_merge($allMatches, $matches[0]); } } $check = array_values(array_unique($allMatches)); if($excption){ if($check){ throw new \Exception("包含敏感词:".implode(",",$check)); } }else{ return $check; } } /** 检测文本中敏感词并替换 * @param $text * @param $replace * */ public function check_and_replace($text,$replace = "*"){ $check = $this->check($text,false);//得到违规词数组 if($check){ foreach ($check as $key => $value) { $text = str_replace($value,$replace,$text); } } return $text; } /** * 递归获取指定目录下的文件路径(默认过滤 .txt 文件) * * @param string $dir 目录路径 * @param string $extension 扩展名过滤,默认为 'txt' * @return array 返回符合条件的文件路径数组 */ public function get_files($dir, $extension = 'txt') { $files = []; if (!is_dir($dir)) { return $files; } $items = scandir($dir); foreach ($items as $item) { if ($item === '.' || $item === '..') { continue; } $path = $dir . DIRECTORY_SEPARATOR . $item; if (is_dir($path)) { // 如果是目录,递归处理 $files = array_merge($files, $this->get_files($path, $extension)); } elseif (pathinfo($path, PATHINFO_EXTENSION) === $extension) { // 如果是目标扩展名的文件,加入结果集 $files[] = $path; } } return $files; } function build_ac_automaton($words) { $root = new AhoNode(); // 构建Trie树 foreach ($words as $word) { $node = $root; $len = strlen($word); for ($i = 0; $i < $len; $i++) { $char = $word[$i]; if (!isset($node->children[$char])) { $node->children[$char] = new AhoNode(); } $node = $node->children[$char]; } $node->is_end = true; $node->word = $word; } // 使用队列构建失败指针(BFS) $queue = []; array_push($queue, $root); $root->fail = $root; while (!empty($queue)) { $current_node = array_shift($queue); foreach ($current_node->children as $char => $child) { if ($current_node === $root) { $child->fail = $root; } else { $p = $current_node->fail; while ($p !== $root && !isset($p->children[$char])) { $p = $p->fail; } $child->fail = isset($p->children[$char]) ? $p->children[$char] : $root; } array_push($queue, $child); } } return $root; } function ac_search($text, $root) { $result = []; $current = $root; $len = mb_strlen($text, 'UTF-8'); for ($i = 0; $i < $len; $i++) { $char = mb_substr($text, $i, 1, 'UTF-8'); // 沿着失败指针回溯直到找到匹配的子节点或到达根节点 while ($current !== $root && !isset($current->children[$char])) { $current = $current->fail; } // 如果当前字符有对应的子节点,则进入该子节点 if (isset($current->children[$char])) { $current = $current->children[$char]; // 检查当前节点及所有失败链上的结束节点 $temp = $current; while ($temp !== $root) { if ($temp->is_end) { $start_index = $i - strlen($temp->word) + 1; $result[] = [ 'word' => $temp->word, 'start' => $start_index, 'end' => $i ]; } $temp = $temp->fail; } } } return $result; } // 使用示例: // $words = ["he", "she", "his", "hers"]; // $root = build_ac_automaton($words); // $text = "ushers"; // $results = ac_search($text, $root); // print_r($results); }