261 lines
7.8 KiB
PHP
261 lines
7.8 KiB
PHP
<?php
|
||
|
||
namespace bw\sensitivewords;
|
||
|
||
/**
|
||
* 敏感词过滤
|
||
*/
|
||
class Sensitivewords
|
||
{
|
||
public function generateJson()
|
||
{
|
||
//词库目录
|
||
$sensitivewords_path = ROOT_PATH . 'extend' . DS . 'bw' . DS .'sensitivewords' . DS .'Vocabulary' . DS ;
|
||
//词库txt文件数组字符串集合,每个txt文件对应数组一个字符串对象
|
||
$sensitivewords_txt = array();
|
||
//递归遍历获取文件夹下的所有txt文件
|
||
$sensitivewords_files = $this->get_files($sensitivewords_path);
|
||
foreach ($sensitivewords_files as $file) {
|
||
$sensitivewords_txt[] = file_get_contents($file);
|
||
}
|
||
$sensitivewords_json_array = [];
|
||
foreach ($sensitivewords_txt as $key => $value) {
|
||
/**
|
||
* 每个词库格式:
|
||
* 何祚庥
|
||
* 刘刚
|
||
* 不要沉默
|
||
* 后勤集团
|
||
* 食堂涨价
|
||
* 发国难财
|
||
* 浪漫邂逅
|
||
*/
|
||
//读取txt文件内容,每行一个词,并添加到敏感词数组中
|
||
$sensitivewords_json_array = array_merge($sensitivewords_json_array, explode("\n", $value));
|
||
}
|
||
//将txt文件内容合并成大json数组,并写入$sensitivewords_path文件夹下保存为sensitivewords.json
|
||
$save_path = ROOT_PATH . 'extend' . DS . 'bw' . DS .'sensitivewords' . DS . 'sensitivewords.json';
|
||
file_put_contents($save_path, json_encode($sensitivewords_json_array));
|
||
|
||
// 使用示例:
|
||
$words = $sensitivewords_json_array;
|
||
|
||
$words = array_filter(array_unique($words), function ($word) {
|
||
return mb_strlen(trim($word), 'UTF-8') >= 2 && mb_strlen(trim($word), 'UTF-8') <= 30;
|
||
});
|
||
|
||
|
||
// var_dump(count($words));die;
|
||
|
||
// $words = $this->build_ac_automaton($words);
|
||
|
||
//json写入缓存
|
||
// 序列化后保存
|
||
cache('sensitivewords', serialize($words));
|
||
|
||
//返回生成的文件全路径
|
||
return $save_path;
|
||
|
||
}
|
||
|
||
|
||
public function readContent($text)
|
||
{
|
||
//先尝试读取缓存,缓存为空则尝试生成,生成后再尝试读取缓存
|
||
$sensitivewords_json_array = unserialize(cache('sensitivewords'));
|
||
if (empty($sensitivewords_json_array)) {
|
||
$this->generateJson();
|
||
$sensitivewords_json_array = unserialize(cache('sensitivewords'));
|
||
}
|
||
return $sensitivewords_json_array;
|
||
|
||
|
||
// var_dump($sensitivewords_json_array);die;
|
||
|
||
//执行比对检测
|
||
// $results = $this->ac_search($text, $sensitivewords_json_array);
|
||
// return $results;
|
||
}
|
||
|
||
/** 检测文本中是否包含敏感词
|
||
* @param $text 检测文本
|
||
* @param $excption 抛异常还是返回bool
|
||
*
|
||
*/
|
||
public function check($text,$excption = false)
|
||
{
|
||
$words = $this->readContent($text);
|
||
if (empty($words)) {
|
||
return [];
|
||
}
|
||
|
||
// 每 1000 个词一组
|
||
$chunkSize = 1000;
|
||
$chunks = array_chunk($words, $chunkSize);
|
||
|
||
$allMatches = [];
|
||
|
||
foreach ($chunks as $chunk) {
|
||
$pattern = '/' . implode('|', array_map(function ($word) {
|
||
return preg_quote(trim($word), '/');
|
||
}, $chunk)) . '/iu';
|
||
|
||
if (@preg_match_all($pattern, $text, $matches)) {
|
||
$allMatches = array_merge($allMatches, $matches[0]);
|
||
}
|
||
}
|
||
$check = array_values(array_unique($allMatches));
|
||
|
||
if($excption){
|
||
if($check){
|
||
throw new \Exception("包含敏感词:".implode(",",$check));
|
||
}
|
||
}else{
|
||
return $check;
|
||
}
|
||
}
|
||
|
||
|
||
/** 检测文本中敏感词并替换
|
||
* @param $text
|
||
* @param $replace
|
||
*
|
||
*/
|
||
public function check_and_replace($text,$replace = "*"){
|
||
$check = $this->check($text,false);//得到违规词数组
|
||
if($check){
|
||
foreach ($check as $key => $value) {
|
||
$text = str_replace($value,$replace,$text);
|
||
}
|
||
}
|
||
return $text;
|
||
}
|
||
|
||
|
||
|
||
/**
|
||
* 递归获取指定目录下的文件路径(默认过滤 .txt 文件)
|
||
*
|
||
* @param string $dir 目录路径
|
||
* @param string $extension 扩展名过滤,默认为 'txt'
|
||
* @return array 返回符合条件的文件路径数组
|
||
*/
|
||
public function get_files($dir, $extension = 'txt')
|
||
{
|
||
$files = [];
|
||
|
||
if (!is_dir($dir)) {
|
||
return $files;
|
||
}
|
||
|
||
$items = scandir($dir);
|
||
|
||
foreach ($items as $item) {
|
||
if ($item === '.' || $item === '..') {
|
||
continue;
|
||
}
|
||
|
||
$path = $dir . DIRECTORY_SEPARATOR . $item;
|
||
|
||
if (is_dir($path)) {
|
||
// 如果是目录,递归处理
|
||
$files = array_merge($files, $this->get_files($path, $extension));
|
||
} elseif (pathinfo($path, PATHINFO_EXTENSION) === $extension) {
|
||
// 如果是目标扩展名的文件,加入结果集
|
||
$files[] = $path;
|
||
}
|
||
}
|
||
|
||
return $files;
|
||
}
|
||
|
||
|
||
|
||
function build_ac_automaton($words) {
|
||
$root = new AhoNode();
|
||
|
||
// 构建Trie树
|
||
foreach ($words as $word) {
|
||
$node = $root;
|
||
$len = strlen($word);
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = $word[$i];
|
||
if (!isset($node->children[$char])) {
|
||
$node->children[$char] = new AhoNode();
|
||
}
|
||
$node = $node->children[$char];
|
||
}
|
||
$node->is_end = true;
|
||
$node->word = $word;
|
||
}
|
||
|
||
// 使用队列构建失败指针(BFS)
|
||
$queue = [];
|
||
array_push($queue, $root);
|
||
$root->fail = $root;
|
||
|
||
while (!empty($queue)) {
|
||
$current_node = array_shift($queue);
|
||
|
||
foreach ($current_node->children as $char => $child) {
|
||
if ($current_node === $root) {
|
||
$child->fail = $root;
|
||
} else {
|
||
$p = $current_node->fail;
|
||
while ($p !== $root && !isset($p->children[$char])) {
|
||
$p = $p->fail;
|
||
}
|
||
$child->fail = isset($p->children[$char]) ? $p->children[$char] : $root;
|
||
}
|
||
array_push($queue, $child);
|
||
}
|
||
}
|
||
|
||
return $root;
|
||
}
|
||
|
||
function ac_search($text, $root) {
|
||
$result = [];
|
||
$current = $root;
|
||
$len = mb_strlen($text, 'UTF-8');
|
||
|
||
for ($i = 0; $i < $len; $i++) {
|
||
$char = mb_substr($text, $i, 1, 'UTF-8');
|
||
|
||
// 沿着失败指针回溯直到找到匹配的子节点或到达根节点
|
||
while ($current !== $root && !isset($current->children[$char])) {
|
||
$current = $current->fail;
|
||
}
|
||
|
||
// 如果当前字符有对应的子节点,则进入该子节点
|
||
if (isset($current->children[$char])) {
|
||
$current = $current->children[$char];
|
||
|
||
// 检查当前节点及所有失败链上的结束节点
|
||
$temp = $current;
|
||
while ($temp !== $root) {
|
||
if ($temp->is_end) {
|
||
$start_index = $i - strlen($temp->word) + 1;
|
||
$result[] = [
|
||
'word' => $temp->word,
|
||
'start' => $start_index,
|
||
'end' => $i
|
||
];
|
||
}
|
||
$temp = $temp->fail;
|
||
}
|
||
}
|
||
}
|
||
|
||
return $result;
|
||
}
|
||
|
||
// 使用示例:
|
||
// $words = ["he", "she", "his", "hers"];
|
||
// $root = build_ac_automaton($words);
|
||
// $text = "ushers";
|
||
// $results = ac_search($text, $root);
|
||
// print_r($results);
|
||
|
||
|
||
} |