261 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
		
		
			
		
	
	
			261 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| 
								 | 
							
								<?php
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								namespace bw\sensitivewords;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								/**
							 | 
						|||
| 
								 | 
							
								 * 敏感词过滤
							 | 
						|||
| 
								 | 
							
								 */
							 | 
						|||
| 
								 | 
							
								class Sensitivewords
							 | 
						|||
| 
								 | 
							
								{
							 | 
						|||
| 
								 | 
							
								    public function generateJson()
							 | 
						|||
| 
								 | 
							
								    {
							 | 
						|||
| 
								 | 
							
								        //词库目录
							 | 
						|||
| 
								 | 
							
								        $sensitivewords_path  = ROOT_PATH . 'extend' . DS . 'bw' . DS .'sensitivewords' . DS .'Vocabulary' . DS ;
							 | 
						|||
| 
								 | 
							
								        //词库txt文件数组字符串集合,每个txt文件对应数组一个字符串对象
							 | 
						|||
| 
								 | 
							
								        $sensitivewords_txt = array();
							 | 
						|||
| 
								 | 
							
								        //递归遍历获取文件夹下的所有txt文件
							 | 
						|||
| 
								 | 
							
								        $sensitivewords_files = $this->get_files($sensitivewords_path);
							 | 
						|||
| 
								 | 
							
								        foreach ($sensitivewords_files as $file) {
							 | 
						|||
| 
								 | 
							
								            $sensitivewords_txt[] = file_get_contents($file);
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								        $sensitivewords_json_array = [];
							 | 
						|||
| 
								 | 
							
								        foreach ($sensitivewords_txt as $key => $value) {
							 | 
						|||
| 
								 | 
							
								            /**
							 | 
						|||
| 
								 | 
							
								             * 每个词库格式:
							 | 
						|||
| 
								 | 
							
								             * 何祚庥
							 | 
						|||
| 
								 | 
							
								             * 刘刚
							 | 
						|||
| 
								 | 
							
								             * 不要沉默
							 | 
						|||
| 
								 | 
							
								             * 后勤集团
							 | 
						|||
| 
								 | 
							
								             * 食堂涨价
							 | 
						|||
| 
								 | 
							
								             * 发国难财
							 | 
						|||
| 
								 | 
							
								             * 浪漫邂逅
							 | 
						|||
| 
								 | 
							
								             */
							 | 
						|||
| 
								 | 
							
								            //读取txt文件内容,每行一个词,并添加到敏感词数组中
							 | 
						|||
| 
								 | 
							
								            $sensitivewords_json_array = array_merge($sensitivewords_json_array, explode("\n", $value));
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								        //将txt文件内容合并成大json数组,并写入$sensitivewords_path文件夹下保存为sensitivewords.json
							 | 
						|||
| 
								 | 
							
								        $save_path = ROOT_PATH . 'extend' . DS . 'bw' . DS .'sensitivewords' . DS . 'sensitivewords.json';
							 | 
						|||
| 
								 | 
							
								        file_put_contents($save_path, json_encode($sensitivewords_json_array));
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        // 使用示例:
							 | 
						|||
| 
								 | 
							
								        $words = $sensitivewords_json_array;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        $words = array_filter(array_unique($words), function ($word) {
							 | 
						|||
| 
								 | 
							
								            return mb_strlen(trim($word), 'UTF-8') >= 2 && mb_strlen(trim($word), 'UTF-8') <= 30;
							 | 
						|||
| 
								 | 
							
								        });
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								//        var_dump(count($words));die;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								//        $words = $this->build_ac_automaton($words);
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        //json写入缓存
							 | 
						|||
| 
								 | 
							
								        // 序列化后保存
							 | 
						|||
| 
								 | 
							
								        cache('sensitivewords', serialize($words));
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        //返回生成的文件全路径
							 | 
						|||
| 
								 | 
							
								        return $save_path;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    public function readContent($text)
							 | 
						|||
| 
								 | 
							
								    {
							 | 
						|||
| 
								 | 
							
								        //先尝试读取缓存,缓存为空则尝试生成,生成后再尝试读取缓存
							 | 
						|||
| 
								 | 
							
								        $sensitivewords_json_array = unserialize(cache('sensitivewords'));
							 | 
						|||
| 
								 | 
							
								        if (empty($sensitivewords_json_array)) {
							 | 
						|||
| 
								 | 
							
								            $this->generateJson();
							 | 
						|||
| 
								 | 
							
								            $sensitivewords_json_array = unserialize(cache('sensitivewords'));
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								        return $sensitivewords_json_array;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								//        var_dump($sensitivewords_json_array);die;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        //执行比对检测
							 | 
						|||
| 
								 | 
							
								//        $results = $this->ac_search($text, $sensitivewords_json_array);
							 | 
						|||
| 
								 | 
							
								//        return $results;
							 | 
						|||
| 
								 | 
							
								    }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    /** 检测文本中是否包含敏感词
							 | 
						|||
| 
								 | 
							
								     * @param $text 检测文本
							 | 
						|||
| 
								 | 
							
								     * @param $excption 抛异常还是返回bool
							 | 
						|||
| 
								 | 
							
								     *
							 | 
						|||
| 
								 | 
							
								     */
							 | 
						|||
| 
								 | 
							
								    public function check($text,$excption = false)
							 | 
						|||
| 
								 | 
							
								    {
							 | 
						|||
| 
								 | 
							
								        $words = $this->readContent($text);
							 | 
						|||
| 
								 | 
							
								        if (empty($words)) {
							 | 
						|||
| 
								 | 
							
								            return [];
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        // 每 1000 个词一组
							 | 
						|||
| 
								 | 
							
								        $chunkSize = 1000;
							 | 
						|||
| 
								 | 
							
								        $chunks = array_chunk($words, $chunkSize);
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        $allMatches = [];
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        foreach ($chunks as $chunk) {
							 | 
						|||
| 
								 | 
							
								            $pattern = '/' . implode('|', array_map(function ($word) {
							 | 
						|||
| 
								 | 
							
								                    return preg_quote(trim($word), '/');
							 | 
						|||
| 
								 | 
							
								                }, $chunk)) . '/iu';
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								            if (@preg_match_all($pattern, $text, $matches)) {
							 | 
						|||
| 
								 | 
							
								                $allMatches = array_merge($allMatches, $matches[0]);
							 | 
						|||
| 
								 | 
							
								            }
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								         $check = array_values(array_unique($allMatches));
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								         if($excption){
							 | 
						|||
| 
								 | 
							
								             if($check){
							 | 
						|||
| 
								 | 
							
								                 throw new \Exception("包含敏感词:".implode(",",$check));
							 | 
						|||
| 
								 | 
							
								             }
							 | 
						|||
| 
								 | 
							
								         }else{
							 | 
						|||
| 
								 | 
							
								             return $check;
							 | 
						|||
| 
								 | 
							
								         }
							 | 
						|||
| 
								 | 
							
								    }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    /** 检测文本中敏感词并替换
							 | 
						|||
| 
								 | 
							
								     * @param $text
							 | 
						|||
| 
								 | 
							
								     * @param $replace
							 | 
						|||
| 
								 | 
							
								     *
							 | 
						|||
| 
								 | 
							
								     */
							 | 
						|||
| 
								 | 
							
								    public function check_and_replace($text,$replace = "*"){
							 | 
						|||
| 
								 | 
							
								        $check = $this->check($text,false);//得到违规词数组
							 | 
						|||
| 
								 | 
							
								        if($check){
							 | 
						|||
| 
								 | 
							
								            foreach ($check as $key => $value) {
							 | 
						|||
| 
								 | 
							
								                $text = str_replace($value,$replace,$text);
							 | 
						|||
| 
								 | 
							
								            }
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								        return $text;
							 | 
						|||
| 
								 | 
							
								    }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    /**
							 | 
						|||
| 
								 | 
							
								     * 递归获取指定目录下的文件路径(默认过滤 .txt 文件)
							 | 
						|||
| 
								 | 
							
								     *
							 | 
						|||
| 
								 | 
							
								     * @param string $dir 目录路径
							 | 
						|||
| 
								 | 
							
								     * @param string $extension 扩展名过滤,默认为 'txt'
							 | 
						|||
| 
								 | 
							
								     * @return array 返回符合条件的文件路径数组
							 | 
						|||
| 
								 | 
							
								     */
							 | 
						|||
| 
								 | 
							
								    public function get_files($dir, $extension = 'txt')
							 | 
						|||
| 
								 | 
							
								    {
							 | 
						|||
| 
								 | 
							
								        $files = [];
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        if (!is_dir($dir)) {
							 | 
						|||
| 
								 | 
							
								            return $files;
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        $items = scandir($dir);
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        foreach ($items as $item) {
							 | 
						|||
| 
								 | 
							
								            if ($item === '.' || $item === '..') {
							 | 
						|||
| 
								 | 
							
								                continue;
							 | 
						|||
| 
								 | 
							
								            }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								            $path = $dir . DIRECTORY_SEPARATOR . $item;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								            if (is_dir($path)) {
							 | 
						|||
| 
								 | 
							
								                // 如果是目录,递归处理
							 | 
						|||
| 
								 | 
							
								                $files = array_merge($files, $this->get_files($path, $extension));
							 | 
						|||
| 
								 | 
							
								            } elseif (pathinfo($path, PATHINFO_EXTENSION) === $extension) {
							 | 
						|||
| 
								 | 
							
								                // 如果是目标扩展名的文件,加入结果集
							 | 
						|||
| 
								 | 
							
								                $files[] = $path;
							 | 
						|||
| 
								 | 
							
								            }
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        return $files;
							 | 
						|||
| 
								 | 
							
								    }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    function build_ac_automaton($words) {
							 | 
						|||
| 
								 | 
							
								        $root = new AhoNode();
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        // 构建Trie树
							 | 
						|||
| 
								 | 
							
								        foreach ($words as $word) {
							 | 
						|||
| 
								 | 
							
								            $node = $root;
							 | 
						|||
| 
								 | 
							
								            $len = strlen($word);
							 | 
						|||
| 
								 | 
							
								            for ($i = 0; $i < $len; $i++) {
							 | 
						|||
| 
								 | 
							
								                $char = $word[$i];
							 | 
						|||
| 
								 | 
							
								                if (!isset($node->children[$char])) {
							 | 
						|||
| 
								 | 
							
								                    $node->children[$char] = new AhoNode();
							 | 
						|||
| 
								 | 
							
								                }
							 | 
						|||
| 
								 | 
							
								                $node = $node->children[$char];
							 | 
						|||
| 
								 | 
							
								            }
							 | 
						|||
| 
								 | 
							
								            $node->is_end = true;
							 | 
						|||
| 
								 | 
							
								            $node->word = $word;
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        // 使用队列构建失败指针(BFS)
							 | 
						|||
| 
								 | 
							
								        $queue = [];
							 | 
						|||
| 
								 | 
							
								        array_push($queue, $root);
							 | 
						|||
| 
								 | 
							
								        $root->fail = $root;
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        while (!empty($queue)) {
							 | 
						|||
| 
								 | 
							
								            $current_node = array_shift($queue);
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								            foreach ($current_node->children as $char => $child) {
							 | 
						|||
| 
								 | 
							
								                if ($current_node === $root) {
							 | 
						|||
| 
								 | 
							
								                    $child->fail = $root;
							 | 
						|||
| 
								 | 
							
								                } else {
							 | 
						|||
| 
								 | 
							
								                    $p = $current_node->fail;
							 | 
						|||
| 
								 | 
							
								                    while ($p !== $root && !isset($p->children[$char])) {
							 | 
						|||
| 
								 | 
							
								                        $p = $p->fail;
							 | 
						|||
| 
								 | 
							
								                    }
							 | 
						|||
| 
								 | 
							
								                    $child->fail = isset($p->children[$char]) ? $p->children[$char] : $root;
							 | 
						|||
| 
								 | 
							
								                }
							 | 
						|||
| 
								 | 
							
								                array_push($queue, $child);
							 | 
						|||
| 
								 | 
							
								            }
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        return $root;
							 | 
						|||
| 
								 | 
							
								    }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								    function ac_search($text, $root) {
							 | 
						|||
| 
								 | 
							
								        $result = [];
							 | 
						|||
| 
								 | 
							
								        $current = $root;
							 | 
						|||
| 
								 | 
							
								        $len = mb_strlen($text, 'UTF-8');
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        for ($i = 0; $i < $len; $i++) {
							 | 
						|||
| 
								 | 
							
								            $char = mb_substr($text, $i, 1, 'UTF-8');
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								            // 沿着失败指针回溯直到找到匹配的子节点或到达根节点
							 | 
						|||
| 
								 | 
							
								            while ($current !== $root && !isset($current->children[$char])) {
							 | 
						|||
| 
								 | 
							
								                $current = $current->fail;
							 | 
						|||
| 
								 | 
							
								            }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								            // 如果当前字符有对应的子节点,则进入该子节点
							 | 
						|||
| 
								 | 
							
								            if (isset($current->children[$char])) {
							 | 
						|||
| 
								 | 
							
								                $current = $current->children[$char];
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								                // 检查当前节点及所有失败链上的结束节点
							 | 
						|||
| 
								 | 
							
								                $temp = $current;
							 | 
						|||
| 
								 | 
							
								                while ($temp !== $root) {
							 | 
						|||
| 
								 | 
							
								                    if ($temp->is_end) {
							 | 
						|||
| 
								 | 
							
								                        $start_index = $i - strlen($temp->word) + 1;
							 | 
						|||
| 
								 | 
							
								                        $result[] = [
							 | 
						|||
| 
								 | 
							
								                            'word' => $temp->word,
							 | 
						|||
| 
								 | 
							
								                            'start' => $start_index,
							 | 
						|||
| 
								 | 
							
								                            'end' => $i
							 | 
						|||
| 
								 | 
							
								                        ];
							 | 
						|||
| 
								 | 
							
								                    }
							 | 
						|||
| 
								 | 
							
								                    $temp = $temp->fail;
							 | 
						|||
| 
								 | 
							
								                }
							 | 
						|||
| 
								 | 
							
								            }
							 | 
						|||
| 
								 | 
							
								        }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								        return $result;
							 | 
						|||
| 
								 | 
							
								    }
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								// 使用示例:
							 | 
						|||
| 
								 | 
							
								// $words = ["he", "she", "his", "hers"];
							 | 
						|||
| 
								 | 
							
								// $root = build_ac_automaton($words);
							 | 
						|||
| 
								 | 
							
								// $text = "ushers";
							 | 
						|||
| 
								 | 
							
								// $results = ac_search($text, $root);
							 | 
						|||
| 
								 | 
							
								// print_r($results);
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								
							 | 
						|||
| 
								 | 
							
								}
							 |