291 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
		
		
			
		
	
	
			291 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| 
								 | 
							
								<?php
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * 使用分词
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								namespace Lizhichao\Word;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class VicWord
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    private $dict = [];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private $end = '\\';
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private $auto = false;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private $count = 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    /**
							 | 
						||
| 
								 | 
							
								     * @var string 词性
							 | 
						||
| 
								 | 
							
								     */
							 | 
						||
| 
								 | 
							
								    private $x = '\\x';
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    public function __construct($dictPath = '')
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        if($dictPath === ''){
							 | 
						||
| 
								 | 
							
								            $dictPath = dirname(__DIR__) . '/Data/dict.json';
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        $type = pathinfo($dictPath)['extension'];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if ( ! \file_exists($dictPath)) {
							 | 
						||
| 
								 | 
							
								            throw new \Exception("Invalid dict file: {$dictPath}");
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        // check dict type
							 | 
						||
| 
								 | 
							
								        switch ($type) {
							 | 
						||
| 
								 | 
							
								            case 'igb':
							 | 
						||
| 
								 | 
							
								                if ( ! \function_exists('\\igbinary_unserialize')) {
							 | 
						||
| 
								 | 
							
								                    throw new \Exception('Requires igbinary PHP extension.');
							 | 
						||
| 
								 | 
							
								                }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								                $this->dict = \igbinary_unserialize(\file_get_contents($dictPath));
							 | 
						||
| 
								 | 
							
								                break;
							 | 
						||
| 
								 | 
							
								            case 'json':
							 | 
						||
| 
								 | 
							
								                $this->dict = \json_decode(\file_get_contents($dictPath), true);
							 | 
						||
| 
								 | 
							
								                break;
							 | 
						||
| 
								 | 
							
								            default:
							 | 
						||
| 
								 | 
							
								                throw new \Exception('Invalid dict type.');
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    /**
							 | 
						||
| 
								 | 
							
								     * @param string $str
							 | 
						||
| 
								 | 
							
								     */
							 | 
						||
| 
								 | 
							
								    public function getWord($str)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        $this->auto = false;
							 | 
						||
| 
								 | 
							
								        $str        = $this->filter($str);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return $this->find($str);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    /**
							 | 
						||
| 
								 | 
							
								     * @param string $str
							 | 
						||
| 
								 | 
							
								     */
							 | 
						||
| 
								 | 
							
								    public function getShortWord($str)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        $this->auto = false;
							 | 
						||
| 
								 | 
							
								        $str        = $this->filter($str);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return $this->shortfind($str);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    /**
							 | 
						||
| 
								 | 
							
								     * @param string $str
							 | 
						||
| 
								 | 
							
								     */
							 | 
						||
| 
								 | 
							
								    public function getAutoWord($str)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        $this->auto = true;
							 | 
						||
| 
								 | 
							
								        $str        = $this->filter($str);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return $this->autoFind($str, ['long' => 1]);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function filter($str)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        return \strtolower($str);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function getD(&$str, $i)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        $o = \ord($str[$i]);
							 | 
						||
| 
								 | 
							
								        if ($o < 128) {
							 | 
						||
| 
								 | 
							
								            $d = $str[$i];
							 | 
						||
| 
								 | 
							
								        } else {
							 | 
						||
| 
								 | 
							
								            $o = $o >> 4;
							 | 
						||
| 
								 | 
							
								            if (12 === $o) {
							 | 
						||
| 
								 | 
							
								                $d = $str[$i] . $str[++$i];
							 | 
						||
| 
								 | 
							
								            } elseif (14 === $o) {
							 | 
						||
| 
								 | 
							
								                $d = $str[$i] . $str[++$i] . $str[++$i];
							 | 
						||
| 
								 | 
							
								            } elseif (15 === $o) {
							 | 
						||
| 
								 | 
							
								                $d = $str[$i] . $str[++$i] . $str[++$i] . $str[++$i];
							 | 
						||
| 
								 | 
							
								            } else {
							 | 
						||
| 
								 | 
							
								                throw new \Exception('Error: unknow charset.');
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return [$d, $i];
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function autoFind($str, $autoInfo = [])
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        if ($autoInfo['long']) {
							 | 
						||
| 
								 | 
							
								            return $this->find($str, $autoInfo);
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return $this->shortfind($str, $autoInfo);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function reGet(&$r, $autoInfo)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        $autoInfo['c'] = isset($autoInfo['c']) ? $autoInfo['c']++ : 1;
							 | 
						||
| 
								 | 
							
								        $l             = \count($r) - 1;
							 | 
						||
| 
								 | 
							
								        $p             = [];
							 | 
						||
| 
								 | 
							
								        $str           = '';
							 | 
						||
| 
								 | 
							
								        for ($i = $l; $i >= 0; --$i) {
							 | 
						||
| 
								 | 
							
								            $str = $r[$i][0] . $str;
							 | 
						||
| 
								 | 
							
								            $f   = $r[$i][3];
							 | 
						||
| 
								 | 
							
								            \array_unshift($p, $r[$i]);
							 | 
						||
| 
								 | 
							
								            unset($r[$i]);
							 | 
						||
| 
								 | 
							
								            if (1 === (int) $f) {
							 | 
						||
| 
								 | 
							
								                break;
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        ++$this->count;
							 | 
						||
| 
								 | 
							
								        $l = \strlen($str);
							 | 
						||
| 
								 | 
							
								        if (isset($r[$i - 1])) {
							 | 
						||
| 
								 | 
							
								            $w = $r[$i - 1][1];
							 | 
						||
| 
								 | 
							
								        } else {
							 | 
						||
| 
								 | 
							
								            $w = 0;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        if (isset($autoInfo['pl']) && $l === (int) $autoInfo['pl']) {
							 | 
						||
| 
								 | 
							
								            $r = $p;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            return false;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        if ($str && $autoInfo['c'] < 3) {
							 | 
						||
| 
								 | 
							
								            $autoInfo['pl']   = $l;
							 | 
						||
| 
								 | 
							
								            $autoInfo['long'] = ! $autoInfo['long'];
							 | 
						||
| 
								 | 
							
								            $sr               = $this->autoFind($str, $autoInfo);
							 | 
						||
| 
								 | 
							
								            $sr               = \array_map(function ($v) use ($w) {
							 | 
						||
| 
								 | 
							
								                $v[1] += $w;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								                return $v;
							 | 
						||
| 
								 | 
							
								            }, $sr);
							 | 
						||
| 
								 | 
							
								            $r = \array_merge($r, $this->getGoodWord($p, $sr));
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function getGoodWord($old, $new)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        if ( ! $new) {
							 | 
						||
| 
								 | 
							
								            return $old;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        if ($this->getUnknowCount($old) > $this->getUnknowCount($new)) {
							 | 
						||
| 
								 | 
							
								            return $new;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return $old;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function getUnknowCount($ar)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        $i = 0;
							 | 
						||
| 
								 | 
							
								        foreach ($ar as $v) {
							 | 
						||
| 
								 | 
							
								            if (0 === (int) $v[3]) {
							 | 
						||
| 
								 | 
							
								                $i += \strlen($v[0]);
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return $i;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function find($str, $autoInfo = [])
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        $len = \strlen($str);
							 | 
						||
| 
								 | 
							
								        $s   = '';
							 | 
						||
| 
								 | 
							
								        $n   = '';
							 | 
						||
| 
								 | 
							
								        $j   = 0;
							 | 
						||
| 
								 | 
							
								        $r   = [];
							 | 
						||
| 
								 | 
							
								        $wr  = [];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        for ($i = 0; $i < $len; ++$i) {
							 | 
						||
| 
								 | 
							
								            list($d, $i) = $this->getD($str, $i);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            if (isset($wr[$d])) {
							 | 
						||
| 
								 | 
							
								                $s .= $d;
							 | 
						||
| 
								 | 
							
								                $wr = $wr[$d];
							 | 
						||
| 
								 | 
							
								            } else {
							 | 
						||
| 
								 | 
							
								                if (isset($wr[$this->end])) {
							 | 
						||
| 
								 | 
							
								                    $this->addNotFind($r, $n, $s, $j, $autoInfo);
							 | 
						||
| 
								 | 
							
								                    $this->addResult($r, $s, $j, $wr[$this->x]);
							 | 
						||
| 
								 | 
							
								                    $n = '';
							 | 
						||
| 
								 | 
							
								                }
							 | 
						||
| 
								 | 
							
								                $wr = $this->dict;
							 | 
						||
| 
								 | 
							
								                if (isset($wr[$d])) {
							 | 
						||
| 
								 | 
							
								                    $s  = $d;
							 | 
						||
| 
								 | 
							
								                    $wr = $wr[$d];
							 | 
						||
| 
								 | 
							
								                } else {
							 | 
						||
| 
								 | 
							
								                    $s = '';
							 | 
						||
| 
								 | 
							
								                }
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								            $n .= $d;
							 | 
						||
| 
								 | 
							
								            $j = $i;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        if (isset($wr[$this->end])) {
							 | 
						||
| 
								 | 
							
								            $this->addNotFind($r, $n, $s, $i, $autoInfo);
							 | 
						||
| 
								 | 
							
								            $this->addResult($r, $s, $i, $wr[$this->x]);
							 | 
						||
| 
								 | 
							
								        } else {
							 | 
						||
| 
								 | 
							
								            $this->addNotFind($r, $n, '', $i, $autoInfo);
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return $r;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function addNotFind(&$r, $n, $s, $i, $autoInfo = [])
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        if ($n !== $s) {
							 | 
						||
| 
								 | 
							
								            $n = \str_replace($s, '', $n);
							 | 
						||
| 
								 | 
							
								            $this->addResult($r, $n, $i - \strlen($s), null, 0);
							 | 
						||
| 
								 | 
							
								            if ($this->auto) {
							 | 
						||
| 
								 | 
							
								                $this->reGet($r, $autoInfo);
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function shortFind($str, $autoInfo = [])
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        $len = \strlen($str);
							 | 
						||
| 
								 | 
							
								        $s   = '';
							 | 
						||
| 
								 | 
							
								        $n   = '';
							 | 
						||
| 
								 | 
							
								        $r   = [];
							 | 
						||
| 
								 | 
							
								        $wr  = [];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        for ($i = 0; $i < $len; ++$i) {
							 | 
						||
| 
								 | 
							
								            $j           = $i;
							 | 
						||
| 
								 | 
							
								            list($d, $i) = $this->getD($str, $i);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            if (isset($wr[$d])) {
							 | 
						||
| 
								 | 
							
								                $s .= $d;
							 | 
						||
| 
								 | 
							
								                $wr = $wr[$d];
							 | 
						||
| 
								 | 
							
								            } else {
							 | 
						||
| 
								 | 
							
								                if (isset($wr[$this->end])) {
							 | 
						||
| 
								 | 
							
								                    $this->addNotFind($r, $n, $s, $j, $autoInfo);
							 | 
						||
| 
								 | 
							
								                    $this->addResult($r, $s, $j, $wr[$this->x]);
							 | 
						||
| 
								 | 
							
								                    $n = '';
							 | 
						||
| 
								 | 
							
								                }
							 | 
						||
| 
								 | 
							
								                $wr = $this->dict;
							 | 
						||
| 
								 | 
							
								                if (isset($wr[$d])) {
							 | 
						||
| 
								 | 
							
								                    $s  = $d;
							 | 
						||
| 
								 | 
							
								                    $wr = $wr[$d];
							 | 
						||
| 
								 | 
							
								                } else {
							 | 
						||
| 
								 | 
							
								                    $s = '';
							 | 
						||
| 
								 | 
							
								                }
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            $n .= $d;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            if (isset($wr[$this->end])) {
							 | 
						||
| 
								 | 
							
								                $this->addNotFind($r, $n, $s, $i, $autoInfo);
							 | 
						||
| 
								 | 
							
								                $this->addResult($r, $s, $i, $wr[$this->x]);
							 | 
						||
| 
								 | 
							
								                $wr = $this->dict;
							 | 
						||
| 
								 | 
							
								                $s  = '';
							 | 
						||
| 
								 | 
							
								                $n  = '';
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        if (isset($wr[$this->end])) {
							 | 
						||
| 
								 | 
							
								            $this->addNotFind($r, $n, $s, $i, $autoInfo);
							 | 
						||
| 
								 | 
							
								            $this->addResult($r, $s, $i, $wr[$this->x]);
							 | 
						||
| 
								 | 
							
								        } else {
							 | 
						||
| 
								 | 
							
								            $this->addNotFind($r, $n, '', $i, $autoInfo);
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return $r;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    private function addResult(&$r, $k, $i, $x, $find = 1)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        $r[] = [$k, $i, $x, $find];
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								}
							 |