初始上传
This commit is contained in:
5
vendor/lizhichao/word/.gitignore
vendored
Executable file
5
vendor/lizhichao/word/.gitignore
vendored
Executable file
@@ -0,0 +1,5 @@
|
||||
.env
|
||||
.idea
|
||||
.DS_Store
|
||||
composer.lock
|
||||
/vendor/
|
||||
BIN
vendor/lizhichao/word/Data/dict.igb
vendored
Executable file
BIN
vendor/lizhichao/word/Data/dict.igb
vendored
Executable file
Binary file not shown.
1
vendor/lizhichao/word/Data/dict.json
vendored
Executable file
1
vendor/lizhichao/word/Data/dict.json
vendored
Executable file
File diff suppressed because one or more lines are too long
201
vendor/lizhichao/word/LICENSE
vendored
Executable file
201
vendor/lizhichao/word/LICENSE
vendored
Executable file
@@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
136
vendor/lizhichao/word/Lib/VicDict.php
vendored
Executable file
136
vendor/lizhichao/word/Lib/VicDict.php
vendored
Executable file
@@ -0,0 +1,136 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Add word to dict.
|
||||
*/
|
||||
|
||||
namespace Lizhichao\Word;
|
||||
|
||||
class VicDict
|
||||
{
|
||||
private $word = [];
|
||||
|
||||
private $code = 'utf-8';
|
||||
|
||||
private $end = ['\\' => 1];
|
||||
|
||||
private $default_end = ['\\' => 1];
|
||||
|
||||
private $end_key = '\\';
|
||||
|
||||
private $type = '';
|
||||
|
||||
private $dictPath = '';
|
||||
|
||||
/**
|
||||
* VicDict constructor.
|
||||
* @param string $path 词库地址
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function __construct($path = '')
|
||||
{
|
||||
if($path === ''){
|
||||
$this->dictPath = dirname(__DIR__) . '/Data/dict.json';
|
||||
}else{
|
||||
$this->dictPath = $path;
|
||||
}
|
||||
$this->type = pathinfo($this->dictPath)['extension'];
|
||||
|
||||
if ( ! \file_exists($this->dictPath)) {
|
||||
throw new \Exception("Invalid dict file: {$this->dictPath}");
|
||||
}
|
||||
|
||||
// check dict type
|
||||
switch ($this->type) {
|
||||
case 'igb':
|
||||
if ( ! \function_exists('\\igbinary_unserialize')) {
|
||||
throw new \Exception('Requires igbinary PHP extension.');
|
||||
}
|
||||
|
||||
$this->word = \igbinary_unserialize(\file_get_contents($this->dictPath));
|
||||
break;
|
||||
case 'json':
|
||||
$this->word = \json_decode(\file_get_contents($this->dictPath), true);
|
||||
break;
|
||||
default:
|
||||
throw new \Exception('Invalid dict type.');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $word
|
||||
* @param null|string $x 词性
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function add($word, $x = null)
|
||||
{
|
||||
$this->end = ['\\x' => $x] + $this->default_end;
|
||||
$word = $this->filter($word);
|
||||
if ($word) {
|
||||
return $this->merge($word);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function save()
|
||||
{
|
||||
if ('igb' === $this->type) {
|
||||
$str = \igbinary_serialize($this->word);
|
||||
} else {
|
||||
$str = \json_encode($this->word);
|
||||
}
|
||||
|
||||
return \file_put_contents($this->dictPath, $str);
|
||||
}
|
||||
|
||||
private function merge($word)
|
||||
{
|
||||
$ar = $this->toArr($word);
|
||||
$br = $ar;
|
||||
$wr = &$this->word;
|
||||
foreach ($ar as $i => $v) {
|
||||
\array_shift($br);
|
||||
if ( ! isset($wr[$v])) {
|
||||
$wr[$v] = $this->dict($br, $this->end);
|
||||
|
||||
return true;
|
||||
}
|
||||
$wr = &$wr[$v];
|
||||
}
|
||||
if ( ! isset($wr[$this->end_key])) {
|
||||
foreach ($this->end as $k => $v) {
|
||||
$wr[$k] = $v;
|
||||
$wr[$k] = $v;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private function filter($word)
|
||||
{
|
||||
return \str_replace(["\n", "\t", "\r"], '', $word);
|
||||
}
|
||||
|
||||
private function dict($arr, $v, $i = 0)
|
||||
{
|
||||
if (isset($arr[$i])) {
|
||||
return [$arr[$i] => $this->dict($arr, $v, $i + 1)];
|
||||
}
|
||||
|
||||
return $v;
|
||||
}
|
||||
|
||||
private function toArr($str)
|
||||
{
|
||||
$l = \mb_strlen($str, $this->code);
|
||||
$r = [];
|
||||
for ($i = 0; $i < $l; ++$i) {
|
||||
$r[] = \mb_substr($str, $i, 1, $this->code);
|
||||
}
|
||||
|
||||
return $r;
|
||||
}
|
||||
}
|
||||
290
vendor/lizhichao/word/Lib/VicWord.php
vendored
Executable file
290
vendor/lizhichao/word/Lib/VicWord.php
vendored
Executable file
@@ -0,0 +1,290 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* 使用分词
|
||||
*/
|
||||
|
||||
namespace Lizhichao\Word;
|
||||
|
||||
class VicWord
|
||||
{
|
||||
private $dict = [];
|
||||
|
||||
private $end = '\\';
|
||||
|
||||
private $auto = false;
|
||||
|
||||
private $count = 0;
|
||||
|
||||
/**
|
||||
* @var string 词性
|
||||
*/
|
||||
private $x = '\\x';
|
||||
|
||||
public function __construct($dictPath = '')
|
||||
{
|
||||
if($dictPath === ''){
|
||||
$dictPath = dirname(__DIR__) . '/Data/dict.json';
|
||||
}
|
||||
$type = pathinfo($dictPath)['extension'];
|
||||
|
||||
if ( ! \file_exists($dictPath)) {
|
||||
throw new \Exception("Invalid dict file: {$dictPath}");
|
||||
}
|
||||
// check dict type
|
||||
switch ($type) {
|
||||
case 'igb':
|
||||
if ( ! \function_exists('\\igbinary_unserialize')) {
|
||||
throw new \Exception('Requires igbinary PHP extension.');
|
||||
}
|
||||
|
||||
$this->dict = \igbinary_unserialize(\file_get_contents($dictPath));
|
||||
break;
|
||||
case 'json':
|
||||
$this->dict = \json_decode(\file_get_contents($dictPath), true);
|
||||
break;
|
||||
default:
|
||||
throw new \Exception('Invalid dict type.');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $str
|
||||
*/
|
||||
public function getWord($str)
|
||||
{
|
||||
$this->auto = false;
|
||||
$str = $this->filter($str);
|
||||
|
||||
return $this->find($str);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $str
|
||||
*/
|
||||
public function getShortWord($str)
|
||||
{
|
||||
$this->auto = false;
|
||||
$str = $this->filter($str);
|
||||
|
||||
return $this->shortfind($str);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $str
|
||||
*/
|
||||
public function getAutoWord($str)
|
||||
{
|
||||
$this->auto = true;
|
||||
$str = $this->filter($str);
|
||||
|
||||
return $this->autoFind($str, ['long' => 1]);
|
||||
}
|
||||
|
||||
private function filter($str)
|
||||
{
|
||||
return \strtolower($str);
|
||||
}
|
||||
|
||||
private function getD(&$str, $i)
|
||||
{
|
||||
$o = \ord($str[$i]);
|
||||
if ($o < 128) {
|
||||
$d = $str[$i];
|
||||
} else {
|
||||
$o = $o >> 4;
|
||||
if (12 === $o) {
|
||||
$d = $str[$i] . $str[++$i];
|
||||
} elseif (14 === $o) {
|
||||
$d = $str[$i] . $str[++$i] . $str[++$i];
|
||||
} elseif (15 === $o) {
|
||||
$d = $str[$i] . $str[++$i] . $str[++$i] . $str[++$i];
|
||||
} else {
|
||||
throw new \Exception('Error: unknow charset.');
|
||||
}
|
||||
}
|
||||
|
||||
return [$d, $i];
|
||||
}
|
||||
|
||||
private function autoFind($str, $autoInfo = [])
|
||||
{
|
||||
if ($autoInfo['long']) {
|
||||
return $this->find($str, $autoInfo);
|
||||
}
|
||||
|
||||
return $this->shortfind($str, $autoInfo);
|
||||
}
|
||||
|
||||
private function reGet(&$r, $autoInfo)
|
||||
{
|
||||
$autoInfo['c'] = isset($autoInfo['c']) ? $autoInfo['c']++ : 1;
|
||||
$l = \count($r) - 1;
|
||||
$p = [];
|
||||
$str = '';
|
||||
for ($i = $l; $i >= 0; --$i) {
|
||||
$str = $r[$i][0] . $str;
|
||||
$f = $r[$i][3];
|
||||
\array_unshift($p, $r[$i]);
|
||||
unset($r[$i]);
|
||||
if (1 === (int) $f) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
++$this->count;
|
||||
$l = \strlen($str);
|
||||
if (isset($r[$i - 1])) {
|
||||
$w = $r[$i - 1][1];
|
||||
} else {
|
||||
$w = 0;
|
||||
}
|
||||
if (isset($autoInfo['pl']) && $l === (int) $autoInfo['pl']) {
|
||||
$r = $p;
|
||||
|
||||
return false;
|
||||
}
|
||||
if ($str && $autoInfo['c'] < 3) {
|
||||
$autoInfo['pl'] = $l;
|
||||
$autoInfo['long'] = ! $autoInfo['long'];
|
||||
$sr = $this->autoFind($str, $autoInfo);
|
||||
$sr = \array_map(function ($v) use ($w) {
|
||||
$v[1] += $w;
|
||||
|
||||
return $v;
|
||||
}, $sr);
|
||||
$r = \array_merge($r, $this->getGoodWord($p, $sr));
|
||||
}
|
||||
}
|
||||
|
||||
private function getGoodWord($old, $new)
|
||||
{
|
||||
if ( ! $new) {
|
||||
return $old;
|
||||
}
|
||||
if ($this->getUnknowCount($old) > $this->getUnknowCount($new)) {
|
||||
return $new;
|
||||
}
|
||||
|
||||
return $old;
|
||||
}
|
||||
|
||||
private function getUnknowCount($ar)
|
||||
{
|
||||
$i = 0;
|
||||
foreach ($ar as $v) {
|
||||
if (0 === (int) $v[3]) {
|
||||
$i += \strlen($v[0]);
|
||||
}
|
||||
}
|
||||
|
||||
return $i;
|
||||
}
|
||||
|
||||
private function find($str, $autoInfo = [])
|
||||
{
|
||||
$len = \strlen($str);
|
||||
$s = '';
|
||||
$n = '';
|
||||
$j = 0;
|
||||
$r = [];
|
||||
$wr = [];
|
||||
|
||||
for ($i = 0; $i < $len; ++$i) {
|
||||
list($d, $i) = $this->getD($str, $i);
|
||||
|
||||
if (isset($wr[$d])) {
|
||||
$s .= $d;
|
||||
$wr = $wr[$d];
|
||||
} else {
|
||||
if (isset($wr[$this->end])) {
|
||||
$this->addNotFind($r, $n, $s, $j, $autoInfo);
|
||||
$this->addResult($r, $s, $j, $wr[$this->x]);
|
||||
$n = '';
|
||||
}
|
||||
$wr = $this->dict;
|
||||
if (isset($wr[$d])) {
|
||||
$s = $d;
|
||||
$wr = $wr[$d];
|
||||
} else {
|
||||
$s = '';
|
||||
}
|
||||
}
|
||||
$n .= $d;
|
||||
$j = $i;
|
||||
}
|
||||
if (isset($wr[$this->end])) {
|
||||
$this->addNotFind($r, $n, $s, $i, $autoInfo);
|
||||
$this->addResult($r, $s, $i, $wr[$this->x]);
|
||||
} else {
|
||||
$this->addNotFind($r, $n, '', $i, $autoInfo);
|
||||
}
|
||||
|
||||
return $r;
|
||||
}
|
||||
|
||||
private function addNotFind(&$r, $n, $s, $i, $autoInfo = [])
|
||||
{
|
||||
if ($n !== $s) {
|
||||
$n = \str_replace($s, '', $n);
|
||||
$this->addResult($r, $n, $i - \strlen($s), null, 0);
|
||||
if ($this->auto) {
|
||||
$this->reGet($r, $autoInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function shortFind($str, $autoInfo = [])
|
||||
{
|
||||
$len = \strlen($str);
|
||||
$s = '';
|
||||
$n = '';
|
||||
$r = [];
|
||||
$wr = [];
|
||||
|
||||
for ($i = 0; $i < $len; ++$i) {
|
||||
$j = $i;
|
||||
list($d, $i) = $this->getD($str, $i);
|
||||
|
||||
if (isset($wr[$d])) {
|
||||
$s .= $d;
|
||||
$wr = $wr[$d];
|
||||
} else {
|
||||
if (isset($wr[$this->end])) {
|
||||
$this->addNotFind($r, $n, $s, $j, $autoInfo);
|
||||
$this->addResult($r, $s, $j, $wr[$this->x]);
|
||||
$n = '';
|
||||
}
|
||||
$wr = $this->dict;
|
||||
if (isset($wr[$d])) {
|
||||
$s = $d;
|
||||
$wr = $wr[$d];
|
||||
} else {
|
||||
$s = '';
|
||||
}
|
||||
}
|
||||
|
||||
$n .= $d;
|
||||
|
||||
if (isset($wr[$this->end])) {
|
||||
$this->addNotFind($r, $n, $s, $i, $autoInfo);
|
||||
$this->addResult($r, $s, $i, $wr[$this->x]);
|
||||
$wr = $this->dict;
|
||||
$s = '';
|
||||
$n = '';
|
||||
}
|
||||
}
|
||||
if (isset($wr[$this->end])) {
|
||||
$this->addNotFind($r, $n, $s, $i, $autoInfo);
|
||||
$this->addResult($r, $s, $i, $wr[$this->x]);
|
||||
} else {
|
||||
$this->addNotFind($r, $n, '', $i, $autoInfo);
|
||||
}
|
||||
|
||||
return $r;
|
||||
}
|
||||
|
||||
private function addResult(&$r, $k, $i, $x, $find = 1)
|
||||
{
|
||||
$r[] = [$k, $i, $x, $find];
|
||||
}
|
||||
}
|
||||
77
vendor/lizhichao/word/README.md
vendored
Executable file
77
vendor/lizhichao/word/README.md
vendored
Executable file
@@ -0,0 +1,77 @@
|
||||
# VicWord 一个纯php的分词
|
||||
|
||||
<a href="https://github.com/996icu/996.ICU/blob/master/LICENSE"><img src="https://img.shields.io/badge/support-996.icu-red.svg"></a>
|
||||
|
||||
QQ交流群: 731475644
|
||||
|
||||
## 安装
|
||||
|
||||
```shell
|
||||
composer require lizhichao/word
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 分词说明
|
||||
- 含有3种切分方法
|
||||
- `getWord` 长度优先切分 。最快
|
||||
- `getShortWord` 细粒度切分。比最快慢一点点
|
||||
- `getAutoWord` 自动切分 。效果最好
|
||||
- 可自定义词典,自己添加词语到词库,词库支持文本格式`json`和二级制格式`igb`
|
||||
二进制格式词典小,加载快
|
||||
- `dict.igb`含有175662个词,欢迎大家补充词语到 `dict.txt` ,格式(词语 \t idf \t 词性)
|
||||
- idf 获取方法 百度搜索这个词语 `Math.log(100000001/结果数量)`,如果你有更好的方法欢迎补充。
|
||||
- 词性 [标点符号,名词,动词,形容词,区别词,代词,数词,量词,副词,介词,连词,助词,语气词,拟声词,叹词] 取index ;标点符号取0
|
||||
- 三种分词结果对比
|
||||
```php
|
||||
$fc = new VicWord();
|
||||
$arr = $fc->getWord('北京大学生喝进口红酒,在北京大学生活区喝进口红酒');
|
||||
//北京大学|生喝|进口|红酒|,|在|北京大学|生活区|喝|进口|红酒
|
||||
//$arr 是一个数组 每个单元的结构[词语,词语位置,词性,这个词语是否包含在词典中] 这里只值列出了词语
|
||||
|
||||
$arr = $fc->getShortWord('北京大学生喝进口红酒,在北京大学生活区喝进口红酒');
|
||||
//北京|大学|生喝|进口|红酒|,|在|北京|大学|生活|区喝|进口|红酒
|
||||
|
||||
$arr = $fc->getAutoWord('北京大学生喝进口红酒,在北京大学生活区喝进口红酒');
|
||||
//北京|大学生|喝|进口|红酒|,|在|北京大学|生活区|喝|进口|红酒
|
||||
|
||||
//对比
|
||||
//qq的分词 http://nlp.qq.com/semantic.cgi#page2
|
||||
//百度的分词 http://ai.baidu.com/tech/nlp/lexical
|
||||
|
||||
```
|
||||
## 分词速度
|
||||
机器阿里云 `Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz`
|
||||
`getWord` 每秒140w字
|
||||
`getShortWord` 每秒138w字
|
||||
`getAutoWord` 每秒40w字
|
||||
测试文本在百度百科拷贝的一段5000字的文本
|
||||
|
||||
## 制作词库
|
||||
- 词库支持utf-8的任意字符
|
||||
- 词典大小不影响 分词速度
|
||||
|
||||
只有一个方法 VicDict->add(词语,词性 = null)
|
||||
```php
|
||||
require __DIR__.'/Lib/VicDict.php';
|
||||
|
||||
//目前可支持 igb 和 json 两种词典库格式;igb需要安装igbinary扩展,igb文件小,加载快
|
||||
$path = ''; //词典地址
|
||||
$dict = new VicDict($path);
|
||||
|
||||
//添加词语词库 add(词语,词性) 不分语言,可以是utf-8编码的任何字符
|
||||
$dict->add('中国','n');
|
||||
|
||||
//保存词库
|
||||
$dict->save();
|
||||
```
|
||||
|
||||
## demo
|
||||
[demo](http://blogs.vicsdf.com/my/fc)
|
||||
|
||||
## 该作者的其他软件
|
||||
[一个极简高性能php框架,支持[swoole | php-fpm ]环境](https://github.com/lizhichao/one)
|
||||
|
||||
|
||||
|
||||
|
||||
23
vendor/lizhichao/word/addDict.php
vendored
Executable file
23
vendor/lizhichao/word/addDict.php
vendored
Executable file
@@ -0,0 +1,23 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: tanszhe
|
||||
* Date: 2017/12/25
|
||||
* Time: 下午7:47
|
||||
* 添加分词库
|
||||
*/
|
||||
//定义词典文件路径
|
||||
define('_VIC_WORD_DICT_PATH_',__DIR__.'/Data/dict.igb');
|
||||
|
||||
require __DIR__.'/vendor/autoload.php';
|
||||
|
||||
use Lizhichao\Word\VicDict;
|
||||
|
||||
//目前可支持 igb 和 json 两种词典库格式;igb需要安装igbinary扩展,igb文件小,加载快
|
||||
$dict = new VicDict('igb');
|
||||
|
||||
//添加词语词库 add(词语,词性) 可以是除保留字符(\ , \x ,\i),以外的utf-8编码的任何字符
|
||||
$dict->add('中国','n');
|
||||
|
||||
//保存词库
|
||||
$dict->save();
|
||||
21
vendor/lizhichao/word/composer.json
vendored
Executable file
21
vendor/lizhichao/word/composer.json
vendored
Executable file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"name": "lizhichao/word",
|
||||
"description": "This is a participle library",
|
||||
"type": "library",
|
||||
"license": "Apache-2.0",
|
||||
"authors": [
|
||||
{
|
||||
"name": "tanszhe",
|
||||
"email": "1018595261@qq.com"
|
||||
}
|
||||
],
|
||||
"require": {
|
||||
"php" : ">=5.6.0"
|
||||
},
|
||||
"minimum-stability" : "stable",
|
||||
"autoload" : {
|
||||
"psr-4" : {
|
||||
"Lizhichao\\Word\\" : "Lib"
|
||||
}
|
||||
}
|
||||
}
|
||||
81
vendor/lizhichao/word/demo.php
vendored
Executable file
81
vendor/lizhichao/word/demo.php
vendored
Executable file
@@ -0,0 +1,81 @@
|
||||
<?php
|
||||
/**
|
||||
* Created by PhpStorm.
|
||||
* User: tanszhe
|
||||
* Date: 2017/12/25
|
||||
* Time: 下午7:46
|
||||
*/
|
||||
//定义词典文件路径
|
||||
|
||||
require __DIR__.'/Lib/VicWord.php';
|
||||
|
||||
use Lizhichao\Word\VicWord;
|
||||
|
||||
|
||||
//type: 词典格式
|
||||
$fc = new VicWord();
|
||||
|
||||
//长度优先分词
|
||||
$ar = $fc->getWord('聚知台是一个及时沟通工具');
|
||||
|
||||
//细切分
|
||||
$ar = $fc->getShortWord('聚知台是一个及时沟通工具');
|
||||
|
||||
//自动 这种方法最耗时
|
||||
$ar = $fc->getAutoWord('聚知台是一个及时沟通工具');
|
||||
print_r($ar);
|
||||
/*
|
||||
|
||||
Array
|
||||
(
|
||||
[0] => Array
|
||||
(
|
||||
[0] => 聚知台 //词语
|
||||
[1] => 8 //词语的位置 utf-8编码
|
||||
[2] => //词性 tip:词库里面没有词性 欢迎大家添加
|
||||
[3] => 1 // 1 词典含有该词语 0没有该词语
|
||||
)
|
||||
|
||||
[1] => Array
|
||||
(
|
||||
[0] => 是
|
||||
[1] => 10
|
||||
[2] =>
|
||||
[3] => 1
|
||||
)
|
||||
|
||||
[2] => Array
|
||||
(
|
||||
[0] => 一个
|
||||
[1] => 16
|
||||
[2] =>
|
||||
[3] => 1
|
||||
)
|
||||
|
||||
[3] => Array
|
||||
(
|
||||
[0] => 及时
|
||||
[1] => 23
|
||||
[2] =>
|
||||
[3] => 1
|
||||
)
|
||||
|
||||
[4] => Array
|
||||
(
|
||||
[0] => 沟通
|
||||
[1] => 29
|
||||
[2] =>
|
||||
[3] => 1
|
||||
)
|
||||
|
||||
[5] => Array
|
||||
(
|
||||
[0] => 工具
|
||||
[1] => 36
|
||||
[2] =>
|
||||
[3] => 1
|
||||
)
|
||||
|
||||
)
|
||||
|
||||
*/
|
||||
Reference in New Issue
Block a user