Initial commit
This commit is contained in:
21
themes/keepit/node_modules/segmentit/LICENSE
generated
vendored
Normal file
21
themes/keepit/node_modules/segmentit/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2017 lin onetwo
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
194
themes/keepit/node_modules/segmentit/README.md
generated
vendored
Normal file
194
themes/keepit/node_modules/segmentit/README.md
generated
vendored
Normal file
@@ -0,0 +1,194 @@
|
||||
<p>
|
||||
<a href='https://badge.fury.io/js/segmentit' style='margin: 0 0.5rem;'>
|
||||
<img src='https://badge.fury.io/js/segmentit.svg' alt='npm version' height='18'/>
|
||||
</a>
|
||||
|
||||
<a href='https://coveralls.io/github/linonetwo/segmentit?branch=master' style='margin: 0 0.5rem;'>
|
||||
<img src='https://coveralls.io/repos/github/linonetwo/segmentit/badge.svg?branch=master' alt='Coverage Status' height='18'/>
|
||||
</a>
|
||||
|
||||
<a href='https://travis-ci.org/linonetwo/segmentit#' style='margin: 0 0.5rem;'>
|
||||
<img src='https://api.travis-ci.org/linonetwo/segmentit.svg?branch=master' alt='CI Status' height='18'/>
|
||||
</a>
|
||||
|
||||
<a href='https://img.shields.io/bundlephobia/minzip/segmentit.svg' style='margin: 0 0.5rem;'>
|
||||
<img src='https://img.shields.io/bundlephobia/minzip/segmentit.svg' alt='Min Zip Size' height='18'/>
|
||||
</a>
|
||||
<p>
|
||||
|
||||
# 中文分词模块
|
||||
|
||||
本模块基于 [node-segment](https://github.com/leizongmin/node-segment) 魔改,增加了 electron、浏览器支持,并准备针对 electron 多线程运行环境进行优化。
|
||||
|
||||
之所以要花时间魔改,是因为 `segment` 和 `nodejieba` 虽然在 node 环境下很好用,但根本无法在浏览器和 electron 环境下运行。我把代码重构为 ES2015,并用 babel 插件内联了字典文件,全部载入的话大小是 3.8M,但如果有些字典你并不需要,字典和模块是支持 tree shaking 的(请使用 ESM 模块)。
|
||||
|
||||
## Usage
|
||||
|
||||
```javascript
|
||||
import { Segment, useDefault } from 'segmentit';
|
||||
|
||||
const segmentit = useDefault(new Segment());
|
||||
const result = segmentit.doSegment('工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作。');
|
||||
console.log(result);
|
||||
```
|
||||
|
||||
对于 runkit 环境:
|
||||
|
||||
```javascript
|
||||
const { Segment, useDefault } = require('segmentit');
|
||||
const segmentit = useDefault(new Segment());
|
||||
const result = segmentit.doSegment('工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作。');
|
||||
console.log(result);
|
||||
```
|
||||
|
||||
[在 Runkit 上免费试用](https://npm.runkit.com/segmentit)
|
||||
|
||||
## 获取词类标注
|
||||
|
||||
结巴分词风格的词类标注:
|
||||
|
||||
```javascript
|
||||
// import Segment, { useDefault, cnPOSTag, enPOSTag } from 'segmentit';
|
||||
const = require('segmentit').default;
|
||||
const { Segment, useDefault, cnPOSTag, enPOSTag } = require('segmentit');
|
||||
|
||||
const segmentit = useDefault(new Segment());
|
||||
|
||||
console.log(segmentit.doSegment('一人得道,鸡犬升天').map(i => `${i.w} <${cnPOSTag(i.p)}> <${enPOSTag(i.p)}>`))
|
||||
// ↑ ["一人得道 <习语,数词 数语素> <l,m>", ", <标点符号> <w>", "鸡犬升天 <成语> <i>"]
|
||||
```
|
||||
|
||||
## 只使用部分词典或使用自定义词典
|
||||
|
||||
useDefault 的具体实现是这样的:
|
||||
|
||||
```javascript
|
||||
// useDefault
|
||||
import { Segment, modules, dicts, synonyms, stopwords } from 'segmentit';
|
||||
|
||||
const segmentit = new Segment();
|
||||
segmentit.use(modules);
|
||||
segmentit.loadDict(dicts);
|
||||
segmentit.loadSynonymDict(synonyms);
|
||||
segmentit.loadStopwordDict(stopwords);
|
||||
```
|
||||
|
||||
因此你实际上可以 import 所需的那部分字典和模块,然后一个个如下载入。没有 import 的那些字典和模块应该会被 webpack 的 tree shaking 去掉。你也可以这样载入自己定义的字典文件,只需要主要 loadDict 的函数签名是 `(dicts: string | string[]): Segment`。
|
||||
|
||||
```javascript
|
||||
// load custom module and dicts
|
||||
import {
|
||||
Segment,
|
||||
ChsNameTokenizer,
|
||||
DictOptimizer,
|
||||
EmailOptimizer,
|
||||
PunctuationTokenizer,
|
||||
URLTokenizer,
|
||||
ChsNameOptimizer,
|
||||
DatetimeOptimizer,
|
||||
DictTokenizer,
|
||||
ForeignTokenizer,
|
||||
SingleTokenizer,
|
||||
WildcardTokenizer,
|
||||
pangu,
|
||||
panguExtend1,
|
||||
panguExtend2,
|
||||
names,
|
||||
wildcard,
|
||||
synonym,
|
||||
stopword,
|
||||
} from 'segmentit';
|
||||
|
||||
const segmentit = new Segment();
|
||||
|
||||
// load them one by one, or by array
|
||||
segmentit.use(ChsNameTokenizer);
|
||||
segmentit.loadDict(pangu);
|
||||
segmentit.loadDict([panguExtend1, panguExtend2]);
|
||||
segmentit.loadSynonymDict(synonym);
|
||||
segmentit.loadStopwordDict(stopword);
|
||||
```
|
||||
|
||||
盘古的词典比较复古了,像「软萌萝莉」这种词都是没有的,请有能力的朋友 PR 一下自己的词库。
|
||||
|
||||
## 创造自己的分词中间件(Tokenizer)和结果优化器(Optimizer)
|
||||
|
||||
### Tokenizer
|
||||
|
||||
Tokenizer 是分词时要经过的一个个中间件,类似于 Redux 的 MiddleWare,它的 split 函数接受分词分到一半的 token 数组,返回一个同样格式的 token 数组(这也就是不要对太长的文本分词的原因,不然这个数组会巨爆大)。
|
||||
|
||||
例子如下:
|
||||
|
||||
```javascript
|
||||
// @flow
|
||||
import { Tokenizer } from 'segmentit';
|
||||
import type { SegmentToken, TokenStartPosition } from 'segmentit';
|
||||
export default class ChsNameTokenizer extends Tokenizer {
|
||||
split(words: Array<SegmentToken>): Array<SegmentToken> {
|
||||
// 可以获取到 this.segment 里的各种信息
|
||||
const POSTAG = this.segment.POSTAG;
|
||||
const TABLE = this.segment.getDict('TABLE');
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
### Optimizer
|
||||
|
||||
Optimizer 是在分词结束后,发现有些难以利用字典处理的情况,却可以用启发式规则处理时,可以放这些启发式规则的地方,它的 doOptimize 函数同样接收一个 token 数组,返回一个同样格式的 token 数组。
|
||||
|
||||
除了 token 数组以外,你还可以自定义余下的参数,比如在下面的例子里,我们会递归调用自己一次,通过第二个参数判断递归深度:
|
||||
|
||||
```javascript
|
||||
// @flow
|
||||
import { Optimizer } from './BaseModule';
|
||||
import type { SegmentToken } from './type';
|
||||
export default class DictOptimizer extends Optimizer {
|
||||
doOptimize(words: Array<SegmentToken>, isNotFirst: boolean): Array<SegmentToken> {
|
||||
// 可以获取到 this.segment 里的各种信息
|
||||
const POSTAG = this.segment.POSTAG;
|
||||
const TABLE = this.segment.getDict('TABLE');
|
||||
// ...
|
||||
// 针对组合数字后无法识别新组合的数字问题,需要重新扫描一次
|
||||
return isNotFirst === true ? words : this.doOptimize(words, true);
|
||||
}
|
||||
```
|
||||
|
||||
例如目前各种分词工具都没法把「一条红色内裤」中的红色标对词性,但在 segmentit 里我加了个简单的 AdjectiveOptimizer 来处理它:
|
||||
|
||||
```javascript
|
||||
// @flow
|
||||
// https://github.com/linonetwo/segmentit/blob/master/src/module/AdjectiveOptimizer.js
|
||||
import { Optimizer } from './BaseModule';
|
||||
import type { SegmentToken } from './type';
|
||||
|
||||
import { colors } from './COLORS';
|
||||
|
||||
// 把一些错认为名词的词标注为形容词,或者对名词作定语的情况
|
||||
export default class AdjectiveOptimizer extends Optimizer {
|
||||
doOptimize(words: Array<SegmentToken>): Array<SegmentToken> {
|
||||
const { POSTAG } = this.segment;
|
||||
let index = 0;
|
||||
while (index < words.length) {
|
||||
const word = words[index];
|
||||
const nextword = words[index + 1];
|
||||
if (nextword) {
|
||||
// 对于<颜色>+<的>,直接判断颜色是形容词(字典里颜色都是名词)
|
||||
if (nextword.p === POSTAG.D_U && colors.includes(word.w)) {
|
||||
word.p = POSTAG.D_A;
|
||||
}
|
||||
// 如果是连续的两个名词,前一个是颜色,那这个颜色也是形容词
|
||||
if (word.p === POSTAG.D_N && nextword.p === POSTAG.D_N && colors.includes(word.w)) {
|
||||
word.p = POSTAG.D_A;
|
||||
}
|
||||
}
|
||||
// 移到下一个单词
|
||||
index += 1;
|
||||
}
|
||||
return words;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
MIT LICENSED
|
||||
1
themes/keepit/node_modules/segmentit/dist/cjs/index.js
generated
vendored
Normal file
1
themes/keepit/node_modules/segmentit/dist/cjs/index.js
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
"use strict";module.exports="production"===process.env.NODE_ENV?require("./segmentit.min.js"):require("./segmentit.js");
|
||||
2357
themes/keepit/node_modules/segmentit/dist/cjs/segmentit.js
generated
vendored
Normal file
2357
themes/keepit/node_modules/segmentit/dist/cjs/segmentit.js
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
169
themes/keepit/node_modules/segmentit/dist/cjs/segmentit.min.js
generated
vendored
Normal file
169
themes/keepit/node_modules/segmentit/dist/cjs/segmentit.min.js
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
1
themes/keepit/node_modules/segmentit/dist/cjs/segmentit.min.js.map
generated
vendored
Normal file
1
themes/keepit/node_modules/segmentit/dist/cjs/segmentit.min.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
2323
themes/keepit/node_modules/segmentit/dist/esm/segmentit.js
generated
vendored
Normal file
2323
themes/keepit/node_modules/segmentit/dist/esm/segmentit.js
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
1
themes/keepit/node_modules/segmentit/dist/esm/segmentit.js.map
generated
vendored
Normal file
1
themes/keepit/node_modules/segmentit/dist/esm/segmentit.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
168
themes/keepit/node_modules/segmentit/dist/umd/segmentit.js
generated
vendored
Normal file
168
themes/keepit/node_modules/segmentit/dist/umd/segmentit.js
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
1
themes/keepit/node_modules/segmentit/dist/umd/segmentit.js.map
generated
vendored
Normal file
1
themes/keepit/node_modules/segmentit/dist/umd/segmentit.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
106
themes/keepit/node_modules/segmentit/package.json
generated
vendored
Normal file
106
themes/keepit/node_modules/segmentit/package.json
generated
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
{
|
||||
"name": "segmentit",
|
||||
"main": "dist/cjs",
|
||||
"module": "dist/esm/segmentit.js",
|
||||
"browser": "dist/umd/segmentit.js",
|
||||
"files": [
|
||||
"dist"
|
||||
],
|
||||
"version": "2.0.3",
|
||||
"description": "Chinese word segmentation 中文分词模块 with browser && electron support",
|
||||
"keywords": [
|
||||
"segment",
|
||||
"chinese",
|
||||
"POS",
|
||||
"mlp",
|
||||
"中文",
|
||||
"electron",
|
||||
"浏览器",
|
||||
"browser",
|
||||
"nodejs",
|
||||
"分词"
|
||||
],
|
||||
"homepage": "https://github.com/linonetwo/segmentit#readme",
|
||||
"bugs": {
|
||||
"url": "https://github.com/linonetwo/segmentit/issues"
|
||||
},
|
||||
"scripts": {
|
||||
"clean": "rimraf dist & rimraf build & rimraf coverage",
|
||||
"flow-typed": "rimraf flow-typed/npm && flow-typed install --overwrite || true",
|
||||
"lint": "eslint .",
|
||||
"lintfix": "eslint . --fix",
|
||||
"postlint": "npm run typecheck",
|
||||
"typecheck": "flow check",
|
||||
"test": "npm run test-only",
|
||||
"test-only": "jest --coverage",
|
||||
"test:watch": "jest --watch",
|
||||
"prepare": "npm run build",
|
||||
"prepublishOnly": "npm run test",
|
||||
"build": "NODE_ENV=production && npm run clean && rollup -c",
|
||||
"build:watch": "babel --watch src -d dist"
|
||||
},
|
||||
"authors": [
|
||||
"Lei Zongmin <leizongmin@gmail.com>",
|
||||
"Linonetwo <linonetwo012@gmail.com> (https://onetwo.ren/)"
|
||||
],
|
||||
"contributors": [
|
||||
{
|
||||
"name": "Lei Zongmin",
|
||||
"email": "leizongmin@gmail.com"
|
||||
},
|
||||
{
|
||||
"name": "Linonetwo",
|
||||
"email": "linonetwo012@gmail.com"
|
||||
}
|
||||
],
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git://github.com/linonetwo/segmentit.git"
|
||||
},
|
||||
"browserslist": "> 0.25%, not dead",
|
||||
"jest": {
|
||||
"testEnvironment": "node",
|
||||
"moduleFileExtensions": [
|
||||
"js"
|
||||
],
|
||||
"watchman": false,
|
||||
"coverageDirectory": "coverage",
|
||||
"collectCoverageFrom": [
|
||||
"src/**/*.js"
|
||||
]
|
||||
},
|
||||
"dependencies": {
|
||||
"preval.macro": "^4.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/cli": "^7.7.7",
|
||||
"@babel/core": "^7.7.7",
|
||||
"@babel/plugin-proposal-class-properties": "^7.7.4",
|
||||
"@babel/plugin-transform-runtime": "^7.7.6",
|
||||
"@babel/preset-env": "^7.7.7",
|
||||
"@babel/preset-flow": "^7.7.4",
|
||||
"babel-eslint": "^10.0.3",
|
||||
"babel-jest": "^24.9.0",
|
||||
"babel-plugin-macros": "^2.8.0",
|
||||
"eslint": "^6.8.0",
|
||||
"eslint-config-airbnb": "^18.0.1",
|
||||
"eslint-plugin-compat": "^3.3.0",
|
||||
"eslint-plugin-flowtype": "~4.5.2",
|
||||
"eslint-plugin-import": "^2.19.1",
|
||||
"eslint-plugin-jsx-a11y": "^6.2.3",
|
||||
"eslint-plugin-promise": "^4.2.1",
|
||||
"eslint-plugin-react": "^7.17.0",
|
||||
"flow-bin": "^0.114.0",
|
||||
"flow-typed": "^2.6.2",
|
||||
"jest": "^24.9.0",
|
||||
"mocha": "^6.2.2",
|
||||
"rimraf": "^3.0.0",
|
||||
"rollup": "^1.27.13",
|
||||
"rollup-plugin-babel": "^4.3.3",
|
||||
"rollup-plugin-commonjs": "^10.1.0",
|
||||
"rollup-plugin-replace": "^2.2.0",
|
||||
"rollup-plugin-uglify": "^6.0.4",
|
||||
"should": "^13.2.3",
|
||||
"uglify-es": "^3.3.9"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user