-
Notifications
You must be signed in to change notification settings - Fork 1k
/
chinesetokenizer.py
68 lines (49 loc) · 2.3 KB
/
chinesetokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#! /usr/bin/env python
#coding:utf-8
#本代码来自网络:http://ashin.sinaapp.com/article/118/
import jieba
from whoosh.analysis import Tokenizer,Token
from whoosh.compat import text_type
class ChineseTokenizer(Tokenizer):
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
seglist=jieba.cut_for_search(value) #使用结巴分词库进行分词
for w in seglist:
t.original = t.text = w
t.boost = 1.0
if positions:
t.pos=start_pos+value.find(w)
if chars:
t.startchar=start_char+value.find(w)
t.endchar=start_char+value.find(w)+len(w)
yield t #通过生成器返回每个分词的结果token
def ChineseAnalyzer():
return ChineseTokenizer()
"""
测试脚本:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from whoosh.index import create_in
from whoosh.fields import *
from chinesetokenizer import ChineseAnalyzer
#from whoosh.analysis import RegexAnalyzer
#analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)")
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
ix = create_in("schema", schema)
writer = ix.writer()
writer.add_document(title=u"First document", path=u"/a", content=u"先生说我们都是好学生")
writer.add_document(title=u"Second document", path=u"/b", content=u"我们要树立科学发展观")
writer.commit()
with ix.searcher() as searcher:
results = searcher.find("content", u"发展")
if 0 != len(results):
for hit in results:
print hit.highlights("content")
运行结果:
先<b class="match term0">生</b>说我们都是好<b class="match term1">学</b><b class="match term0">生</b>
先生说我们都是好<b class="match term0">学生</b>
我们要树立科学<b class="match term0">发</b><b class="match term1">展</b>观
我们要树立科学<b class="match term0">发展</b>观
"""