本文整理汇总了Python中whoosh.analysis.Token.endchar方法的典型用法代码示例。如果您正苦于以下问题:Python Token.endchar方法的具体用法?Python Token.endchar怎么用?Python Token.endchar使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类whoosh.analysis.Token
的用法示例。
在下文中一共展示了Token.endchar方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, value, positions=False, chars=False,
keeporiginal=False, removestops=True,
start_pos=0, start_char=0,
tokenize=True, mode='', **kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode)
if not tokenize:
t.original = t.text = value
t.boost = 1.0
if positions:
t.pos = start_pos
if chars:
t.startchar = start_char
t.endchar = start_char + len(value)
yield t
else:
pos = start_pos
for m in self.tagger.parse(value):
t.text = m.surface
t.feature = m.feature
# TODO: use base form.
t.boost = 1.0
if keeporiginal:
t.original = t.text
t.stopped = False
if positions:
t.pos = pos
pos += 1
if chars:
t.startchar = start_char + m.start
t.endchar = t.startchar + len(m.surface)
yield t
示例2: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, text, **kargs):
token = Token()
words = set()
words_list = []
for (i, start_pos, stop_pos) in jieba.tokenize(text, mode='search'):
i = i.strip()
if not i:
continue
if i in words:
continue
if i in punct:
continue
words.add(i)
words_list.append(i)
for w in words:
if not accepted_chars.match(w):
if len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
示例3: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, value, positions=False, chars=False,
keeporiginal=False, removestops=True,
start_pos=0, start_char=0,
tokenize=True, mode='', **kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode)
if not tokenize:
t.original = t.text = value
t.boost = 1.0
if positions:
t.pos = start_pos
if chars:
t.startchar = start_char
t.endchar = start_char + len(value)
yield t
else:
pos = start_pos
offset = start_char
byte_offset = 0
# TODO: support other encodings
byte = value.encode('utf-8')
m = self.tagger.parseToNode(byte)
while m:
if len(m.surface) == 0:
m = m.next
continue
t.text = m.surface.decode('utf-8')
t.feature = m.feature
# TODO: use base form.
t.boost = 1.0
if keeporiginal:
t.original = t.text
t.stopped = False
if positions:
t.pos = pos
pos += 1
if chars:
s = byte_offset + m.rlength - m.length
e = s + m.length
t.startchar = offset + \
len(byte[byte_offset:s].decode('utf-8'))
t.endchar = t.startchar + len(byte[s:e].decode('utf-8'))
offset = t.endchar
byte_offset = e
m = m.next
yield t
示例4: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, text, **kargs):
token = Token()
start_pos = 0
for w in group_words(text):
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = start_pos + len(w)
start_pos = token.endchar
yield token
示例5: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w,start_pos,stop_pos) in words:
if not accepted_chars.match(w) and len(w)<=1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
示例6: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self,text,**kargs):
words = tokenize_1(text)
token = Token()
for (w,start_pos,stop_pos) in words:
if not accepted_chars.match(w):
if len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
示例7: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, value, positions=False, chars=False,
keeporiginal=False, removestops=True,
start_pos=0, start_char=0,
tokenize=True, mode='', **kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode)
if not tokenize:
t.original = t.text = value
t.boost = 1.0
if positions:
t.pos = start_pos
if chars:
t.startchar = start_char
t.endchar = start_char + len(value)
yield t
else:
if self.strip:
strip = text_type.strip
else:
def strip(s):
return s
pos = start_pos
startchar = start_char
for s, l in \
((strip(s), len(s)) for s in
tinysegmenter.tokenize(value)):
t.text = s
t.boost = 1.0
if keeporiginal:
t.original = t.text
t.stopped = False
if positions:
t.pos = pos
pos += 1
if chars:
t.startchar = startchar
startchar += l
t.endchar = startchar
yield t
示例8: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, value, positions=False, chars=False, keeporiginal=False,removestops=True,start_pos=0, start_char=0, mode='',**kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode,**kwargs)
seglist = value.split(' ')
for w in seglist:
t.original = t.text = w
t.boost = 1.0
if positions:
t.pos=start_pos+value.find(w)
if chars:
t.startchar=start_char+value.find(w)
t.endchar=start_char+value.find(w)+len(w)
yield t
示例9: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
seglist=jieba.cut_for_search(value) #使用结巴分词库进行分词
for w in seglist:
t.original = t.text = w
t.boost = 1.0
if positions:
t.pos=start_pos+value.find(w)
if chars:
t.startchar=start_char+value.find(w)
t.endchar=start_char+value.find(w)+len(w)
yield t #通过生成器返回每个分词的结果token
示例10: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self,text,**kargs):
words = _cuttor.tokenize(text, search=True)
token = Token()
for (w,start_pos,stop_pos) in words:
if not accepted_chars.match(w):
if len(w)>1:
pass
else:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
示例11: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, value, positions=False, chars=False,
keeporiginal=False, removestops=True,
start_pos=0, start_char=0, mode='', **kwargs):
t = Token(positions, chars, removestops=removestops, mode=mode,
**kwargs)
seglist = jieba.cut(value, cut_all=False)
for word in seglist:
t.original = t.text = word
t.boost = 1.0
if positions:
t.pos = start_pos + value.find(word)
if chars:
t.startchar = start_char + value.find(word)
t.endchar = t.startchar + len(word)
yield t
示例12: _merge_matched_tokens
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def _merge_matched_tokens(self, tokens):
token_ready = False
for t in tokens:
if not t.matched:
yield t
continue
if not token_ready:
token = Token(**t.__dict__)
token_ready = True
elif t.startchar <= token.endchar:
if t.endchar > token.endchar:
token.text += t.text[token.endchar-t.endchar:]
token.endchar = t.endchar
else:
yield token
token_ready = False
if token_ready:
yield token
示例13: __call__
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
def __call__(self, value, positions=False, chars=False,
keeporiginal=False, removestops=True,
start_pos=0, start_char=0, mode='', **kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode,
**kwargs)
nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE)
pynlpir.open()
pynlpir.open(encoding='utf-8')
seglist = pynlpir.segment(value,)
for w in seglist:
t.original = t.text = w
t.boost = 1.0
if positions:
t.pos=start_pos+value.find(w)
if chars:
t.startchar=start_char+value.find(w)
t.endchar=start_char+value.find(w)+len(w)
yield t #通过生成器返回每个分词的结果token
示例14: open
# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import endchar [as 别名]
# 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
# 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
# 'to', 'us', 'we', 'when', 'will', 'with', 'yet',
# 'you', 'your',u'的',u'了',u'和',u'的',u'我',u'你',u'地',u'我们',u'我的',u'你们',u'你的',u'','_'))
STOP_WORDS =frozenset(([for line.strip() in open("stopwords.dic",'r')])
print 'stopwords'
accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
class ChineseTokenizer(Tokenizer):
def __call__(self,text,**kargs):
words = jieba.tokenize(text,mode="search")
token = Token()
for (w,start_pos,stop_pos) in words:
if not accepted_chars.match(w):
if len(w)>1:
pass
else:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000):
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\
|StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)