本文整理汇总了Python中whoosh.analysis方法的典型用法代码示例。如果您正苦于以下问题:Python whoosh.analysis方法的具体用法?Python whoosh.analysis怎么用?Python whoosh.analysis使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类whoosh
的用法示例。
在下文中一共展示了whoosh.analysis方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import whoosh [as 别名]
# 或者: from whoosh import analysis [as 别名]
def __init__(self, luceneanalyzer):
"""
:param expression: A regular expression object or string. Each match
of the expression equals a token. Group 0 (the entire matched text)
is used as the text of the token. If you require more complicated
handling of the expression match, simply write your own tokenizer.
:param gaps: If True, the tokenizer *splits* on the expression, rather
than matching on the expression.
"""
lucene.initVM()
# luceneananalyzer is an analyzer, e.g., StandardAnalyzer
# from org.apache.lucene.analysis.standard import StandardAnalyzer
self.lanalyzer = luceneanalyzer()
示例2: __call__
# 需要导入模块: import whoosh [as 别名]
# 或者: from whoosh import analysis [as 别名]
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
removestops=True, start_pos=0, start_char=0, tokenize=True,
mode='', **kwargs):
"""
:param value: The unicode string to tokenize.
:param positions: Whether to record token positions in the token.
:param chars: Whether to record character offsets in the token.
:param start_pos: The position number of the first token. For example,
if you set start_pos=2, the tokens will be numbered 2,3,4,...
instead of 0,1,2,...
:param start_char: The offset of the first character of the first
token. For example, if you set start_char=2, the text "aaa bbb"
will have chars (2,5),(6,9) instead (0,3),(4,7).
:param tokenize: if True, the text should be tokenized.
"""
assert isinstance(value, text_type), "%r is not unicode" % value
tokenlist = []
tokenStream = self.lanalyzer.tokenStream("contents", StringReader(value))
#term = tokenStream.addAttribute(lucene.TermAttribute.class_)
tokenStream.reset()
if len(value)>0:
while tokenStream.incrementToken():
tokenlist.append(tokenStream.getAttribute(CharTermAttribute.class_).toString())
#self.tokentext.insert(END, "[%s]" %(term.term()))
t = Token(positions, chars, removestops=removestops, mode=mode,
**kwargs)
if not tokenize:
t.original = t.text = value
t.boost = 1.0
if positions:
t.pos = start_pos
if chars:
t.startchar = start_char
t.endchar = start_char + len(value)
yield t
else:
for (pos,text) in enumerate(tokenlist):
# we may have some off by one errors
# what is the starting character of the token?
#start_char_t = value[start_char:].find(text)+start_char
t.text = text
#print pos, start_char_t, text
if positions:
print "Unsupported!"
#t.pos = start_pos+pos
t.pos = pos
if chars:
print "Unsupported!"
t.startchar = pos
t.endchar = pos
yield t
# make the tokens
# copying from https://bitbucket.org/mchaput/whoosh/src/c9ad870378a0f5167182349b64fc3e09c6ca12df/src/whoosh/analysis/tokenizers.py?at=default
#def LuceneAnalyzer():
# return LuceneTokenizer(luceneanalyzer)
示例3: __call__
# 需要导入模块: import whoosh [as 别名]
# 或者: from whoosh import analysis [as 别名]
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
removestops=True, start_pos=0, start_char=0, tokenize=True,
mode='', **kwargs):
"""
:param value: The unicode string to tokenize.
:param positions: Whether to record token positions in the token.
:param chars: Whether to record character offsets in the token.
:param start_pos: The position number of the first token. For example,
if you set start_pos=2, the tokens will be numbered 2,3,4,...
instead of 0,1,2,...
:param start_char: The offset of the first character of the first
token. For example, if you set start_char=2, the text "aaa bbb"
will have chars (2,5),(6,9) instead (0,3),(4,7).
:param tokenize: if True, the text should be tokenized.
"""
assert isinstance(value, text_type), "%r is not unicode" % value
s = nielsenstemmer.stem(value,transliteration=False)
tokenlist = s.split()
t = Token(positions, chars, removestops=removestops, mode=mode,
**kwargs)
if not tokenize:
t.original = t.text = value
t.boost = 1.0
if positions:
t.pos = start_pos
if chars:
t.startchar = start_char
t.endchar = start_char + len(value)
yield t
else:
for (pos,text) in enumerate(tokenlist):
# we may have some off by one errors
# what is the starting character of the token?
#start_char_t = value[start_char:].find(text)+start_char
t.text = text
#print pos, start_char_t, text
if positions:
print "Unsupported!"
#t.pos = start_pos+pos
t.pos = pos
if chars:
print "Unsupported!"
t.startchar = pos
t.endchar = pos
yield t
# make the tokens
# copying from https://bitbucket.org/mchaput/whoosh/src/c9ad870378a0f5167182349b64fc3e09c6ca12df/src/whoosh/analysis/tokenizers.py?at=default
示例4: __call__
# 需要导入模块: import whoosh [as 别名]
# 或者: from whoosh import analysis [as 别名]
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
removestops=True, start_pos=0, start_char=0, tokenize=True,
mode='', **kwargs):
"""
:param value: The unicode string to tokenize.
:param positions: Whether to record token positions in the token.
:param chars: Whether to record character offsets in the token.
:param start_pos: The position number of the first token. For example,
if you set start_pos=2, the tokens will be numbered 2,3,4,...
instead of 0,1,2,...
:param start_char: The offset of the first character of the first
token. For example, if you set start_char=2, the text "aaa bbb"
will have chars (2,5),(6,9) instead (0,3),(4,7).
:param tokenize: if True, the text should be tokenized.
"""
assert isinstance(value, text_type), "%r is not unicode" % value
# test
#fpath = '/Users/astorer/Dev/txtorg/examples/chinese/1.txt'
#text = open(fpath).read()
#value = unicode(text,encoding='utf-8')
# Thanks, isnowfy!
s = SnowNLP(value)
tokenlist = s.words
t = Token(positions, chars, removestops=removestops, mode=mode,
**kwargs)
if not tokenize:
t.original = t.text = value
t.boost = 1.0
if positions:
t.pos = start_pos
if chars:
t.startchar = start_char
t.endchar = start_char + len(value)
yield t
else:
for (pos,text) in enumerate(tokenlist):
# we may have some off by one errors
# what is the starting character of the token?
start_char_t = value[start_char:].find(text)+start_char
t.text = text
#print pos, start_char_t, text
if positions:
t.pos = start_pos+pos
if chars:
t.startchar = start_char_t
t.endchar = start_char_t + len(text)
yield t
# make the tokens
# copying from https://bitbucket.org/mchaput/whoosh/src/c9ad870378a0f5167182349b64fc3e09c6ca12df/src/whoosh/analysis/tokenizers.py?at=default