当前位置: 首页>>代码示例>>Python>>正文


Python Token.text方法代码示例

本文整理汇总了Python中whoosh.analysis.Token.text方法的典型用法代码示例。如果您正苦于以下问题:Python Token.text方法的具体用法?Python Token.text怎么用?Python Token.text使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在whoosh.analysis.Token的用法示例。


在下文中一共展示了Token.text方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __call__

# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import text [as 别名]
    def __call__(self, value, start_pos=0, positions=False, **kwargs):
        """
        Tokenizer behaviour:

        Input: u"text/x.moin.wiki;charset=utf-8"
        Output: u"text/x.moin.wiki;charset=utf-8", u"text", u"x.moin.wiki", u"charset=utf-8"

        Input: u"application/pdf"
        Output: u"application/pdf", u"application", u"pdf"

        :param value: String for tokenization
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param positions: Whether to record token positions in the token.
        """
        assert isinstance(value, unicode), "{0!r} is not unicode".format(value)
        if u'/' not in value: # Add '/' if user forgot do this
            value += u'/'
        pos = start_pos
        tk = Token()
        tp = Type(value)
        # we need to yield the complete contenttype in one piece,
        # so we can find it with Term(CONTENTTYPE, contenttype):
        if tp.type is not None and tp.subtype is not None:
            # note: we do not use "value" directly, so Type.__unicode__ can normalize it:
            tk.text = unicode(tp)
            if positions:
                tk.pos = pos
                pos += 1
            yield tk
        # now yield the pieces:
        tk.text = tp.type
        if positions:
            tk.pos = pos
            pos += 1
        yield tk
        if tp.subtype is not None:
            tk.text = tp.subtype
            if positions:
                tk.pos = pos
                pos += 1
            yield tk
        for key, value in tp.parameters.items():
            tk.text = u"{0}={1}".format(key, value)
            if positions:
                tk.pos = pos
                pos += 1
            yield tk
开发者ID:pombredanne,项目名称:moin2,代码行数:51,代码来源:analyzers.py

示例2: __call__

# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import text [as 别名]
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         for m in self.tagger.parse(value):
             t.text = m.surface
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = start_char + m.start
                 t.endchar = t.startchar + len(m.surface)
             yield t
开发者ID:hideaki-t,项目名称:whoosh-igo,代码行数:34,代码来源:IgoTokenizer.py

示例3: __call__

# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import text [as 别名]
	def __call__(self,text,**kargs):
		words = jieba.tokenize(text,mode="search")
		token  = Token()
		for (w,start_pos,stop_pos) in words:
			if not accepted_chars.match(w):
				if len(w)>1:
					pass
				else:
					continue
			token.text = w
			token.pos = start_pos
			token.startchar = start_pos
			token.endchar = stop_pos
			yield token
开发者ID:1ee7,项目名称:piRobot,代码行数:16,代码来源:analyzer.py

示例4: __call__

# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import text [as 别名]
    def __call__(self, value, mode='', positions=False, **kwargs):
        assert isinstance(value, unicode), "%r is not unicode" % value
        token = Token(**kwargs)
        tagger = MeCab.Tagger('mecabrc')
        result = tagger.parse(value.encode("utf8")).decode('utf8')

        cur = 0
        for match in re.compile("(\S+)\s+(\S+)\n").finditer(result):
            category = match.group(2).split(",")
            if 0 < len(category) and \
                    (category[0] == u'名詞' or category[0] == u'動詞' \
                         or category[0] == u'形容詞' or category[0] == u'副詞'):
                token.text = match.group(1)
                token.pos  = cur
                yield token
            cur += len(match.group(1))
开发者ID:faggion,项目名称:sample-codes,代码行数:18,代码来源:mecab.py

示例5: __call__

# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import text [as 别名]
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         offset = start_char
         byte_offset = 0
         # TODO: support other encodings
         byte = value.encode('utf-8')
         m = self.tagger.parseToNode(byte)
         while m:
             if len(m.surface) == 0:
                 m = m.next
                 continue
             t.text = m.surface.decode('utf-8')
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 s = byte_offset + m.rlength - m.length
                 e = s + m.length
                 t.startchar = offset + \
                     len(byte[byte_offset:s].decode('utf-8'))
                 t.endchar = t.startchar + len(byte[s:e].decode('utf-8'))
                 offset = t.endchar
                 byte_offset = e
             m = m.next
             yield t
开发者ID:knzm,项目名称:whoosh-igo,代码行数:48,代码来源:MeCabTokenizer.py

示例6: __call__

# 需要导入模块: from whoosh.analysis import Token [as 别名]
# 或者: from whoosh.analysis.Token import text [as 别名]
    def __call__(self, value, positions=False, chars=False,
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0,
                 tokenize=True, mode='', **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            if self.strip:
                strip = text_type.strip
            else:
                def strip(s):
                    return s

            pos = start_pos
            startchar = start_char
            for s, l in \
                    ((strip(s), len(s)) for s in
                     tinysegmenter.tokenize(value)):
                t.text = s
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                    pos += 1
                if chars:
                    t.startchar = startchar
                    startchar += l
                    t.endchar = startchar
                yield t
开发者ID:hideaki-t,项目名称:whoosh-igo,代码行数:42,代码来源:TinySegmenterTokenizer.py


注:本文中的whoosh.analysis.Token.text方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。