Python whoosh.analysis方法代碼示例

本文整理匯總了Python中whoosh.analysis方法的典型用法代碼示例。如果您正苦於以下問題：Python whoosh.analysis方法的具體用法？Python whoosh.analysis怎麽用？Python whoosh.analysis使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類whoosh的用法示例。

在下文中一共展示了whoosh.analysis方法的4個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: init

# 需要導入模塊: import whoosh [as 別名]
# 或者: from whoosh import analysis [as 別名]
def __init__(self, luceneanalyzer):
        """
        :param expression: A regular expression object or string. Each match
            of the expression equals a token. Group 0 (the entire matched text)
            is used as the text of the token. If you require more complicated
            handling of the expression match, simply write your own tokenizer.
        :param gaps: If True, the tokenizer *splits* on the expression, rather
            than matching on the expression.
        """
        lucene.initVM()
        # luceneananalyzer is an analyzer, e.g., StandardAnalyzer
        # from org.apache.lucene.analysis.standard import StandardAnalyzer
        self.lanalyzer = luceneanalyzer()

開發者ID:ChristopherLucas，項目名稱:txtorg，代碼行數:15，代碼來源:fromlucene.py

示例2: call

# 需要導入模塊: import whoosh [as 別名]
# 或者: from whoosh import analysis [as 別名]
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                  mode='', **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%r is not unicode" % value

        tokenlist = []
        tokenStream = self.lanalyzer.tokenStream("contents", StringReader(value))
        #term = tokenStream.addAttribute(lucene.TermAttribute.class_)
        tokenStream.reset()
        
        if len(value)>0:
            while tokenStream.incrementToken():
                tokenlist.append(tokenStream.getAttribute(CharTermAttribute.class_).toString())
                #self.tokentext.insert(END, "[%s]" %(term.term()))
        

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for (pos,text) in enumerate(tokenlist):
                # we may have some off by one errors
                # what is the starting character of the token?
                #start_char_t = value[start_char:].find(text)+start_char
                t.text = text
                #print pos, start_char_t, text
                if positions:
                    print "Unsupported!"
                    #t.pos = start_pos+pos
                    t.pos = pos
                if chars:
                    print "Unsupported!"                    
                    t.startchar = pos
                    t.endchar = pos
                yield t
                # make the tokens
                # copying from https://bitbucket.org/mchaput/whoosh/src/c9ad870378a0f5167182349b64fc3e09c6ca12df/src/whoosh/analysis/tokenizers.py?at=default


#def LuceneAnalyzer():    
#    return LuceneTokenizer(luceneanalyzer)

開發者ID:ChristopherLucas，項目名稱:txtorg，代碼行數:63，代碼來源:fromlucene.py

示例3: call

# 需要導入模塊: import whoosh [as 別名]
# 或者: from whoosh import analysis [as 別名]
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                  mode='', **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%r is not unicode" % value

        s = nielsenstemmer.stem(value,transliteration=False)
        tokenlist = s.split()

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for (pos,text) in enumerate(tokenlist):
                # we may have some off by one errors
                # what is the starting character of the token?
                #start_char_t = value[start_char:].find(text)+start_char
                t.text = text
                #print pos, start_char_t, text
                if positions:
                    print "Unsupported!"
                    #t.pos = start_pos+pos
                    t.pos = pos
                if chars:
                    print "Unsupported!"                    
                    t.startchar = pos
                    t.endchar = pos
                yield t
                # make the tokens
                # copying from https://bitbucket.org/mchaput/whoosh/src/c9ad870378a0f5167182349b64fc3e09c6ca12df/src/whoosh/analysis/tokenizers.py?at=default

開發者ID:ChristopherLucas，項目名稱:txtorg，代碼行數:51，代碼來源:arabic.py

示例4: call

# 需要導入模塊: import whoosh [as 別名]
# 或者: from whoosh import analysis [as 別名]
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                  mode='', **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%r is not unicode" % value

        # test
        #fpath = '/Users/astorer/Dev/txtorg/examples/chinese/1.txt'
        #text = open(fpath).read()
        #value = unicode(text,encoding='utf-8')
        # Thanks, isnowfy!
        s = SnowNLP(value)
        tokenlist = s.words

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for (pos,text) in enumerate(tokenlist):
                # we may have some off by one errors
                # what is the starting character of the token?
                start_char_t = value[start_char:].find(text)+start_char
                t.text = text
                #print pos, start_char_t, text
                if positions:
                    t.pos = start_pos+pos
                if chars:
                    t.startchar = start_char_t
                    t.endchar = start_char_t + len(text)
                yield t
                # make the tokens
                # copying from https://bitbucket.org/mchaput/whoosh/src/c9ad870378a0f5167182349b64fc3e09c6ca12df/src/whoosh/analysis/tokenizers.py?at=default

開發者ID:ChristopherLucas，項目名稱:txtorg，代碼行數:53，代碼來源:chinese.py

注：本文中的whoosh.analysis方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。

示例1: __init__

示例2: __call__

示例3: __call__

示例4: __call__

示例1: init

示例2: call

示例3: call

示例4: call