当前位置: 首页>>代码示例>>Python>>正文


Python Tokenizer.tokenize方法代码示例

本文整理汇总了Python中tokenizer.Tokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.tokenize方法的具体用法?Python Tokenizer.tokenize怎么用?Python Tokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tokenizer.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: testExecutionTreeWithItemAssignment

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
    def testExecutionTreeWithItemAssignment(self):

        c = ExpressionCompiler()
        tokenizer = Tokenizer()

        tokenizer.tokenize("A[B]= 1 + R")

        tokenizer.next()

        expr = c.compile(tokenizer)

        exec_tree = expr.get_execution_tree()

        print "Expression Tree %s\n" % (exec_tree)

        self.assertEqual(
            "( = ( item_assign ( literal A ) ( index ( literal B ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )", exec_tree
        )

        # a little bit more complex
        tokenizer.tokenize("A[B+(C*3)+1]= 1 + R")

        tokenizer.next()

        expr = c.compile(tokenizer)

        exec_tree = expr.get_execution_tree()

        print "Expression Tree %s\n" % (exec_tree)

        self.assertEqual(
            "( = ( item_assign ( literal A ) ( index ( + ( + ( literal B ) ( * ( literal C ) ( literal 3.0 ) ) ) ( literal 1.0 ) ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )",
            exec_tree,
        )
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:36,代码来源:expr_compiler.py

示例2: testEvaluateFactors

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
    def testEvaluateFactors(self):

        c = ExpressionCompiler()

        tokenizer = Tokenizer()
        tokenizer.tokenize("7*7")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(49.0, result)

        tokenizer.tokenize("7*7/7")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(7.0, result)
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:28,代码来源:expr_compiler.py

示例3: testEvaluateNegation

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
    def testEvaluateNegation(self):

        c = ExpressionCompiler()
        tokenizer = Tokenizer()

        tokenizer.tokenize("not 0")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(1, result)
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:17,代码来源:expr_compiler.py

示例4: _classify

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
    def _classify(self, tokens, languages):
        """
        Internal: Guess language of data

        data      - Array of tokens or String data to analyze.
        languages - Array of language name Strings to restrict to.

        Returns sorted Array of result pairs. Each pair contains the
        String language name and a Float score.
        """
        if tokens is None:
            return []

        if isinstance(tokens, basestring):
            tokens = Tokenizer.tokenize(tokens)

        scores = {}
        if self.verbosity >= 2:
            self.dump_all_tokens(tokens, languages)
        for language in languages:
            scores[language] = self.tokens_probability(tokens, language) + self.language_probability(language)
            if self.verbosity >= 1:
                print '%10s = %10.3f + %7.3f = %10.3f\n' % (language,
                                                            self.tokens_probability(tokens, language),
                                                            self.language_probability(language),
                                                            scores[language])
        return sorted(scores.iteritems(), key=lambda t: t[1], reverse=True)
开发者ID:arnauorriols,项目名称:plangclassifier,代码行数:29,代码来源:classifier.py

示例5: train

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
    def train(cls, db, language, data):
        """
        Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
        per-language.  See also dump_all_tokens, below.

        Public: Train classifier that data is a certain language.

          db       - Hash classifier database object
          language - String language of data
          data     - String contents of file

          Examples

            Classifier.train(db, 'Ruby', "def hello; end")

          Returns nothing.
        """
        tokens = Tokenizer.tokenize(data)
        db['tokens_total'] = db.get('tokens_total', 0)
        db['languages_total'] = db.get('languages_total', 0)
        db['tokens'] = db.get('tokens', {})
        db['language_tokens'] = db.get('language_tokens', {})
        db['languages'] = db.get('languages', {})

        for token in tokens:
            db['tokens'][language] = db['tokens'].get(language, {})
            db['tokens'][language][token] = db['tokens'][language].get(token, 0)
            db['tokens'][language][token] += 1
            db['language_tokens'][language] = db['language_tokens'].get(language, 0)
            db['language_tokens'][language] += 1
            db['tokens_total'] += 1

        db['languages'][language] = db['languages'].get(language, 0)
        db['languages'][language] += 1
        db['languages_total'] += 1
开发者ID:arnauorriols,项目名称:plangclassifier,代码行数:37,代码来源:classifier.py

示例6: main

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def main():
    ## args
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--reviews', required=True, help='Review data file')
    parser.add_argument('-o', '--out', required=True, help='Inverted index output file')
    parser.add_argument('-s', '--stop', required=True, help='Stopword list')
    opts = parser.parse_args()

    ## Output file
    csv_writer = csv.writer(open(opts.out, 'w'), delimiter="\t")
    csv_writer.writerow(['token', 'business_id', 'review_id', 'position', '...'])

    ## Tokenizer
    tk = Tokenizer(opts.stop)
    token_map = defaultdict(list)

    ## Tokenize review texts
    # for each word in the vocabulary (in this case all words found in all reviews):
    # business id, review id, and position of each term occurrence
    # instead of using the review id, uses the line on which the review occurs as a unique identifier
    reviews = open(opts.reviews)
    for review_num, line in enumerate(reviews):
        review = json.loads(line)
        business_id = review['business_id'].encode('utf-8')
        tokens = tk.tokenize(review['text'])
        for position, word in enumerate(tokens):
            token_map[word].append((business_id, review_num, position))

    ## Print sorted inverted index
    for token in sorted(token_map):
        row = [token]
        row.extend(token_map[token])
        csv_writer.writerow(row)
开发者ID:jschear,项目名称:cs1951a-final,代码行数:35,代码来源:create_index.py

示例7: _tokenize_tweet

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
 def _tokenize_tweet(self, tweet):
     """
     Input: tweet (String)
     Output: List of tokens
     """
     tok = Tokenizer(preserve_case=False)
     return tok.tokenize(tweet)
开发者ID:Chouffe,项目名称:senti-tweet,代码行数:9,代码来源:tools.py

示例8: train

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
    def train(cls, db, language, data):
        """
        Public: Train classifier that data is a certain language.

          db       - Hash classifier database object
          language - String language of data
          data     - String contents of file

          Examples

            Classifier.train(db, 'Ruby', "def hello; end")

          Returns nothing.
        """
        tokens = Tokenizer.tokenize(data)
        db['tokens_total'] = db.get('tokens_total', 0)
        db['languages_total'] = db.get('languages_total', 0)
        db['tokens'] = db.get('tokens', {})
        db['language_tokens'] = db.get('language_tokens', {})
        db['languages'] = db.get('languages', {})

        for token in tokens:
            db['tokens'][language] = db['tokens'].get(language, {})
            db['tokens'][language][token] = db['tokens'][language].get(token, 0)
            db['tokens'][language][token] += 1
            db['language_tokens'][language] = db['language_tokens'].get(language, 0)
            db['language_tokens'][language] += 1
            db['tokens_total'] += 1

        db['languages'][language] = db['languages'].get(language, 0)
        db['languages'][language] += 1
        db['languages_total'] += 1
开发者ID:arthur503,项目名称:linguist,代码行数:34,代码来源:classifier.py

示例9: __init__

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
class Preprocesser:

    def __init__(self, lower=True, punctuation=True, digits=True, stop=True, min_length=3,
                 pos_tag=False, lemmatization=True):

        self.lemma = lemmatization
        self.pos_tag = pos_tag

        self.tokenizer = Tokenizer(lower, punctuation, digits)
        self.token_filter = TokenFilter(stop, min_length)
        if pos_tag or lemmatization:
            self.postagger = Postagger()
            print dir(self.postagger)
        if lemmatization:
            self.Lemmatizer = Lemmatizer()

    def process(self, text):
        words = self.tokenizer.tokenize(text)
        words = self.token_filter.filter(words)
        if self.lemma:
            tags = self.postagger.tags2lemmatags(self.postagger.tags(words))
            result = self.Lemmatizer.lemma(words, tags)
        if self.pos_tag:
            tags = self.postagger.tags(words)
            result = tags
        return result
开发者ID:largelymfs,项目名称:en_corpora_predealer,代码行数:28,代码来源:main.py

示例10: InvertedIndex

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
class InvertedIndex():

    def __init__(self):
        self.invertedindex = {}
        self.lexicon = Lexicon()
        self.tokenizer = Tokenizer()
        self.doc_reader = DocReader()
        self.build_index()

    def build_index(self):
        #comments?
        cache = self.doc_reader.get_cache()
        docs = self.doc_reader.read_docs(cache)
        print "\nINVERTEDINDEX : Indexing %d documents..\n" % len(docs)
        for d in range(len(docs)):
            print "Indexing document '%s'" % (settings.PATH_DOCS + str(d))
            self.add_document(docs[d], d)

        print "Indexed total %d unique terms" % self.lexicon.size()

    def get_postinglist(self, lex_id):
        return self.invertedindex[lex_id]
            
    def add_document(self, doc, document_id):
        """FIXME: 
        -Needs doc 
        -Too slow?
        -Remove stop words
        -Reduce number of tokens
        """
        tokens = self.tokenizer.tokenize(doc)
        
        for t in tokens:
            lex_id = self.lexicon.lookup(t.get_value())

            if(lex_id == settings.INVALID):
                lex_id = self.lexicon.add_value(t.get_value())
                pl = PostingList()
                pl.append_posting(Posting(document_id, t.get_position()))
                self.invertedindex[lex_id] = pl
            else:
                pl = self.get_postinglist(lex_id)
    
            if pl.get_last_posting().get_document_id() != document_id:
                pl.append_posting(Posting(document_id, t.get_position()))
            else:
                p = pl.get_last_posting()
                p.append_position(t.get_position())
           
    def size(self):
        return len(self.invertedindex)

    def debugprint(self):
        voc = self.lexicon.get_vocabulary()
        for v in voc:
            lid = self.lexicon.lookup(v)
            pl = self.get_postinglist(lid)
            print "[%s]" % v
            pl.info()
开发者ID:johnro,项目名称:bacon,代码行数:61,代码来源:invertedindex.py

示例11: tokenize

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
    def tokenize(self, **kwargs):
        """
        Returns the tokenized string using a parser.
        """

        string_tokenizer = Tokenizer()

        return string_tokenizer.tokenize(kwargs.get("text"), kwargs.get("parser"))
开发者ID:DarkmatterVale,项目名称:regex4dummies,代码行数:10,代码来源:toolkit.py

示例12: predict

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
 def predict(self, sentence):
   if self._net is None or self._trainer is None:
     raise Exception('Must train first')
   vectors, characters = Tokenizer.tokenize(sentence)
   if len(vectors) == 0:
     return None
   input = np.array(self._code(vectors[0]),
                    ndmin=2,
                    dtype=np.float32)
   result = self._net.predict(input)
   return self.languages[result[0]]
开发者ID:alpo,项目名称:examples-in-python,代码行数:13,代码来源:network.py

示例13: SimpleTokenizer

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
class SimpleTokenizer(object):
    def __init__(self, convert_to_base_form=True, normalize_number=False,
                 append_pos=False):
        self.tokenizer = Tokenizer(convert_to_base_form=convert_to_base_form,
                                   normalize_number=normalize_number,
                                   append_pos=append_pos)

    def tokenize(self, review):
        nested_tokens = [self.tokenizer.tokenize(sentence)
                         for sentence in review['text']]
        tokens = [token for sublist in nested_tokens for token in sublist]
        return tokens
开发者ID:mrorii,项目名称:language-of-beauty,代码行数:14,代码来源:common.py

示例14: testExecutionTreeWithTerms

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
    def testExecutionTreeWithTerms(self):

        c = ExpressionCompiler()
        tokenizer = Tokenizer()

        tokenizer.tokenize("A=1")
        tokenizer.next()

        expr = c.compile(tokenizer)

        self.assertEqual("( = ( literal A ) ( literal 1.0 ) )", expr.get_execution_tree())

        tokenizer.tokenize("A=1 and B>10")
        tokenizer.next()

        expr = c.compile(tokenizer)

        print "Execution Tree = %s\n" % (expr.get_execution_tree())

        self.assertEqual(
            "( and ( = ( literal A ) ( literal 1.0 ) ) ( > ( literal B ) ( literal 10.0 ) ) )",
            expr.get_execution_tree(),
        )

        tokenizer.tokenize("(A=1 and B>10) or (C > 10)")
        tokenizer.next()

        expr = c.compile(tokenizer)

        print "Execution Tree = %s\n" % (expr.get_execution_tree())
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:32,代码来源:expr_compiler.py

示例15: analyze

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def analyze(string):
    scanner = Tokenizer()
    list_of_tokens= scanner.tokenize(string)
    print "-------------"
    print "TOKEN LIST:"
    print list_of_tokens
    parser = QueryParser()
    print "----------------"
    print "PARSING RESULT"
    print "----------------"
    print parser.parse(list_of_tokens)

    semparser = QuerySemanticParser(parser.parse(list_of_tokens))
    semparser.parse()
开发者ID:dav009,项目名称:truthgraph,代码行数:16,代码来源:main.py


注:本文中的tokenizer.Tokenizer.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。