本文整理汇总了Python中tokenizer.Tokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.tokenize方法的具体用法?Python Tokenizer.tokenize怎么用?Python Tokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: testExecutionTreeWithItemAssignment
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def testExecutionTreeWithItemAssignment(self):
c = ExpressionCompiler()
tokenizer = Tokenizer()
tokenizer.tokenize("A[B]= 1 + R")
tokenizer.next()
expr = c.compile(tokenizer)
exec_tree = expr.get_execution_tree()
print "Expression Tree %s\n" % (exec_tree)
self.assertEqual(
"( = ( item_assign ( literal A ) ( index ( literal B ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )", exec_tree
)
# a little bit more complex
tokenizer.tokenize("A[B+(C*3)+1]= 1 + R")
tokenizer.next()
expr = c.compile(tokenizer)
exec_tree = expr.get_execution_tree()
print "Expression Tree %s\n" % (exec_tree)
self.assertEqual(
"( = ( item_assign ( literal A ) ( index ( + ( + ( literal B ) ( * ( literal C ) ( literal 3.0 ) ) ) ( literal 1.0 ) ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )",
exec_tree,
)
示例2: testEvaluateFactors
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def testEvaluateFactors(self):
c = ExpressionCompiler()
tokenizer = Tokenizer()
tokenizer.tokenize("7*7")
tokenizer.next()
expr = c.compile(tokenizer)
result = expr.evaluate()
print "result = %s\n" % (result)
self.assertEqual(49.0, result)
tokenizer.tokenize("7*7/7")
tokenizer.next()
expr = c.compile(tokenizer)
result = expr.evaluate()
print "result = %s\n" % (result)
self.assertEqual(7.0, result)
示例3: testEvaluateNegation
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def testEvaluateNegation(self):
c = ExpressionCompiler()
tokenizer = Tokenizer()
tokenizer.tokenize("not 0")
tokenizer.next()
expr = c.compile(tokenizer)
result = expr.evaluate()
print "result = %s\n" % (result)
self.assertEqual(1, result)
示例4: _classify
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def _classify(self, tokens, languages):
"""
Internal: Guess language of data
data - Array of tokens or String data to analyze.
languages - Array of language name Strings to restrict to.
Returns sorted Array of result pairs. Each pair contains the
String language name and a Float score.
"""
if tokens is None:
return []
if isinstance(tokens, basestring):
tokens = Tokenizer.tokenize(tokens)
scores = {}
if self.verbosity >= 2:
self.dump_all_tokens(tokens, languages)
for language in languages:
scores[language] = self.tokens_probability(tokens, language) + self.language_probability(language)
if self.verbosity >= 1:
print '%10s = %10.3f + %7.3f = %10.3f\n' % (language,
self.tokens_probability(tokens, language),
self.language_probability(language),
scores[language])
return sorted(scores.iteritems(), key=lambda t: t[1], reverse=True)
示例5: train
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def train(cls, db, language, data):
"""
Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
per-language. See also dump_all_tokens, below.
Public: Train classifier that data is a certain language.
db - Hash classifier database object
language - String language of data
data - String contents of file
Examples
Classifier.train(db, 'Ruby', "def hello; end")
Returns nothing.
"""
tokens = Tokenizer.tokenize(data)
db['tokens_total'] = db.get('tokens_total', 0)
db['languages_total'] = db.get('languages_total', 0)
db['tokens'] = db.get('tokens', {})
db['language_tokens'] = db.get('language_tokens', {})
db['languages'] = db.get('languages', {})
for token in tokens:
db['tokens'][language] = db['tokens'].get(language, {})
db['tokens'][language][token] = db['tokens'][language].get(token, 0)
db['tokens'][language][token] += 1
db['language_tokens'][language] = db['language_tokens'].get(language, 0)
db['language_tokens'][language] += 1
db['tokens_total'] += 1
db['languages'][language] = db['languages'].get(language, 0)
db['languages'][language] += 1
db['languages_total'] += 1
示例6: main
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def main():
## args
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--reviews', required=True, help='Review data file')
parser.add_argument('-o', '--out', required=True, help='Inverted index output file')
parser.add_argument('-s', '--stop', required=True, help='Stopword list')
opts = parser.parse_args()
## Output file
csv_writer = csv.writer(open(opts.out, 'w'), delimiter="\t")
csv_writer.writerow(['token', 'business_id', 'review_id', 'position', '...'])
## Tokenizer
tk = Tokenizer(opts.stop)
token_map = defaultdict(list)
## Tokenize review texts
# for each word in the vocabulary (in this case all words found in all reviews):
# business id, review id, and position of each term occurrence
# instead of using the review id, uses the line on which the review occurs as a unique identifier
reviews = open(opts.reviews)
for review_num, line in enumerate(reviews):
review = json.loads(line)
business_id = review['business_id'].encode('utf-8')
tokens = tk.tokenize(review['text'])
for position, word in enumerate(tokens):
token_map[word].append((business_id, review_num, position))
## Print sorted inverted index
for token in sorted(token_map):
row = [token]
row.extend(token_map[token])
csv_writer.writerow(row)
示例7: _tokenize_tweet
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def _tokenize_tweet(self, tweet):
"""
Input: tweet (String)
Output: List of tokens
"""
tok = Tokenizer(preserve_case=False)
return tok.tokenize(tweet)
示例8: train
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def train(cls, db, language, data):
"""
Public: Train classifier that data is a certain language.
db - Hash classifier database object
language - String language of data
data - String contents of file
Examples
Classifier.train(db, 'Ruby', "def hello; end")
Returns nothing.
"""
tokens = Tokenizer.tokenize(data)
db['tokens_total'] = db.get('tokens_total', 0)
db['languages_total'] = db.get('languages_total', 0)
db['tokens'] = db.get('tokens', {})
db['language_tokens'] = db.get('language_tokens', {})
db['languages'] = db.get('languages', {})
for token in tokens:
db['tokens'][language] = db['tokens'].get(language, {})
db['tokens'][language][token] = db['tokens'][language].get(token, 0)
db['tokens'][language][token] += 1
db['language_tokens'][language] = db['language_tokens'].get(language, 0)
db['language_tokens'][language] += 1
db['tokens_total'] += 1
db['languages'][language] = db['languages'].get(language, 0)
db['languages'][language] += 1
db['languages_total'] += 1
示例9: __init__
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
class Preprocesser:
def __init__(self, lower=True, punctuation=True, digits=True, stop=True, min_length=3,
pos_tag=False, lemmatization=True):
self.lemma = lemmatization
self.pos_tag = pos_tag
self.tokenizer = Tokenizer(lower, punctuation, digits)
self.token_filter = TokenFilter(stop, min_length)
if pos_tag or lemmatization:
self.postagger = Postagger()
print dir(self.postagger)
if lemmatization:
self.Lemmatizer = Lemmatizer()
def process(self, text):
words = self.tokenizer.tokenize(text)
words = self.token_filter.filter(words)
if self.lemma:
tags = self.postagger.tags2lemmatags(self.postagger.tags(words))
result = self.Lemmatizer.lemma(words, tags)
if self.pos_tag:
tags = self.postagger.tags(words)
result = tags
return result
示例10: InvertedIndex
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
class InvertedIndex():
def __init__(self):
self.invertedindex = {}
self.lexicon = Lexicon()
self.tokenizer = Tokenizer()
self.doc_reader = DocReader()
self.build_index()
def build_index(self):
#comments?
cache = self.doc_reader.get_cache()
docs = self.doc_reader.read_docs(cache)
print "\nINVERTEDINDEX : Indexing %d documents..\n" % len(docs)
for d in range(len(docs)):
print "Indexing document '%s'" % (settings.PATH_DOCS + str(d))
self.add_document(docs[d], d)
print "Indexed total %d unique terms" % self.lexicon.size()
def get_postinglist(self, lex_id):
return self.invertedindex[lex_id]
def add_document(self, doc, document_id):
"""FIXME:
-Needs doc
-Too slow?
-Remove stop words
-Reduce number of tokens
"""
tokens = self.tokenizer.tokenize(doc)
for t in tokens:
lex_id = self.lexicon.lookup(t.get_value())
if(lex_id == settings.INVALID):
lex_id = self.lexicon.add_value(t.get_value())
pl = PostingList()
pl.append_posting(Posting(document_id, t.get_position()))
self.invertedindex[lex_id] = pl
else:
pl = self.get_postinglist(lex_id)
if pl.get_last_posting().get_document_id() != document_id:
pl.append_posting(Posting(document_id, t.get_position()))
else:
p = pl.get_last_posting()
p.append_position(t.get_position())
def size(self):
return len(self.invertedindex)
def debugprint(self):
voc = self.lexicon.get_vocabulary()
for v in voc:
lid = self.lexicon.lookup(v)
pl = self.get_postinglist(lid)
print "[%s]" % v
pl.info()
示例11: tokenize
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def tokenize(self, **kwargs):
"""
Returns the tokenized string using a parser.
"""
string_tokenizer = Tokenizer()
return string_tokenizer.tokenize(kwargs.get("text"), kwargs.get("parser"))
示例12: predict
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def predict(self, sentence):
if self._net is None or self._trainer is None:
raise Exception('Must train first')
vectors, characters = Tokenizer.tokenize(sentence)
if len(vectors) == 0:
return None
input = np.array(self._code(vectors[0]),
ndmin=2,
dtype=np.float32)
result = self._net.predict(input)
return self.languages[result[0]]
示例13: SimpleTokenizer
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
class SimpleTokenizer(object):
def __init__(self, convert_to_base_form=True, normalize_number=False,
append_pos=False):
self.tokenizer = Tokenizer(convert_to_base_form=convert_to_base_form,
normalize_number=normalize_number,
append_pos=append_pos)
def tokenize(self, review):
nested_tokens = [self.tokenizer.tokenize(sentence)
for sentence in review['text']]
tokens = [token for sublist in nested_tokens for token in sublist]
return tokens
示例14: testExecutionTreeWithTerms
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def testExecutionTreeWithTerms(self):
c = ExpressionCompiler()
tokenizer = Tokenizer()
tokenizer.tokenize("A=1")
tokenizer.next()
expr = c.compile(tokenizer)
self.assertEqual("( = ( literal A ) ( literal 1.0 ) )", expr.get_execution_tree())
tokenizer.tokenize("A=1 and B>10")
tokenizer.next()
expr = c.compile(tokenizer)
print "Execution Tree = %s\n" % (expr.get_execution_tree())
self.assertEqual(
"( and ( = ( literal A ) ( literal 1.0 ) ) ( > ( literal B ) ( literal 10.0 ) ) )",
expr.get_execution_tree(),
)
tokenizer.tokenize("(A=1 and B>10) or (C > 10)")
tokenizer.next()
expr = c.compile(tokenizer)
print "Execution Tree = %s\n" % (expr.get_execution_tree())
示例15: analyze
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize [as 别名]
def analyze(string):
scanner = Tokenizer()
list_of_tokens= scanner.tokenize(string)
print "-------------"
print "TOKEN LIST:"
print list_of_tokens
parser = QueryParser()
print "----------------"
print "PARSING RESULT"
print "----------------"
print parser.parse(list_of_tokens)
semparser = QuerySemanticParser(parser.parse(list_of_tokens))
semparser.parse()