本文整理匯總了Python中tokenizer.tokenize方法的典型用法代碼示例。如果您正苦於以下問題:Python tokenizer.tokenize方法的具體用法?Python tokenizer.tokenize怎麽用?Python tokenizer.tokenize使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類tokenizer
的用法示例。
在下文中一共展示了tokenizer.tokenize方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: calculate_batchsize_maxlen
# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def calculate_batchsize_maxlen(texts):
""" Calculates the maximum length in the provided texts and a suitable
batch size. Rounds up maxlen to the nearest multiple of ten.
# Arguments:
texts: List of inputs.
# Returns:
Batch size,
max length
"""
def roundup(x):
return int(math.ceil(x / 10.0)) * 10
# Calculate max length of sequences considered
# Adjust batch_size accordingly to prevent GPU overflow
lengths = [len(tokenize(t)) for t in texts]
maxlen = roundup(np.percentile(lengths, 80.0))
batch_size = 250 if maxlen <= 100 else 50
return batch_size, maxlen
示例2: numericTagger
# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def numericTagger(instr):
lst = type([1, 2, 3])
tup = type(("Hello", "Hi"))
string = type("Hello")
num_match = re.compile(r'([०१२३४५६७८९]+[\.\,]*)+[०१२३४५६७८९]+|([-+]*\d+[\.\,]*)+\d+|([०१२३४५६७८९]+|\d+)')
if type(instr) == lst:
for index, item in enumerate(instr):
if type(item) == tup:
if num_match.search(str(item[0])):
instr[index] = (instr[index][1], 'num')
else:
if num_match.search(str(item)):
instr[index] = (instr[index], 'num')
else:
if type(instr) == string:
instr = tok.tokenize(instr)
numericTagger(instr)
else:
print("not supported")
return instr
示例3: get_words
# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def get_words(self, sentence):
""" Tokenizes a sentence into individual words.
Converts Unicode punctuation into ASCII if that option is set.
Ignores sentences with Unicode if that option is set.
Returns an empty list of words if the sentence has Unicode and
that is not allowed.
"""
if not isinstance(sentence, unicode):
raise ValueError("All sentences should be Unicode-encoded!")
sentence = sentence.strip().lower()
if self.break_replacement:
sentence = convert_linebreaks(sentence)
if self.remove_variation_selectors:
sentence = remove_variation_selectors(sentence)
# Split into words using simple whitespace splitting and convert
# Unicode. This is done to prevent word splitting issues with
# twokenize and Unicode
words = sentence.split()
converted_words = []
for w in words:
accept_sentence, c_w = self.convert_unicode_word(w)
# Unicode word detected and not allowed
if not accept_sentence:
return []
else:
converted_words.append(c_w)
sentence = ' '.join(converted_words)
words = tokenize(sentence)
words = [process_word(w) for w in words]
return words
示例4: process_questions
# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def process_questions(questions, return_score_modifiers = False):
# Make a list
if not isinstance(questions, list):
questions = [questions]
# Clean and tokenize
prepared_questions = []
for question in questions:
question = question.strip()
prepared_questions.append(apply_bpe(tokenize(question)) if question else '##emptyquestion##')
# Run inference
answers_list = inference_helper(prepared_questions)
# Process answers
prepared_answers_list = []
for index, answers in enumerate(answers_list):
answers = detokenize(answers)
answers = replace_in_answers(answers)
answers = normalize_new_lines(answers)
answers_score = score_answers(questions[index], answers)
best_index, best_score = get_best_score(answers_score['score'])
if prepared_questions[index] == '##emptyquestion##':
prepared_answers_list.append(None)
elif return_score_modifiers:
prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score, 'score_modifiers': answers_score['score_modifiers']})
else:
prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score})
return prepared_answers_list
# interactive mode
示例5: defaultTagger
# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def defaultTagger(instr):
lst = type([1, 2, 3])
tup = type(("Hello", "Hi"))
string = type("Hello")
if type(instr) == lst:
for index, item in enumerate(instr):
if type(item) != tup:
instr[index] = (instr[index], 'any')
else:
if type(instr) == string:
instr = tok.tokenize(instr)
defaultTagger(instr)
else:
print("not supported")
return instr
示例6: lookupTagger
# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def lookupTagger(instr):
lst = type([1, 2, 3])
tup = type(("Hello", "Hi"))
string = type("Hello")
gndrlst = gndr.drawlist()
words = []
genders = []
for item in gndrlst:
words.append(item.split("\t")[0])
if(len(item.split("\t"))>2):
genders.append("any")
else:
genders.append(item.split("\t")[1])
tokens = set(words)
if(type(instr) == lst):
for index,item in enumerate(instr):
if(type(item) == tup):
if item in tokens:
tag = genders[words.index(item)]
instr[index] = (instr[index][1],tag)
else:
if(type(item) != tup):
if item in tokens:
tag = genders[words.index(item)]
instr[index] = (instr[index], tag)
else:
if(type(instr) == string):
instr = tok.tokenize(instr)
lookupTagger(instr)
else:
print("not supported")
return(instr)
示例7: parse_paragraph
# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def parse_paragraph(parag, mim_tags, fast_p):
""" Parse a single paragraph in free text form and compare to MIM POS tags """
tokens = tokenize(parag)
tlist = list(tokens)
result = parse_tokens(tlist, mim_tags, fast_p)
print("{0}\n--> {1} sentences, {2} parsed".format(parag, result["num_sent"], result["num_parsed_sent"]))
示例8: get_tokens_iterator
# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def get_tokens_iterator(tagger, iter_docs):
tokenize = partial(tokenizer.tokenize, tagger=tagger)
def iter_tokens():
for doc in iter_docs():
yield tokenize(doc)
return iter_tokens