本文整理汇总了Python中revtok.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python revtok.tokenize方法的具体用法?Python revtok.tokenize怎么用?Python revtok.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类revtok
的用法示例。
在下文中一共展示了revtok.tokenize方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: computeBLEUMSCOCO
# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeBLEUMSCOCO(outputs, targets, corpus=True, tokenizer=None):
# outputs is list of 5000 captions
# targets is list of 5000 lists each length of 5
if tokenizer is None:
tokenizer = revtok.tokenize
outputs = [tokenizer(o) for o in outputs]
new_targets = []
for i, t in enumerate(targets):
new_targets.append([tokenizer(tt) for tt in t])
#targets[i] = [tokenizer(tt) for tt in t]
if corpus:
return corpus_bleu(new_targets, outputs, emulate_multibleu=True)
else:
return [sentence_bleu(new_t, o)[0] for o, new_t in zip(outputs, new_targets)]
示例2: computeGroupBLEU
# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeGroupBLEU(outputs, targets, tokenizer=None, bra=10, maxmaxlen=80):
if tokenizer is None:
tokenizer = revtok.tokenize
outputs = [tokenizer(o) for o in outputs]
targets = [tokenizer(t) for t in targets]
maxlens = max([len(t) for t in targets])
print(maxlens)
maxlens = min([maxlens, maxmaxlen])
nums = int(np.ceil(maxlens / bra))
outputs_buckets = [[] for _ in range(nums)]
targets_buckets = [[] for _ in range(nums)]
for o, t in zip(outputs, targets):
idx = len(o) // bra
if idx >= len(outputs_buckets):
idx = -1
outputs_buckets[idx] += [o]
targets_buckets[idx] += [t]
for k in range(nums):
print(corpus_bleu([[t] for t in targets_buckets[k]], [o for o in outputs_buckets[k]], emulate_multibleu=True))
示例3: computeGroupBLEU
# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeGroupBLEU(outputs, targets, tokenizer=None, bra=10, maxmaxlen=80):
if tokenizer is None:
tokenizer = revtok.tokenize
outputs = [tokenizer(o) for o in outputs]
targets = [tokenizer(t) for t in targets]
maxlens = max([len(t) for t in targets])
print(maxlens)
maxlens = min([maxlens, maxmaxlen])
nums = int(np.ceil(maxlens / bra))
outputs_buckets = [[] for _ in range(nums)]
targets_buckets = [[] for _ in range(nums)]
for o, t in zip(outputs, targets):
idx = len(o) // bra
if idx >= len(outputs_buckets):
idx = -1
outputs_buckets[idx] += [o]
targets_buckets[idx] += [t]
for k in range(nums):
print(corpus_bleu([[t] for t in targets_buckets[k]], [o for o in outputs_buckets[k]], emulate_multibleu=True))
# load the dataset + reversible tokenization
示例4: computeGLEU
# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeGLEU(outputs, targets, corpus=False, tokenizer=None):
if tokenizer is None:
tokenizer = revtok.tokenize
outputs = [tokenizer(o) for o in outputs]
targets = [tokenizer(t) for t in targets]
if not corpus:
return torch.Tensor([sentence_gleu(
[t], o) for o, t in zip(outputs, targets)])
return corpus_gleu([[t] for t in targets], [o for o in outputs])
示例5: computeBLEU
# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeBLEU(outputs, targets, corpus=False, tokenizer=None):
if tokenizer is None:
tokenizer = revtok.tokenize
outputs = [tokenizer(o) for o in outputs]
targets = [tokenizer(t) for t in targets]
if corpus:
return corpus_bleu([[t] for t in targets], [o for o in outputs], emulate_multibleu=True)
else:
return [sentence_bleu([t], o)[0] for o, t in zip(outputs, targets)]
#return torch.Tensor([sentence_bleu([t], o)[0] for o, t in zip(outputs, targets)])
示例6: computeBLEU
# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeBLEU(outputs, targets, corpus=False, tokenizer=None):
if tokenizer is None:
tokenizer = revtok.tokenize
outputs = [tokenizer(o) for o in outputs]
targets = [tokenizer(t) for t in targets]
if not corpus:
return torch.Tensor([sentence_gleu(
[t], o) for o, t in zip(outputs, targets)])
return corpus_bleu([[t] for t in targets], [o for o in outputs], emulate_multibleu=True)
示例7: get_tokenizer
# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def get_tokenizer(tokenizer, decap=False):
if callable(tokenizer):
return tokenizer
if tokenizer == "spacy":
try:
import spacy
spacy_en = spacy.load('en')
return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
except ImportError:
print("Please install SpaCy and the SpaCy English tokenizer. "
"See the docs at https://spacy.io for more information.")
raise
except AttributeError:
print("Please install SpaCy and the SpaCy English tokenizer. "
"See the docs at https://spacy.io for more information.")
raise
elif tokenizer == "moses":
try:
from nltk.tokenize.moses import MosesTokenizer
moses_tokenizer = MosesTokenizer()
return moses_tokenizer.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at http://nltk.org for more information.")
raise
except LookupError:
print("Please install the necessary NLTK corpora. "
"See the docs at http://nltk.org for more information.")
raise
elif tokenizer == 'revtok':
try:
import revtok
return revtok.tokenize
except ImportError:
print("Please install revtok.")
raise
elif tokenizer == 'subword':
try:
import revtok
return revtok.tokenize
except ImportError:
print("Please install revtok.")
raise
raise ValueError("Requested tokenizer {}, valid choices are a "
"callable that takes a single string as input, "
"\"revtok\" for the revtok reversible tokenizer, "
"\"subword\" for the revtok caps-aware tokenizer, "
"\"spacy\" for the SpaCy English tokenizer, or "
"\"moses\" for the NLTK port of the Moses tokenization "
"script.".format(tokenizer))
示例8: get_tokenizer
# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def get_tokenizer(tokenizer):
if callable(tokenizer):
return tokenizer
if tokenizer == "spacy":
try:
import spacy
spacy_en = spacy.load('en')
return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
except ImportError:
print("Please install SpaCy and the SpaCy English tokenizer. "
"See the docs at https://spacy.io for more information.")
raise
except AttributeError:
print("Please install SpaCy and the SpaCy English tokenizer. "
"See the docs at https://spacy.io for more information.")
raise
elif tokenizer == "moses":
try:
from nltk.tokenize.moses import MosesTokenizer
moses_tokenizer = MosesTokenizer()
return moses_tokenizer.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at http://nltk.org for more information.")
raise
except LookupError:
print("Please install the necessary NLTK corpora. "
"See the docs at http://nltk.org for more information.")
raise
elif tokenizer == 'revtok':
try:
import revtok
return revtok.tokenize
except ImportError:
print("Please install revtok.")
raise
elif tokenizer == 'subword':
try:
import revtok
return lambda x: revtok.tokenize(x, decap=True)
except ImportError:
print("Please install revtok.")
raise
raise ValueError("Requested tokenizer {}, valid choices are a "
"callable that takes a single string as input, "
"\"revtok\" for the revtok reversible tokenizer, "
"\"subword\" for the revtok caps-aware tokenizer, "
"\"spacy\" for the SpaCy English tokenizer, or "
"\"moses\" for the NLTK port of the Moses tokenization "
"script.".format(tokenizer))