本文整理汇总了Python中sacremoses.MosesTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python sacremoses.MosesTokenizer方法的具体用法?Python sacremoses.MosesTokenizer怎么用?Python sacremoses.MosesTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sacremoses
的用法示例。
在下文中一共展示了sacremoses.MosesTokenizer方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import sacremoses [as 别名]
# 或者: from sacremoses import MosesTokenizer [as 别名]
def __init__(self, *args, **kwargs):
if 'tokenize' in kwargs:
raise TypeError('``MosesEncoder`` does not take keyword argument ``tokenize``.')
if 'detokenize' in kwargs:
raise TypeError('``MosesEncoder`` does not take keyword argument ``detokenize``.')
try:
from sacremoses import MosesTokenizer
from sacremoses import MosesDetokenizer
except ImportError:
print("Please install SacreMoses. "
"See the docs at https://github.com/alvations/sacremoses for more information.")
raise
super().__init__(
*args,
tokenize=MosesTokenizer().tokenize,
detokenize=partial(MosesDetokenizer().detokenize, return_str=True),
**kwargs)
示例2: normalize
# 需要导入模块: import sacremoses [as 别名]
# 或者: from sacremoses import MosesTokenizer [as 别名]
def normalize(sentence, lowercase: bool = True, tokenizer: str = '13a', return_str: bool = True):
if lowercase:
sentence = sentence.lower()
if tokenizer == "13a":
normalized_sent = sacrebleu.tokenize_13a(sentence)
elif tokenizer == "intl":
normalized_sent = sacrebleu.tokenize_v14_international(sentence)
elif tokenizer == "moses":
normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence, return_str=True, escape=False)
elif tokenizer == "penn":
normalized_sent = sacremoses.MosesTokenizer().penn_tokenize(sentence, return_str=True)
else:
normalized_sent = sentence
if not return_str:
normalized_sent = normalized_sent.split()
return normalized_sent
示例3: __init__
# 需要导入模块: import sacremoses [as 别名]
# 或者: from sacremoses import MosesTokenizer [as 别名]
def __init__(self, args):
self.args = args
if getattr(args, 'moses_source_lang', None) is None:
args.moses_source_lang = getattr(args, 'source_lang', 'en')
if getattr(args, 'moses_target_lang', None) is None:
args.moses_target_lang = getattr(args, 'target_lang', 'en')
try:
from sacremoses import MosesTokenizer, MosesDetokenizer
self.tok = MosesTokenizer(args.moses_source_lang)
self.detok = MosesDetokenizer(args.moses_target_lang)
except ImportError:
raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
示例4: __init__
# 需要导入模块: import sacremoses [as 别名]
# 或者: from sacremoses import MosesTokenizer [as 别名]
def __init__(self,
lang: str = 'en',
lower_case: bool = True,
romanize: Optional[bool] = None,
descape: bool = False):
assert lower_case, 'lower case is needed by all the models'
if lang in ('cmn', 'wuu', 'yue'):
lang = 'zh'
if lang == 'jpn':
lang = 'ja'
if lang == 'zh' and jieba is None:
raise ModuleNotFoundError(
'''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
)
if lang == 'ja' and MeCab is None:
raise ModuleNotFoundError(
'''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
)
self.lang = lang
self.lower_case = lower_case
self.romanize = romanize if romanize is not None else lang == 'el'
self.descape = descape
self.normalizer = MosesPunctNormalizer(lang=lang)
self.tokenizer = MosesTokenizer(lang=lang)
self.mecab_tokenizer = MeCab.Tagger(
"-O wakati -b 50000") if lang == 'ja' else None
示例5: tokenize_captions
# 需要导入模块: import sacremoses [as 别名]
# 或者: from sacremoses import MosesTokenizer [as 别名]
def tokenize_captions(captions, lang='en'):
"""Tokenizes captions list with Moses tokenizer.
"""
tokenizer = MosesTokenizer(lang=lang)
return [tokenizer.tokenize(caption, return_str=True) for caption in captions]
示例6: moses_tokenize
# 需要导入模块: import sacremoses [as 别名]
# 或者: from sacremoses import MosesTokenizer [as 别名]
def moses_tokenize(self, text, lang):
if lang not in self.cache_moses_tokenizer:
moses_tokenizer = sm.MosesTokenizer(lang=lang)
self.cache_moses_tokenizer[lang] = moses_tokenizer
else:
moses_tokenizer = self.cache_moses_tokenizer[lang]
return moses_tokenizer.tokenize(text, return_str=False, escape=False)
示例7: __init__
# 需要导入模块: import sacremoses [as 别名]
# 或者: from sacremoses import MosesTokenizer [as 别名]
def __init__(self, escape: bool = False, *args, **kwargs):
self.escape = escape
self.tokenizer = MosesTokenizer()
self.detokenizer = MosesDetokenizer()
示例8: enable_moses
# 需要导入模块: import sacremoses [as 别名]
# 或者: from sacremoses import MosesTokenizer [as 别名]
def enable_moses(self, lang='en', tokenize=True, detokenize=True):
if tokenize:
self._moses_tok = MosesTokenizer(lang=lang)
else:
self._moses_tok = None
if detokenize:
self._moses_detok = MosesDetokenizer(lang=lang)
else:
self._moses_detok = None