当前位置: 首页>>代码示例>>Python>>正文


Python revtok.tokenize方法代码示例

本文整理汇总了Python中revtok.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python revtok.tokenize方法的具体用法?Python revtok.tokenize怎么用?Python revtok.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在revtok的用法示例。


在下文中一共展示了revtok.tokenize方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: computeBLEUMSCOCO

# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeBLEUMSCOCO(outputs, targets, corpus=True, tokenizer=None):
    # outputs is list of 5000 captions
    # targets is list of 5000 lists each length of 5
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    new_targets = []
    for i, t in enumerate(targets):
        new_targets.append([tokenizer(tt) for tt in t])
        #targets[i] = [tokenizer(tt) for tt in t]

    if corpus:
        return corpus_bleu(new_targets, outputs, emulate_multibleu=True)
    else:
        return [sentence_bleu(new_t, o)[0] for o, new_t in zip(outputs, new_targets)] 
开发者ID:nyu-dl,项目名称:dl4mt-nonauto,代码行数:18,代码来源:utils.py

示例2: computeGroupBLEU

# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeGroupBLEU(outputs, targets, tokenizer=None, bra=10, maxmaxlen=80):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]
    maxlens = max([len(t) for t in targets])
    print(maxlens)
    maxlens = min([maxlens, maxmaxlen])
    nums = int(np.ceil(maxlens / bra))
    outputs_buckets = [[] for _ in range(nums)]
    targets_buckets = [[] for _ in range(nums)]
    for o, t in zip(outputs, targets):
        idx = len(o) // bra
        if idx >= len(outputs_buckets):
            idx = -1
        outputs_buckets[idx] += [o]
        targets_buckets[idx] += [t]

    for k in range(nums):
        print(corpus_bleu([[t] for t in targets_buckets[k]], [o for o in outputs_buckets[k]], emulate_multibleu=True)) 
开发者ID:nyu-dl,项目名称:dl4mt-nonauto,代码行数:23,代码来源:utils.py

示例3: computeGroupBLEU

# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeGroupBLEU(outputs, targets, tokenizer=None, bra=10, maxmaxlen=80):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]
    maxlens = max([len(t) for t in targets])
    print(maxlens)
    maxlens = min([maxlens, maxmaxlen])
    nums = int(np.ceil(maxlens / bra))
    outputs_buckets = [[] for _ in range(nums)]
    targets_buckets = [[] for _ in range(nums)]
    for o, t in zip(outputs, targets):
        idx = len(o) // bra
        if idx >= len(outputs_buckets):
            idx = -1
        outputs_buckets[idx] += [o]
        targets_buckets[idx] += [t]

    for k in range(nums):
        print(corpus_bleu([[t] for t in targets_buckets[k]], [o for o in outputs_buckets[k]], emulate_multibleu=True))


# load the dataset + reversible tokenization 
开发者ID:salesforce,项目名称:nonauto-nmt,代码行数:26,代码来源:utils.py

示例4: computeGLEU

# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeGLEU(outputs, targets, corpus=False, tokenizer=None):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if not corpus:
        return torch.Tensor([sentence_gleu(
            [t],  o) for o, t in zip(outputs, targets)])
    return corpus_gleu([[t] for t in targets], [o for o in outputs]) 
开发者ID:nyu-dl,项目名称:dl4mt-nonauto,代码行数:13,代码来源:utils.py

示例5: computeBLEU

# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeBLEU(outputs, targets, corpus=False, tokenizer=None):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if corpus:
        return corpus_bleu([[t] for t in targets], [o for o in outputs], emulate_multibleu=True)
    else:
        return [sentence_bleu([t],  o)[0] for o, t in zip(outputs, targets)]
        #return torch.Tensor([sentence_bleu([t],  o)[0] for o, t in zip(outputs, targets)]) 
开发者ID:nyu-dl,项目名称:dl4mt-nonauto,代码行数:14,代码来源:utils.py

示例6: computeBLEU

# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def computeBLEU(outputs, targets, corpus=False, tokenizer=None):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if not corpus:
        return torch.Tensor([sentence_gleu(
            [t],  o) for o, t in zip(outputs, targets)])
    return corpus_bleu([[t] for t in targets], [o for o in outputs], emulate_multibleu=True) 
开发者ID:salesforce,项目名称:nonauto-nmt,代码行数:13,代码来源:utils.py

示例7: get_tokenizer

# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def get_tokenizer(tokenizer, decap=False):
    if callable(tokenizer):
        return tokenizer
    if tokenizer == "spacy":
        try:
            import spacy
            spacy_en = spacy.load('en')
            return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
        except ImportError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
    elif tokenizer == "moses":
        try:
            from nltk.tokenize.moses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise
        except LookupError:
            print("Please install the necessary NLTK corpora. "
                  "See the docs at http://nltk.org for more information.")
            raise
    elif tokenizer == 'revtok':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    elif tokenizer == 'subword':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"revtok\" for the revtok reversible tokenizer, "
                     "\"subword\" for the revtok caps-aware tokenizer, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer)) 
开发者ID:salesforce,项目名称:decaNLP,代码行数:52,代码来源:utils.py

示例8: get_tokenizer

# 需要导入模块: import revtok [as 别名]
# 或者: from revtok import tokenize [as 别名]
def get_tokenizer(tokenizer):
    if callable(tokenizer):
        return tokenizer
    if tokenizer == "spacy":
        try:
            import spacy
            spacy_en = spacy.load('en')
            return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
        except ImportError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
    elif tokenizer == "moses":
        try:
            from nltk.tokenize.moses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise
        except LookupError:
            print("Please install the necessary NLTK corpora. "
                  "See the docs at http://nltk.org for more information.")
            raise
    elif tokenizer == 'revtok':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    elif tokenizer == 'subword':
        try:
            import revtok
            return lambda x: revtok.tokenize(x, decap=True)
        except ImportError:
            print("Please install revtok.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"revtok\" for the revtok reversible tokenizer, "
                     "\"subword\" for the revtok caps-aware tokenizer, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer)) 
开发者ID:aimagelab,项目名称:speaksee,代码行数:52,代码来源:utils.py


注:本文中的revtok.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。