當前位置: 首頁>>代碼示例>>Python>>正文


Python revtok.tokenize方法代碼示例

本文整理匯總了Python中revtok.tokenize方法的典型用法代碼示例。如果您正苦於以下問題:Python revtok.tokenize方法的具體用法?Python revtok.tokenize怎麽用?Python revtok.tokenize使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在revtok的用法示例。


在下文中一共展示了revtok.tokenize方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: computeBLEUMSCOCO

# 需要導入模塊: import revtok [as 別名]
# 或者: from revtok import tokenize [as 別名]
def computeBLEUMSCOCO(outputs, targets, corpus=True, tokenizer=None):
    # outputs is list of 5000 captions
    # targets is list of 5000 lists each length of 5
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    new_targets = []
    for i, t in enumerate(targets):
        new_targets.append([tokenizer(tt) for tt in t])
        #targets[i] = [tokenizer(tt) for tt in t]

    if corpus:
        return corpus_bleu(new_targets, outputs, emulate_multibleu=True)
    else:
        return [sentence_bleu(new_t, o)[0] for o, new_t in zip(outputs, new_targets)] 
開發者ID:nyu-dl,項目名稱:dl4mt-nonauto,代碼行數:18,代碼來源:utils.py

示例2: computeGroupBLEU

# 需要導入模塊: import revtok [as 別名]
# 或者: from revtok import tokenize [as 別名]
def computeGroupBLEU(outputs, targets, tokenizer=None, bra=10, maxmaxlen=80):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]
    maxlens = max([len(t) for t in targets])
    print(maxlens)
    maxlens = min([maxlens, maxmaxlen])
    nums = int(np.ceil(maxlens / bra))
    outputs_buckets = [[] for _ in range(nums)]
    targets_buckets = [[] for _ in range(nums)]
    for o, t in zip(outputs, targets):
        idx = len(o) // bra
        if idx >= len(outputs_buckets):
            idx = -1
        outputs_buckets[idx] += [o]
        targets_buckets[idx] += [t]

    for k in range(nums):
        print(corpus_bleu([[t] for t in targets_buckets[k]], [o for o in outputs_buckets[k]], emulate_multibleu=True)) 
開發者ID:nyu-dl,項目名稱:dl4mt-nonauto,代碼行數:23,代碼來源:utils.py

示例3: computeGroupBLEU

# 需要導入模塊: import revtok [as 別名]
# 或者: from revtok import tokenize [as 別名]
def computeGroupBLEU(outputs, targets, tokenizer=None, bra=10, maxmaxlen=80):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]
    maxlens = max([len(t) for t in targets])
    print(maxlens)
    maxlens = min([maxlens, maxmaxlen])
    nums = int(np.ceil(maxlens / bra))
    outputs_buckets = [[] for _ in range(nums)]
    targets_buckets = [[] for _ in range(nums)]
    for o, t in zip(outputs, targets):
        idx = len(o) // bra
        if idx >= len(outputs_buckets):
            idx = -1
        outputs_buckets[idx] += [o]
        targets_buckets[idx] += [t]

    for k in range(nums):
        print(corpus_bleu([[t] for t in targets_buckets[k]], [o for o in outputs_buckets[k]], emulate_multibleu=True))


# load the dataset + reversible tokenization 
開發者ID:salesforce,項目名稱:nonauto-nmt,代碼行數:26,代碼來源:utils.py

示例4: computeGLEU

# 需要導入模塊: import revtok [as 別名]
# 或者: from revtok import tokenize [as 別名]
def computeGLEU(outputs, targets, corpus=False, tokenizer=None):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if not corpus:
        return torch.Tensor([sentence_gleu(
            [t],  o) for o, t in zip(outputs, targets)])
    return corpus_gleu([[t] for t in targets], [o for o in outputs]) 
開發者ID:nyu-dl,項目名稱:dl4mt-nonauto,代碼行數:13,代碼來源:utils.py

示例5: computeBLEU

# 需要導入模塊: import revtok [as 別名]
# 或者: from revtok import tokenize [as 別名]
def computeBLEU(outputs, targets, corpus=False, tokenizer=None):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if corpus:
        return corpus_bleu([[t] for t in targets], [o for o in outputs], emulate_multibleu=True)
    else:
        return [sentence_bleu([t],  o)[0] for o, t in zip(outputs, targets)]
        #return torch.Tensor([sentence_bleu([t],  o)[0] for o, t in zip(outputs, targets)]) 
開發者ID:nyu-dl,項目名稱:dl4mt-nonauto,代碼行數:14,代碼來源:utils.py

示例6: computeBLEU

# 需要導入模塊: import revtok [as 別名]
# 或者: from revtok import tokenize [as 別名]
def computeBLEU(outputs, targets, corpus=False, tokenizer=None):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if not corpus:
        return torch.Tensor([sentence_gleu(
            [t],  o) for o, t in zip(outputs, targets)])
    return corpus_bleu([[t] for t in targets], [o for o in outputs], emulate_multibleu=True) 
開發者ID:salesforce,項目名稱:nonauto-nmt,代碼行數:13,代碼來源:utils.py

示例7: get_tokenizer

# 需要導入模塊: import revtok [as 別名]
# 或者: from revtok import tokenize [as 別名]
def get_tokenizer(tokenizer, decap=False):
    if callable(tokenizer):
        return tokenizer
    if tokenizer == "spacy":
        try:
            import spacy
            spacy_en = spacy.load('en')
            return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
        except ImportError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
    elif tokenizer == "moses":
        try:
            from nltk.tokenize.moses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise
        except LookupError:
            print("Please install the necessary NLTK corpora. "
                  "See the docs at http://nltk.org for more information.")
            raise
    elif tokenizer == 'revtok':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    elif tokenizer == 'subword':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"revtok\" for the revtok reversible tokenizer, "
                     "\"subword\" for the revtok caps-aware tokenizer, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer)) 
開發者ID:salesforce,項目名稱:decaNLP,代碼行數:52,代碼來源:utils.py

示例8: get_tokenizer

# 需要導入模塊: import revtok [as 別名]
# 或者: from revtok import tokenize [as 別名]
def get_tokenizer(tokenizer):
    if callable(tokenizer):
        return tokenizer
    if tokenizer == "spacy":
        try:
            import spacy
            spacy_en = spacy.load('en')
            return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
        except ImportError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
    elif tokenizer == "moses":
        try:
            from nltk.tokenize.moses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise
        except LookupError:
            print("Please install the necessary NLTK corpora. "
                  "See the docs at http://nltk.org for more information.")
            raise
    elif tokenizer == 'revtok':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    elif tokenizer == 'subword':
        try:
            import revtok
            return lambda x: revtok.tokenize(x, decap=True)
        except ImportError:
            print("Please install revtok.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"revtok\" for the revtok reversible tokenizer, "
                     "\"subword\" for the revtok caps-aware tokenizer, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer)) 
開發者ID:aimagelab,項目名稱:speaksee,代碼行數:52,代碼來源:utils.py


注:本文中的revtok.tokenize方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。