Python tokenizer.tokenize_line方法代码示例

本文整理汇总了Python中fairseq.tokenizer.tokenize_line方法的典型用法代码示例。如果您正苦于以下问题：Python tokenizer.tokenize_line方法的具体用法？Python tokenizer.tokenize_line怎么用？Python tokenizer.tokenize_line使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fairseq.tokenizer的用法示例。

在下文中一共展示了tokenizer.tokenize_line方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build_dictionary

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def build_dictionary(
        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
    ):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            Dictionary.add_file_to_dictionary(
                filename, d, tokenizer.tokenize_line, workers
            )
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d

开发者ID:pytorch，项目名称:fairseq，代码行数:24，代码来源:fairseq_task.py

示例2: build_dictionary

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def build_dictionary(
        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
    ):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = MaskedLMDictionary()
        for filename in filenames:
            MaskedLMDictionary.add_file_to_dictionary(
                filename, d, tokenizer.tokenize_line, workers
            )
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d

开发者ID:pytorch，项目名称:translate，代码行数:24，代码来源:translation_from_pretrained_xlm.py

示例3: build_dictionary

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d

开发者ID:kakaobrain，项目名称:helo_word，代码行数:20，代码来源:fairseq_task.py

示例4: encode_line

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def encode_line(self, line, line_tokenizer=tokenize_line, add_if_not_exist=True,
                    consumer=None, append_eos=True, reverse_order=False):
        words = line_tokenizer(line)
        if reverse_order:
            words = list(reversed(words))
        nwords = len(words)
        ids = torch.IntTensor(nwords + 1 if append_eos else nwords)

        for i, word in enumerate(words):
            if add_if_not_exist:
                idx = self.add_symbol(word)
            else:
                idx = self.index(word)
            if consumer is not None:
                consumer(word, idx)
            ids[i] = idx
        if append_eos:
            ids[nwords] = self.eos_index
        return ids

开发者ID:kakaobrain，项目名称:helo_word，代码行数:21，代码来源:dictionary.py

示例5: encode_labels_line

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def encode_labels_line(labels_line, append_eos=True, reverse_order=False):
    """Custom helper:
    Encode a string of space-separated binary labels into LongTensor.

    Mimicks fairseq.data.dictionary.Dictionary.encode_line().
    eos always gets a zero token (no change).

    Returns a torch.IntTensor, analogous to dictionary's encode_line() method.
    """
    labels = [int(label) for label in tokenize_line(labels_line)]
    assert all([label in [0, 1] for label in labels]), \
        f"encode_labels_line: token-level labels must be binary!"
    if reverse_order:
        labels = list(reversed(labels))
    if append_eos:
        labels.append(0)
    return torch.tensor(labels, dtype=torch.int)

开发者ID:kakaobrain，项目名称:helo_word，代码行数:19，代码来源:token_labeled_language_pair_dataset.py

示例6: replace_unk

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
    from fairseq import tokenizer
    # Tokens are strings here
    hypo_tokens = tokenizer.tokenize_line(hypo_str)
    # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
    src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>']
    for i, ht in enumerate(hypo_tokens):
        if ht == unk:
            src_token = src_tokens[alignment[i]]
            # Either take the corresponding value in the aligned dictionary or just copy the original value.
            hypo_tokens[i] = align_dict.get(src_token, src_token)
    return ' '.join(hypo_tokens)

开发者ID:nusnlp，项目名称:crosentgec，代码行数:14，代码来源:utils.py

示例7: build_dictionary

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8):
        d = BertDictionary()
        for filename in filenames:
            Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d

开发者ID:pytorch，项目名称:fairseq，代码行数:8，代码来源:legacy_masked_lm.py

示例8: build_dictionary

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8):
        d = MaskedLMDictionary()
        for filename in filenames:
            Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d

开发者ID:pytorch，项目名称:fairseq，代码行数:8，代码来源:cross_lingual_lm.py

示例9: encode_line

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def encode_line(
        self,
        line,
        line_tokenizer=tokenize_line,
        add_if_not_exist=True,
        consumer=None,
        append_eos=True,
        reverse_order=False,
    ):
        words = line_tokenizer(line)
        if reverse_order:
            words = list(reversed(words))
        nwords = len(words)
        ids = torch.IntTensor(nwords + 1 if append_eos else nwords)

        for i, word in enumerate(words):
            if add_if_not_exist:
                idx = self.add_symbol(word)
            else:
                idx = self.index(word)
            if consumer is not None:
                consumer(word, idx)
            ids[i] = idx
        if append_eos:
            ids[nwords] = self.eos_index
        return ids

开发者ID:pytorch，项目名称:fairseq，代码行数:28，代码来源:dictionary.py

示例10: build_dictionary

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def build_dictionary(
        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
    ):
        d = MaskedLMDictionary()
        for filename in filenames:
            MaskedLMDictionary.add_file_to_dictionary(
                filename, d, tokenizer.tokenize_line, workers
            )
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d

开发者ID:pytorch，项目名称:translate，代码行数:12，代码来源:cross_lingual_lm.py

示例11: _sent_to_word_ids

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def _sent_to_word_ids(
        self, sent, word_dict, reverse_order, prepend_inds, append_inds
    ):
        """
        Extract the word ids for words associated with the input sentence.
        """
        words = tokenizer.tokenize_line(sent)
        if reverse_order:
            words.reverse()
        word_inds = [word_dict.index(w) for w in words]
        word_inds = prepend_inds + word_inds + append_inds
        return words, word_inds

开发者ID:pytorch，项目名称:translate，代码行数:14，代码来源:char_data.py

示例12: replace_unk

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
    # Tokens are strings here
    hypo_tokens = tokenizer.tokenize_line(hypo_str)
    # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
    src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>']
    for i, ht in enumerate(hypo_tokens):
        if ht == unk:
            src_token = src_tokens[alignment[i]]
            # Either take the corresponding value in the aligned dictionary or just copy the original value.
            hypo_tokens[i] = align_dict.get(src_token, src_token)
    return ' '.join(hypo_tokens)

开发者ID:EdinburghNLP，项目名称:XSum，代码行数:13，代码来源:utils.py

示例13: binarize

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def binarize(filename, dict, consumer, tokenize=tokenize_line, append_eos=True, reverse_order=False,
                 offset=0, end=-1):
        nseq, ntok = 0, 0
        replaced = Counter()

        def replaced_consumer(word, idx):
            if idx == dict.unk_index and word != dict.unk_word:
                replaced.update([word])

        with open(filename, 'r', encoding='utf-8') as f:
            f.seek(offset)
            # next(f) breaks f.tell(), hence readline() must be used
            line = safe_readline(f)
            while line:
                if end > 0 and f.tell() > end:
                    break
                ids = dict.encode_line(
                        line=line,
                        line_tokenizer=tokenize,
                        add_if_not_exist=False,
                        consumer=replaced_consumer,
                        append_eos=append_eos,
                        reverse_order=reverse_order,
                )
                nseq += 1
                ntok += len(ids)
                consumer(ids)
                line = f.readline()
        return {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced}

开发者ID:kakaobrain，项目名称:helo_word，代码行数:31，代码来源:binarizer.py

示例14: build_dictionary

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1,
                         padding_factor=8):
        """Build the dictionary from edit-labeled raw text inputs.

        Each file contains tokenized sentences along with their token labels:
        ```text
        My teacher is going to move to change his job .
        0 0 0 0 0 0 0 0 0 0 0
        And he took in my favorite subject like soccer .
        0 0 0 0 0 0 1 0 0 0
        ...
        ```
        A dictionary is built using only the tokens and not token labels.

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            # Write only tokens to a separate file.
            with open(filename) as f_in, \
                    open(f"{filename}.tokens", "w") as f_out:
                f_out.writelines(line for i, line in enumerate(f_in)
                                 if i % 2 == 0)
            # Add tokens to dictionary with multiprocessing.
            Dictionary.add_file_to_dictionary(f"{filename}.tokens", d,
                                              tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold, nwords=nwords,
                   padding_factor=padding_factor)
        return d

开发者ID:kakaobrain，项目名称:helo_word，代码行数:39，代码来源:gec.py

示例15: binarize

# 需要导入模块: from fairseq import tokenizer [as 别名]
# 或者: from fairseq.tokenizer import tokenize_line [as 别名]
def binarize(
        filename,
        dict,
        consumer,
        tokenize=tokenize_line,
        append_eos=True,
        reverse_order=False,
        offset=0,
        end=-1,
        already_numberized=False,
    ):
        nseq, ntok = 0, 0
        replaced = Counter()

        def replaced_consumer(word, idx):
            if idx == dict.unk_index and word != dict.unk_word:
                replaced.update([word])

        with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f:
            f.seek(offset)
            # next(f) breaks f.tell(), hence readline() must be used
            line = safe_readline(f)
            while line:
                if end > 0 and f.tell() > end:
                    break
                if already_numberized:
                    id_strings = line.strip().split()
                    id_list = [int(id_string) for id_string in id_strings]
                    if reverse_order:
                        id_list.reverse()
                    if append_eos:
                        id_list.append(dict.eos())
                    ids = torch.IntTensor(id_list)
                else:
                    ids = dict.encode_line(
                        line=line,
                        line_tokenizer=tokenize,
                        add_if_not_exist=False,
                        consumer=replaced_consumer,
                        append_eos=append_eos,
                        reverse_order=reverse_order,
                    )
                nseq += 1
                ntok += len(ids)
                consumer(ids)
                line = f.readline()
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }

开发者ID:pytorch，项目名称:fairseq，代码行数:54，代码来源:binarizer.py

注：本文中的fairseq.tokenizer.tokenize_line方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。