本文整理匯總了Python中fairseq.tokenizer.tokenize_line方法的典型用法代碼示例。如果您正苦於以下問題:Python tokenizer.tokenize_line方法的具體用法?Python tokenizer.tokenize_line怎麽用?Python tokenizer.tokenize_line使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類fairseq.tokenizer
的用法示例。
在下文中一共展示了tokenizer.tokenize_line方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: build_dictionary
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def build_dictionary(
cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
):
"""Build the dictionary
Args:
filenames (list): list of filenames
workers (int): number of concurrent workers
threshold (int): defines the minimum word count
nwords (int): defines the total number of words in the final dictionary,
including special symbols
padding_factor (int): can be used to pad the dictionary size to be a
multiple of 8, which is important on some hardware (e.g., Nvidia
Tensor Cores).
"""
d = Dictionary()
for filename in filenames:
Dictionary.add_file_to_dictionary(
filename, d, tokenizer.tokenize_line, workers
)
d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
return d
示例2: build_dictionary
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def build_dictionary(
cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
):
"""Build the dictionary
Args:
filenames (list): list of filenames
workers (int): number of concurrent workers
threshold (int): defines the minimum word count
nwords (int): defines the total number of words in the final dictionary,
including special symbols
padding_factor (int): can be used to pad the dictionary size to be a
multiple of 8, which is important on some hardware (e.g., Nvidia
Tensor Cores).
"""
d = MaskedLMDictionary()
for filename in filenames:
MaskedLMDictionary.add_file_to_dictionary(
filename, d, tokenizer.tokenize_line, workers
)
d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
return d
示例3: build_dictionary
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8):
"""Build the dictionary
Args:
filenames (list): list of filenames
workers (int): number of concurrent workers
threshold (int): defines the minimum word count
nwords (int): defines the total number of words in the final dictionary,
including special symbols
padding_factor (int): can be used to pad the dictionary size to be a
multiple of 8, which is important on some hardware (e.g., Nvidia
Tensor Cores).
"""
d = Dictionary()
for filename in filenames:
Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers)
d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
return d
示例4: encode_line
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def encode_line(self, line, line_tokenizer=tokenize_line, add_if_not_exist=True,
consumer=None, append_eos=True, reverse_order=False):
words = line_tokenizer(line)
if reverse_order:
words = list(reversed(words))
nwords = len(words)
ids = torch.IntTensor(nwords + 1 if append_eos else nwords)
for i, word in enumerate(words):
if add_if_not_exist:
idx = self.add_symbol(word)
else:
idx = self.index(word)
if consumer is not None:
consumer(word, idx)
ids[i] = idx
if append_eos:
ids[nwords] = self.eos_index
return ids
示例5: encode_labels_line
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def encode_labels_line(labels_line, append_eos=True, reverse_order=False):
"""Custom helper:
Encode a string of space-separated binary labels into LongTensor.
Mimicks fairseq.data.dictionary.Dictionary.encode_line().
eos always gets a zero token (no change).
Returns a torch.IntTensor, analogous to dictionary's encode_line() method.
"""
labels = [int(label) for label in tokenize_line(labels_line)]
assert all([label in [0, 1] for label in labels]), \
f"encode_labels_line: token-level labels must be binary!"
if reverse_order:
labels = list(reversed(labels))
if append_eos:
labels.append(0)
return torch.tensor(labels, dtype=torch.int)
示例6: replace_unk
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
from fairseq import tokenizer
# Tokens are strings here
hypo_tokens = tokenizer.tokenize_line(hypo_str)
# TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>']
for i, ht in enumerate(hypo_tokens):
if ht == unk:
src_token = src_tokens[alignment[i]]
# Either take the corresponding value in the aligned dictionary or just copy the original value.
hypo_tokens[i] = align_dict.get(src_token, src_token)
return ' '.join(hypo_tokens)
示例7: build_dictionary
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8):
d = BertDictionary()
for filename in filenames:
Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers)
d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
return d
示例8: build_dictionary
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8):
d = MaskedLMDictionary()
for filename in filenames:
Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers)
d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
return d
示例9: encode_line
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def encode_line(
self,
line,
line_tokenizer=tokenize_line,
add_if_not_exist=True,
consumer=None,
append_eos=True,
reverse_order=False,
):
words = line_tokenizer(line)
if reverse_order:
words = list(reversed(words))
nwords = len(words)
ids = torch.IntTensor(nwords + 1 if append_eos else nwords)
for i, word in enumerate(words):
if add_if_not_exist:
idx = self.add_symbol(word)
else:
idx = self.index(word)
if consumer is not None:
consumer(word, idx)
ids[i] = idx
if append_eos:
ids[nwords] = self.eos_index
return ids
示例10: build_dictionary
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def build_dictionary(
cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
):
d = MaskedLMDictionary()
for filename in filenames:
MaskedLMDictionary.add_file_to_dictionary(
filename, d, tokenizer.tokenize_line, workers
)
d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
return d
示例11: _sent_to_word_ids
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def _sent_to_word_ids(
self, sent, word_dict, reverse_order, prepend_inds, append_inds
):
"""
Extract the word ids for words associated with the input sentence.
"""
words = tokenizer.tokenize_line(sent)
if reverse_order:
words.reverse()
word_inds = [word_dict.index(w) for w in words]
word_inds = prepend_inds + word_inds + append_inds
return words, word_inds
示例12: replace_unk
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
# Tokens are strings here
hypo_tokens = tokenizer.tokenize_line(hypo_str)
# TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>']
for i, ht in enumerate(hypo_tokens):
if ht == unk:
src_token = src_tokens[alignment[i]]
# Either take the corresponding value in the aligned dictionary or just copy the original value.
hypo_tokens[i] = align_dict.get(src_token, src_token)
return ' '.join(hypo_tokens)
示例13: binarize
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def binarize(filename, dict, consumer, tokenize=tokenize_line, append_eos=True, reverse_order=False,
offset=0, end=-1):
nseq, ntok = 0, 0
replaced = Counter()
def replaced_consumer(word, idx):
if idx == dict.unk_index and word != dict.unk_word:
replaced.update([word])
with open(filename, 'r', encoding='utf-8') as f:
f.seek(offset)
# next(f) breaks f.tell(), hence readline() must be used
line = safe_readline(f)
while line:
if end > 0 and f.tell() > end:
break
ids = dict.encode_line(
line=line,
line_tokenizer=tokenize,
add_if_not_exist=False,
consumer=replaced_consumer,
append_eos=append_eos,
reverse_order=reverse_order,
)
nseq += 1
ntok += len(ids)
consumer(ids)
line = f.readline()
return {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced}
示例14: build_dictionary
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1,
padding_factor=8):
"""Build the dictionary from edit-labeled raw text inputs.
Each file contains tokenized sentences along with their token labels:
```text
My teacher is going to move to change his job .
0 0 0 0 0 0 0 0 0 0 0
And he took in my favorite subject like soccer .
0 0 0 0 0 0 1 0 0 0
...
```
A dictionary is built using only the tokens and not token labels.
Args:
filenames (list): list of filenames
workers (int): number of concurrent workers
threshold (int): defines the minimum word count
nwords (int): defines the total number of words in the final dictionary,
including special symbols
padding_factor (int): can be used to pad the dictionary size to be a
multiple of 8, which is important on some hardware (e.g., Nvidia
Tensor Cores).
"""
d = Dictionary()
for filename in filenames:
# Write only tokens to a separate file.
with open(filename) as f_in, \
open(f"{filename}.tokens", "w") as f_out:
f_out.writelines(line for i, line in enumerate(f_in)
if i % 2 == 0)
# Add tokens to dictionary with multiprocessing.
Dictionary.add_file_to_dictionary(f"{filename}.tokens", d,
tokenizer.tokenize_line, workers)
d.finalize(threshold=threshold, nwords=nwords,
padding_factor=padding_factor)
return d
示例15: binarize
# 需要導入模塊: from fairseq import tokenizer [as 別名]
# 或者: from fairseq.tokenizer import tokenize_line [as 別名]
def binarize(
filename,
dict,
consumer,
tokenize=tokenize_line,
append_eos=True,
reverse_order=False,
offset=0,
end=-1,
already_numberized=False,
):
nseq, ntok = 0, 0
replaced = Counter()
def replaced_consumer(word, idx):
if idx == dict.unk_index and word != dict.unk_word:
replaced.update([word])
with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f:
f.seek(offset)
# next(f) breaks f.tell(), hence readline() must be used
line = safe_readline(f)
while line:
if end > 0 and f.tell() > end:
break
if already_numberized:
id_strings = line.strip().split()
id_list = [int(id_string) for id_string in id_strings]
if reverse_order:
id_list.reverse()
if append_eos:
id_list.append(dict.eos())
ids = torch.IntTensor(id_list)
else:
ids = dict.encode_line(
line=line,
line_tokenizer=tokenize,
add_if_not_exist=False,
consumer=replaced_consumer,
append_eos=append_eos,
reverse_order=reverse_order,
)
nseq += 1
ntok += len(ids)
consumer(ids)
line = f.readline()
return {
"nseq": nseq,
"nunk": sum(replaced.values()),
"ntok": ntok,
"replaced": replaced,
}