本文整理汇总了Python中transformers.BertTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python transformers.BertTokenizer方法的具体用法?Python transformers.BertTokenizer怎么用?Python transformers.BertTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类transformers
的用法示例。
在下文中一共展示了transformers.BertTokenizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_loader
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def build_loader(pairs, dictionary_or_tokenizer, label2id, config):
pairs = [(text, label2id[label]) for text, label in pairs]
if isinstance(dictionary_or_tokenizer, Dictionary):
col_fn = partial(collate_fn, dictionary_or_tokenizer, config.max_len)
elif isinstance(dictionary_or_tokenizer, BertTokenizer):
col_fn = partial(bert_collate_fn, dictionary_or_tokenizer, config.max_len)
loader = DataLoader(
dataset=TextClfDataset(pairs),
collate_fn=col_fn,
batch_size=config.batch_size,
shuffle=config.shuffle,
num_workers=config.num_workers,
pin_memory=config.pin_memory,
drop_last=config.drop_last
)
return loader
示例2: bert_collate_fn
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def bert_collate_fn(
tokenizer: BertTokenizer,
max_len: int,
pairs: Iterable[Tuple[str, int]]
):
pairs = [(text.split()[:max_len], label) for text, label in pairs]
texts, labels = zip(*pairs)
labels = torch.LongTensor(labels)
# +1 for [CLS] token
text_lens = torch.LongTensor([len(text)+1 for text in texts])
max_len = text_lens.max().item()
ids = torch.ones(len(texts), max_len).long() * tokenizer.pad_token_id
for i, text in enumerate(texts):
ids[i][:len(text)+1] = torch.LongTensor(
tokenizer.encode(text, add_special_tokens=True)[:-1])
return ids, text_lens, labels
示例3: __init__
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def __init__(self, perf_count=None, cache_path='eval_features.pickle'):
print("Constructing QSL...")
eval_features = []
# Load features if cached, convert from examples otherwise.
if os.path.exists(cache_path):
print("Loading cached features from '%s'..." % cache_path)
with open(cache_path, 'rb') as cache_file:
eval_features = pickle.load(cache_file)
else:
print("No cached features at '%s'... converting from examples..." % cache_path)
print("Creating tokenizer...")
tokenizer = BertTokenizer("build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt")
print("Reading examples...")
eval_examples = read_squad_examples(input_file="build/data/dev-v1.1.json",
is_training=False, version_2_with_negative=False)
print("Converting examples to features...")
def append_feature(feature):
eval_features.append(feature)
convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
is_training=False,
output_fn=append_feature,
verbose_logging=False)
print("Caching features at '%s'..." % cache_path)
with open(cache_path, 'wb') as cache_file:
pickle.dump(eval_features, cache_file)
self.eval_features = eval_features
self.count = len(self.eval_features)
self.perf_count = perf_count if perf_count is not None else self.count
self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples)
print("Finished constructing QSL.")
示例4: __init__
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def __init__(self, config_path):
super(transformer_gpt2, self).__init__()
self.tokenzier = BertTokenizer(vocab_file='config/vocab_en.txt')
self.vocab_size = len(self.tokenzier)
self.model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(config_path)
self.model = GPT2LMHeadModel(config=self.model_config)
self.model.resize_token_embeddings(self.vocab_size)
self.n_ctx = self.model.config.to_dict().get('n_ctx')
示例5: __init__
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def __init__(self,vocab_path,do_lower_case):
self.tokenizer = BertTokenizer(vocab_path,do_lower_case)
示例6: load_ubuntucorpus_bert
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def load_ubuntucorpus_bert():
def _load_ubuntucorpus(min_rare_vocab_times=0):
from transformers import BertTokenizer
toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
return UbuntuCorpus("./tests/dataloader/dummy_ubuntucorpus#Ubuntu", min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="bert")
return _load_ubuntucorpus
示例7: load_switchboardcorpus_bert
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def load_switchboardcorpus_bert():
def _load_switchboardcorpus(min_rare_vocab_times=0):
from transformers import BertTokenizer
toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
return SwitchboardCorpus("./tests/dataloader/dummy_switchboardcorpus#SwitchboardCorpus",
min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="bert")
return _load_switchboardcorpus
示例8: load_opensubtitles_bert
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def load_opensubtitles_bert():
def _load_opensubtitles(invalid_vocab_times=0):
from transformers import BertTokenizer
toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
return OpenSubtitles("./tests/dataloader/dummy_opensubtitles#OpenSubtitles", tokenizer=toker, pretrained='bert', min_rare_vocab_times=invalid_vocab_times)
return _load_opensubtitles
示例9: load_sst_bert
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def load_sst_bert():
def _load_sst(min_rare_vocab_times=0):
from transformers import BertTokenizer
toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
return SST("./tests/dataloader/dummy_sst#SST", tokenizer=toker, min_rare_vocab_times=min_rare_vocab_times, pretrained="bert")
return _load_sst
示例10: load_mscoco_bert
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def load_mscoco_bert():
def _load_mscoco(invalid_vocab_times=0):
from transformers import BertTokenizer
toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
return MSCOCO("./tests/dataloader/dummy_mscoco#MSCOCO", tokenizer=toker, pretrained='bert', min_rare_vocab_times=invalid_vocab_times)
return _load_mscoco
示例11: __init__
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def __init__(self, vocab_file, do_lower_case=False,
do_basic_tokenize=True, do_wordpiece_tokenize=True,
mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]',
pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
"""Constructs a MecabBertTokenizer.
Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
Only has an effect when do_basic_tokenize=True.
**do_basic_tokenize**: (`optional`) boolean (default True)
Whether to do basic tokenization with MeCab before wordpiece.
**mecab_dict_path**: (`optional`) string
Path to a directory of a MeCab dictionary.
"""
super(BertTokenizer, self).__init__(
unk_token=unk_token, sep_token=sep_token, pad_token=pad_token,
cls_token=cls_token, mask_token=mask_token, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'.".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
self.do_wordpiece_tokenize = do_wordpiece_tokenize
if do_basic_tokenize:
self.basic_tokenizer = MecabBasicTokenizer(do_lower_case=do_lower_case,
mecab_dict_path=mecab_dict_path)
if do_wordpiece_tokenize:
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
unk_token=self.unk_token)
示例12: __init__
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def __init__(self, vocab_file: str,
do_lower_case: bool = False,
max_seq_length: int = 512,
tokenize_chinese_chars: bool = True,
**kwargs):
vocab_file = expand_path(vocab_file)
self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case,
tokenize_chinese_chars=tokenize_chinese_chars)
self.max_seq_length = max_seq_length
示例13: _preprocess
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def _preprocess(self, text):
text_tokenized = self.tokenizer(text)
if isinstance(self.dictionary, Dictionary):
text_processed = self.dictionary.tokens_to_tensor(
text_tokenized, max_len=self.config.max_len
)
text_len = (text_processed != self.dictionary.pad()).sum()
elif isinstance(self.dictionary, BertTokenizer):
text_processed = torch.LongTensor(
self.dictionary.encode(text_tokenized, add_special_tokens=True)[:-1])
max_len = self.config.max_len
pad_id = self.dictionary.pad_token_id
if len(text_processed) >= max_len:
text_processed = text_processed[:max_len]
else:
text_processed = torch.cat([
text_processed,
torch.ones(max_len-len(text_processed)).long()*pad_id
])
text_len = (text_processed != pad_id).sum()
if self.use_cuda:
text_processed = text_processed.cuda()
text_len = text_len.cuda()
return text_processed.unsqueeze(0), text_len.unsqueeze(0)
示例14: main
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--vocab_file", default="build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt", help="Path to vocab.txt")
parser.add_argument("--val_data", default="build/data/dev-v1.1.json", help="Path to validation data")
parser.add_argument("--log_file", default="build/logs/mlperf_log_accuracy.json", help="Path to LoadGen accuracy log")
parser.add_argument("--out_file", default="build/result/predictions.json", help="Path to output predictions file")
parser.add_argument("--features_cache_file", default="eval_features.pickle", help="Path to features' cache file")
parser.add_argument("--output_transposed", action="store_true", help="Transpose the output")
args = parser.parse_args()
print("Reading examples...")
eval_examples = read_squad_examples(input_file=args.val_data,
is_training=False, version_2_with_negative=False)
eval_features = []
# Load features if cached, convert from examples otherwise.
cache_path = args.features_cache_file
if os.path.exists(cache_path):
print("Loading cached features from '%s'..." % cache_path)
with open(cache_path, 'rb') as cache_file:
eval_features = pickle.load(cache_file)
else:
print("No cached features at '%s'... converting from examples..." % cache_path)
print("Creating tokenizer...")
tokenizer = BertTokenizer(args.vocab_file)
print("Converting examples to features...")
def append_feature(feature):
eval_features.append(feature)
convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
is_training=False,
output_fn=append_feature,
verbose_logging=False)
print("Caching features at '%s'..." % cache_path)
with open(cache_path, 'wb') as cache_file:
pickle.dump(eval_features, cache_file)
print("Loading LoadGen logs...")
results = load_loadgen_log(args.log_file, eval_features, args.output_transposed)
print("Post-processing predictions...")
write_predictions(eval_examples, eval_features, results, 20, 30, True, args.out_file)
print("Evaluating predictions...")
cmd = "python3 build/data/evaluate-v1.1.py build/data/dev-v1.1.json build/result/predictions.json"
subprocess.check_call(cmd, shell=True)
示例15: transformer_preprocess
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import BertTokenizer [as 别名]
def transformer_preprocess(src_path, tgt_path, tokenized_file,
vocab_file='./config/vocab_en.txt', ctx=200):
'''
tokenize the dataset for NLG (GPT2), write the tokenized id into the tokenized_file.
more details can be found in https://github.com/yangjianxin1/GPT2-chitchat
'''
def clean_inside(s):
s = s.replace('<user0>', '')
s = s.replace('<user1>', '')
s = s.strip()
s = clean(s)
return s
# create the Bert tokenizer of the GPT2 model
tokenizer = BertTokenizer(vocab_file=vocab_file)
src_data, tgt_data = read_file(src_path), read_file(tgt_path)
src_data = [' '.join(i) for i in src_data]
tgt_data = [' '.join(i) for i in tgt_data]
assert len(src_data) == len(tgt_data), f'[!] length of src and tgt: {len(src_data)}/{len(tgt_data)}'
# combine them
corpus = []
longest = 0
for s, t in tqdm(list(zip(src_data, tgt_data))):
item = [tokenizer.cls_token_id] # [CLS] for each dialogue in the begining
s = s + ' __eou__ ' + t
s = clean_inside(s)
utterances = s.split('__eou__')
for utterance in utterances:
words = nltk.word_tokenize(utterance)
item.extend([tokenizer.convert_tokens_to_ids(word) for word in words])
item.append(tokenizer.sep_token_id)
if len(item) > longest:
longest = len(item)
item = item[:ctx]
corpus.append(item)
# write into the file
with open(tokenized_file, 'w') as f:
for i in range(len(corpus)):
words = [str(word) for word in corpus[i]]
f.write(f'{" ".join(words)}')
if i < len(corpus) - 1:
f.write('\n')
print(f'[!] Preprocess the data for the transformers(GPT2), the longest sentence :{longest}, write the data into {tokenized_file}.')
# From https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Optim.py
# ========== lr scheduler for transformer ==========