本文整理汇总了Python中pytorch_transformers.BertTokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python BertTokenizer.from_pretrained方法的具体用法?Python BertTokenizer.from_pretrained怎么用?Python BertTokenizer.from_pretrained使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pytorch_transformers.BertTokenizer
的用法示例。
在下文中一共展示了BertTokenizer.from_pretrained方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, args):
try:
from pytorch_transformers import BertTokenizer
from pytorch_transformers.tokenization_utils import clean_up_tokenization
except ImportError:
raise ImportError(
'Please install 1.0.0 version of pytorch_transformers'
'with: pip install pytorch-transformers'
)
if 'bpe_vocab_file' in args:
self.bert_tokenizer = BertTokenizer(
args.bpe_vocab_file,
do_lower_case=not args.bpe_cased
)
else:
vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased'
self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
self.clean_up_tokenization = clean_up_tokenization
示例2: __init__
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, model_path: str=None) -> None:
super().__init__()
"Requires the BertTokenizer from pytorch_transformers"
# pip install pytorch_transformers
import os
import torch
from pytorch_transformers import BertTokenizer, cached_path
from training.transformer_utils.model import TransformerWithClfHeadAndAdapters
try:
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.config = torch.load(cached_path(os.path.join(model_path, "model_training_args.bin")))
self.model = TransformerWithClfHeadAndAdapters(self.config["config"],
self.config["config_ft"]).to(self.device)
state_dict = torch.load(cached_path(os.path.join(model_path, "model_weights.pth")),
map_location=self.device)
self.model.load_state_dict(state_dict)
self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
except:
raise Exception("Require a valid transformer model file ({0}/model_weights.pth) "
"and its config file ({0}/model_training_args.bin)."
.format(model_path))
示例3: __init__
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, model_file: str=None) -> None:
"Requires the BertTokenizer from pytorch_transformers"
# pip install pytorch_transformers
import os
import torch
from pytorch_transformers import BertTokenizer, cached_path
from training.transformer_utils.model import TransformerWithClfHeadAndAdapters
try:
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.config = torch.load(cached_path(os.path.join(model_file, "model_training_args.bin")))
self.model = TransformerWithClfHeadAndAdapters(self.config["config"],
self.config["config_ft"]).to(self.device)
state_dict = torch.load(cached_path(os.path.join(model_file, "model_weights.pth")),
map_location=self.device)
self.model.load_state_dict(state_dict)
self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
except:
raise Exception("Require a valid transformer model file ({0}/model_weights.pth) "
"and its config file ({0}/model_training_args.bin)."
.format(model_file))
示例4: __init__
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, args= None, device='cuda', bert_model_path='bert-base-uncased', batch_size=10, learning_rate = 5e-5, weight_decay=0, additional_features=None):
if args is not None:
self.args = vars(args)
assert device in ['cuda', 'cpu']
if not args:
self.args = {}
self.args['bert_model_path'] = bert_model_path
self.args['device'] = device
self.args['learning_rate'] = learning_rate
self.args['weight_decay'] = weight_decay
self.args['batch_size'] = batch_size
self.log = logging.getLogger()
self.bert_tokenizer = BertTokenizer.from_pretrained(self.args['bert_model_path'])
if os.path.exists(self.args['bert_model_path']):
if os.path.exists(os.path.join(self.args['bert_model_path'], CONFIG_NAME)):
config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], CONFIG_NAME))
elif os.path.exists(os.path.join(self.args['bert_model_path'], 'bert_config.json')):
config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], 'bert_config.json'))
else:
raise ValueError("Cannot find a configuration for the BERT model you are attempting to load.")
self.loss_function = torch.nn.MSELoss()
config.pretrained_config_archive_map['additional_features'] = additional_features
self.regressor_net = BertSimilarityRegressor.from_pretrained(self.args['bert_model_path'], config=config)
self.optimizer = torch.optim.Adam(
self.regressor_net.parameters(),
weight_decay=self.args['weight_decay'],
lr=self.args['learning_rate']
)
self.log.info('Initialized BertSentencePairSimilarity model from %s' % self.args['bert_model_path'])
示例5: load_from_file
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def load_from_file(cls, vocab_file):
from pytorch_transformers import BertTokenizer
return cls(BertTokenizer.from_pretrained(vocab_file))
示例6: __init__
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, mode: str = 'bert-base-uncased'):
"""Initialization."""
super().__init__()
self._tokenizer = BertTokenizer.from_pretrained(mode)
示例7: validate
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def validate(args, device_id, pt, step):
device = "cpu" if args.visible_gpus == '-1' else "cuda"
if (pt != ''):
test_from = pt
else:
test_from = args.test_from
logger.info('Loading checkpoint from %s' % test_from)
checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
opt = vars(checkpoint['opt'])
for k in opt.keys():
if (k in model_flags):
setattr(args, k, opt[k])
print(args)
model = AbsSummarizer(args, device, checkpoint)
model.eval()
valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False),
args.batch_size, device,
shuffle=False, is_test=False)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device)
trainer = build_trainer(args, device_id, model, None, valid_loss)
stats = trainer.validate(valid_iter, step)
return stats.xent()
示例8: test_abs
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def test_abs(args, device_id, pt, step):
device = "cpu" if args.visible_gpus == '-1' else "cuda"
if (pt != ''):
test_from = pt
else:
test_from = args.test_from
logger.info('Loading checkpoint from %s' % test_from)
checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
opt = vars(checkpoint['opt'])
for k in opt.keys():
if (k in model_flags):
setattr(args, k, opt[k])
print(args)
model = AbsSummarizer(args, device, checkpoint)
model.eval()
test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
args.test_batch_size, device,
shuffle=False, is_test=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
predictor = build_predictor(args, tokenizer, symbols, model, logger)
predictor.translate(test_iter, step)
示例9: test_text_abs
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def test_text_abs(args, device_id, pt, step):
device = "cpu" if args.visible_gpus == '-1' else "cuda"
if (pt != ''):
test_from = pt
else:
test_from = args.test_from
logger.info('Loading checkpoint from %s' % test_from)
checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
opt = vars(checkpoint['opt'])
for k in opt.keys():
if (k in model_flags):
setattr(args, k, opt[k])
print(args)
model = AbsSummarizer(args, device, checkpoint)
model.eval()
test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
args.test_batch_size, device,
shuffle=False, is_test=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
predictor = build_predictor(args, tokenizer, symbols, model, logger)
predictor.translate(test_iter, step)
示例10: __init__
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, pretrained_model_name_for_tokenizer, max_vocabulary_size,
max_tokenization_length, embedding_dim, num_classes=1, num_recurrent_layers=1,
use_bidirectional=False, hidden_size=128, dropout_rate=0.10, use_gpu=False):
super(SimpleRNN, self).__init__()
self.num_recurrent_layers = num_recurrent_layers
self.use_bidirectional = use_bidirectional
self.hidden_size = hidden_size
self.use_gpu = use_gpu
# Configure tokenizer
self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_for_tokenizer)
self.tokenizer.max_len = max_tokenization_length
# Define additional layers & utilities specific to the finetuned task
# Embedding Layer
self.embedding = nn.Embedding(num_embeddings=max_vocabulary_size,
embedding_dim=embedding_dim)
# Dropout to prevent overfitting
self.dropout = nn.Dropout(p=dropout_rate)
# Recurrent Layer
self.lstm = nn.LSTM(input_size=embedding_dim,
hidden_size=hidden_size,
num_layers=num_recurrent_layers,
bidirectional=use_bidirectional,
batch_first=True)
# Dense Layer for Classification
self.clf = nn.Linear(in_features=hidden_size*2 if use_bidirectional else hidden_size,
out_features=num_classes)
示例11: __init__
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, config, logger):
super().__init__(config, logger)
bert_model = bert_models.get_model(config['bert_base'], self.logger)
self.tokenizer = BertTokenizer.from_pretrained(bert_model)
# HACK! Until the transformers library adopts tokenizers, save and re-load vocab
with tempfile.TemporaryDirectory() as d:
self.tokenizer.save_vocabulary(d)
# this tokenizer is ~4x faster as the BertTokenizer, per my measurements
self.tokenizer = tk.BertWordPieceTokenizer(os.path.join(d, 'vocab.txt'))
示例12: __init__
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, pretrain_path, max_length):
nn.Module.__init__(self)
# self.bert = BertModel.from_pretrained(pretrain_path)
self.bert = BertForSequenceClassification.from_pretrained(
pretrain_path,
num_labels=2)
self.max_length = max_length
self.tokenizer = BertTokenizer.from_pretrained(os.path.join(
pretrain_path, 'bert_vocab.txt'))
self.modelName = 'Bert'
示例13: __init__
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, tagged_sents, tag_to_index, config, word_to_embid=None):
sents, tags_li,values_li = [], [], [] # list of lists
self.config = config
for sent in tagged_sents:
words = [word_tag[0] for word_tag in sent]
tags = [word_tag[1] for word_tag in sent]
values = [word_tag[3] for word_tag in sent] #+++HANDE
if self.config.model != 'LSTM' and self.config.model != 'BiLSTM':
sents.append(["[CLS]"] + words + ["[SEP]"])
tags_li.append(["<pad>"] + tags + ["<pad>"])
values_li.append(["<pad>"] + values + ["<pad>"])
else:
sents.append(words)
tags_li.append(tags)
values_li.append(values)
self.sents, self.tags_li, self.values_li = sents, tags_li, values_li
if self.config.model == 'BertUncased':
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
else:
self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
self.tag_to_index = tag_to_index
self.word_to_embid = word_to_embid
示例14: read_chinese
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def read_chinese(path):
output_path = path + '_output.txt'
content = path + '_sentence.txt'
aspect = path + '_target.txt'
polarity = path + '_label.txt'
fin = open(content, 'r', encoding='utf-8', newline='\n', errors='ignore')
reviews = fin.readlines()
fin.close()
for i in range(len(reviews)):
reviews[i] = reviews[i].strip()
fin = open(aspect, 'r', encoding='utf-8', newline='\n', errors='ignore')
aspects = fin.readlines()
fin.close()
for i in range(len(aspects)):
aspects[i] = aspects[i].strip()
fin = open(polarity, 'r', encoding='utf-8', newline='\n', errors='ignore')
polarities = fin.readlines()
fin.close()
for i in range(len(polarities)):
polarities[i] = polarities[i].strip()
from pytorch_transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert_for_global_context-base-chinese', do_lower_case=True)
with open(output_path, 'w', encoding='utf-8', newline='\n', errors='ignore') as f_out:
print(len(reviews))
print(len(aspects))
print(len(polarities))
for i in range(len(reviews)):
if aspects[i] is '0':
aspects[i] = reviews[i]
if aspects[i].replace(' ','') not in reviews[i]:
print(aspects[i].replace(' ',''))
continue
reviews[i]=reviews[i].replace(aspects[i].replace(' ',''),' $T$ ')
f_out.write(' '.join(tokenizer.tokenize(reviews[i])) + '\n')
f_out.write(' '.join(tokenizer.tokenize(aspects[i].replace(' ',''))) + '\n')
if polarities[i].strip() is '0':
f_out.write('1' + '\n')
else:
f_out.write('-1' + '\n')
f_out.close()
示例15: train_abs_single
# 需要导入模块: from pytorch_transformers import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.BertTokenizer import from_pretrained [as 别名]
def train_abs_single(args, device_id):
init_logger(args.log_file)
logger.info(str(args))
device = "cpu" if args.visible_gpus == '-1' else "cuda"
logger.info('Device ID %d' % device_id)
logger.info('Device %s' % device)
torch.manual_seed(args.seed)
random.seed(args.seed)
torch.backends.cudnn.deterministic = True
if device_id >= 0:
torch.cuda.set_device(device_id)
torch.cuda.manual_seed(args.seed)
if args.train_from != '':
logger.info('Loading checkpoint from %s' % args.train_from)
checkpoint = torch.load(args.train_from,
map_location=lambda storage, loc: storage)
opt = vars(checkpoint['opt'])
for k in opt.keys():
if (k in model_flags):
setattr(args, k, opt[k])
else:
checkpoint = None
if (args.load_from_extractive != ''):
logger.info('Loading bert from extractive model %s' % args.load_from_extractive)
bert_from_extractive = torch.load(args.load_from_extractive, map_location=lambda storage, loc: storage)
bert_from_extractive = bert_from_extractive['model']
else:
bert_from_extractive = None
torch.manual_seed(args.seed)
random.seed(args.seed)
torch.backends.cudnn.deterministic = True
def train_iter_fct():
return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device,
shuffle=True, is_test=False)
model = AbsSummarizer(args, device, checkpoint, bert_from_extractive)
if (args.sep_optim):
optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
optim = [optim_bert, optim_dec]
else:
optim = [model_builder.build_optim(args, model, checkpoint)]
logger.info(model)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True,
label_smoothing=args.label_smoothing)
trainer = build_trainer(args, device_id, model, optim, train_loss)
trainer.train(train_iter_fct, args.train_steps)