本文整理汇总了Python中pytorch_transformers.tokenization_bert.BertTokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python BertTokenizer.from_pretrained方法的具体用法?Python BertTokenizer.from_pretrained怎么用?Python BertTokenizer.from_pretrained使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pytorch_transformers.tokenization_bert.BertTokenizer
的用法示例。
在下文中一共展示了BertTokenizer.from_pretrained方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, params):
super(BiEncoderModule, self).__init__()
ctxt_bert = BertModel.from_pretrained(params["bert_model"])
cand_bert = BertModel.from_pretrained(params['bert_model'])
self.context_encoder = BertEncoder(
ctxt_bert,
params["out_dim"],
layer_pulled=params["pull_from_layer"],
add_linear=params["add_linear"],
)
self.cand_encoder = BertEncoder(
cand_bert,
params["out_dim"],
layer_pulled=params["pull_from_layer"],
add_linear=params["add_linear"],
)
self.config = ctxt_bert.config
示例2: bertForPreTraining
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForPreTraining(*args, **kwargs):
"""
BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads
- the masked language modeling head, and
- the next sentence classification head.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForPreTraining
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
"""
model = BertForPreTraining.from_pretrained(*args, **kwargs)
return model
示例3: bertModel
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertModel(*args, **kwargs):
"""
BertModel is the basic BERT Transformer model with a layer of summed token,
position and sequence embeddings followed by a series of identical
self-attention blocks (12 for BERT-base, 24 for BERT-large).
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertModel
model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
encoded_layers, _ = model(tokens_tensor, segments_tensors)
"""
model = BertModel.from_pretrained(*args, **kwargs)
return model
示例4: bertForNextSentencePrediction
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForNextSentencePrediction(*args, **kwargs):
"""
BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence
classification head.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForNextSentencePrediction
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
model.eval()
# Predict the next sentence classification logits
with torch.no_grad():
next_sent_classif_logits = model(tokens_tensor, segments_tensors)
"""
model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
return model
示例5: bertForMaskedLM
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForMaskedLM(*args, **kwargs):
"""
BertForMaskedLM includes the BertModel Transformer followed by the
(possibly) pre-trained masked language modeling head.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForMaskedLM
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
model.eval()
# Predict all tokens
with torch.no_grad():
predictions = model(tokens_tensor, segments_tensors)
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
'henson'
"""
model = BertForMaskedLM.from_pretrained(*args, **kwargs)
return model
示例6: bertForMultipleChoice
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForMultipleChoice(*args, **kwargs):
"""
BertForMultipleChoice is a fine-tuning model that includes BertModel and a
linear layer on top of the BertModel. Note that the multiple choice head is
only initialized and has to be trained.
Args:
num_choices: the number (>=2) of classes for the classifier.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
# Load bertForMultipleChoice
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
model.eval()
# Predict the multiple choice logits
with torch.no_grad():
multiple_choice_logits = model(tokens_tensor, segments_tensors)
# Or get the multiple choice loss
labels = torch.tensor([1])
multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
"""
model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
return model
示例7: bertForQuestionAnswering
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForQuestionAnswering(*args, **kwargs):
"""
BertForQuestionAnswering is a fine-tuning model that includes BertModel
with a token-level classifiers on top of the full sequence of last hidden
states. Note that the classification head is only initialized
and has to be trained.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForQuestionAnswering
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
model.eval()
# Predict the start and end positions logits
with torch.no_grad():
start_logits, end_logits = model(tokens_tensor, segments_tensors)
# Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
# set model.train() before if training this loss
multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
"""
model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
return model
示例8: bertForTokenClassification
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForTokenClassification(*args, **kwargs):
"""
BertForTokenClassification is a fine-tuning model that includes BertModel
and a token-level classifier on top of the BertModel. Note that the classification
head is only initialized and has to be trained.
The token-level classifier is a linear layer that takes as input the last
hidden state of the sequence.
Args:
num_labels: the number (>=2) of classes for the classifier.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForTokenClassification
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
model.eval()
# Predict the token classification logits
with torch.no_grad():
classif_logits = model(tokens_tensor, segments_tensors)
# Or get the token classification loss
labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
"""
model = BertForTokenClassification.from_pretrained(*args, **kwargs)
return model
示例9: get_model
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def get_model(parameters):
model = BertForReranking.from_pretrained(
parameters["path_to_model"],
num_labels=parameters["top_k"],
cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), "local"),
)
if parameters["dataparallel_bert"]:
model.bert = torch.nn.DataParallel(model.bert)
print("Data parallel Bert")
return model
示例10: get_tokenizer
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def get_tokenizer(parameters):
tokenizer = BertTokenizer.from_pretrained(
parameters["path_to_model"], do_lower_case=parameters["lowercase_flag"]
)
return tokenizer
示例11: __init__
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, params, tokenizer):
super(CrossEncoderModule, self).__init__()
model_path = params["bert_model"]
if params.get("roberta"):
encoder_model = RobertaModel.from_pretrained(model_path)
else:
encoder_model = BertModel.from_pretrained(model_path)
encoder_model.resize_token_embeddings(len(tokenizer))
self.encoder = BertEncoder(
encoder_model,
params["out_dim"],
layer_pulled=params["pull_from_layer"],
add_linear=params["add_linear"],
)
self.config = self.encoder.bert_model.config
示例12: __init__
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self) -> None:
os.environ['CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(os.environ['HOME'])
self.client: CoreNLPClient = CoreNLPClient()
self.client.ensure_alive()
self.do_lower_case = '-cased' not in config.bert_model
self.basic_tokenizer: BasicTokenizer \
= BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer
示例13: __init__
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, config, mode, *args, **params):
super().__init__(config, mode, *args, **params)
self.tokenizer = BertTokenizer.from_pretrained(config.get("model", "bert_path"))
self.max_len = config.getint("data", "max_seq_length")
self.mode = mode
示例14: bertTokenizer
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertTokenizer(*args, **kwargs):
"""
Instantiate a BertTokenizer from a pre-trained/customized vocab file
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* bert-base-uncased
* bert-large-uncased
* bert-base-cased
* bert-large-cased
* bert-base-multilingual-uncased
* bert-base-multilingual-cased
* bert-base-chinese
Keyword args:
cache_dir: an optional path to a specific directory to download and cache
the pre-trained model weights.
Default: None
do_lower_case: Whether to lower case the input.
Only has an effect when do_wordpiece_only=False
Default: True
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
Default: True
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
Default: None
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
Example:
import torch
sentence = 'Hello, World!'
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
toks = tokenizer.tokenize(sentence)
['Hello', '##,', 'World', '##!']
ids = tokenizer.convert_tokens_to_ids(toks)
[8667, 28136, 1291, 28125]
"""
tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
return tokenizer
示例15: main
# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def main():
parser = ArgumentParser()
parser.add_argument('--train_corpus', type=Path, required=True)
parser.add_argument("--output_dir", type=Path, required=True)
parser.add_argument("--bert_model", type=str, required=True,
choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
"bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
parser.add_argument("--do_lower_case", action="store_true")
parser.add_argument("--do_whole_word_mask", action="store_true",
help="Whether to use whole word masking rather than per-WordPiece masking.")
parser.add_argument("--reduce_memory", action="store_true",
help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")
parser.add_argument("--num_workers", type=int, default=1,
help="The number of workers to use to write the files")
parser.add_argument("--epochs_to_generate", type=int, default=3,
help="Number of epochs of data to pregenerate")
parser.add_argument("--max_seq_len", type=int, default=128)
parser.add_argument("--short_seq_prob", type=float, default=0.1,
help="Probability of making a short sentence as a training example")
parser.add_argument("--masked_lm_prob", type=float, default=0.15,
help="Probability of masking each token for the LM task")
parser.add_argument("--max_predictions_per_seq", type=int, default=20,
help="Maximum number of tokens to mask in each sequence")
args = parser.parse_args()
if args.num_workers > 1 and args.reduce_memory:
raise ValueError("Cannot use multiple workers while reducing memory")
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
vocab_list = list(tokenizer.vocab.keys())
with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
with args.train_corpus.open() as f:
doc = []
for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
line = line.strip()
if line == "":
docs.add_document(doc)
doc = []
else:
tokens = tokenizer.tokenize(line)
doc.append(tokens)
if doc:
docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added
if len(docs) <= 1:
exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
"ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
"indicate breaks between documents in your input file. If your dataset does not contain multiple "
"documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
"sections or paragraphs.")
args.output_dir.mkdir(exist_ok=True)
if args.num_workers > 1:
writer_workers = Pool(min(args.num_workers, args.epochs_to_generate))
arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)]
writer_workers.starmap(create_training_file, arguments)
else:
for epoch in trange(args.epochs_to_generate, desc="Epoch"):
create_training_file(docs, vocab_list, args, epoch)