本文整理汇总了Python中pytorch_pretrained_bert.tokenization.BertTokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python BertTokenizer.from_pretrained方法的具体用法?Python BertTokenizer.from_pretrained怎么用?Python BertTokenizer.from_pretrained使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pytorch_pretrained_bert.tokenization.BertTokenizer
的用法示例。
在下文中一共展示了BertTokenizer.from_pretrained方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir="."):
"""Initializes the classifier and the underlying pretrained model.
Args:
language (Language, optional): The pretrained model's language.
Defaults to Language.ENGLISH.
num_labels (int, optional): The number of unique labels in the
training data. Defaults to 2.
cache_dir (str, optional): Location of BERT's cache directory.
Defaults to ".".
"""
if num_labels < 2:
raise ValueError("Number of labels should be at least 2.")
self.language = language
self.num_labels = num_labels
self.cache_dir = cache_dir
# create classifier
self.model = BertForSequenceClassification.from_pretrained(
language, cache_dir=cache_dir, num_labels=num_labels
)
self.has_cuda = self.cuda
示例2: load_model
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def load_model(config, num_train_steps, label_list):
# device = torch.device(torch.cuda.is_available())
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()
model = BertTagger(config, num_labels=len(label_list))
# model = BertForTagger.from_pretrained(config.bert_model, num_labels=13)
model.to(device)
if n_gpu > 1:
model = torch.nn.DataParallel(model)
# prepare optimzier
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]
# optimizer = Adam(optimizer_grouped_parameters, lr=config.learning_rate)
optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps, max_grad_norm=config.clip_grad)
return model, optimizer, device, n_gpu
示例3: __init__
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, archive_file, model_file=None, use_cuda=False):
if not os.path.isfile(archive_file):
if not model_file:
raise Exception("No model for DA-predictor is specified!")
archive_file = cached_path(model_file)
model_dir = os.path.dirname(os.path.abspath(__file__))
if not os.path.exists(os.path.join(model_dir, 'checkpoints')):
archive = zipfile.ZipFile(archive_file, 'r')
archive.extractall(model_dir)
load_dir = os.path.join(model_dir, "checkpoints/predictor/save_step_15120")
if not os.path.exists(load_dir):
archive = zipfile.ZipFile(f'{load_dir}.zip', 'r')
archive.extractall(os.path.dirname(load_dir))
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False)
self.max_seq_length = 256
self.domain = 'restaurant'
self.model = BertForSequenceClassification.from_pretrained(load_dir,
cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(-1)), num_labels=44)
self.device = 'cuda' if use_cuda else 'cpu'
self.model.to(self.device)
示例4: __init__
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, pretrained_model: str,
requires_grad: bool = False,
dropout: float = 0.1,
layer_dropout: float = 0.1,
combine_layers: str = "mix") -> None:
model = BertModel.from_pretrained(pretrained_model)
for param in model.parameters():
param.requires_grad = requires_grad
super().__init__(bert_model=model,
layer_dropout=layer_dropout,
combine_layers=combine_layers)
self.model = model
self.dropout = dropout
self.set_dropout(dropout)
示例5: __init__
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, conversations, labels, conversation_length, sentence_length, data=None):
# [total_data_size, max_conversation_length, max_sentence_length]
# tokenized raw text of sentences
self.conversations = conversations
self.labels = labels
# conversation length of each batch
# [total_data_size]
self.conversation_length = conversation_length
# list of length of sentences
# [total_data_size, max_conversation_length]
self.sentence_length = sentence_length
self.data = data
self.len = len(conversations)
# Prepare for BERT
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
self.prepare_BERT()
示例6: __init__
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, path, batch_size=32, gpu=True, labels=None,
has_labels=True, is_train=True, dropout_w=0.005, maxlen=128):
self.batch_size = batch_size
self.has_labels = has_labels
self.gpu = gpu
self.labels = labels
self.is_train = is_train
# Explicit cache dir required for some reason -- default doesn't exist in the docker
# container, maybe?
self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='/tmp')
self.data = self.load(path, maxlen, has_labels)
if self.is_train:
indices = list(range(len(self.data)))
random.shuffle(indices)
data = [self.data[i] for i in indices]
self.data = GobbliBatchGen.make_batches(self.data, batch_size)
self.offset = 0
self.dropout_w = dropout_w
示例7: main
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def main():
torch.manual_seed(args.seed)
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
use_gpu = torch.cuda.is_available()
args.use_gpu = use_gpu
if use_gpu:
print("Currently using GPU {}".format(args.gpu_devices))
cudnn.benchmark = True
torch.cuda.manual_seed_all(args.seed)
else:
print("Currently using CPU (GPU is highly recommended)")
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
bert_model = BertModel.from_pretrained("bert-base-chinese")
if use_gpu:
bert_model = bert_model.cuda()
processor = Preprocess(args, tokenizer, bert_model)
processor.do_preprocess()
示例8: _prepare_model
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def _prepare_model(self) -> BertPreTrainedModel:
if self.args.cache_dir:
cache_dir = self.args.cache_dir
else:
cache_dir = os.path.join(
str(PYTORCH_PRETRAINED_BERT_CACHE),
f"distributed_{self.args.local_rank}",
)
model = BertForSequenceClassification.from_pretrained(
self.args.bert_model, cache_dir=cache_dir, num_labels=self.num_labels
)
model.to(self.device)
return model
示例9: bertTokenizer
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertTokenizer(*args, **kwargs):
"""
Instantiate a BertTokenizer from a pre-trained/customized vocab file
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* bert-base-uncased
* bert-large-uncased
* bert-base-cased
* bert-large-cased
* bert-base-multilingual-uncased
* bert-base-multilingual-cased
* bert-base-chinese
Keyword args:
cache_dir: an optional path to a specific directory to download and cache
the pre-trained model weights.
Default: None
do_lower_case: Whether to lower case the input.
Only has an effect when do_wordpiece_only=False
Default: True
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
Default: True
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
Default: None
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
Example:
>>> sentence = 'Hello, World!'
>>> tokenizer = torch.hub.load('ailzhang/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
>>> toks = tokenizer.tokenize(sentence)
['Hello', '##,', 'World', '##!']
>>> ids = tokenizer.convert_tokens_to_ids(toks)
[8667, 28136, 1291, 28125]
"""
tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
return tokenizer
示例10: bertModel
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertModel(*args, **kwargs):
"""
BertModel is the basic BERT Transformer model with a layer of summed token,
position and sequence embeddings followed by a series of identical
self-attention blocks (12 for BERT-base, 24 for BERT-large).
"""
model = BertModel.from_pretrained(*args, **kwargs)
return model
示例11: bertForNextSentencePrediction
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForNextSentencePrediction(*args, **kwargs):
"""
BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence
classification head.
"""
model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
return model
示例12: bertForPreTraining
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForPreTraining(*args, **kwargs):
"""
BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads
- the masked language modeling head, and
- the next sentence classification head.
"""
model = BertForPreTraining.from_pretrained(*args, **kwargs)
return model
示例13: bertForMaskedLM
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForMaskedLM(*args, **kwargs):
"""
BertForMaskedLM includes the BertModel Transformer followed by the
(possibly) pre-trained masked language modeling head.
"""
model = BertForMaskedLM.from_pretrained(*args, **kwargs)
return model
示例14: bertForMultipleChoice
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForMultipleChoice(*args, **kwargs):
"""
BertForMultipleChoice is a fine-tuning model that includes BertModel and a
linear layer on top of the BertModel.
"""
model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
return model
示例15: bertForQuestionAnswering
# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForQuestionAnswering(*args, **kwargs):
"""
BertForQuestionAnswering is a fine-tuning model that includes BertModel
with a token-level classifiers on top of the full sequence of last hidden
states.
"""
model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
return model