本文整理汇总了Python中pytorch_pretrained_bert.BertTokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python BertTokenizer.from_pretrained方法的具体用法?Python BertTokenizer.from_pretrained怎么用?Python BertTokenizer.from_pretrained使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pytorch_pretrained_bert.BertTokenizer
的用法示例。
在下文中一共展示了BertTokenizer.from_pretrained方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_single_embedding
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def _get_single_embedding(model, text, device):
"""Get the bert embedding for a single sentence
:param text: The current sentence
:type text: str
:param device: A pytorch device
:type device: torch.device
:param model: a pytorch model
:type model: torch.nn
:return: A bert embedding of the single sentence
:rtype: torch.embedding
"""
tokenizer = BertTokenizer.from_pretrained(Language.ENGLISH)
words = [BertTokens.CLS] + tokenizer.tokenize(text) + [BertTokens.SEP]
tokenized_ids = tokenizer.convert_tokens_to_ids(words)
token_tensor = torch.tensor([tokenized_ids], device=device)
embedding = model.bert.embeddings(token_tensor)[0]
return embedding, words
示例2: __init__
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, embed_dim: int, hidden_dim: int, num_embeddings: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float,
causal: bool):
super().__init__()
self.causal: bool = causal
self.tokens_embeddings: torch.nn.Embedding = torch.nn.Embedding(num_embeddings, embed_dim)
self.position_embeddings: torch.nn.Embedding = torch.nn.Embedding(num_max_positions, embed_dim)
self.dropout: torch.nn.Dropout = torch.nn.Dropout(dropout)
self.attentions, self.feed_forwards = torch.nn.ModuleList(), torch.nn.ModuleList()
self.layer_norms_1, self.layer_norms_2 = torch.nn.ModuleList(), torch.nn.ModuleList()
for _ in range(num_layers):
self.attentions.append(torch.nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout))
self.feed_forwards.append(torch.nn.Sequential(torch.nn.Linear(embed_dim, hidden_dim),
torch.nn.ReLU(),
torch.nn.Linear(hidden_dim, embed_dim)))
self.layer_norms_1.append(torch.nn.LayerNorm(embed_dim, eps=1e-12))
self.layer_norms_2.append(torch.nn.LayerNorm(embed_dim, eps=1e-12))
self.attn_mask = None
self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
示例3: __init__
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, data_dir, bert_model_dir, params, token_pad_idx=0):
self.data_dir = data_dir
self.batch_size = params.batch_size
self.max_len = params.max_len
self.device = params.device
self.seed = params.seed
self.token_pad_idx = 0
tags = self.load_tags()
self.tag2idx = {tag: idx for idx, tag in enumerate(tags)}
self.idx2tag = {idx: tag for idx, tag in enumerate(tags)}
params.tag2idx = self.tag2idx
params.idx2tag = self.idx2tag
self.tag_pad_idx = self.tag2idx['O']
self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir, do_lower_case=True)
示例4: __init__
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, opt):
super().__init__(opt)
# initialize from vocab path
download(opt['datapath'])
vocab_path = os.path.join(opt['datapath'], 'models', 'bert_models', VOCAB_PATH)
self.tokenizer = BertTokenizer.from_pretrained(vocab_path)
self.start_token = '[CLS]'
self.end_token = '[SEP]'
self.null_token = '[PAD]'
self.start_idx = self.tokenizer.convert_tokens_to_ids(['[CLS]'])[
0
] # should be 101
self.end_idx = self.tokenizer.convert_tokens_to_ids(['[SEP]'])[
0
] # should be 102
self.pad_idx = self.tokenizer.convert_tokens_to_ids(['[PAD]'])[0] # should be 0
# set tok2ind for special tokens
self.tok2ind[self.start_token] = self.start_idx
self.tok2ind[self.end_token] = self.end_idx
self.tok2ind[self.null_token] = self.pad_idx
# set ind2tok for special tokens
self.ind2tok[self.start_idx] = self.start_token
self.ind2tok[self.end_idx] = self.end_token
self.ind2tok[self.pad_idx] = self.null_token
示例5: inspect_sampler_squad_examples
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def inspect_sampler_squad_examples():
bert_model_name = "bert-base-uncased"
bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
do_lower_case = True
max_pre_context_length = 315
max_query_length = 64
doc_stride = 128
debug = True
tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case,
cache_dir=bert_pretrain_path)
squad_train_v2 = common.load_json(config.SQUAD_TRAIN_2_0)
train_eitem_list = preprocessing_squad(squad_train_v2)
train_fitem_dict, train_fitem_list = eitems_to_fitems(train_eitem_list, tokenizer, is_training=False,
max_tokens_for_doc=max_pre_context_length,
doc_stride=doc_stride,
debug=debug)
print(len(train_fitem_list))
示例6: _load_model
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def _load_model(self) -> None:
self.device = torch.device(
"cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu"
)
self.n_gpu = torch.cuda.device_count()
# Load a trained model and vocabulary that you have fine-tuned
self.model = BertForSequenceClassification.from_pretrained(
self.model_dir, num_labels=self.num_labels
)
self.tokenizer = BertTokenizer.from_pretrained(
self.model_dir, do_lower_case=self.do_lower_case
)
self.model.to(self.device)
示例7: eval_semantic_sim_score
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def eval_semantic_sim_score(instances: List[CFRInstance], bert_model_type="bert-base-uncased"):
tokenizer = BertTokenizer.from_pretrained(bert_model_type)
model = BertModel.from_pretrained(bert_model_type)
model.eval()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
drift_similarities = []
for instance in instances:
clean_original_story = _clean_text(instance.original_context + ' ' + instance.original_ending)
predicted_ending = _clean_text(instance.cf_context + ' ' + instance.predicted_ending)
original_story_emb = _bert_embed_sentence(clean_original_story, model, tokenizer)
predicted_ending_emb = _bert_embed_sentence(predicted_ending, model, tokenizer)
all_sims = []
for gold_cf in instance.gold_cf_endings:
clean_gold_cf = _clean_text(instance.cf_context + ' ' + gold_cf)
gold_cf_emb = _bert_embed_sentence(clean_gold_cf, model, tokenizer)
all_sims.append(drift_similarity(original_story_emb, predicted_ending_emb, gold_cf_emb))
drift_similarities.append(np.max(all_sims))
return {
"drift_similarity": np.mean(drift_similarities),
"drift_similarity_by_instance": [float(f) for f in drift_similarities]
}
示例8: __init__
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, data_file: str, bert_version: str):
super().__init__(data_file=data_file)
self.tokenizer = BertTokenizer.from_pretrained(bert_version)
df = pd.read_csv(data_file)
self.target_vocab = Vocabulary(add_unk=False)
self.target_vocab.add_many(set(df.category))
示例9: bert_model
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def bert_model(pretrained_model_name_or_path: str = 'bert-base-uncased', num_labels: int = 4):
return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path, num_labels=num_labels)
示例10: get_bert
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def get_bert(bert_model, bert_do_lower_case):
# Avoid a hard dependency on BERT by only importing it if it's being used
from pytorch_pretrained_bert import BertTokenizer, BertModel
if bert_model.endswith('.tar.gz'):
tokenizer = BertTokenizer.from_pretrained(bert_model.replace('.tar.gz', '-vocab.txt'), do_lower_case=bert_do_lower_case)
else:
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=bert_do_lower_case)
bert = BertModel.from_pretrained(bert_model)
return tokenizer, bert
# %%
示例11: __init__
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, bert_type: str, do_basic_tokenize=True):
super(TokenizerForBert, self).__init__()
self.bert_type = bert_type
self.do_basic_tokenize = do_basic_tokenize
self.msg_printer = wasabi.Printer()
self.allowed_bert_types = [
"bert-base-uncased",
"bert-large-uncased",
"bert-base-cased",
"bert-large-cased",
"scibert-base-cased",
"scibert-sci-cased",
"scibert-base-uncased",
"scibert-sci-uncased",
]
self.scibert_foldername_mapping = {
"scibert-base-cased": "scibert_basevocab_cased",
"scibert-sci-cased": "scibert_scivocab_cased",
"scibert-base-uncased": "scibert_basevocab_uncased",
"scibert-sci-uncased": "scibert_scivocab_uncased",
}
assert bert_type in self.allowed_bert_types, self.msg_printer.fail(
f"You passed {bert_type} for attribute bert_type."
f"The allowed types are {self.allowed_bert_types}"
)
self.vocab_type_or_filename = None
if "scibert" in self.bert_type:
foldername = self.scibert_foldername_mapping[self.bert_type]
self.vocab_type_or_filename = os.path.join(
EMBEDDING_CACHE_DIR, foldername, "vocab.txt"
)
else:
self.vocab_type_or_filename = self.bert_type
with self.msg_printer.loading("Loading Bert model"):
self.tokenizer = BertTokenizer.from_pretrained(
self.vocab_type_or_filename, do_basic_tokenize=do_basic_tokenize
)
示例12: __init__
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, bert_type_name='') -> None:
super().__init__()
self.bert_type_name = bert_type_name
self.bert_tokenizer = BertTokenizer.from_pretrained(self.bert_type_name)
self.bert_model: BertModel = BertModel.from_pretrained(self.bert_type_name)
self.bert_model.eval()
示例13: load_bert
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def load_bert():
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')
model.eval()
model.to(device)
return tokenizer, model
示例14: __init__
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, args):
self.args = args
self.set_random_seed(random_seed=args.random_seed)
self.tokenizer = BertTokenizer.from_pretrained(args.bert_model,
do_lower_case=args.do_lower_case)
if args.debug:
print("Debugging mode on.")
self.features_lst = self.get_features(self.args.train_folder, self.args.debug)
示例15: inspect_upstream_eval_v1
# 需要导入模块: from pytorch_pretrained_bert import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.BertTokenizer import from_pretrained [as 别名]
def inspect_upstream_eval_v1():
bert_model_name = "bert-base-uncased"
bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
do_lower_case = True
max_pre_context_length = 315
max_query_length = 64
doc_stride = 128
is_training = True
debug_mode = True
d_list = common.load_jsonl(config.OPEN_SQUAD_DEV_GT)
in_file_name = config.PRO_ROOT / 'saved_models/05-12-08:44:38_mtr_open_qa_p_level_(num_train_epochs:3)/i(2000)|e(2)|squad|top10(0.6909176915799432)|top20(0.7103122043519394)|seed(12)_eval_results.jsonl'
cur_eval_results_list = common.load_jsonl(in_file_name)
top_k = 10
filter_value = 0.1
match_type = 'string'
tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case,
cache_dir=bert_pretrain_path)
fitems_dict, read_fitems_list, _ = get_open_qa_item_with_upstream_paragraphs(d_list, cur_eval_results_list, is_training,
tokenizer, max_pre_context_length, max_query_length, doc_stride,
debug_mode, top_k, filter_value, match_type)
print(len(read_fitems_list))
print(len(fitems_dict))