本文整理汇总了Python中transformers.BertTokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python BertTokenizer.from_pretrained方法的具体用法?Python BertTokenizer.from_pretrained怎么用?Python BertTokenizer.from_pretrained使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类transformers.BertTokenizer
的用法示例。
在下文中一共展示了BertTokenizer.from_pretrained方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_TFXLNet
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def test_TFXLNet(self):
if enable_full_transformer_test:
from transformers import XLNetConfig, TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
TFXLNetForTokenClassification, TFXLNetForQuestionAnsweringSimple, XLNetTokenizer
model_list = [TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
TFXLNetForTokenClassification, TFXLNetForQuestionAnsweringSimple]
else:
from transformers import XLNetConfig, TFXLNetModel, XLNetTokenizer
model_list = [TFXLNetModel]
# XLNetTokenizer need SentencePiece, so the pickle file does not work here.
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
config = XLNetConfig(n_layer=2)
# The model with input mask has MatrixDiagV3 which is not a registered function/op
token = np.asarray(tokenizer.encode(self.text_str, add_special_tokens=True), dtype=np.int32)
inputs_onnx = {'input_1': np.expand_dims(token, axis=0)}
inputs = tf.constant(token)[None, :] # Batch size 1
for model_instance_ in model_list:
keras.backend.clear_session()
model = model_instance_(config)
predictions = model.predict(inputs)
onnx_model = keras2onnx.convert_keras(model)
self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2,
atol=1.e-4))
示例2: __init__
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, intent_vocab, tag_vocab, pretrained_weights):
"""
:param intent_vocab: list of all intents
:param tag_vocab: list of all tags
:param pretrained_weights: which bert, e.g. 'bert-base-uncased'
"""
self.intent_vocab = intent_vocab
self.tag_vocab = tag_vocab
self.intent_dim = len(intent_vocab)
self.tag_dim = len(tag_vocab)
self.id2intent = dict([(i, x) for i, x in enumerate(intent_vocab)])
self.intent2id = dict([(x, i) for i, x in enumerate(intent_vocab)])
self.id2tag = dict([(i, x) for i, x in enumerate(tag_vocab)])
self.tag2id = dict([(x, i) for i, x in enumerate(tag_vocab)])
self.tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
self.data = {}
self.intent_weight = [1] * len(self.intent2id)
示例3: save_to_onnx
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def save_to_onnx(model):
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model.eval()
dummy_input = torch.ones((1, 384), dtype=torch.int64)
torch.onnx.export(
model,
(dummy_input, dummy_input, dummy_input),
"build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx",
verbose=True,
input_names = ["input_ids", "input_mask", "segment_ids"],
output_names = ["output_start_logits", "output_end_logits"],
opset_version=11,
dynamic_axes=({"input_ids": {0: "batch_size"}, "input_mask": {0: "batch_size"}, "segment_ids": {0: "batch_size"},
"output_start_logits": {0: "batch_size"}, "output_end_logits": {0: "batch_size"}})
)
示例4: __init__
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
from transformers import BertTokenizer, BertForSequenceClassification
# download the model or load the model path
path_emotion = download_model('bert.emotion', cache_dir,
process_func=_unzip_process_func,
verbose=verbose)
path_emotion = os.path.join(path_emotion,'bert.emotion')
path_reject = download_model('bert.noemotion', cache_dir,
process_func=_unzip_process_func,
verbose=verbose)
path_reject = os.path.join(path_reject,'bert.noemotion')
# load the models
self.tokenizer_rejct = BertTokenizer.from_pretrained(path_reject)
self.model_reject = BertForSequenceClassification.from_pretrained(path_reject)
self.tokenizer = BertTokenizer.from_pretrained(path_emotion)
self.model = BertForSequenceClassification.from_pretrained(path_emotion)
# load the class names mapping
self.catagories = {5: 'Foragt/Modvilje', 2: 'Forventning/Interrese',
0: 'Glæde/Sindsro', 3: 'Overasket/Målløs',
1: 'Tillid/Accept',
4: 'Vrede/Irritation', 6: 'Sorg/trist',
7: 'Frygt/Bekymret'}
示例5: __init__
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, max_length, pretrain_path, blank_padding=True, mask_entity=False):
"""
Args:
max_length: max length of sentence
pretrain_path: path of pretrain model
"""
super().__init__()
self.max_length = max_length
self.blank_padding = blank_padding
self.hidden_size = 768
self.mask_entity = mask_entity
logging.info('Loading BERT pre-trained checkpoint.')
self.bert = BertModel.from_pretrained(pretrain_path)
self.tokenizer = BertTokenizer.from_pretrained(pretrain_path)
示例6: get_tokenizer
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def get_tokenizer(self) -> BertTokenizer:
""" Return bert tokenizer
:return: BertTokenizer
:rtype BertTokenizer
"""
return BertTokenizer.from_pretrained("bert-base-uncased")
示例7: test_processor_functions
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def test_processor_functions():
download_squad(dir="./data")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
max_seq_length = 256
max_query_length = 32
doc_stride = 128
is_training = False
verbose = False
examples = read_squad_examples(
"./data/SQuAD_1.1/dev-v1.1.json",
is_training=is_training,
version_2_with_negative=False,
)
assert len(examples) == 10570
features = convert_examples_to_features(
examples,
tokenizer,
max_seq_length,
doc_stride,
max_query_length,
is_training,
verbose,
)
assert len(features) == 12006
示例8: __init__
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(
self,
bert_model="bert-base-uncased",
do_lower_case=True,
is_training=False,
version_2_with_negative=False,
max_seq_length=384,
doc_stride=128,
max_query_length=64,
verbose=False,
tokenizer=None,
):
self.bert_model = bert_model
self.do_lower_case = do_lower_case
self.is_training = is_training
self.version_2_with_negative = version_2_with_negative
self.max_seq_length = max_seq_length
self.doc_stride = doc_stride
self.max_query_length = max_query_length
self.verbose = verbose
if tokenizer is None:
self.tokenizer = BertTokenizer.from_pretrained(
self.bert_model, do_lower_case=self.do_lower_case
)
else:
self.tokenizer = tokenizer
logger.info("loading custom tokenizer")
示例9: __init__
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
super(BERT, self).__init__()
self.config_keys = ['max_seq_length', 'do_lower_case']
self.do_lower_case = do_lower_case
if max_seq_length > 510:
logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
max_seq_length = 510
self.max_seq_length = max_seq_length
if self.do_lower_case is not None:
tokenizer_args['do_lower_case'] = do_lower_case
self.bert = BertModel.from_pretrained(model_name_or_path, **model_args)
self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
示例10: call
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.bert(inputs, **kwargs)
return outputs
示例11: test_3layer_gpt2
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def test_3layer_gpt2(self):
from transformers import GPT2Config, TFGPT2Model, BertTokenizer
keras2onnx.proto.keras.backend.set_learning_phase(0)
config = GPT2Config(n_layer=3)
model = TFGPT2Model(config)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='tf')
predictions = model.predict(inputs)
onnx_model = keras2onnx.convert_keras(model, model.name)
self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
示例12: load
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def load(self, path):
"""Load saved model and vectorizer.
:param path: Path of directory where the model was saved.
"""
self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model)
vectorizer_values = torch.load(path + '/vectorizer.pt')
self.vectorizer = Vectorizer(device=self.device)
self.vectorizer.load_values(vectorizer_values)
model_class = BertCrfForTokenClassification if self.using_crf else BertForTokenClassification
self.model = model_class.from_pretrained(
path,
num_labels=len(self.vectorizer.tag_to_index) - 1 # Ignore 'X'
)
self.model = self.model.to(self.device)
示例13: __init__
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(
self,
class_size=None,
pretrained_model="gpt2-medium",
classifier_head=None,
cached_mode=False,
device='cpu'
):
super(Discriminator, self).__init__()
if pretrained_model.startswith("gpt2"):
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
self.embed_size = self.encoder.transformer.config.hidden_size
elif pretrained_model.startswith("bert"):
self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
self.encoder = BertModel.from_pretrained(pretrained_model)
self.embed_size = self.encoder.config.hidden_size
else:
raise ValueError(
"{} model not yet supported".format(pretrained_model)
)
if classifier_head:
self.classifier_head = classifier_head
else:
if not class_size:
raise ValueError("must specify class_size")
self.classifier_head = ClassificationHead(
class_size=class_size,
embed_size=self.embed_size
)
self.cached_mode = cached_mode
self.device = device
示例14: __init__
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, bert_model, lower, max_src_tokens, max_tgt_tokens):
self.max_src_tokens = max_src_tokens
self.max_tgt_tokens = max_tgt_tokens
self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=lower)
self.sep_token = '[SEP]'
self.cls_token = '[CLS]'
self.pad_token = '[PAD]'
self.tgt_bos = '[unused1] '
self.tgt_eos = ' [unused2]'
self.tgt_sent_split = ' [unused3] '
self.sep_vid = self.tokenizer.vocab[self.sep_token]
self.cls_vid = self.tokenizer.vocab[self.cls_token]
self.pad_vid = self.tokenizer.vocab[self.pad_token]
示例15: __init__
# 需要导入模块: from transformers import BertTokenizer [as 别名]
# 或者: from transformers.BertTokenizer import from_pretrained [as 别名]
def __init__(self, pretrain_path, max_length, cat_entity_rep=False):
nn.Module.__init__(self)
self.bert = BertModel.from_pretrained(pretrain_path)
self.max_length = max_length
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.cat_entity_rep = cat_entity_rep