本文整理汇总了Python中transformers.GPT2Tokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python GPT2Tokenizer.from_pretrained方法的具体用法?Python GPT2Tokenizer.from_pretrained怎么用?Python GPT2Tokenizer.from_pretrained使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类transformers.GPT2Tokenizer
的用法示例。
在下文中一共展示了GPT2Tokenizer.from_pretrained方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def __init__(
self,
pretrained_model=None,
vocab_file=None,
merges_file=None,
errors='replace',
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
):
if pretrained_model:
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
self.vocab_size = self.tokenizer.vocab_size
special_tokens_dict = {}
if self.tokenizer.unk_token is None:
self.tokenizer.unk_token = "<|unk|>"
special_tokens_dict["unk_token"] = "<|unk|>"
if self.tokenizer.bos_token is None:
special_tokens_dict["bos_token"] = bos_token
if self.tokenizer.eos_token is None:
special_tokens_dict["eos_token"] = eos_token
if self.tokenizer.pad_token is None:
special_tokens_dict["pad_token"] = "<|pad|>"
self.tokenizer.add_special_tokens(special_tokens_dict)
示例2: get_tokenizer
# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def get_tokenizer(self, opt):
"""
Instantiate tokenizer.
"""
model_sz = opt['gpt2_size']
fle_key = 'gpt2' if model_sz == 'small' else f'gpt2-{model_sz}'
return GPT2Tokenizer.from_pretrained(fle_key)
示例3: __init__
# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
super().__init__()
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
self.embed_size = self.encoder.transformer.config.hidden_size
self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size)
self.cached_mode = cached_mode
self.device = device
示例4: call
# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2Model.from_pretrained('gpt2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
示例5: __init__
# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def __init__(
self,
class_size=None,
pretrained_model="gpt2-medium",
classifier_head=None,
cached_mode=False,
device='cpu'
):
super(Discriminator, self).__init__()
if pretrained_model.startswith("gpt2"):
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
self.embed_size = self.encoder.transformer.config.hidden_size
elif pretrained_model.startswith("bert"):
self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
self.encoder = BertModel.from_pretrained(pretrained_model)
self.embed_size = self.encoder.config.hidden_size
else:
raise ValueError(
"{} model not yet supported".format(pretrained_model)
)
if classifier_head:
self.classifier_head = classifier_head
else:
if not class_size:
raise ValueError("must specify class_size")
self.classifier_head = ClassificationHead(
class_size=class_size,
embed_size=self.embed_size
)
self.cached_mode = cached_mode
self.device = device
示例6: main
# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def main():
parser = argparse.ArgumentParser(
description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
)
parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
args = parser.parse_args()
logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
if args.tokenizer_type == "bert":
tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map["cls_token"] # `[CLS]`
sep = tokenizer.special_tokens_map["sep_token"] # `[SEP]`
elif args.tokenizer_type == "roberta":
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map["cls_token"] # `<s>`
sep = tokenizer.special_tokens_map["sep_token"] # `</s>`
elif args.tokenizer_type == "gpt2":
tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map["bos_token"] # `<|endoftext|>`
sep = tokenizer.special_tokens_map["eos_token"] # `<|endoftext|>`
logger.info(f"Loading text from {args.file_path}")
with open(args.file_path, "r", encoding="utf8") as fp:
data = fp.readlines()
logger.info(f"Start encoding")
logger.info(f"{len(data)} examples to process.")
rslt = []
iter = 0
interval = 10000
start = time.time()
for text in data:
text = f"{bos} {text.strip()} {sep}"
token_ids = tokenizer.encode(text, add_special_tokens=False)
rslt.append(token_ids)
iter += 1
if iter % interval == 0:
end = time.time()
logger.info(f"{iter} examples processed. - {(end-start):.2f}s/{interval}expl")
start = time.time()
logger.info("Finished binarization")
logger.info(f"{len(data)} examples processed.")
dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
vocab_size = tokenizer.vocab_size
if vocab_size < (1 << 16):
rslt_ = [np.uint16(d) for d in rslt]
else:
rslt_ = [np.int32(d) for d in rslt]
random.shuffle(rslt_)
logger.info(f"Dump to {dp_file}")
with open(dp_file, "wb") as handle:
pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
示例7: main
# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def main():
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
parser.add_argument('--file_path', type=str, default='data/dump.txt',
help='The path to the data.')
parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2', 'kobert'])
parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
help="The tokenizer to use.")
parser.add_argument('--dump_file', type=str, default='data/dump',
help='The dump file prefix.')
args = parser.parse_args()
logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
if args.tokenizer_type == 'bert':
tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
elif args.tokenizer_type == 'roberta':
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
elif args.tokenizer_type == 'gpt2':
tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`
elif args.tokenizer_type == 'kobert':
tokenizer = KoBertTokenizer.from_pretrained('kobert')
bos = tokenizer.special_tokens_map['cls_token']
sep = tokenizer.special_tokens_map['sep_token']
logger.info(f'Loading text from {args.file_path}')
with open(args.file_path, 'r', encoding='utf8') as fp:
data = fp.readlines()
logger.info(f'Start encoding')
logger.info(f'{len(data)} examples to process.')
rslt = []
iter = 0
interval = 10000
start = time.time()
for text in data:
text = f'{bos} {text.strip()} {sep}'
token_ids = tokenizer.encode(text, add_special_tokens=False)
rslt.append(token_ids)
iter += 1
if iter % interval == 0:
end = time.time()
logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
start = time.time()
logger.info('Finished binarization')
logger.info(f'{len(data)} examples processed.')
dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
rslt_ = [np.uint16(d) for d in rslt]
random.shuffle(rslt_)
logger.info(f'Dump to {dp_file}')
with open(dp_file, 'wb') as handle:
pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)