本文整理汇总了Python中pytorch_pretrained_bert.GPT2Tokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python GPT2Tokenizer.from_pretrained方法的具体用法?Python GPT2Tokenizer.from_pretrained怎么用?Python GPT2Tokenizer.from_pretrained使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pytorch_pretrained_bert.GPT2Tokenizer
的用法示例。
在下文中一共展示了GPT2Tokenizer.from_pretrained方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: transform
# 需要导入模块: from pytorch_pretrained_bert import GPT2Tokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.GPT2Tokenizer import from_pretrained [as 别名]
def transform(self, X):
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Load pre-trained model (weights)
model = GPT2Model.from_pretrained('gpt2', cache_dir='tmp/gpt2/')
model.eval()
output = []
for idx, row in tqdm(X.iterrows(), total=len(X)):
# Encode some inputs
indexed_tokens_1 = tokenizer.encode(row.text)
# If you have a GPU, put everything on cuda
# Convert inputs to PyTorch tensors
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_1 = tokens_tensor_1.to('cuda')
model.to('cuda')
# Predict hidden states features for each layer
with torch.no_grad():
hidden_states_1, past = model(tokens_tensor_1)
tokens = [tokenizer.decoder[token].replace('Ġ', '') for token in indexed_tokens_1]
output.append([tokens, hidden_states_1.cpu()[0]])
output = pd.DataFrame(output, columns=['tokens', 'layer_-1'])
res = []
for idx, row in X.iterrows():
res.append(self.get_sample_props(output.loc[idx], **row)[1:])
res = pd.DataFrame(res, columns=['tokens', 'pronoun_offset_token',
'a_offset_token', 'b_offset_token', 'a_span',
'b_span', 'pronoun_token', 'a_tokens', 'b_tokens', 'bert', 'cls'])
cols = set(X.columns).difference(res.columns)
return {'X': pd.concat([X[cols], res], axis=1)}
示例2: __init__
# 需要导入模块: from pytorch_pretrained_bert import GPT2Tokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.GPT2Tokenizer import from_pretrained [as 别名]
def __init__(self, cuda_device=-1):
super(GPT2Embedder, self).__init__()
self.cuda_device = 'cpu' if cuda_device == -1 else f'cuda:{cuda_device}'
# Load pre-trained model tokenizer (vocabulary)
self.enc = GPT2Tokenizer.from_pretrained('gpt2')
# Load pre-trained model (weights)
self.model = GPT2Model.from_pretrained('gpt2')
self.model.to(self.cuda_device)
self.model.eval() # we only use the evaluation mode of the pretrained model
self._bos_id = self.enc.encoder['<|endoftext|>']
self._bos_past = None
示例3: tokenizeGpt2
# 需要导入模块: from pytorch_pretrained_bert import GPT2Tokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.GPT2Tokenizer import from_pretrained [as 别名]
def tokenizeGpt2(extraction_file_paths, args, min_length=20):
"""Tokenize text using GPT-2's pretrained BPE encoder.
Saves as compressed npz files that can be loaded using `with np.load('filename.npz') as a: a['arr_0']`.
Omit files smaller than min_length tokens, which are likely low quality.
"""
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
EOT = tokenizer.encoder['<|endoftext|>']
omitted_files = 0
combined = []
p = current_process()
index = p._identity[0] if p._identity else 0
bar = tqdm.tqdm(extraction_file_paths, position=index, desc=f'proc {index}')
for extraction_file_path in bar:
_, filename = os.path.split(extraction_file_path)
text_file = os.path.join(
args.output_dir, filename.replace('.txt', '.tokenized.npz'))
with io.open(extraction_file_path, 'r', encoding='utf-8') as fi:
# Suppress warnings about length.
with open(os.devnull, "w") as f, contextlib.redirect_stderr(f):
# Safe to concat by adding EOT.
out = tokenizer.encode(fi.read()) + [EOT]
if len(out) < min_length:
omitted_files += 1
continue
combined += out
if len(combined) > args.combine:
np.savez_compressed(text_file, combined)
combined = []
# Save the rest.
if combined:
np.savez_compressed(text_file, combined)
return omitted_files, bar.total
示例4: run_model
# 需要导入模块: from pytorch_pretrained_bert import GPT2Tokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.GPT2Tokenizer import from_pretrained [as 别名]
def run_model():
parser = argparse.ArgumentParser()
parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--nsamples", type=int, default=1)
parser.add_argument("--batch_size", type=int, default=-1)
parser.add_argument("--length", type=int, default=-1)
parser.add_argument("--temperature", type=int, default=1)
parser.add_argument("--top_k", type=int, default=0)
parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
args = parser.parse_args()
print(args)
if args.batch_size == -1:
args.batch_size = 1
assert args.nsamples % args.batch_size == 0
np.random.seed(args.seed)
torch.random.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
model.to(device)
model.eval()
if args.length == -1:
args.length = model.config.n_ctx // 2
elif args.length > model.config.n_ctx:
raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
while not args.unconditional:
if not args.unconditional:
raw_text = input("Model prompt >>> ")
while not raw_text:
print('Prompt should not be empty!')
raw_text = input("Model prompt >>> ")
context_tokens = enc.encode(raw_text)
generated = 0
for _ in range(args.nsamples // args.batch_size):
out = sample_sequence(
model=model, length=args.length,
context=context_tokens if not args.unconditional else None,
start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None,
batch_size=args.batch_size,
temperature=args.temperature, top_k=args.top_k, device=device
)
out = out[:, len(context_tokens):].tolist()
for i in range(args.batch_size):
generated += 1
text = enc.decode(out[i])
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
print(text)
print("=" * 80)