本文整理汇总了Python中bert.tokenization.FullTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenization.FullTokenizer方法的具体用法?Python tokenization.FullTokenizer怎么用?Python tokenization.FullTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bert.tokenization
的用法示例。
在下文中一共展示了tokenization.FullTokenizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def __init__(self, id, args, worker_address, sink_address):
super().__init__()
self.model_dir = args.model_dir
self.config_fp = os.path.join(self.model_dir, 'bert_config.json')
self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt')
self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt')
self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp)
self.max_seq_len = args.max_seq_len
self.worker_id = id
self.daemon = True
self.model_fn = model_fn_builder(
bert_config=modeling.BertConfig.from_json_file(self.config_fp),
init_checkpoint=self.checkpoint_fp,
pooling_strategy=args.pooling_strategy,
pooling_layer=args.pooling_layer
)
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction
self.estimator = Estimator(self.model_fn, config=RunConfig(session_config=config))
self.exit_flag = multiprocessing.Event()
self.logger = set_logger('WORKER-%d' % self.worker_id)
self.worker_address = worker_address
self.sink_address = sink_address
示例2: test_full_tokenizer
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def test_full_tokenizer(self):
vocab_tokens = [
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
"##ing", ","
]
with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
vocab_file = vocab_writer.name
tokenizer = tokenization.FullTokenizer(vocab_file)
os.unlink(vocab_file)
tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertAllEqual(
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
示例3: main
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--review-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/reviews')
parser.add_argument('--parsedpdf-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/parsed_pdfs')
parser.add_argument('--out-dir', type=str, default='../dat/PeerRead/proc')
parser.add_argument('--out-file', type=str, default='arxiv-all.tf_record')
parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
parser.add_argument('--max-abs-len', type=int, default=250)
parser.add_argument('--venue', type=int, default=0)
parser.add_argument('--year', type=int, default=2017)
args = parser.parse_args()
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=True)
clean_PeerRead_dataset(args.review_json_dir, args.parsedpdf_json_dir,
args.venue, args.year,
args.out_dir, args.out_file,
args.max_abs_len, tokenizer, is_arxiv=True)
示例4: main
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--data-dir', type=str, default=None)
parser.add_argument('--out-dir', type=str, default='../dat/reddit')
parser.add_argument('--out-file', type=str, default='proc.tf_record')
parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
parser.add_argument('--max-abs-len', type=int, default=128)
parser.add_argument('--subsample', type=int, default=0)
parser.add_argument('--use-latest-reddit', type=bool, default=True)
args = parser.parse_args()
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=True)
process_reddit_dataset(args.data_dir, args.out_dir, args.out_file,
args.max_abs_len, tokenizer, args.subsample, args.use_latest_reddit)
示例5: __init__
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def __init__(self, model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt",
bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json",
vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt",
max_seq_length=32, dimension=768, num_labels=2, use_notebook=False):
super().__init__("bert", dimension, use_notebook)
config = BertConfig.from_json_file(bertconfig_fname)
self.max_seq_length = max_seq_length
self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph(config,
max_seq_length,
1.0,
num_labels,
tune=False)
saver = tf.train.Saver(tf.global_variables())
self.sess = tf.Session()
checkpoint_path = tf.train.latest_checkpoint(model_fname)
saver.restore(self.sess, checkpoint_path)
示例6: get_bert
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')
bert_config = BertConfig.from_json_file(bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
bert_config.print_status()
model_bert = BertModel(bert_config)
if no_pretraining:
pass
else:
model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
print("Load pre-trained parameters.")
model_bert.to(device)
return model_bert, tokenizer, bert_config
示例7: get_bert
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')
bert_config = BertConfig.from_json_file(bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
bert_config.print_status()
model_bert = BertModel(bert_config)
if no_pretraining:
pass
else:
model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
print("Load pre-trained parameters.")
model_bert.to(device)
return model_bert, tokenizer, bert_config
示例8: __init__
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def __init__(self, label_map, vocab_file,
max_seq_length, do_lower_case,
converter):
"""Initializes an instance of BertExampleBuilder.
Args:
label_map: Mapping from tags to tag IDs.
vocab_file: Path to BERT vocabulary file.
max_seq_length: Maximum sequence length.
do_lower_case: Whether to lower case the input text. Should be True for
uncased models and False for cased models.
converter: Converter from text targets to tags.
"""
self._label_map = label_map
self._tokenizer = tokenization.FullTokenizer(vocab_file,
do_lower_case=do_lower_case)
self._max_seq_length = max_seq_length
self._converter = converter
self._pad_id = self._get_pad_id()
self._keep_tag_id = self._label_map['KEEP']
示例9: test_full_tokenizer
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def test_full_tokenizer(self):
vocab_tokens = [
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
"##ing", ","
]
with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
if six.PY2:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
else:
vocab_writer.write("".join(
[x + "\n" for x in vocab_tokens]).encode("utf-8"))
vocab_file = vocab_writer.name
tokenizer = tokenization.FullTokenizer(vocab_file)
os.unlink(vocab_file)
tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertAllEqual(
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
示例10: create_tokenizer_from_hub_module
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def create_tokenizer_from_hub_module():
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(BERT_MODEL_HUB)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
示例11: __init__
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def __init__(self, vocab_file=None, **kwargs):
super().__init__()
if vocab_file is None:
raise ValueError(
'Vocabulary file is required to initialize BERT tokenizer'
)
try:
from bert.tokenization import FullTokenizer
except ImportError:
raise ValueError(
"Please install bert-tensorflow: pip install bert-tensorflow"
)
self.tokenizer = FullTokenizer(vocab_file)
示例12: main
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
input_files = []
for input_pattern in FLAGS.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
tf.logging.info("*** Reading from input files ***")
for input_file in input_files:
tf.logging.info(" %s", input_file)
rng = random.Random(FLAGS.random_seed)
instances = create_training_instances(
input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
rng)
output_files = FLAGS.output_file.split(",")
tf.logging.info("*** Writing to output files ***")
for output_file in output_files:
tf.logging.info(" %s", output_file)
write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
FLAGS.max_predictions_per_seq, output_files)
示例13: buzzy_title_based_sim_dfs
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def buzzy_title_based_sim_dfs(treat_strength, con_strength, noise_level, setting="simple", seed=0,
base_output_dir='../dat/sim/peerread_buzzytitle_based/'):
labeler = make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed)
num_splits = 10
dev_splits = [0]
test_splits = [0]
# data_file = '../dat/reddit/proc.tf_record'
# vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt"
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
input_dataset_from_filenames = make_input_fn_from_file(data_file,
250,
num_splits,
dev_splits,
test_splits,
tokenizer,
is_training=False,
filter_test=False,
shuffle_buffer_size=25000,
seed=seed,
labeler=labeler)
output_df = dataset_fn_to_df(input_dataset_from_filenames)
output_df = output_df.rename(index=str, columns={'theorem_referenced': 'treatment'})
output_dir = os.path.join(base_output_dir, "mode{}".format(setting))
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level))
output_df.to_csv(output_path, '\t')
示例14: main
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--datasets-dir', type=str, default='../dat/PeerRead')
parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
args = parser.parse_args()
datasets_dir = args.datasets_dir
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=True)
def proc_dataset(dataset):
all_dir = os.path.join(datasets_dir, dataset_paths[dataset], 'all')
review_json_dir = os.path.join(all_dir, 'reviews')
parsedpdf_json_dir = os.path.join(all_dir, 'parsed_pdfs')
venue = dataset_venues[dataset]
year = dataset_years[dataset]
out_dir = os.path.join(datasets_dir, 'proc')
out_file = dataset + '.tf_record'
max_abs_len = 250
clean_PeerRead_dataset(review_json_dir, parsedpdf_json_dir, venue, year, out_dir, out_file, max_abs_len,
tokenizer)
# pool = mp.Pool(4)
# pool.map(proc_dataset, dataset_names)
for dataset in dataset_names:
proc_dataset(dataset)
示例15: main
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import FullTokenizer [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--review-json-dir', type=str, default=None)
parser.add_argument('--vocab-file', type=str, default=None)
args = parser.parse_args()
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=True)
review_json_dir = args.review_json_dir
print('Reading reviews from...', review_json_dir)
paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir)))
paper_json_filename = paper_json_filenames[0]
with io.open(paper_json_filename) as json_file:
loaded = json.load(json_file)
abstract = loaded['abstract']
print(abstract)
tokens = tokenizer.tokenize(abstract)
print(tokens)
print(tokenizer.convert_tokens_to_ids(tokens))
# for idx, paper_json_filename in enumerate(paper_json_filenames):
# with io.open(paper_json_filename) as json_file:
# loaded = json.load(json_file)
#
# print(loaded['abstract'])