本文整理汇总了Python中tokenization.convert_to_unicode方法的典型用法代码示例。如果您正苦于以下问题:Python tokenization.convert_to_unicode方法的具体用法?Python tokenization.convert_to_unicode怎么用?Python tokenization.convert_to_unicode使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenization
的用法示例。
在下文中一共展示了tokenization.convert_to_unicode方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _create_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
if set_type == "test":
label = "contradiction"
else:
label = tokenization.convert_to_unicode(line[-1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例2: get_train_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
# 数据可以保存在多个文件中,命名格式为train*.txt
file_list = []
for file in os.listdir(data_dir):
if 'train' in os.path.splitext(file)[0]:
file_list.append(file)
file_path = [os.path.join(data_dir, file) for file in file_list]
i = 0
example = []
for file in file_path:
with open(file, 'r') as f:
reader = f.readlines()
for line in reader:
guid = 'train-%d' % i
i += 1
split_line = line.strip().split('\t')
# 数据清洗
if len(split_line) != 4:
print("脏数据:",split_line)
continue
text_a = tokenization.convert_to_unicode(split_line[1])
text_b = tokenization.convert_to_unicode(split_line[2])
label = str(split_line[3])
example.append(InputExample(guid, text_a, text_b, label))
return example
示例3: read_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
examples.append(
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
unique_id += 1
return examples
开发者ID:Nagakiran1,项目名称:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代码行数:24,代码来源:extract_features.py
示例4: get_train_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(
os.path.join(data_dir, "multinli",
"multinli.train.%s.tsv" % self.language))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
if label == tokenization.convert_to_unicode("contradictory"):
label = tokenization.convert_to_unicode("contradiction")
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
开发者ID:Nagakiran1,项目名称:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代码行数:20,代码来源:run_classifier.py
示例5: get_dev_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_dev_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "dev-%d" % (i)
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
开发者ID:Nagakiran1,项目名称:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代码行数:19,代码来源:run_classifier.py
示例6: _create_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
if set_type == "test":
label = "contradiction"
else:
label = tokenization.convert_to_unicode(line[-1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
开发者ID:Nagakiran1,项目名称:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代码行数:18,代码来源:run_classifier.py
示例7: get_train_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
"""See base class."""
lines = self._read_csv(os.path.join(data_dir, "train.csv"))
examples = []
if self.swap_input:
for (i, line) in enumerate(lines):
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[0])
label = tokenization.convert_to_unicode(line[2])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
else:
for (i, line) in enumerate(lines):
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例8: get_test_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_test_examples(self, data_dir):
"""See base class."""
lines = self._read_csv(os.path.join(data_dir, "test.csv"), istrain=False)
examples = []
if self.swap_input:
for (i, line) in enumerate(lines):
guid = "test-%d" % (i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[0])
label = "unrelated"
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
else:
for (i, line) in enumerate(lines):
guid = "test-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = "unrelated"
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例9: get_train_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
"""See base class."""
lines = self._read_csv(os.path.join(data_dir, "train.csv"))
X = np.loadtxt(FLAGS.extra_train_tensor)
examples = []
if self.swap_input:
for (i, line) in enumerate(lines):
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[0])
label = tokenization.convert_to_unicode(line[2])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
else:
for (i, line) in enumerate(lines):
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
return examples
示例10: get_test_examples
# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_test_examples(self, data_dir):
"""See base class."""
lines = self._read_csv(os.path.join(data_dir, "test.csv"), istrain=False)
X = np.loadtxt(FLAGS.extra_test_tensor)
examples = []
if self.swap_input:
for (i, line) in enumerate(lines):
guid = "test-%d" % (i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[0])
label = "unrelated"
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
else:
for (i, line) in enumerate(lines):
guid = "test-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = "unrelated"
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
return examples