本文整理汇总了Python中bert.tokenization.convert_to_unicode方法的典型用法代码示例。如果您正苦于以下问题:Python tokenization.convert_to_unicode方法的具体用法?Python tokenization.convert_to_unicode怎么用?Python tokenization.convert_to_unicode使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bert.tokenization
的用法示例。
在下文中一共展示了tokenization.convert_to_unicode方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: read_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
examples.append(
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
unique_id += 1
return examples
示例2: read_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def read_examples(lst_strs):
"""Read a list of `InputExample`s from a list of strings."""
unique_id = 0
for ss in lst_strs:
line = tokenization.convert_to_unicode(ss)
if not line:
continue
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
unique_id += 1
示例3: get_train_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(
os.path.join(data_dir, "multinli",
"multinli.train.%s.tsv" % self.language))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
if label == tokenization.convert_to_unicode("contradictory"):
label = tokenization.convert_to_unicode("contradiction")
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例4: get_dev_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def get_dev_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "dev-%d" % (i)
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例5: _create_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
if set_type == "test":
label = "contradiction"
else:
label = tokenization.convert_to_unicode(line[-1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例6: _create_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
sentence_index = 0
for (i, line) in enumerate(lines):
if i == 0:
# Identify the sentence index
for j, token in enumerate(line):
if token.strip() == "sentence":
sentence_index = j
continue
guid = "%s-%s" % (set_type, i)
if set_type == "test":
text_a = tokenization.convert_to_unicode(line[sentence_index])
label = "true"
else:
text_a = tokenization.convert_to_unicode(line[sentence_index])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
示例7: _create_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
sentence_index = 0
for (i, line) in enumerate(lines):
if i == 0:
# Identify the missing index
for j, token in enumerate(line):
if token.strip() == "sentence":
sentence_index = j
continue
guid = "%s-%s" % (set_type, i)
if set_type == "test":
text_a = tokenization.convert_to_unicode(line[sentence_index])
label = [1.0, 0]
else:
text_a = tokenization.convert_to_unicode(line[sentence_index])
label = [float(line[2]), float(line[3])]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
示例8: _create_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[3])
text_b = tokenization.convert_to_unicode(line[4])
if set_type == "test":
label = "0"
else:
label = tokenization.convert_to_unicode(line[0])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例9: _create_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
print("length of lines:", len(lines))
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
try:
label = tokenization.convert_to_unicode(line[2])
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
except Exception: # pylint: disable=broad-except
print("###error.i:", i, line)
return examples
示例10: get_train_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(
os.path.join(data_dir, "multinli",
"multinli.train.%s.tsv" % self.language))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
if label == tokenization.convert_to_unicode("contradictory"):
label = tokenization.convert_to_unicode("contradiction")
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例11: get_dev_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def get_dev_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "dev-%d" % (i)
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例12: _create_examples
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[3])
text_b = tokenization.convert_to_unicode(line[4])
if set_type == "test":
label = "0"
else:
label = tokenization.convert_to_unicode(line[0])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples