本文整理匯總了Python中tokenization.convert_to_unicode方法的典型用法代碼示例。如果您正苦於以下問題:Python tokenization.convert_to_unicode方法的具體用法?Python tokenization.convert_to_unicode怎麽用?Python tokenization.convert_to_unicode使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類tokenization
的用法示例。
在下文中一共展示了tokenization.convert_to_unicode方法的10個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: _create_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
if set_type == "test":
label = "contradiction"
else:
label = tokenization.convert_to_unicode(line[-1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例2: get_train_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def get_train_examples(self, data_dir):
# 數據可以保存在多個文件中,命名格式為train*.txt
file_list = []
for file in os.listdir(data_dir):
if 'train' in os.path.splitext(file)[0]:
file_list.append(file)
file_path = [os.path.join(data_dir, file) for file in file_list]
i = 0
example = []
for file in file_path:
with open(file, 'r') as f:
reader = f.readlines()
for line in reader:
guid = 'train-%d' % i
i += 1
split_line = line.strip().split('\t')
# 數據清洗
if len(split_line) != 4:
print("髒數據:",split_line)
continue
text_a = tokenization.convert_to_unicode(split_line[1])
text_b = tokenization.convert_to_unicode(split_line[2])
label = str(split_line[3])
example.append(InputExample(guid, text_a, text_b, label))
return example
示例3: read_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
examples.append(
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
unique_id += 1
return examples
開發者ID:Nagakiran1,項目名稱:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代碼行數:24,代碼來源:extract_features.py
示例4: get_train_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def get_train_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(
os.path.join(data_dir, "multinli",
"multinli.train.%s.tsv" % self.language))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
if label == tokenization.convert_to_unicode("contradictory"):
label = tokenization.convert_to_unicode("contradiction")
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
開發者ID:Nagakiran1,項目名稱:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代碼行數:20,代碼來源:run_classifier.py
示例5: get_dev_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def get_dev_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "dev-%d" % (i)
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
開發者ID:Nagakiran1,項目名稱:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代碼行數:19,代碼來源:run_classifier.py
示例6: _create_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
if set_type == "test":
label = "contradiction"
else:
label = tokenization.convert_to_unicode(line[-1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
開發者ID:Nagakiran1,項目名稱:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代碼行數:18,代碼來源:run_classifier.py
示例7: get_train_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def get_train_examples(self, data_dir):
"""See base class."""
lines = self._read_csv(os.path.join(data_dir, "train.csv"))
examples = []
if self.swap_input:
for (i, line) in enumerate(lines):
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[0])
label = tokenization.convert_to_unicode(line[2])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
else:
for (i, line) in enumerate(lines):
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例8: get_test_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def get_test_examples(self, data_dir):
"""See base class."""
lines = self._read_csv(os.path.join(data_dir, "test.csv"), istrain=False)
examples = []
if self.swap_input:
for (i, line) in enumerate(lines):
guid = "test-%d" % (i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[0])
label = "unrelated"
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
else:
for (i, line) in enumerate(lines):
guid = "test-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = "unrelated"
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
示例9: get_train_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def get_train_examples(self, data_dir):
"""See base class."""
lines = self._read_csv(os.path.join(data_dir, "train.csv"))
X = np.loadtxt(FLAGS.extra_train_tensor)
examples = []
if self.swap_input:
for (i, line) in enumerate(lines):
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[0])
label = tokenization.convert_to_unicode(line[2])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
else:
for (i, line) in enumerate(lines):
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
return examples
示例10: get_test_examples
# 需要導入模塊: import tokenization [as 別名]
# 或者: from tokenization import convert_to_unicode [as 別名]
def get_test_examples(self, data_dir):
"""See base class."""
lines = self._read_csv(os.path.join(data_dir, "test.csv"), istrain=False)
X = np.loadtxt(FLAGS.extra_test_tensor)
examples = []
if self.swap_input:
for (i, line) in enumerate(lines):
guid = "test-%d" % (i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[0])
label = "unrelated"
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
else:
for (i, line) in enumerate(lines):
guid = "test-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = "unrelated"
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
return examples