当前位置: 首页>>代码示例>>Python>>正文


Python tokenization.convert_to_unicode方法代码示例

本文整理汇总了Python中tokenization.convert_to_unicode方法的典型用法代码示例。如果您正苦于以下问题:Python tokenization.convert_to_unicode方法的具体用法?Python tokenization.convert_to_unicode怎么用?Python tokenization.convert_to_unicode使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tokenization的用法示例。


在下文中一共展示了tokenization.convert_to_unicode方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _create_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
            text_a = tokenization.convert_to_unicode(line[8])
            text_b = tokenization.convert_to_unicode(line[9])
            if set_type == "test":
                label = "contradiction"
            else:
                label = tokenization.convert_to_unicode(line[-1])
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples 
开发者ID:Socialbird-AILab,项目名称:BERT-Classification-Tutorial,代码行数:18,代码来源:run_classifier.py

示例2: get_train_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
        # 数据可以保存在多个文件中,命名格式为train*.txt
        file_list = []
        for file in os.listdir(data_dir):
            if 'train' in os.path.splitext(file)[0]:
                file_list.append(file)
        file_path = [os.path.join(data_dir, file) for file in file_list]
        i = 0
        example = []
        for file in file_path:
            with open(file, 'r') as f:
                reader = f.readlines()
            for line in reader:
                guid = 'train-%d' % i
                i += 1
                split_line = line.strip().split('\t')
                # 数据清洗
                if len(split_line) != 4:
                    print("脏数据:",split_line)
                    continue
                text_a = tokenization.convert_to_unicode(split_line[1])
                text_b = tokenization.convert_to_unicode(split_line[2])
                label = str(split_line[3])
                example.append(InputExample(guid, text_a, text_b, label))
        return example 
开发者ID:fennuDetudou,项目名称:tudouNLP,代码行数:27,代码来源:run_classifier.py

示例3: read_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def read_examples(input_file):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  with tf.gfile.GFile(input_file, "r") as reader:
    while True:
      line = tokenization.convert_to_unicode(reader.readline())
      if not line:
        break
      line = line.strip()
      text_a = None
      text_b = None
      m = re.match(r"^(.*) \|\|\| (.*)$", line)
      if m is None:
        text_a = line
      else:
        text_a = m.group(1)
        text_b = m.group(2)
      examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
      unique_id += 1
  return examples 
开发者ID:Nagakiran1,项目名称:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代码行数:24,代码来源:extract_features.py

示例4: get_train_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(
        os.path.join(data_dir, "multinli",
                     "multinli.train.%s.tsv" % self.language))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "train-%d" % (i)
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = tokenization.convert_to_unicode(line[1])
      label = tokenization.convert_to_unicode(line[2])
      if label == tokenization.convert_to_unicode("contradictory"):
        label = tokenization.convert_to_unicode("contradiction")
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
开发者ID:Nagakiran1,项目名称:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代码行数:20,代码来源:run_classifier.py

示例5: get_dev_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_dev_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "dev-%d" % (i)
      language = tokenization.convert_to_unicode(line[0])
      if language != tokenization.convert_to_unicode(self.language):
        continue
      text_a = tokenization.convert_to_unicode(line[6])
      text_b = tokenization.convert_to_unicode(line[7])
      label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
开发者ID:Nagakiran1,项目名称:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代码行数:19,代码来源:run_classifier.py

示例6: _create_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
      text_a = tokenization.convert_to_unicode(line[8])
      text_b = tokenization.convert_to_unicode(line[9])
      if set_type == "test":
        label = "contradiction"
      else:
        label = tokenization.convert_to_unicode(line[-1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
开发者ID:Nagakiran1,项目名称:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代码行数:18,代码来源:run_classifier.py

示例7: get_train_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
    """See base class."""
    lines = self._read_csv(os.path.join(data_dir, "train.csv"))
    examples = []
    if self.swap_input:
        for (i, line) in enumerate(lines):
          guid = "train-%d" % (i)
          text_a = tokenization.convert_to_unicode(line[1])
          text_b = tokenization.convert_to_unicode(line[0])
          label = tokenization.convert_to_unicode(line[2])
          examples.append(
              InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    else:
        for (i, line) in enumerate(lines):
          guid = "train-%d" % (i)
          text_a = tokenization.convert_to_unicode(line[0])
          text_b = tokenization.convert_to_unicode(line[1])
          label = tokenization.convert_to_unicode(line[2])
          examples.append(
              InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
开发者ID:lampts,项目名称:wsdm19cup,代码行数:23,代码来源:run_classifier_v2.py

示例8: get_test_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_test_examples(self, data_dir):
    """See base class."""
    lines = self._read_csv(os.path.join(data_dir, "test.csv"), istrain=False)
    examples = []
    if self.swap_input:
        for (i, line) in enumerate(lines):
          guid = "test-%d" % (i)
          text_a = tokenization.convert_to_unicode(line[1])
          text_b = tokenization.convert_to_unicode(line[0])
          label = "unrelated"
          examples.append(
              InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    else:
        for (i, line) in enumerate(lines):
          guid = "test-%d" % (i)
          text_a = tokenization.convert_to_unicode(line[0])
          text_b = tokenization.convert_to_unicode(line[1])
          label = "unrelated"
          examples.append(
              InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
开发者ID:lampts,项目名称:wsdm19cup,代码行数:23,代码来源:run_classifier_v2.py

示例9: get_train_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
    """See base class."""
    lines = self._read_csv(os.path.join(data_dir, "train.csv"))
    X = np.loadtxt(FLAGS.extra_train_tensor)
    examples = []
    if self.swap_input:
        for (i, line) in enumerate(lines):
          guid = "train-%d" % (i)
          text_a = tokenization.convert_to_unicode(line[1])
          text_b = tokenization.convert_to_unicode(line[0])
          label = tokenization.convert_to_unicode(line[2])
          examples.append(
              InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
    else:
        for (i, line) in enumerate(lines):
          guid = "train-%d" % (i)
          text_a = tokenization.convert_to_unicode(line[0])
          text_b = tokenization.convert_to_unicode(line[1])
          label = tokenization.convert_to_unicode(line[2])
          examples.append(
              InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
    return examples 
开发者ID:lampts,项目名称:wsdm19cup,代码行数:24,代码来源:run_classifier_v3.py

示例10: get_test_examples

# 需要导入模块: import tokenization [as 别名]
# 或者: from tokenization import convert_to_unicode [as 别名]
def get_test_examples(self, data_dir):
    """See base class."""
    lines = self._read_csv(os.path.join(data_dir, "test.csv"), istrain=False)
    X = np.loadtxt(FLAGS.extra_test_tensor)
    examples = []
    if self.swap_input:
        for (i, line) in enumerate(lines):
          guid = "test-%d" % (i)
          text_a = tokenization.convert_to_unicode(line[1])
          text_b = tokenization.convert_to_unicode(line[0])
          label = "unrelated"
          examples.append(
              InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
    else:
        for (i, line) in enumerate(lines):
          guid = "test-%d" % (i)
          text_a = tokenization.convert_to_unicode(line[0])
          text_b = tokenization.convert_to_unicode(line[1])
          label = "unrelated"
          examples.append(
              InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, extra_feats=list(X[i])))
    return examples 
开发者ID:lampts,项目名称:wsdm19cup,代码行数:24,代码来源:run_classifier_v3.py


注:本文中的tokenization.convert_to_unicode方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。