Python tokenization.convert_to_unicode方法代码示例

本文整理汇总了Python中bert.tokenization.convert_to_unicode方法的典型用法代码示例。如果您正苦于以下问题：Python tokenization.convert_to_unicode方法的具体用法？Python tokenization.convert_to_unicode怎么用？Python tokenization.convert_to_unicode使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bert.tokenization的用法示例。

在下文中一共展示了tokenization.convert_to_unicode方法的12个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: read_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def read_examples(input_file):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  with tf.gfile.GFile(input_file, "r") as reader:
    while True:
      line = tokenization.convert_to_unicode(reader.readline())
      if not line:
        break
      line = line.strip()
      text_a = None
      text_b = None
      m = re.match(r"^(.*) \|\|\| (.*)$", line)
      if m is None:
        text_a = line
      else:
        text_a = m.group(1)
        text_b = m.group(2)
      examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
      unique_id += 1
  return examples

开发者ID:ZhangShiyue，项目名称:QGforQA，代码行数:24，代码来源:extract_features.py

示例2: read_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def read_examples(lst_strs):
    """Read a list of `InputExample`s from a list of strings."""
    unique_id = 0
    for ss in lst_strs:
        line = tokenization.convert_to_unicode(ss)
        if not line:
            continue
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
        unique_id += 1

开发者ID:a414351664，项目名称:Bert-TextClassification，代码行数:20，代码来源:extract_features.py

示例3: get_train_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(
        os.path.join(data_dir, "multinli",
                     "multinli.train.%s.tsv" % self.language))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "train-%d" % (i)
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = tokenization.convert_to_unicode(line[1])
      label = tokenization.convert_to_unicode(line[2])
      if label == tokenization.convert_to_unicode("contradictory"):
        label = tokenization.convert_to_unicode("contradiction")
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

开发者ID:ZhangShiyue，项目名称:QGforQA，代码行数:20，代码来源:run_classifier.py

示例4: get_dev_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def get_dev_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "dev-%d" % (i)
      language = tokenization.convert_to_unicode(line[0])
      if language != tokenization.convert_to_unicode(self.language):
        continue
      text_a = tokenization.convert_to_unicode(line[6])
      text_b = tokenization.convert_to_unicode(line[7])
      label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

开发者ID:ZhangShiyue，项目名称:QGforQA，代码行数:19，代码来源:run_classifier.py

示例5: _create_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
      text_a = tokenization.convert_to_unicode(line[8])
      text_b = tokenization.convert_to_unicode(line[9])
      if set_type == "test":
        label = "contradiction"
      else:
        label = tokenization.convert_to_unicode(line[-1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

开发者ID:ZhangShiyue，项目名称:QGforQA，代码行数:18，代码来源:run_classifier.py

示例6: _create_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    sentence_index = 0
    for (i, line) in enumerate(lines):

      if i == 0:
        # Identify the sentence index
        for j, token in enumerate(line):
          if token.strip() == "sentence":
            sentence_index = j
        continue

      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[sentence_index])
        label = "true"
      else:
        text_a = tokenization.convert_to_unicode(line[sentence_index])
        label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples

开发者ID:google-research，项目名称:language，代码行数:25，代码来源:run_classifier_membership.py

示例7: _create_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    sentence_index = 0
    for (i, line) in enumerate(lines):

      if i == 0:
        # Identify the missing index
        for j, token in enumerate(line):
          if token.strip() == "sentence":
            sentence_index = j
        continue

      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[sentence_index])
        label = [1.0, 0]
      else:
        text_a = tokenization.convert_to_unicode(line[sentence_index])
        label = [float(line[2]), float(line[3])]
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples

开发者ID:google-research，项目名称:language，代码行数:25，代码来源:run_classifier_distillation.py

示例8: _create_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      text_a = tokenization.convert_to_unicode(line[3])
      text_b = tokenization.convert_to_unicode(line[4])
      if set_type == "test":
        label = "0"
      else:
        label = tokenization.convert_to_unicode(line[0])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

开发者ID:google-research，项目名称:language，代码行数:18，代码来源:run_classifier.py

示例9: _create_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    print("length of lines:", len(lines))
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      try:
        label = tokenization.convert_to_unicode(line[2])
        text_a = tokenization.convert_to_unicode(line[0])
        text_b = tokenization.convert_to_unicode(line[1])
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
      except Exception:  # pylint: disable=broad-except
        print("###error.i:", i, line)
    return examples

开发者ID:google-research，项目名称:language，代码行数:19，代码来源:classifier_utils.py

示例10: get_train_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def get_train_examples(self, data_dir):
        """See base class."""
        lines = self._read_tsv(
            os.path.join(data_dir, "multinli",
                         "multinli.train.%s.tsv" % self.language))
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "train-%d" % (i)
            text_a = tokenization.convert_to_unicode(line[0])
            text_b = tokenization.convert_to_unicode(line[1])
            label = tokenization.convert_to_unicode(line[2])
            if label == tokenization.convert_to_unicode("contradictory"):
                label = tokenization.convert_to_unicode("contradiction")
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

开发者ID:sliderSun，项目名称:pynlp，代码行数:20，代码来源:run_text_classification.py

示例11: get_dev_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def get_dev_examples(self, data_dir):
        """See base class."""
        lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "dev-%d" % (i)
            language = tokenization.convert_to_unicode(line[0])
            if language != tokenization.convert_to_unicode(self.language):
                continue
            text_a = tokenization.convert_to_unicode(line[6])
            text_b = tokenization.convert_to_unicode(line[7])
            label = tokenization.convert_to_unicode(line[1])
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

开发者ID:sliderSun，项目名称:pynlp，代码行数:19，代码来源:run_text_classification.py

示例12: _create_examples

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import convert_to_unicode [as 别名]
def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = tokenization.convert_to_unicode(line[3])
            text_b = tokenization.convert_to_unicode(line[4])
            if set_type == "test":
                label = "0"
            else:
                label = tokenization.convert_to_unicode(line[0])
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

开发者ID:sliderSun，项目名称:pynlp，代码行数:18，代码来源:run_text_classification.py

注：本文中的bert.tokenization.convert_to_unicode方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。