Python tokenization.convert_to_unicode方法代碼示例

本文整理匯總了Python中bert.tokenization.convert_to_unicode方法的典型用法代碼示例。如果您正苦於以下問題：Python tokenization.convert_to_unicode方法的具體用法？Python tokenization.convert_to_unicode怎麽用？Python tokenization.convert_to_unicode使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類bert.tokenization的用法示例。

在下文中一共展示了tokenization.convert_to_unicode方法的12個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: read_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def read_examples(input_file):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  with tf.gfile.GFile(input_file, "r") as reader:
    while True:
      line = tokenization.convert_to_unicode(reader.readline())
      if not line:
        break
      line = line.strip()
      text_a = None
      text_b = None
      m = re.match(r"^(.*) \|\|\| (.*)$", line)
      if m is None:
        text_a = line
      else:
        text_a = m.group(1)
        text_b = m.group(2)
      examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
      unique_id += 1
  return examples

開發者ID:ZhangShiyue，項目名稱:QGforQA，代碼行數:24，代碼來源:extract_features.py

示例2: read_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def read_examples(lst_strs):
    """Read a list of `InputExample`s from a list of strings."""
    unique_id = 0
    for ss in lst_strs:
        line = tokenization.convert_to_unicode(ss)
        if not line:
            continue
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
        unique_id += 1

開發者ID:a414351664，項目名稱:Bert-TextClassification，代碼行數:20，代碼來源:extract_features.py

示例3: get_train_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def get_train_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(
        os.path.join(data_dir, "multinli",
                     "multinli.train.%s.tsv" % self.language))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "train-%d" % (i)
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = tokenization.convert_to_unicode(line[1])
      label = tokenization.convert_to_unicode(line[2])
      if label == tokenization.convert_to_unicode("contradictory"):
        label = tokenization.convert_to_unicode("contradiction")
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

開發者ID:ZhangShiyue，項目名稱:QGforQA，代碼行數:20，代碼來源:run_classifier.py

示例4: get_dev_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def get_dev_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "dev-%d" % (i)
      language = tokenization.convert_to_unicode(line[0])
      if language != tokenization.convert_to_unicode(self.language):
        continue
      text_a = tokenization.convert_to_unicode(line[6])
      text_b = tokenization.convert_to_unicode(line[7])
      label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

開發者ID:ZhangShiyue，項目名稱:QGforQA，代碼行數:19，代碼來源:run_classifier.py

示例5: _create_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
      text_a = tokenization.convert_to_unicode(line[8])
      text_b = tokenization.convert_to_unicode(line[9])
      if set_type == "test":
        label = "contradiction"
      else:
        label = tokenization.convert_to_unicode(line[-1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

開發者ID:ZhangShiyue，項目名稱:QGforQA，代碼行數:18，代碼來源:run_classifier.py

示例6: _create_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    sentence_index = 0
    for (i, line) in enumerate(lines):

      if i == 0:
        # Identify the sentence index
        for j, token in enumerate(line):
          if token.strip() == "sentence":
            sentence_index = j
        continue

      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[sentence_index])
        label = "true"
      else:
        text_a = tokenization.convert_to_unicode(line[sentence_index])
        label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples

開發者ID:google-research，項目名稱:language，代碼行數:25，代碼來源:run_classifier_membership.py

示例7: _create_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    sentence_index = 0
    for (i, line) in enumerate(lines):

      if i == 0:
        # Identify the missing index
        for j, token in enumerate(line):
          if token.strip() == "sentence":
            sentence_index = j
        continue

      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[sentence_index])
        label = [1.0, 0]
      else:
        text_a = tokenization.convert_to_unicode(line[sentence_index])
        label = [float(line[2]), float(line[3])]
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples

開發者ID:google-research，項目名稱:language，代碼行數:25，代碼來源:run_classifier_distillation.py

示例8: _create_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      text_a = tokenization.convert_to_unicode(line[3])
      text_b = tokenization.convert_to_unicode(line[4])
      if set_type == "test":
        label = "0"
      else:
        label = tokenization.convert_to_unicode(line[0])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

開發者ID:google-research，項目名稱:language，代碼行數:18，代碼來源:run_classifier.py

示例9: _create_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    print("length of lines:", len(lines))
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      try:
        label = tokenization.convert_to_unicode(line[2])
        text_a = tokenization.convert_to_unicode(line[0])
        text_b = tokenization.convert_to_unicode(line[1])
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
      except Exception:  # pylint: disable=broad-except
        print("###error.i:", i, line)
    return examples

開發者ID:google-research，項目名稱:language，代碼行數:19，代碼來源:classifier_utils.py

示例10: get_train_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def get_train_examples(self, data_dir):
        """See base class."""
        lines = self._read_tsv(
            os.path.join(data_dir, "multinli",
                         "multinli.train.%s.tsv" % self.language))
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "train-%d" % (i)
            text_a = tokenization.convert_to_unicode(line[0])
            text_b = tokenization.convert_to_unicode(line[1])
            label = tokenization.convert_to_unicode(line[2])
            if label == tokenization.convert_to_unicode("contradictory"):
                label = tokenization.convert_to_unicode("contradiction")
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

開發者ID:sliderSun，項目名稱:pynlp，代碼行數:20，代碼來源:run_text_classification.py

示例11: get_dev_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def get_dev_examples(self, data_dir):
        """See base class."""
        lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "dev-%d" % (i)
            language = tokenization.convert_to_unicode(line[0])
            if language != tokenization.convert_to_unicode(self.language):
                continue
            text_a = tokenization.convert_to_unicode(line[6])
            text_b = tokenization.convert_to_unicode(line[7])
            label = tokenization.convert_to_unicode(line[1])
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

開發者ID:sliderSun，項目名稱:pynlp，代碼行數:19，代碼來源:run_text_classification.py

示例12: _create_examples

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import convert_to_unicode [as 別名]
def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = tokenization.convert_to_unicode(line[3])
            text_b = tokenization.convert_to_unicode(line[4])
            if set_type == "test":
                label = "0"
            else:
                label = tokenization.convert_to_unicode(line[0])
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

開發者ID:sliderSun，項目名稱:pynlp，代碼行數:18，代碼來源:run_text_classification.py

注：本文中的bert.tokenization.convert_to_unicode方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。