当前位置: 首页>>代码示例>>Python>>正文


Python tokenizers.Token方法代码示例

本文整理汇总了Python中allennlp.data.tokenizers.Token方法的典型用法代码示例。如果您正苦于以下问题:Python tokenizers.Token方法的具体用法?Python tokenizers.Token怎么用?Python tokenizers.Token使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在allennlp.data.tokenizers的用法示例。


在下文中一共展示了tokenizers.Token方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: text_to_instance

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
        intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # print([t.text for t in context_tokens])
        fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
        fields["tokens"] = TextField(tokens, self._token_indexers)
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, fields["tokens"])
        if intents is not None:
            fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
开发者ID:ConvLab,项目名称:ConvLab,代码行数:23,代码来源:dataset_reader.py

示例2: text_to_instance

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
开发者ID:ConvLab,项目名称:ConvLab,代码行数:23,代码来源:dataset_reader.py

示例3: _read

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, _, _, pico_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pico_tags) 
开发者ID:allenai,项目名称:scibert,代码行数:22,代码来源:ebmnlp.py

示例4: preprocess

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def preprocess(self, token_batch):
        seq_lens = [len(sequence) for sequence in token_batch if sequence]
        if not seq_lens:
            return []
        max_len = min(max(seq_lens), self.max_len)
        batches = []
        for indexer in self.indexers:
            batch = []
            for sequence in token_batch:
                tokens = sequence[:max_len]
                tokens = [Token(token) for token in ['$START'] + tokens]
                batch.append(Instance({'tokens': TextField(tokens, indexer)}))
            batch = Batch(batch)
            batch.index_instances(self.vocab)
            batches.append(batch)

        return batches 
开发者ID:plkmo,项目名称:NLP_Toolkit,代码行数:19,代码来源:gec_model.py

示例5: _read

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:

            logger.info("Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip("\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [
                    pair.rsplit(self._word_tag_delimiter, 1)
                    for pair in line.split(self._token_delimiter)
                ]
                tokens = [Token(token) for token, tag in tokens_and_tags]
                tags = [tag for token, tag in tokens_and_tags]
                yield self.text_to_instance(tokens, tags) 
开发者ID:allenai,项目名称:allennlp,代码行数:23,代码来源:sequence_tagging.py

示例6: _read

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, pos_tags, chunk_tags, ner_tags = fields
                    # TextField requires `Token` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags) 
开发者ID:allenai,项目名称:allennlp,代码行数:22,代码来源:conll2003.py

示例7: _read

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:

            logger.info(u"Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip(u"\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1)
                                   for pair in line.split(self._token_delimiter)]
                tokens = [Token(token) for token, tag in tokens_and_tags]
                tags = [tag for token, tag in tokens_and_tags]
                yield self.text_to_instance(tokens, tags) 
开发者ID:plasticityai,项目名称:magnitude,代码行数:21,代码来源:sequence_tagging.py

示例8: _read

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path     )                      :
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:
            logger.info(u"Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags) 
开发者ID:plasticityai,项目名称:magnitude,代码行数:21,代码来源:conll2003.py

示例9: _read

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info(u"Reading SRL instances from dataset files at: %s", file_path)
        if self._domain_identifier is not None:
            logger.info(u"Filtering to only include file paths containing the %s domain", self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = [u"O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == u"-V" else 0 for label in tags]
                    yield self.text_to_instance(tokens, verb_indicator, tags) 
开发者ID:plasticityai,项目名称:magnitude,代码行数:21,代码来源:semantic_role_labeling.py

示例10: test_world_adds_numbers_from_question

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def test_world_adds_numbers_from_question(self):
        question_tokens = [Token(x) for x in [u'what', u'2007', u'2,107', u'0.2', u'1800s', u'1950s', u'?']]
        table_kg = TableQuestionKnowledgeGraph.read_from_file(
                self.FIXTURES_ROOT / u"data" / u"wikitables" / u"sample_table.tsv", question_tokens)
        world = WikiTablesWorld(table_kg)
        valid_actions = world.get_valid_actions()
        assert u'n -> 2007' in valid_actions[u'n']
        assert u'n -> 2107' in valid_actions[u'n']

        # It appears that sempre normalizes floating point numbers.
        assert u'n -> 0.200' in valid_actions[u'n']

        # We want to add the end-points to things like "1800s": 1800 and 1900.
        assert u'n -> 1800' in valid_actions[u'n']
        assert u'n -> 1900' in valid_actions[u'n']
        assert u'n -> 1950' in valid_actions[u'n']
        assert u'n -> 1960' in valid_actions[u'n'] 
开发者ID:plasticityai,项目名称:magnitude,代码行数:19,代码来源:wikitables_world_test.py

示例11: text_to_instance

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance:

        if len(tokens) > self._max_token_len:
            tokens = tokens[:self._max_token_len]
            print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...')
            if tags:
                tags = tags[:self._max_token_len]

        fields = {}

        text_field = TextField(tokens, self._token_indexers)
        fields['tokens'] = text_field
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        return Instance(fields) 
开发者ID:mhagiwara,项目名称:nanigonet,代码行数:18,代码来源:dataset_reader.py

示例12: split_tokens_by_hyphen

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def split_tokens_by_hyphen(tokens: List[Token]) -> List[Token]:
    hyphens = ["-", "–", "~"]
    new_tokens: List[Token] = []

    for token in tokens:
        if any(hyphen in token.text for hyphen in hyphens):
            unsplit_tokens = [token]
            split_tokens: List[Token] = []
            for hyphen in hyphens:
                for unsplit_token in unsplit_tokens:
                    if hyphen in token.text:
                        split_tokens += split_token_by_delimiter(unsplit_token, hyphen)
                    else:
                        split_tokens.append(unsplit_token)
                unsplit_tokens, split_tokens = split_tokens, []
            new_tokens += unsplit_tokens
        else:
            new_tokens.append(token)

    return new_tokens 
开发者ID:huminghao16,项目名称:MTMSN,代码行数:22,代码来源:drop_utils.py

示例13: get_strings_from_utterance

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    """
    Based on the current utterance, return a dictionary where the keys are the strings in
    the database that map to lists of the token indices that they are linked to.
    """
    string_linking_scores: Dict[str, List[int]] = defaultdict(list)

    for index, token in enumerate(tokenized_utterance):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
            string_linking_scores[string].append(index)

    token_bigrams = bigrams([token.text for token in tokenized_utterance])
    for index, token_bigram in enumerate(token_bigrams):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
            string_linking_scores[string].extend([index, index + 1])

    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for index, trigram in enumerate(trigrams):
        if trigram[0] == "st":
            natural_language_key = f"st. {trigram[2]}".lower()
        else:
            natural_language_key = " ".join(trigram).lower()
        for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
            string_linking_scores[string].extend([index, index + 1, index + 2])
    return string_linking_scores 
开发者ID:allenai,项目名称:allennlp-semparse,代码行数:27,代码来源:atis_world.py

示例14: get_time_range_end_from_utterance

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def get_time_range_end_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    early_indices = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "early"
    }

    time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_END_DICT.get(token.text, []):
            if token_index - 1 not in early_indices:
                time_range_end_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []):
            time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_end_linking_dict 
开发者ID:allenai,项目名称:allennlp-semparse,代码行数:21,代码来源:atis_tables.py

示例15: get_flight_numbers_from_utterance

# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def get_flight_numbers_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    indices_words_preceding_flight_number = {
        index
        for index, token in enumerate(tokenized_utterance)
        if token.text in {"flight", "number"}
        or token.text.upper() in AIRLINE_CODE_LIST
        or token.text.lower() in AIRLINE_CODES.keys()
    }

    indices_words_succeeding_flight_number = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "flight"
    }

    flight_numbers_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        if token.text.isdigit():
            if token_index - 1 in indices_words_preceding_flight_number:
                flight_numbers_linking_dict[token.text].append(token_index)
            if token_index + 1 in indices_words_succeeding_flight_number:
                flight_numbers_linking_dict[token.text].append(token_index)
    return flight_numbers_linking_dict 
开发者ID:allenai,项目名称:allennlp-semparse,代码行数:25,代码来源:atis_tables.py


注:本文中的allennlp.data.tokenizers.Token方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。