當前位置: 首頁>>代碼示例>>Python>>正文


Python word_splitter.JustSpacesWordSplitter方法代碼示例

本文整理匯總了Python中allennlp.data.tokenizers.word_splitter.JustSpacesWordSplitter方法的典型用法代碼示例。如果您正苦於以下問題:Python word_splitter.JustSpacesWordSplitter方法的具體用法?Python word_splitter.JustSpacesWordSplitter怎麽用?Python word_splitter.JustSpacesWordSplitter使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在allennlp.data.tokenizers.word_splitter的用法示例。


在下文中一共展示了word_splitter.JustSpacesWordSplitter方法的5個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 別名]
def __init__(self, lazy: bool = False,
                 max_bag_size: int = 25,
                 negative_exampels_percentage: int = 100,
                 with_direct_supervision: bool = True) -> None:
        """
        args:
            lazy: lazy reading of the dataset
            max_bag_size: maximum number of sentences per a bag
            negative_exampels_percentage: percentage of negative examples to keep
            with_direct_supervision: keep or ignore direct supervision examples
        """
        super().__init__(lazy=lazy)
        self.max_bag_size = max_bag_size
        self.negative_exampels_percentage = negative_exampels_percentage
        self.with_direct_supervision = with_direct_supervision

        self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
        self._token_indexers = {"tokens": SingleIdTokenIndexer()}

        # for logging and input validation
        self._inst_counts: Dict = defaultdict(int)  # count instances per relation type
        self._pairs: Set = set()  # keep track of pairs of entities
        self._bag_sizes: Dict = defaultdict(int)  # count relation types per bag
        self._relation_coocur: Dict = defaultdict(int)  # count relation types per bag
        self._failed_mentions_count: int = 0  # count mentions with wrong formating
        self._count_direct_supervised_inst: int = 0
        self._count_bag_labels: Dict = defaultdict(int) 
開發者ID:allenai,項目名稱:comb_dist_direct_relex,代碼行數:29,代碼來源:relation_instances_reader.py

示例2: __init__

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 別名]
def __init__(self,
                 lazy       = False,
                 tokenizer            = None,
                 token_indexers                          = None)        :
        super(QuoraParaphraseDatasetReader, self).__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
        self._token_indexers = token_indexers or {u"tokens": SingleIdTokenIndexer()}

    #overrides 
開發者ID:plasticityai,項目名稱:magnitude,代碼行數:11,代碼來源:quora_paraphrase.py

示例3: multiprocess_training_loader

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 別名]
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):

    # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step 
    # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
    _tokenizer = None
    if _config["preprocessed_tokenized"] == True:
        _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

    if _config["token_embedder_type"] == "embedding":
        _token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        _vocab = Vocabulary.from_files(_config["vocab_directory"])

    elif _config["token_embedder_type"] == "fasttext":
        _token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
        _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])

    elif _config["token_embedder_type"] == "elmo":
        _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
        _vocab = None

    _triple_loader = IrTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])

    _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
                               sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])

    _iterator.index_with(_vocab)

    for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1):

        _queue.put(training_batch)  # this moves the tensors in to shared memory

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait()  # keep this process alive until all the shared memory is used and not needed anymore

#
# validation instance generator
#   - filling the _queue with ready to run validation batches
#   - everything is defined thread local
# 
開發者ID:sebastian-hofstaetter,項目名稱:sigir19-neural-ir,代碼行數:41,代碼來源:multiprocess_input_pipeline.py

示例4: multiprocess_validation_loader

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 別名]
def multiprocess_validation_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):

    # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step 
    # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
    _tokenizer = None
    if _config and _config["preprocessed_tokenized"] == True:
        _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

    if _config["token_embedder_type"] == "embedding":
        _token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        _vocab = Vocabulary.from_files(_config["vocab_directory"])

    elif _config["token_embedder_type"] == "fasttext":
        _token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
        _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])

    elif _config["token_embedder_type"] == "elmo":
        _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
        _vocab = None

    _tuple_loader = IrLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])

    _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                               sorting_keys=[("doc_tokens", "num_tokens"), ("query_tokens", "num_tokens")])

    _iterator.index_with(_vocab)

    for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):

        _queue.put(training_batch)  # this moves the tensors in to shared memory

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait()  # keep this process alive until all the shared memory is used and not needed anymore 
開發者ID:sebastian-hofstaetter,項目名稱:sigir19-neural-ir,代碼行數:35,代碼來源:multiprocess_input_pipeline.py

示例5: setUp

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 別名]
def setUp(self):
        super(TestDictField, self).setUp()

        entity_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("entity1", "entity")
        self.vocab.add_token_to_namespace("entity2", "entity")
        self.vocab.add_token_to_namespace("entity3", "entity")
        self.entity_indexer = {"entity": TokenCharactersIndexerTokenizer(
            "entity", character_tokenizer=entity_tokenizer)
        }

        tokens1 = "The sentence .".split()
        tokens_field = TextField(
            [Token(t) for t in tokens1],
            token_indexers={'tokens': SingleIdTokenIndexer()}
        )

        self.instance1_fields = {
            "candidate_entities": TextField(
                    [Token("entity1 entity2"), Token("entity_unk")],
                    token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array([[0.5, 0.5], [1.0, 0.0]])),
            "candidate_spans": ListField(
                    [SpanField(0, 0, tokens_field),
                     SpanField(1, 2, tokens_field)]
            )
        }

        tokens2 = "The sentence".split()
        tokens2_field = TextField(
            [Token(t) for t in tokens2], 
            token_indexers={'tokens': SingleIdTokenIndexer()}
        )

        self.instance2_fields = {
            "candidate_entities": TextField(
                    [Token("entity1")], 
                    token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array([[1.0]])),
            "candidate_spans": ListField(
                    [SpanField(1, 1, tokens2_field)],
            )
        } 
開發者ID:allenai,項目名稱:kb,代碼行數:47,代碼來源:test_dict_field.py


注:本文中的allennlp.data.tokenizers.word_splitter.JustSpacesWordSplitter方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。