本文整理汇总了Python中allennlp.data.tokenizers.word_splitter.JustSpacesWordSplitter方法的典型用法代码示例。如果您正苦于以下问题:Python word_splitter.JustSpacesWordSplitter方法的具体用法?Python word_splitter.JustSpacesWordSplitter怎么用?Python word_splitter.JustSpacesWordSplitter使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.tokenizers.word_splitter
的用法示例。
在下文中一共展示了word_splitter.JustSpacesWordSplitter方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 别名]
def __init__(self, lazy: bool = False,
max_bag_size: int = 25,
negative_exampels_percentage: int = 100,
with_direct_supervision: bool = True) -> None:
"""
args:
lazy: lazy reading of the dataset
max_bag_size: maximum number of sentences per a bag
negative_exampels_percentage: percentage of negative examples to keep
with_direct_supervision: keep or ignore direct supervision examples
"""
super().__init__(lazy=lazy)
self.max_bag_size = max_bag_size
self.negative_exampels_percentage = negative_exampels_percentage
self.with_direct_supervision = with_direct_supervision
self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
self._token_indexers = {"tokens": SingleIdTokenIndexer()}
# for logging and input validation
self._inst_counts: Dict = defaultdict(int) # count instances per relation type
self._pairs: Set = set() # keep track of pairs of entities
self._bag_sizes: Dict = defaultdict(int) # count relation types per bag
self._relation_coocur: Dict = defaultdict(int) # count relation types per bag
self._failed_mentions_count: int = 0 # count mentions with wrong formating
self._count_direct_supervised_inst: int = 0
self._count_bag_labels: Dict = defaultdict(int)
示例2: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 别名]
def __init__(self,
lazy = False,
tokenizer = None,
token_indexers = None) :
super(QuoraParaphraseDatasetReader, self).__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
self._token_indexers = token_indexers or {u"tokens": SingleIdTokenIndexer()}
#overrides
示例3: multiprocess_training_loader
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 别名]
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
# workflow: we tokenize the data files with the costly spacy before training in a preprocessing step
# (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
_tokenizer = None
if _config["preprocessed_tokenized"] == True:
_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_triple_loader = IrTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
#
# validation instance generator
# - filling the _queue with ready to run validation batches
# - everything is defined thread local
#
示例4: multiprocess_validation_loader
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 别名]
def multiprocess_validation_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
# workflow: we tokenize the data files with the costly spacy before training in a preprocessing step
# (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
_tokenizer = None
if _config and _config["preprocessed_tokenized"] == True:
_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_tuple_loader = IrLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("doc_tokens", "num_tokens"), ("query_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
示例5: setUp
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter [as 别名]
def setUp(self):
super(TestDictField, self).setUp()
entity_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
self.vocab = Vocabulary()
self.vocab.add_token_to_namespace("entity1", "entity")
self.vocab.add_token_to_namespace("entity2", "entity")
self.vocab.add_token_to_namespace("entity3", "entity")
self.entity_indexer = {"entity": TokenCharactersIndexerTokenizer(
"entity", character_tokenizer=entity_tokenizer)
}
tokens1 = "The sentence .".split()
tokens_field = TextField(
[Token(t) for t in tokens1],
token_indexers={'tokens': SingleIdTokenIndexer()}
)
self.instance1_fields = {
"candidate_entities": TextField(
[Token("entity1 entity2"), Token("entity_unk")],
token_indexers=self.entity_indexer),
"candidate_entity_prior": ArrayField(np.array([[0.5, 0.5], [1.0, 0.0]])),
"candidate_spans": ListField(
[SpanField(0, 0, tokens_field),
SpanField(1, 2, tokens_field)]
)
}
tokens2 = "The sentence".split()
tokens2_field = TextField(
[Token(t) for t in tokens2],
token_indexers={'tokens': SingleIdTokenIndexer()}
)
self.instance2_fields = {
"candidate_entities": TextField(
[Token("entity1")],
token_indexers=self.entity_indexer),
"candidate_entity_prior": ArrayField(np.array([[1.0]])),
"candidate_spans": ListField(
[SpanField(1, 1, tokens2_field)],
)
}