本文整理汇总了Python中allennlp.data.tokenizers.word_splitter.SimpleWordSplitter方法的典型用法代码示例。如果您正苦于以下问题:Python word_splitter.SimpleWordSplitter方法的具体用法?Python word_splitter.SimpleWordSplitter怎么用?Python word_splitter.SimpleWordSplitter使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.tokenizers.word_splitter
的用法示例。
在下文中一共展示了word_splitter.SimpleWordSplitter方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def __init__(self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
source_add_start_token: bool = True,
max_doc_length:int = -1,
max_query_length:int = -1,
min_doc_length:int = -1,
min_query_length:int = -1,
lazy: bool = False) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
self._source_add_start_token = source_add_start_token
self.max_doc_length = max_doc_length
self.max_query_length = max_query_length
self.min_doc_length = min_doc_length
self.min_query_length = min_query_length
self.padding_value = Token(text = "@@PADDING@@",text_id=0)
示例2: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def __init__(self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
source_add_start_token: bool = True,
max_doc_length:int = -1,
max_query_length:int = -1,
lazy: bool = False) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
self._source_add_start_token = source_add_start_token
self.max_doc_length = max_doc_length
self.max_query_length = max_query_length
示例3: setUp
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def setUp(self):
super(TestSimpleWordSplitter, self).setUp()
self.word_splitter = SimpleWordSplitter()
示例4: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def __init__(self,
source_tokenizer: Tokenizer = None,
target_tokenizer: Tokenizer = None,
source_token_indexers: Dict[str, TokenIndexer] = None,
target_token_indexers: Dict[str, TokenIndexer] = None,
source_add_start_token: bool = True,
lowercase: bool = True,
lazy: bool = False) -> None:
super().__init__(lazy)
self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter()
self._target_tokenizer = target_tokenizer or self._source_tokenizer
self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)}
self._target_token_indexers = target_token_indexers or self._source_token_indexers
self._source_add_start_token = source_add_start_token
示例5: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def __init__(self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
max_seq_length:int = -1,
min_seq_length:int = -1,
lazy: bool = False) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
self.max_seq_length = max_seq_length
self.min_seq_length = min_seq_length
self.padding_value = Token(text = "@@PADDING@@",text_id=0)
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:15,代码来源:ir_single_sequence_loader.py
示例6: test_cnn_dailymail_reader
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def test_cnn_dailymail_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False)
dataset = reader.read(TEST_URLS_FILE)
for sample in dataset:
self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["source_tokens"]), 2)
self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["target_tokens"]), 2)
示例7: test_ria_reader
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def test_ria_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = RIAReader(tokenizer)
dataset = reader.read(RIA_EXAMPLE_FILE)
for sample in dataset:
self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["source_tokens"]), 2)
self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["target_tokens"]), 2)
示例8: test_ria_copy_reader
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def test_ria_copy_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = RIAReader(tokenizer, separate_namespaces=True, save_copy_fields=True)
dataset = reader.read(RIA_EXAMPLE_FILE)
vocabulary = Vocabulary.from_instances(dataset)
for sample in dataset:
sample.index_fields(vocabulary)
self.assertIsNotNone(sample.fields["source_tokens"])
self.assertIsNotNone(sample.fields["target_tokens"])
self.assertIsNotNone(sample.fields["metadata"].metadata)
self.assertIsNotNone(sample.fields["source_token_ids"].array)
self.assertIsNotNone(sample.fields["target_token_ids"].array)
self.assertIsNotNone(sample.fields["source_to_target"]._mapping_array)
self.assertIsNotNone(sample.fields["source_to_target"]._target_namespace)