本文整理匯總了Python中allennlp.data.tokenizers.word_splitter.SimpleWordSplitter方法的典型用法代碼示例。如果您正苦於以下問題:Python word_splitter.SimpleWordSplitter方法的具體用法?Python word_splitter.SimpleWordSplitter怎麽用?Python word_splitter.SimpleWordSplitter使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類allennlp.data.tokenizers.word_splitter
的用法示例。
在下文中一共展示了word_splitter.SimpleWordSplitter方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __init__
# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def __init__(self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
source_add_start_token: bool = True,
max_doc_length:int = -1,
max_query_length:int = -1,
min_doc_length:int = -1,
min_query_length:int = -1,
lazy: bool = False) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
self._source_add_start_token = source_add_start_token
self.max_doc_length = max_doc_length
self.max_query_length = max_query_length
self.min_doc_length = min_doc_length
self.min_query_length = min_query_length
self.padding_value = Token(text = "@@PADDING@@",text_id=0)
示例2: __init__
# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def __init__(self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
source_add_start_token: bool = True,
max_doc_length:int = -1,
max_query_length:int = -1,
lazy: bool = False) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
self._source_add_start_token = source_add_start_token
self.max_doc_length = max_doc_length
self.max_query_length = max_query_length
示例3: setUp
# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def setUp(self):
super(TestSimpleWordSplitter, self).setUp()
self.word_splitter = SimpleWordSplitter()
示例4: __init__
# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def __init__(self,
source_tokenizer: Tokenizer = None,
target_tokenizer: Tokenizer = None,
source_token_indexers: Dict[str, TokenIndexer] = None,
target_token_indexers: Dict[str, TokenIndexer] = None,
source_add_start_token: bool = True,
lowercase: bool = True,
lazy: bool = False) -> None:
super().__init__(lazy)
self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter()
self._target_tokenizer = target_tokenizer or self._source_tokenizer
self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)}
self._target_token_indexers = target_token_indexers or self._source_token_indexers
self._source_add_start_token = source_add_start_token
示例5: __init__
# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def __init__(self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
max_seq_length:int = -1,
min_seq_length:int = -1,
lazy: bool = False) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
self.max_seq_length = max_seq_length
self.min_seq_length = min_seq_length
self.padding_value = Token(text = "@@PADDING@@",text_id=0)
開發者ID:sebastian-hofstaetter,項目名稱:transformer-kernel-ranking,代碼行數:15,代碼來源:ir_single_sequence_loader.py
示例6: test_cnn_dailymail_reader
# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def test_cnn_dailymail_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False)
dataset = reader.read(TEST_URLS_FILE)
for sample in dataset:
self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["source_tokens"]), 2)
self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["target_tokens"]), 2)
示例7: test_ria_reader
# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def test_ria_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = RIAReader(tokenizer)
dataset = reader.read(RIA_EXAMPLE_FILE)
for sample in dataset:
self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["source_tokens"]), 2)
self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["target_tokens"]), 2)
示例8: test_ria_copy_reader
# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def test_ria_copy_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = RIAReader(tokenizer, separate_namespaces=True, save_copy_fields=True)
dataset = reader.read(RIA_EXAMPLE_FILE)
vocabulary = Vocabulary.from_instances(dataset)
for sample in dataset:
sample.index_fields(vocabulary)
self.assertIsNotNone(sample.fields["source_tokens"])
self.assertIsNotNone(sample.fields["target_tokens"])
self.assertIsNotNone(sample.fields["metadata"].metadata)
self.assertIsNotNone(sample.fields["source_token_ids"].array)
self.assertIsNotNone(sample.fields["target_token_ids"].array)
self.assertIsNotNone(sample.fields["source_to_target"]._mapping_array)
self.assertIsNotNone(sample.fields["source_to_target"]._target_namespace)