當前位置: 首頁>>代碼示例>>Python>>正文


Python word_splitter.SimpleWordSplitter方法代碼示例

本文整理匯總了Python中allennlp.data.tokenizers.word_splitter.SimpleWordSplitter方法的典型用法代碼示例。如果您正苦於以下問題:Python word_splitter.SimpleWordSplitter方法的具體用法?Python word_splitter.SimpleWordSplitter怎麽用?Python word_splitter.SimpleWordSplitter使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在allennlp.data.tokenizers.word_splitter的用法示例。


在下文中一共展示了word_splitter.SimpleWordSplitter方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
開發者ID:sebastian-hofstaetter,項目名稱:transformer-kernel-ranking,代碼行數:21,代碼來源:ir_labeled_tuple_loader.py

示例2: __init__

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length 
開發者ID:sebastian-hofstaetter,項目名稱:teaching,代碼行數:15,代碼來源:data_loading.py

示例3: setUp

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def setUp(self):
        super(TestSimpleWordSplitter, self).setUp()
        self.word_splitter = SimpleWordSplitter() 
開發者ID:plasticityai,項目名稱:magnitude,代碼行數:5,代碼來源:word_splitter_test.py

示例4: __init__

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def __init__(self,
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 lowercase: bool = True,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)}
        self._target_token_indexers = target_token_indexers or self._source_token_indexers
        self._source_add_start_token = source_add_start_token 
開發者ID:sebastian-hofstaetter,項目名稱:sigir19-neural-ir,代碼行數:16,代碼來源:ir_tuple_loader.py

示例5: __init__

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_length:int = -1,
                 min_seq_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self.max_seq_length = max_seq_length
        self.min_seq_length = min_seq_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
開發者ID:sebastian-hofstaetter,項目名稱:transformer-kernel-ranking,代碼行數:15,代碼來源:ir_single_sequence_loader.py

示例6: test_cnn_dailymail_reader

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def test_cnn_dailymail_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False)
        dataset = reader.read(TEST_URLS_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
開發者ID:IlyaGusev,項目名稱:summarus,代碼行數:14,代碼來源:test_readers.py

示例7: test_ria_reader

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def test_ria_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = RIAReader(tokenizer)
        dataset = reader.read(RIA_EXAMPLE_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
開發者ID:IlyaGusev,項目名稱:summarus,代碼行數:14,代碼來源:test_readers.py

示例8: test_ria_copy_reader

# 需要導入模塊: from allennlp.data.tokenizers import word_splitter [as 別名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 別名]
def test_ria_copy_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = RIAReader(tokenizer, separate_namespaces=True, save_copy_fields=True)
        dataset = reader.read(RIA_EXAMPLE_FILE)
        vocabulary = Vocabulary.from_instances(dataset)

        for sample in dataset:
            sample.index_fields(vocabulary)
            self.assertIsNotNone(sample.fields["source_tokens"])
            self.assertIsNotNone(sample.fields["target_tokens"])
            self.assertIsNotNone(sample.fields["metadata"].metadata)
            self.assertIsNotNone(sample.fields["source_token_ids"].array)
            self.assertIsNotNone(sample.fields["target_token_ids"].array)
            self.assertIsNotNone(sample.fields["source_to_target"]._mapping_array)
            self.assertIsNotNone(sample.fields["source_to_target"]._target_namespace) 
開發者ID:IlyaGusev,項目名稱:summarus,代碼行數:17,代碼來源:test_readers.py


注:本文中的allennlp.data.tokenizers.word_splitter.SimpleWordSplitter方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。