当前位置: 首页>>代码示例>>Python>>正文


Python word_splitter.SimpleWordSplitter方法代码示例

本文整理汇总了Python中allennlp.data.tokenizers.word_splitter.SimpleWordSplitter方法的典型用法代码示例。如果您正苦于以下问题:Python word_splitter.SimpleWordSplitter方法的具体用法?Python word_splitter.SimpleWordSplitter怎么用?Python word_splitter.SimpleWordSplitter使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在allennlp.data.tokenizers.word_splitter的用法示例。


在下文中一共展示了word_splitter.SimpleWordSplitter方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:21,代码来源:ir_labeled_tuple_loader.py

示例2: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length 
开发者ID:sebastian-hofstaetter,项目名称:teaching,代码行数:15,代码来源:data_loading.py

示例3: setUp

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def setUp(self):
        super(TestSimpleWordSplitter, self).setUp()
        self.word_splitter = SimpleWordSplitter() 
开发者ID:plasticityai,项目名称:magnitude,代码行数:5,代码来源:word_splitter_test.py

示例4: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def __init__(self,
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 lowercase: bool = True,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)}
        self._target_token_indexers = target_token_indexers or self._source_token_indexers
        self._source_add_start_token = source_add_start_token 
开发者ID:sebastian-hofstaetter,项目名称:sigir19-neural-ir,代码行数:16,代码来源:ir_tuple_loader.py

示例5: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_length:int = -1,
                 min_seq_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self.max_seq_length = max_seq_length
        self.min_seq_length = min_seq_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:15,代码来源:ir_single_sequence_loader.py

示例6: test_cnn_dailymail_reader

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def test_cnn_dailymail_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False)
        dataset = reader.read(TEST_URLS_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
开发者ID:IlyaGusev,项目名称:summarus,代码行数:14,代码来源:test_readers.py

示例7: test_ria_reader

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def test_ria_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = RIAReader(tokenizer)
        dataset = reader.read(RIA_EXAMPLE_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
开发者ID:IlyaGusev,项目名称:summarus,代码行数:14,代码来源:test_readers.py

示例8: test_ria_copy_reader

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter [as 别名]
def test_ria_copy_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = RIAReader(tokenizer, separate_namespaces=True, save_copy_fields=True)
        dataset = reader.read(RIA_EXAMPLE_FILE)
        vocabulary = Vocabulary.from_instances(dataset)

        for sample in dataset:
            sample.index_fields(vocabulary)
            self.assertIsNotNone(sample.fields["source_tokens"])
            self.assertIsNotNone(sample.fields["target_tokens"])
            self.assertIsNotNone(sample.fields["metadata"].metadata)
            self.assertIsNotNone(sample.fields["source_token_ids"].array)
            self.assertIsNotNone(sample.fields["target_token_ids"].array)
            self.assertIsNotNone(sample.fields["source_to_target"]._mapping_array)
            self.assertIsNotNone(sample.fields["source_to_target"]._target_namespace) 
开发者ID:IlyaGusev,项目名称:summarus,代码行数:17,代码来源:test_readers.py


注:本文中的allennlp.data.tokenizers.word_splitter.SimpleWordSplitter方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。