当前位置: 首页>>代码示例>>Python>>正文


Python word_splitter.SpacyWordSplitter方法代码示例

本文整理汇总了Python中allennlp.data.tokenizers.word_splitter.SpacyWordSplitter方法的典型用法代码示例。如果您正苦于以下问题:Python word_splitter.SpacyWordSplitter方法的具体用法?Python word_splitter.SpacyWordSplitter怎么用?Python word_splitter.SpacyWordSplitter使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在allennlp.data.tokenizers.word_splitter的用法示例。


在下文中一共展示了word_splitter.SpacyWordSplitter方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
                archive_file=DEFAULT_ARCHIVE_FILE,
                cuda_device=DEFAULT_CUDA_DEVICE,
                model_file=None):
        """ Constructor for NLU class. """
        check_for_gpu(cuda_device)

        if not os.path.isfile(archive_file):
            if not model_file:
                raise Exception("No model for JointNLU is specified!")
            archive_file = cached_path(model_file)


        archive = load_archive(archive_file,
                            cuda_device=cuda_device)
        self.tokenizer = SpacyWordSplitter(language="en_core_web_sm")
        dataset_reader_params = archive.config["dataset_reader"]
        self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
        self.model = archive.model
        self.model.eval() 
开发者ID:ConvLab,项目名称:ConvLab,代码行数:22,代码来源:nlu.py

示例2: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
                 word_splitter               = None,
                 word_filter             = PassThroughWordFilter(),
                 word_stemmer              = PassThroughWordStemmer(),
                 start_tokens            = None,
                 end_tokens            = None)        :
        self._word_splitter = word_splitter or SpacyWordSplitter()
        self._word_filter = word_filter
        self._word_stemmer = word_stemmer
        self._start_tokens = start_tokens or []
        # We reverse the tokens here because we're going to insert them with `insert(0)` later;
        # this makes sure they show up in the right order.
        self._start_tokens.reverse()
        self._end_tokens = end_tokens or []

    #overrides 
开发者ID:plasticityai,项目名称:magnitude,代码行数:18,代码来源:word_tokenizer.py

示例3: load_data

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def load_data(data_path: str, tokenize: bool = False, tokenizer_type: str = "just_spaces") -> List[str]:
    if tokenizer_type == "just_spaces":
        tokenizer = SpacyWordSplitter()
    elif tokenizer_type == "spacy":
        nlp = spacy.load('en')
        tokenizer = Tokenizer(nlp.vocab)
    tokenized_examples = []
    with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f:
        for line in f:
            if data_path.endswith(".jsonl") or data_path.endswith(".json"):
                example = json.loads(line)
            else:
                example = {"text": line}
            if tokenize:
                if tokenizer_type == 'just_spaces':
                    tokens = list(map(str, tokenizer.split_words(example['text'])))
                elif tokenizer_type == 'spacy':
                    tokens = list(map(str, tokenizer(example['text'])))
                text = ' '.join(tokens)
            else:
                text = example['text']
            tokenized_examples.append(text)
    return tokenized_examples 
开发者ID:allenai,项目名称:vampire,代码行数:25,代码来源:preprocess_data.py

示例4: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
                archive_file=DEFAULT_ARCHIVE_FILE,
                cuda_device=DEFAULT_CUDA_DEVICE,
                model_file=None,
                context_size=3):
        """ Constructor for NLU class. """

        self.context_size = context_size

        check_for_gpu(cuda_device)

        if not os.path.isfile(archive_file):
            if not model_file:
                raise Exception("No model for MILU is specified!")

            archive_file = cached_path(model_file)

        archive = load_archive(archive_file,
                            cuda_device=cuda_device)
        self.tokenizer = SpacyWordSplitter(language="en_core_web_sm")
        _special_case = [{ORTH: u"id", LEMMA: u"id"}]
        self.tokenizer.spacy.tokenizer.add_special_case(u"id", _special_case)

        dataset_reader_params = archive.config["dataset_reader"]
        self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
        self.model = archive.model
        self.model.eval() 
开发者ID:ConvLab,项目名称:ConvLab,代码行数:29,代码来源:nlu.py

示例5: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
                 dataset_reader: DatasetReader,
                 tokenizer: WordSplitter = None) -> None:
        super().__init__(lazy=dataset_reader.lazy)
        self.dataset_reader = dataset_reader
        if tokenizer:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = SpacyWordSplitter(language="xx_ent_wiki_sm") 
开发者ID:Hyperparticle,项目名称:udify,代码行数:11,代码来源:universal_dependencies.py

示例6: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
                 lazy       = False,
                 tables_directory      = None,
                 dpd_output_directory      = None,
                 max_dpd_logical_forms      = 10,
                 sort_dpd_logical_forms       = True,
                 max_dpd_tries      = 20,
                 keep_if_no_dpd       = False,
                 tokenizer            = None,
                 question_token_indexers                          = None,
                 table_token_indexers                          = None,
                 use_table_for_vocab       = False,
                 linking_feature_extractors            = None,
                 include_table_metadata       = False,
                 max_table_tokens      = None,
                 output_agendas       = False)        :
        super(WikiTablesDatasetReader, self).__init__(lazy=lazy)
        self._tables_directory = tables_directory
        self._dpd_output_directory = dpd_output_directory
        self._max_dpd_logical_forms = max_dpd_logical_forms
        self._sort_dpd_logical_forms = sort_dpd_logical_forms
        self._max_dpd_tries = max_dpd_tries
        self._keep_if_no_dpd = keep_if_no_dpd
        self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self._question_token_indexers = question_token_indexers or {u"tokens": SingleIdTokenIndexer()}
        self._table_token_indexers = table_token_indexers or self._question_token_indexers
        self._use_table_for_vocab = use_table_for_vocab
        self._linking_feature_extractors = linking_feature_extractors
        self._include_table_metadata = include_table_metadata
        self._basic_types = set(unicode(type_) for type_ in wt_types.BASIC_TYPES)
        self._max_table_tokens = max_table_tokens
        self._output_agendas = output_agendas

    #overrides 
开发者ID:plasticityai,项目名称:magnitude,代码行数:36,代码来源:wikitables.py

示例7: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
                 token_indexers                          = None,
                 lazy       = False,
                 tokenizer            = None)        :
        super(AtisDatasetReader, self).__init__(lazy)
        self._token_indexers = token_indexers or {u'tokens': SingleIdTokenIndexer()}
        self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter(pos_tags=True))


    #overrides 
开发者ID:plasticityai,项目名称:magnitude,代码行数:12,代码来源:atis.py

示例8: setUp

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def setUp(self):
        super(TestDepLabelIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(parse=True) 
开发者ID:plasticityai,项目名称:magnitude,代码行数:5,代码来源:dep_label_indexer_test.py

示例9: setUp

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def setUp(self):
        super(TestNerTagIndexer, self).setUp()
        self.tokenizer = SpacyWordSplitter(ner=True) 
开发者ID:plasticityai,项目名称:magnitude,代码行数:5,代码来源:ner_tag_indexer_test.py

示例10: setUp

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def setUp(self):
        super(TestSpacyWordSplitter, self).setUp()
        self.word_splitter = SpacyWordSplitter() 
开发者ID:plasticityai,项目名称:magnitude,代码行数:5,代码来源:word_splitter_test.py

示例11: setUp

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize(u"where is mersin?")
        self.token_indexers = {u"tokens": SingleIdTokenIndexer(u"tokens")}

        json = {
                u'question': self.utterance,
                u'columns': [u'Name in English', u'Location in English'],
                u'cells': [[u'Paradeniz', u'Mersin'],
                          [u'Lake Gala', u'Edirne']]
                }
        self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace(u"name", namespace=u'tokens')
        self.in_index = self.vocab.add_token_to_namespace(u"in", namespace=u'tokens')
        self.english_index = self.vocab.add_token_to_namespace(u"english", namespace=u'tokens')
        self.location_index = self.vocab.add_token_to_namespace(u"location", namespace=u'tokens')
        self.paradeniz_index = self.vocab.add_token_to_namespace(u"paradeniz", namespace=u'tokens')
        self.mersin_index = self.vocab.add_token_to_namespace(u"mersin", namespace=u'tokens')
        self.lake_index = self.vocab.add_token_to_namespace(u"lake", namespace=u'tokens')
        self.gala_index = self.vocab.add_token_to_namespace(u"gala", namespace=u'tokens')
        self.negative_one_index = self.vocab.add_token_to_namespace(u"-1", namespace=u'tokens')
        self.zero_index = self.vocab.add_token_to_namespace(u"0", namespace=u'tokens')
        self.one_index = self.vocab.add_token_to_namespace(u"1", namespace=u'tokens')

        self.oov_index = self.vocab.get_token_index(u'random OOV string', namespace=u'tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp() 
开发者ID:plasticityai,项目名称:magnitude,代码行数:32,代码来源:knowledge_graph_field_test.py

示例12: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self, model       , dataset_reader               )        :
        super(ConstituencyParserPredictor, self).__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=u'en_core_web_sm', pos_tags=True) 
开发者ID:plasticityai,项目名称:magnitude,代码行数:5,代码来源:constituency_parser.py

示例13: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self, model       , dataset_reader               )        :
        super(SemanticRoleLabelerPredictor, self).__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=u'en_core_web_sm', pos_tags=True) 
开发者ID:plasticityai,项目名称:magnitude,代码行数:5,代码来源:semantic_role_labeler.py

示例14: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self, model       , dataset_reader               )        :
        super(SentenceTaggerPredictor, self).__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language=u'en_core_web_sm', pos_tags=True) 
开发者ID:plasticityai,项目名称:magnitude,代码行数:5,代码来源:sentence_tagger.py

示例15: __init__

# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self.tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(pos_tags=True)) 
开发者ID:allenai,项目名称:propara,代码行数:7,代码来源:prostruct_prediction.py


注:本文中的allennlp.data.tokenizers.word_splitter.SpacyWordSplitter方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。