本文整理汇总了Python中allennlp.data.tokenizers.word_splitter.SpacyWordSplitter方法的典型用法代码示例。如果您正苦于以下问题:Python word_splitter.SpacyWordSplitter方法的具体用法?Python word_splitter.SpacyWordSplitter怎么用?Python word_splitter.SpacyWordSplitter使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.tokenizers.word_splitter
的用法示例。
在下文中一共展示了word_splitter.SpacyWordSplitter方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
archive_file=DEFAULT_ARCHIVE_FILE,
cuda_device=DEFAULT_CUDA_DEVICE,
model_file=None):
""" Constructor for NLU class. """
check_for_gpu(cuda_device)
if not os.path.isfile(archive_file):
if not model_file:
raise Exception("No model for JointNLU is specified!")
archive_file = cached_path(model_file)
archive = load_archive(archive_file,
cuda_device=cuda_device)
self.tokenizer = SpacyWordSplitter(language="en_core_web_sm")
dataset_reader_params = archive.config["dataset_reader"]
self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
self.model = archive.model
self.model.eval()
示例2: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
word_splitter = None,
word_filter = PassThroughWordFilter(),
word_stemmer = PassThroughWordStemmer(),
start_tokens = None,
end_tokens = None) :
self._word_splitter = word_splitter or SpacyWordSplitter()
self._word_filter = word_filter
self._word_stemmer = word_stemmer
self._start_tokens = start_tokens or []
# We reverse the tokens here because we're going to insert them with `insert(0)` later;
# this makes sure they show up in the right order.
self._start_tokens.reverse()
self._end_tokens = end_tokens or []
#overrides
示例3: load_data
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def load_data(data_path: str, tokenize: bool = False, tokenizer_type: str = "just_spaces") -> List[str]:
if tokenizer_type == "just_spaces":
tokenizer = SpacyWordSplitter()
elif tokenizer_type == "spacy":
nlp = spacy.load('en')
tokenizer = Tokenizer(nlp.vocab)
tokenized_examples = []
with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f:
for line in f:
if data_path.endswith(".jsonl") or data_path.endswith(".json"):
example = json.loads(line)
else:
example = {"text": line}
if tokenize:
if tokenizer_type == 'just_spaces':
tokens = list(map(str, tokenizer.split_words(example['text'])))
elif tokenizer_type == 'spacy':
tokens = list(map(str, tokenizer(example['text'])))
text = ' '.join(tokens)
else:
text = example['text']
tokenized_examples.append(text)
return tokenized_examples
示例4: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
archive_file=DEFAULT_ARCHIVE_FILE,
cuda_device=DEFAULT_CUDA_DEVICE,
model_file=None,
context_size=3):
""" Constructor for NLU class. """
self.context_size = context_size
check_for_gpu(cuda_device)
if not os.path.isfile(archive_file):
if not model_file:
raise Exception("No model for MILU is specified!")
archive_file = cached_path(model_file)
archive = load_archive(archive_file,
cuda_device=cuda_device)
self.tokenizer = SpacyWordSplitter(language="en_core_web_sm")
_special_case = [{ORTH: u"id", LEMMA: u"id"}]
self.tokenizer.spacy.tokenizer.add_special_case(u"id", _special_case)
dataset_reader_params = archive.config["dataset_reader"]
self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
self.model = archive.model
self.model.eval()
示例5: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
dataset_reader: DatasetReader,
tokenizer: WordSplitter = None) -> None:
super().__init__(lazy=dataset_reader.lazy)
self.dataset_reader = dataset_reader
if tokenizer:
self.tokenizer = tokenizer
else:
self.tokenizer = SpacyWordSplitter(language="xx_ent_wiki_sm")
示例6: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
lazy = False,
tables_directory = None,
dpd_output_directory = None,
max_dpd_logical_forms = 10,
sort_dpd_logical_forms = True,
max_dpd_tries = 20,
keep_if_no_dpd = False,
tokenizer = None,
question_token_indexers = None,
table_token_indexers = None,
use_table_for_vocab = False,
linking_feature_extractors = None,
include_table_metadata = False,
max_table_tokens = None,
output_agendas = False) :
super(WikiTablesDatasetReader, self).__init__(lazy=lazy)
self._tables_directory = tables_directory
self._dpd_output_directory = dpd_output_directory
self._max_dpd_logical_forms = max_dpd_logical_forms
self._sort_dpd_logical_forms = sort_dpd_logical_forms
self._max_dpd_tries = max_dpd_tries
self._keep_if_no_dpd = keep_if_no_dpd
self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter(pos_tags=True))
self._question_token_indexers = question_token_indexers or {u"tokens": SingleIdTokenIndexer()}
self._table_token_indexers = table_token_indexers or self._question_token_indexers
self._use_table_for_vocab = use_table_for_vocab
self._linking_feature_extractors = linking_feature_extractors
self._include_table_metadata = include_table_metadata
self._basic_types = set(unicode(type_) for type_ in wt_types.BASIC_TYPES)
self._max_table_tokens = max_table_tokens
self._output_agendas = output_agendas
#overrides
示例7: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
token_indexers = None,
lazy = False,
tokenizer = None) :
super(AtisDatasetReader, self).__init__(lazy)
self._token_indexers = token_indexers or {u'tokens': SingleIdTokenIndexer()}
self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter(pos_tags=True))
#overrides
示例8: setUp
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def setUp(self):
super(TestDepLabelIndexer, self).setUp()
self.tokenizer = SpacyWordSplitter(parse=True)
示例9: setUp
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def setUp(self):
super(TestNerTagIndexer, self).setUp()
self.tokenizer = SpacyWordSplitter(ner=True)
示例10: setUp
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def setUp(self):
super(TestSpacyWordSplitter, self).setUp()
self.word_splitter = SpacyWordSplitter()
示例11: setUp
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def setUp(self):
self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
self.utterance = self.tokenizer.tokenize(u"where is mersin?")
self.token_indexers = {u"tokens": SingleIdTokenIndexer(u"tokens")}
json = {
u'question': self.utterance,
u'columns': [u'Name in English', u'Location in English'],
u'cells': [[u'Paradeniz', u'Mersin'],
[u'Lake Gala', u'Edirne']]
}
self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
self.vocab = Vocabulary()
self.name_index = self.vocab.add_token_to_namespace(u"name", namespace=u'tokens')
self.in_index = self.vocab.add_token_to_namespace(u"in", namespace=u'tokens')
self.english_index = self.vocab.add_token_to_namespace(u"english", namespace=u'tokens')
self.location_index = self.vocab.add_token_to_namespace(u"location", namespace=u'tokens')
self.paradeniz_index = self.vocab.add_token_to_namespace(u"paradeniz", namespace=u'tokens')
self.mersin_index = self.vocab.add_token_to_namespace(u"mersin", namespace=u'tokens')
self.lake_index = self.vocab.add_token_to_namespace(u"lake", namespace=u'tokens')
self.gala_index = self.vocab.add_token_to_namespace(u"gala", namespace=u'tokens')
self.negative_one_index = self.vocab.add_token_to_namespace(u"-1", namespace=u'tokens')
self.zero_index = self.vocab.add_token_to_namespace(u"0", namespace=u'tokens')
self.one_index = self.vocab.add_token_to_namespace(u"1", namespace=u'tokens')
self.oov_index = self.vocab.get_token_index(u'random OOV string', namespace=u'tokens')
self.edirne_index = self.oov_index
self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer)
super(KnowledgeGraphFieldTest, self).setUp()
示例12: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self, model , dataset_reader ) :
super(ConstituencyParserPredictor, self).__init__(model, dataset_reader)
self._tokenizer = SpacyWordSplitter(language=u'en_core_web_sm', pos_tags=True)
示例13: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self, model , dataset_reader ) :
super(SemanticRoleLabelerPredictor, self).__init__(model, dataset_reader)
self._tokenizer = SpacyWordSplitter(language=u'en_core_web_sm', pos_tags=True)
示例14: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self, model , dataset_reader ) :
super(SentenceTaggerPredictor, self).__init__(model, dataset_reader)
self._tokenizer = SpacyWordSplitter(language=u'en_core_web_sm', pos_tags=True)
示例15: __init__
# 需要导入模块: from allennlp.data.tokenizers import word_splitter [as 别名]
# 或者: from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter [as 别名]
def __init__(self,
model: Model,
dataset_reader: DatasetReader) -> None:
super().__init__(model, dataset_reader)
self.tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(pos_tags=True))