本文整理汇总了Python中spacy.tokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python spacy.tokenizer方法的具体用法?Python spacy.tokenizer怎么用?Python spacy.tokenizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy
的用法示例。
在下文中一共展示了spacy.tokenizer方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokenizer [as 别名]
def run(entry):
text = entry['text']
summary = entry['summary']
text = ' '.join([_.text for _ in tokenizer(remove_non_ascii(text))])
summary = ' '.join([_.text for _ in tokenizer(remove_non_ascii(summary))])
text = nlp(text)
summary = nlp(summary)
text = '\n'.join([' '.join([_.text for _ in s]) for s in text.sents])
summary = '\n'.join([' '.join([_.text for _ in s]) for s in summary.sents])
# run pre-processing
line_text, pos_text, ner_text = pre_processing(text)
line_summary, pos_summary, ner_summary = pre_processing(summary)
entry['processed'] = {}
entry['processed']['text'] = line_text
entry['processed']['pos_text'] = pos_text
entry['processed']['ner_text'] = ner_text
entry['processed']['summary'] = line_summary
entry['processed']['pos_summary'] = pos_summary
entry['processed']['ner_summary'] = ner_summary
entry['text'] = text.lower()
entry['summary'] = summary.lower()
return entry
示例2: transform
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokenizer [as 别名]
def transform(self, data):
tokenizer = Tokenizer(nlp.vocab)
return np.array(
[
np.mean(
[
self.model[w.text.lower()]
for w in words
if w.text.lower() in self.model
]
or [np.zeros(self.dim)],
axis=0,
)
for words in tokenizer.pipe(data)
]
)
示例3: get_empty_model
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokenizer [as 别名]
def get_empty_model(load_labels_for_training: bool) -> French:
"""
Generate an empty NER model
:rtype: object
"""
# Important to setup the right language because it impacts the tokenizer, sentences split, ...
nlp = spacy.blank('fr')
nlp.tokenizer = get_tokenizer(nlp)
nlp.add_pipe(prevent_sentence_boundary_detection, name='prevent-sbd', first=True)
ner = nlp.create_pipe('ner')
# add labels
if load_labels_for_training:
for token_type in list(colors.keys()):
ner.add_label(token_type)
nlp.add_pipe(ner, last=True)
return nlp
示例4: load_data
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokenizer [as 别名]
def load_data(data_path: str, tokenize: bool = False, tokenizer_type: str = "just_spaces") -> List[str]:
if tokenizer_type == "just_spaces":
tokenizer = SpacyWordSplitter()
elif tokenizer_type == "spacy":
nlp = spacy.load('en')
tokenizer = Tokenizer(nlp.vocab)
tokenized_examples = []
with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f:
for line in f:
if data_path.endswith(".jsonl") or data_path.endswith(".json"):
example = json.loads(line)
else:
example = {"text": line}
if tokenize:
if tokenizer_type == 'just_spaces':
tokens = list(map(str, tokenizer.split_words(example['text'])))
elif tokenizer_type == 'spacy':
tokens = list(map(str, tokenizer(example['text'])))
text = ' '.join(tokens)
else:
text = example['text']
tokenized_examples.append(text)
return tokenized_examples
示例5: _spacy_tokenize
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokenizer [as 别名]
def _spacy_tokenize(x, spacy):
return [tok.text for tok in spacy.tokenizer(x)]
示例6: is_tokenizer_serializable
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokenizer [as 别名]
def is_tokenizer_serializable(tokenizer, language):
"""Extend with other tokenizers which are found to not be serializable
"""
if tokenizer == 'spacy':
return False
return True
示例7: __init__
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokenizer [as 别名]
def __init__(self, *args, **kwargs):
# Detect when the Spacy optional dependency is missing
if not HAS_OPTIONAL_DEPENDENCIES:
raise NotImplementedError(OPT_MSG_MISSING)
super().__init__(tokenizer=spacy_token_lemmatizer, *args, **kwargs)