本文整理汇总了Python中spacy.tokenizer.Tokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenizer.Tokenizer方法的具体用法?Python tokenizer.Tokenizer怎么用?Python tokenizer.Tokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy.tokenizer
的用法示例。
在下文中一共展示了tokenizer.Tokenizer方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: transform
# 需要导入模块: from spacy import tokenizer [as 别名]
# 或者: from spacy.tokenizer import Tokenizer [as 别名]
def transform(self, data):
tokenizer = Tokenizer(nlp.vocab)
return np.array(
[
np.mean(
[
self.model[w.text.lower()]
for w in words
if w.text.lower() in self.model
]
or [np.zeros(self.dim)],
axis=0,
)
for words in tokenizer.pipe(data)
]
)
示例2: count_frequencies
# 需要导入模块: from spacy import tokenizer [as 别名]
# 或者: from spacy.tokenizer import Tokenizer [as 别名]
def count_frequencies(language_class: Language, input_path: Path):
"""
Given a file containing single documents per line
(in this case, sentences for the ICLR case law corpus), split the text
using a science specific tokenizer and compute word and
document frequencies for all words.
"""
print(f"Processing {input_path}.")
nlp = English()
#tokenizer = combined_rule_tokenizer(language_class())
tokenizer = Tokenizer(nlp.vocab)
counts = Counter()
doc_counts = Counter()
for line in tqdm.tqdm(open(input_path, "r")):
words = [t.text for t in tokenizer(line)]
counts.update(words)
doc_counts.update(set(words))
return counts, doc_counts
示例3: load_data
# 需要导入模块: from spacy import tokenizer [as 别名]
# 或者: from spacy.tokenizer import Tokenizer [as 别名]
def load_data(data_path: str, tokenize: bool = False, tokenizer_type: str = "just_spaces") -> List[str]:
if tokenizer_type == "just_spaces":
tokenizer = SpacyWordSplitter()
elif tokenizer_type == "spacy":
nlp = spacy.load('en')
tokenizer = Tokenizer(nlp.vocab)
tokenized_examples = []
with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f:
for line in f:
if data_path.endswith(".jsonl") or data_path.endswith(".json"):
example = json.loads(line)
else:
example = {"text": line}
if tokenize:
if tokenizer_type == 'just_spaces':
tokens = list(map(str, tokenizer.split_words(example['text'])))
elif tokenizer_type == 'spacy':
tokens = list(map(str, tokenizer(example['text'])))
text = ' '.join(tokens)
else:
text = example['text']
tokenized_examples.append(text)
return tokenized_examples
示例4: __init__
# 需要导入模块: from spacy import tokenizer [as 别名]
# 或者: from spacy.tokenizer import Tokenizer [as 别名]
def __init__(self, args):
if args.lang == 'cn':
import jieba
if args.dict:
if not os.path.exists(args.dict):
print('Segmentor dictionary not found.')
exit(1)
jieba.load_userdict(args.dict)
self.cut = jieba.cut
else: # en
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
self.tokenizer = Tokenizer(nlp.vocab)
self.cut = self.cut_en
示例5: __init__
# 需要导入模块: from spacy import tokenizer [as 别名]
# 或者: from spacy.tokenizer import Tokenizer [as 别名]
def __init__(self, nlp):
if not isinstance(nlp, Language):
raise ValueError("NLP must be an instance of spacy.lang")
self.nlp = nlp
self.tokenizer = Tokenizer(
nlp.vocab,
nlp.Defaults.tokenizer_exceptions,
prefix_search=self._get_prefix_regex().search,
infix_finditer=self._get_infix_regex().finditer,
suffix_search=self._get_suffix_regex().search,
token_match=None
)
示例6: init_model
# 需要导入模块: from spacy import tokenizer [as 别名]
# 或者: from spacy.tokenizer import Tokenizer [as 别名]
def init_model(lang, output_dir, freqs_loc=None,
vectors_loc=None, no_expand_vectors=False,
meta_overrides=None, prune_vectors=-1, min_word_frequency=50):
"""
Create a new model from raw data, like word frequencies, Brown clusters
and word vectors.
"""
output_dir = ensure_path(output_dir)
if vectors_loc is not None:
vectors_loc = cached_path(vectors_loc)
vectors_loc = ensure_path(vectors_loc)
if freqs_loc is not None:
freqs_loc = cached_path(freqs_loc)
freqs_loc = ensure_path(freqs_loc)
if freqs_loc is not None and not freqs_loc.exists():
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
probs, oov_prob = read_freqs(freqs_loc, min_freq=min_word_frequency) if freqs_loc is not None else ({}, -20)
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
nlp = create_model(lang, probs, oov_prob, vectors_data, vector_keys, not no_expand_vectors, prune_vectors)
# Insert our custom tokenizer into the base model.
#nlp.tokenizer = combined_rule_tokenizer(nlp)
nlp.tokenizer = Tokenizer(nlp.vocab)
if meta_overrides is not None:
metadata = json.load(open(meta_overrides))
nlp.meta.update(metadata)
nlp.meta["version"] = VERSION
if not output_dir.exists():
os.makedirs(output_dir, exist_ok=True)
nlp.to_disk(output_dir)
return nlp
示例7: get_tokenizer
# 需要导入模块: from spacy import tokenizer [as 别名]
# 或者: from spacy.tokenizer import Tokenizer [as 别名]
def get_tokenizer(model: French) -> Tokenizer:
split_char = r"[ ,\\.()-/\\|:;'\"+=!’?_+#“’']"
extended_infix = [r'[:\\(\\)-\./#"“’\'—'] + model.Defaults.infixes
infix_re = spacy.util.compile_infix_regex(extended_infix)
prefix_re = spacy.util.compile_prefix_regex(tuple(list(model.Defaults.prefixes) + [split_char]))
suffix_re = spacy.util.compile_suffix_regex(tuple(list(model.Defaults.suffixes) + [split_char]))
tok = Tokenizer(model.vocab,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=None)
return tok
示例8: biomedical_tokenizer
# 需要导入模块: from spacy import tokenizer [as 别名]
# 或者: from spacy.tokenizer import Tokenizer [as 别名]
def biomedical_tokenizer(nlp):
"""
Customizes spaCy's tokenizer class for better handling of biomedical text.
"""
return Tokenizer(nlp.vocab, infix_finditer=INFIX_RE.finditer)