本文整理汇总了Python中spacy.lang.en.English方法的典型用法代码示例。如果您正苦于以下问题:Python en.English方法的具体用法?Python en.English怎么用?Python en.English使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy.lang.en
的用法示例。
在下文中一共展示了en.English方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(
self,
parser,
stop_words=spacy.lang.en.stop_words.STOP_WORDS,
punctuations=string.punctuation,
):
"""Initialize the BOWTokenizer object.
Arguments:
parser {spacy.lang.en.English - by default} -- Any parser object
that supports parser(sentence) call on it.
Keyword Arguments:
stop_words {iterable over str} -- Set of stop words to be removed.
(default: {spacy.lang.en.stop_words.STOP_WORDS})
punctuations {iterable over str} -- Set of punctuations to be
removed. (default: {string.punctuation})
"""
self.parser = parser
# list of stop words and punctuation marks
self.stop_words = stop_words
self.punctuations = punctuations
示例2: normalize_batch
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
"""Normalize and tokenize strings.
Args:
p_iter (iter): iter over strings to normalize and tokenize.
p_batch_size (int): number of batches.
p_thread_count (int): number of threads running.
Returns:
iter: iter over normalized and tokenized string.
"""
global NLP
if not NLP:
NLP = NlpEnglish(parser=False)
output_iter = NLP.pipe(p_iter, \
batch_size=p_batch_size, \
n_threads=p_thread_count)
for doc in output_iter:
tokens = [str(w).strip().lower() for w in doc]
yield ' '.join(tokens)
示例3: normalize_batch
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
"""Normalize and tokenize strings.
Args:
p_iter (iter): iter over strings to normalize and tokenize.
p_batch_size (int): number of batches.
p_thread_count (int): number of threads running.
Returns:
iter: iter over normalized and tokenized string.
"""
global NLP
if not NLP:
NLP = NlpEnglish(parser=False)
output_iter = NLP.pipe(p_iter, \
batch_size=p_batch_size, \
n_threads=p_thread_count)
for doc in output_iter:
tokens = [str(w).strip().lower() for w in doc]
yield ' '.join(tokens)
示例4: __init__
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
try:
import ftfy
from spacy.lang.en import English
_nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True)
self.fix_text = None
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
示例5: fasttext_preprocess
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def fasttext_preprocess(texts: List[str]) -> List[str]:
"""
Apply preprocessing appropriate for a fastText model to a set of texts.
Args:
texts: Texts to preprocess.
Returns:
List of preprocessed texts.
"""
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)
processed_texts = []
for doc in tokenizer.pipe(texts, batch_size=500):
processed_texts.append(" ".join(tok.lower_ for tok in doc if tok.is_alpha))
return processed_texts
示例6: _generate_text
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def _generate_text(sentences, fname, threshold=0.9):
"""iterate through sentences. if the sentence is not a signature block,
write to file.
if probability(signature block) > threshold, then it is a signature block.
Parameters
----------
sentence : str
Represents line in email block.
POS_parser: obj
Spacy English object used to tag parts-of-speech. Will explore using
other POS taggers like NLTK's.
fname : str
Represents fname of new corpus, excluding signature block.
threshold: float
Lower thresholds will result in more false positives.
"""
tagger = spacy.load('en_core_web_sm')
with open(fname, "w") as new_file:
for sentence in sentences:
if _prob_block(sentence, tagger) < threshold:
new_file.write(sentence)
示例7: count_frequencies
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def count_frequencies(language_class: Language, input_path: Path):
"""
Given a file containing single documents per line
(in this case, sentences for the ICLR case law corpus), split the text
using a science specific tokenizer and compute word and
document frequencies for all words.
"""
print(f"Processing {input_path}.")
nlp = English()
#tokenizer = combined_rule_tokenizer(language_class())
tokenizer = Tokenizer(nlp.vocab)
counts = Counter()
doc_counts = Counter()
for line in tqdm.tqdm(open(input_path, "r")):
words = [t.text for t in tokenizer(line)]
counts.update(words)
doc_counts.update(set(words))
return counts, doc_counts
示例8: normalize_batch
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
"""Normalize and tokenize strings.
Args:
p_iter (iter): iter over strings to normalize and tokenize.
p_batch_size (int): number of batches.
p_thread_count (int): number of threads running.
Returns:
iter: iter over normalized and tokenized string.
"""
global NLP
if not NLP:
NLP = NlpEnglish(parser=False)
output_iter = NLP.pipe(p_iter, \
batch_size=p_batch_size, \
n_threads=p_thread_count)
for doc in output_iter:
tokens = [str(w).strip().lower() for w in doc]
yield ' '.join(tokens)
示例9: __init__
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self):
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
self._tokenizer = nlp.Defaults.create_tokenizer(nlp)
示例10: spacy_nlp
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def spacy_nlp():
if getattr(spacy_nlp, '_nlp', None) is None:
try:
from spacy.lang.en import English
spacy_nlp._nlp = English()
except ImportError:
raise ImportError('Please install spacy with: pip install spacy')
return spacy_nlp._nlp
示例11: __init__
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self, language=English):
self.nlp = language()
self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
示例12: __init__
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
sep_token="</s>", pad_token="<pad>", cls_token="</s>",
mask_token="<special1>", additional_special_tokens=["<special0>",
"<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
"<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
sep_token=sep_token, pad_token=pad_token,
cls_token=cls_token, mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs)
try:
import ftfy
from spacy.lang.en import English
_nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True)
self.fix_text = None
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
示例13: test_detokenize_doc
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def test_detokenize_doc(text):
# Initialize the spaCy extension needed to detokenize text
WordNet()
nlp = English()
doc = nlp(text)
# Fill out the replacement attribute as WordNet would.
for tok in doc:
tok._.replacement = tok.text
assert _detokenize_doc(doc) == text
示例14: nlp
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def nlp():
return English()
示例15: __init__
# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
super().__init__(unk_token=unk_token, **kwargs)
self.max_len_single_sentence = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
try:
import ftfy
from spacy.lang.en import English
_nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True)
self.fix_text = None
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}