Python en.English方法代码示例

本文整理汇总了Python中spacy.lang.en.English方法的典型用法代码示例。如果您正苦于以下问题：Python en.English方法的具体用法？Python en.English怎么用？Python en.English使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy.lang.en的用法示例。

在下文中一共展示了en.English方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: init

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(
        self,
        parser,
        stop_words=spacy.lang.en.stop_words.STOP_WORDS,
        punctuations=string.punctuation,
    ):
        """Initialize the BOWTokenizer object.

        Arguments:
            parser {spacy.lang.en.English - by default} -- Any parser object
                that supports parser(sentence) call on it.

        Keyword Arguments:
            stop_words {iterable over str} -- Set of stop words to be removed.
            (default: {spacy.lang.en.stop_words.STOP_WORDS})
            punctuations {iterable over str} -- Set of punctuations to be
            removed. (default: {string.punctuation})
        """
        self.parser = parser
        # list of stop words and punctuation marks
        self.stop_words = stop_words
        self.punctuations = punctuations

开发者ID:interpretml，项目名称:interpret-text，代码行数:24，代码来源:utils_classical.py

示例2: normalize_batch

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """
    global NLP
    if not NLP:
        NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                           batch_size=p_batch_size, \
                           n_threads=p_thread_count)

    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield ' '.join(tokens)

开发者ID:spacemanidol，项目名称:MSMARCO，代码行数:24，代码来源:eval_exp.py

示例3: normalize_batch

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """

    global NLP
    if not NLP:
        NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                           batch_size=p_batch_size, \
                           n_threads=p_thread_count)

    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield ' '.join(tokens)

开发者ID:spacemanidol，项目名称:MSMARCO，代码行数:25，代码来源:ms_marco_eval.py

示例4: init

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)

        try:
            import ftfy
            from spacy.lang.en import English
            _nlp = English()
            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}

开发者ID:linhaow，项目名称:TextClassify，代码行数:22，代码来源:tokenization_openai.py

示例5: fasttext_preprocess

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def fasttext_preprocess(texts: List[str]) -> List[str]:
    """
    Apply preprocessing appropriate for a fastText model to a set of texts.

    Args:
      texts: Texts to preprocess.

    Returns:
      List of preprocessed texts.
    """
    nlp = English()
    tokenizer = nlp.Defaults.create_tokenizer(nlp)

    processed_texts = []
    for doc in tokenizer.pipe(texts, batch_size=500):
        processed_texts.append(" ".join(tok.lower_ for tok in doc if tok.is_alpha))
    return processed_texts

开发者ID:RTIInternational，项目名称:gobbli，代码行数:19，代码来源:benchmark_util.py

示例6: _generate_text

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def _generate_text(sentences, fname, threshold=0.9):
    """iterate through sentences. if the sentence is not a signature block, 
    write to file.

    if probability(signature block) > threshold, then it is a signature block.

    Parameters
    ----------
    sentence : str
        Represents line in email block.
    POS_parser: obj
        Spacy English object used to tag parts-of-speech. Will explore using
        other POS taggers like NLTK's.
    fname : str
        Represents fname of new corpus, excluding signature block.
    threshold: float
        Lower thresholds will result in more false positives.
    """
    tagger = spacy.load('en_core_web_sm')

    with open(fname, "w") as new_file:
        for sentence in sentences:
            if _prob_block(sentence, tagger) < threshold:
                new_file.write(sentence)

开发者ID:mynameisvinn，项目名称:EmailParser，代码行数:26，代码来源:Parser.py

示例7: count_frequencies

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def count_frequencies(language_class: Language, input_path: Path):
    """
    Given a file containing single documents per line
    (in this case, sentences for the ICLR case law corpus), split the text
    using a science specific tokenizer and compute word and
    document frequencies for all words.
    """
    print(f"Processing {input_path}.")
    nlp = English()
    #tokenizer = combined_rule_tokenizer(language_class())
    tokenizer = Tokenizer(nlp.vocab)
    counts = Counter()
    doc_counts = Counter()
    for line in tqdm.tqdm(open(input_path, "r")):
        words = [t.text for t in tokenizer(line)]
        counts.update(words)
        doc_counts.update(set(words))

    return counts, doc_counts

开发者ID:ICLRandD，项目名称:Blackstone，代码行数:21，代码来源:word_freqs.py

示例8: normalize_batch

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """

    global NLP
    if not NLP:
       NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                          batch_size=p_batch_size, \
                          n_threads=p_thread_count)
    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield ' '.join(tokens)

开发者ID:xycforgithub，项目名称:MultiTask-MRC，代码行数:24，代码来源:ms_marco_eval_pretoken.py

示例9: init

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self):
        nlp = English()
        # Create a Tokenizer with the default settings for English
        # including punctuation rules and exceptions
        self._tokenizer = nlp.Defaults.create_tokenizer(nlp)

开发者ID:tokern，项目名称:piicatcher，代码行数:7，代码来源:tokenizer.py

示例10: spacy_nlp

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def spacy_nlp():
    if getattr(spacy_nlp, '_nlp', None) is None:
        try:
            from spacy.lang.en import English
            spacy_nlp._nlp = English()
        except ImportError:
            raise ImportError('Please install spacy with: pip install spacy')
    return spacy_nlp._nlp

开发者ID:pytorch，项目名称:fairseq，代码行数:10，代码来源:alignment_utils.py

示例11: init

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self, language=English):
        self.nlp = language()
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))

开发者ID:dmmiller612，项目名称:bert-extractive-summarizer，代码行数:5，代码来源:sentence_handler.py

示例12: init

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
                 sep_token="</s>", pad_token="<pad>", cls_token="</s>",
                 mask_token="<special1>", additional_special_tokens=["<special0>",
                 "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
                 "<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
        super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
                                           sep_token=sep_token, pad_token=pad_token,
                                           cls_token=cls_token, mask_token=mask_token,
                                           additional_special_tokens=additional_special_tokens,
                                           **kwargs)
        try:
            import ftfy
            from spacy.lang.en import English
            _nlp = English()
            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
        merges = [tuple(merge.split()[:2]) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}

开发者ID:linhaow，项目名称:TextClassify，代码行数:29，代码来源:tokenization_xlm.py

示例13: test_detokenize_doc

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def test_detokenize_doc(text):
    # Initialize the spaCy extension needed to detokenize text
    WordNet()

    nlp = English()
    doc = nlp(text)

    # Fill out the replacement attribute as WordNet would.
    for tok in doc:
        tok._.replacement = tok.text
    assert _detokenize_doc(doc) == text

开发者ID:RTIInternational，项目名称:gobbli，代码行数:13，代码来源:test_wordnet.py

示例14: nlp

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def nlp():
    return English()

开发者ID:mpuig，项目名称:spacy-lookup，代码行数:4，代码来源:test_entities.py

示例15: init

# 需要导入模块: from spacy.lang import en [as 别名]
# 或者: from spacy.lang.en import English [as 别名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        super().__init__(unk_token=unk_token, **kwargs)

        self.max_len_single_sentence = (
            self.max_len
        )  # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = (
            self.max_len
        )  # no default special tokens - you can update this value if you add special tokens

        try:
            import ftfy
            from spacy.lang.en import English

            _nlp = English()
            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        self.decoder = {v: k for k, v in self.encoder.items()}
        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}

开发者ID:bhoov，项目名称:exbert，代码行数:32，代码来源:tokenization_openai.py

注：本文中的spacy.lang.en.English方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。

示例1: __init__