當前位置: 首頁>>代碼示例>>Python>>正文


Python en.English方法代碼示例

本文整理匯總了Python中spacy.lang.en.English方法的典型用法代碼示例。如果您正苦於以下問題:Python en.English方法的具體用法?Python en.English怎麽用?Python en.English使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在spacy.lang.en的用法示例。


在下文中一共展示了en.English方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def __init__(
        self,
        parser,
        stop_words=spacy.lang.en.stop_words.STOP_WORDS,
        punctuations=string.punctuation,
    ):
        """Initialize the BOWTokenizer object.

        Arguments:
            parser {spacy.lang.en.English - by default} -- Any parser object
                that supports parser(sentence) call on it.

        Keyword Arguments:
            stop_words {iterable over str} -- Set of stop words to be removed.
            (default: {spacy.lang.en.stop_words.STOP_WORDS})
            punctuations {iterable over str} -- Set of punctuations to be
            removed. (default: {string.punctuation})
        """
        self.parser = parser
        # list of stop words and punctuation marks
        self.stop_words = stop_words
        self.punctuations = punctuations 
開發者ID:interpretml,項目名稱:interpret-text,代碼行數:24,代碼來源:utils_classical.py

示例2: normalize_batch

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """
    global NLP
    if not NLP:
        NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                           batch_size=p_batch_size, \
                           n_threads=p_thread_count)

    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield ' '.join(tokens) 
開發者ID:spacemanidol,項目名稱:MSMARCO,代碼行數:24,代碼來源:eval_exp.py

示例3: normalize_batch

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """

    global NLP
    if not NLP:
        NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                           batch_size=p_batch_size, \
                           n_threads=p_thread_count)

    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield ' '.join(tokens) 
開發者ID:spacemanidol,項目名稱:MSMARCO,代碼行數:25,代碼來源:ms_marco_eval.py

示例4: __init__

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)

        try:
            import ftfy
            from spacy.lang.en import English
            _nlp = English()
            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {} 
開發者ID:linhaow,項目名稱:TextClassify,代碼行數:22,代碼來源:tokenization_openai.py

示例5: fasttext_preprocess

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def fasttext_preprocess(texts: List[str]) -> List[str]:
    """
    Apply preprocessing appropriate for a fastText model to a set of texts.

    Args:
      texts: Texts to preprocess.

    Returns:
      List of preprocessed texts.
    """
    nlp = English()
    tokenizer = nlp.Defaults.create_tokenizer(nlp)

    processed_texts = []
    for doc in tokenizer.pipe(texts, batch_size=500):
        processed_texts.append(" ".join(tok.lower_ for tok in doc if tok.is_alpha))
    return processed_texts 
開發者ID:RTIInternational,項目名稱:gobbli,代碼行數:19,代碼來源:benchmark_util.py

示例6: _generate_text

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def _generate_text(sentences, fname, threshold=0.9):
    """iterate through sentences. if the sentence is not a signature block, 
    write to file.

    if probability(signature block) > threshold, then it is a signature block.

    Parameters
    ----------
    sentence : str
        Represents line in email block.
    POS_parser: obj
        Spacy English object used to tag parts-of-speech. Will explore using
        other POS taggers like NLTK's.
    fname : str
        Represents fname of new corpus, excluding signature block.
    threshold: float
        Lower thresholds will result in more false positives.
    """
    tagger = spacy.load('en_core_web_sm')

    with open(fname, "w") as new_file:
        for sentence in sentences:
            if _prob_block(sentence, tagger) < threshold:
                new_file.write(sentence) 
開發者ID:mynameisvinn,項目名稱:EmailParser,代碼行數:26,代碼來源:Parser.py

示例7: count_frequencies

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def count_frequencies(language_class: Language, input_path: Path):
    """
    Given a file containing single documents per line
    (in this case, sentences for the ICLR case law corpus), split the text
    using a science specific tokenizer and compute word and
    document frequencies for all words.
    """
    print(f"Processing {input_path}.")
    nlp = English()
    #tokenizer = combined_rule_tokenizer(language_class())
    tokenizer = Tokenizer(nlp.vocab)
    counts = Counter()
    doc_counts = Counter()
    for line in tqdm.tqdm(open(input_path, "r")):
        words = [t.text for t in tokenizer(line)]
        counts.update(words)
        doc_counts.update(set(words))

    return counts, doc_counts 
開發者ID:ICLRandD,項目名稱:Blackstone,代碼行數:21,代碼來源:word_freqs.py

示例8: normalize_batch

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """

    global NLP
    if not NLP:
       NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                          batch_size=p_batch_size, \
                          n_threads=p_thread_count)
    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield ' '.join(tokens) 
開發者ID:xycforgithub,項目名稱:MultiTask-MRC,代碼行數:24,代碼來源:ms_marco_eval_pretoken.py

示例9: __init__

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def __init__(self):
        nlp = English()
        # Create a Tokenizer with the default settings for English
        # including punctuation rules and exceptions
        self._tokenizer = nlp.Defaults.create_tokenizer(nlp) 
開發者ID:tokern,項目名稱:piicatcher,代碼行數:7,代碼來源:tokenizer.py

示例10: spacy_nlp

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def spacy_nlp():
    if getattr(spacy_nlp, '_nlp', None) is None:
        try:
            from spacy.lang.en import English
            spacy_nlp._nlp = English()
        except ImportError:
            raise ImportError('Please install spacy with: pip install spacy')
    return spacy_nlp._nlp 
開發者ID:pytorch,項目名稱:fairseq,代碼行數:10,代碼來源:alignment_utils.py

示例11: __init__

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def __init__(self, language=English):
        self.nlp = language()
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer')) 
開發者ID:dmmiller612,項目名稱:bert-extractive-summarizer,代碼行數:5,代碼來源:sentence_handler.py

示例12: __init__

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
                 sep_token="</s>", pad_token="<pad>", cls_token="</s>",
                 mask_token="<special1>", additional_special_tokens=["<special0>",
                 "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
                 "<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
        super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
                                           sep_token=sep_token, pad_token=pad_token,
                                           cls_token=cls_token, mask_token=mask_token,
                                           additional_special_tokens=additional_special_tokens,
                                           **kwargs)
        try:
            import ftfy
            from spacy.lang.en import English
            _nlp = English()
            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
        merges = [tuple(merge.split()[:2]) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {} 
開發者ID:linhaow,項目名稱:TextClassify,代碼行數:29,代碼來源:tokenization_xlm.py

示例13: test_detokenize_doc

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def test_detokenize_doc(text):
    # Initialize the spaCy extension needed to detokenize text
    WordNet()

    nlp = English()
    doc = nlp(text)

    # Fill out the replacement attribute as WordNet would.
    for tok in doc:
        tok._.replacement = tok.text
    assert _detokenize_doc(doc) == text 
開發者ID:RTIInternational,項目名稱:gobbli,代碼行數:13,代碼來源:test_wordnet.py

示例14: nlp

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def nlp():
    return English() 
開發者ID:mpuig,項目名稱:spacy-lookup,代碼行數:4,代碼來源:test_entities.py

示例15: __init__

# 需要導入模塊: from spacy.lang import en [as 別名]
# 或者: from spacy.lang.en import English [as 別名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        super().__init__(unk_token=unk_token, **kwargs)

        self.max_len_single_sentence = (
            self.max_len
        )  # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = (
            self.max_len
        )  # no default special tokens - you can update this value if you add special tokens

        try:
            import ftfy
            from spacy.lang.en import English

            _nlp = English()
            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        self.decoder = {v: k for k, v in self.encoder.items()}
        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {} 
開發者ID:bhoov,項目名稱:exbert,代碼行數:32,代碼來源:tokenization_openai.py


注:本文中的spacy.lang.en.English方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。