當前位置: 首頁>>代碼示例>>Python>>正文


Python tokenizer.Tokenizer方法代碼示例

本文整理匯總了Python中spacy.tokenizer.Tokenizer方法的典型用法代碼示例。如果您正苦於以下問題:Python tokenizer.Tokenizer方法的具體用法?Python tokenizer.Tokenizer怎麽用?Python tokenizer.Tokenizer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在spacy.tokenizer的用法示例。


在下文中一共展示了tokenizer.Tokenizer方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: transform

# 需要導入模塊: from spacy import tokenizer [as 別名]
# 或者: from spacy.tokenizer import Tokenizer [as 別名]
def transform(self, data):
        tokenizer = Tokenizer(nlp.vocab)
        return np.array(
            [
                np.mean(
                    [
                        self.model[w.text.lower()]
                        for w in words
                        if w.text.lower() in self.model
                    ]
                    or [np.zeros(self.dim)],
                    axis=0,
                )
                for words in tokenizer.pipe(data)
            ]
        ) 
開發者ID:mozilla,項目名稱:bugbug,代碼行數:18,代碼來源:nlp.py

示例2: count_frequencies

# 需要導入模塊: from spacy import tokenizer [as 別名]
# 或者: from spacy.tokenizer import Tokenizer [as 別名]
def count_frequencies(language_class: Language, input_path: Path):
    """
    Given a file containing single documents per line
    (in this case, sentences for the ICLR case law corpus), split the text
    using a science specific tokenizer and compute word and
    document frequencies for all words.
    """
    print(f"Processing {input_path}.")
    nlp = English()
    #tokenizer = combined_rule_tokenizer(language_class())
    tokenizer = Tokenizer(nlp.vocab)
    counts = Counter()
    doc_counts = Counter()
    for line in tqdm.tqdm(open(input_path, "r")):
        words = [t.text for t in tokenizer(line)]
        counts.update(words)
        doc_counts.update(set(words))

    return counts, doc_counts 
開發者ID:ICLRandD,項目名稱:Blackstone,代碼行數:21,代碼來源:word_freqs.py

示例3: load_data

# 需要導入模塊: from spacy import tokenizer [as 別名]
# 或者: from spacy.tokenizer import Tokenizer [as 別名]
def load_data(data_path: str, tokenize: bool = False, tokenizer_type: str = "just_spaces") -> List[str]:
    if tokenizer_type == "just_spaces":
        tokenizer = SpacyWordSplitter()
    elif tokenizer_type == "spacy":
        nlp = spacy.load('en')
        tokenizer = Tokenizer(nlp.vocab)
    tokenized_examples = []
    with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f:
        for line in f:
            if data_path.endswith(".jsonl") or data_path.endswith(".json"):
                example = json.loads(line)
            else:
                example = {"text": line}
            if tokenize:
                if tokenizer_type == 'just_spaces':
                    tokens = list(map(str, tokenizer.split_words(example['text'])))
                elif tokenizer_type == 'spacy':
                    tokens = list(map(str, tokenizer(example['text'])))
                text = ' '.join(tokens)
            else:
                text = example['text']
            tokenized_examples.append(text)
    return tokenized_examples 
開發者ID:allenai,項目名稱:vampire,代碼行數:25,代碼來源:preprocess_data.py

示例4: __init__

# 需要導入模塊: from spacy import tokenizer [as 別名]
# 或者: from spacy.tokenizer import Tokenizer [as 別名]
def __init__(self, args):
        if args.lang == 'cn':
            import jieba
            if args.dict:
                if not os.path.exists(args.dict):
                    print('Segmentor dictionary not found.')
                    exit(1)
                jieba.load_userdict(args.dict)
            self.cut = jieba.cut
        else:  # en
            from spacy.tokenizer import Tokenizer
            from spacy.lang.en import English
            nlp = English()
            self.tokenizer = Tokenizer(nlp.vocab)
            self.cut = self.cut_en 
開發者ID:RandyPen,項目名稱:TextCluster,代碼行數:17,代碼來源:segmentor.py

示例5: __init__

# 需要導入模塊: from spacy import tokenizer [as 別名]
# 或者: from spacy.tokenizer import Tokenizer [as 別名]
def __init__(self, nlp):

        if not isinstance(nlp, Language):
            raise ValueError("NLP must be an instance of spacy.lang")
        self.nlp = nlp
        self.tokenizer = Tokenizer(
            nlp.vocab,
            nlp.Defaults.tokenizer_exceptions,
            prefix_search=self._get_prefix_regex().search,
            infix_finditer=self._get_infix_regex().finditer,
            suffix_search=self._get_suffix_regex().search,
            token_match=None
        ) 
開發者ID:NLPatVCU,項目名稱:medaCy,代碼行數:15,代碼來源:character_tokenizer.py

示例6: init_model

# 需要導入模塊: from spacy import tokenizer [as 別名]
# 或者: from spacy.tokenizer import Tokenizer [as 別名]
def init_model(lang, output_dir, freqs_loc=None,
               vectors_loc=None, no_expand_vectors=False,
               meta_overrides=None, prune_vectors=-1, min_word_frequency=50):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors.
    """
    output_dir = ensure_path(output_dir)
    if vectors_loc is not None:
        vectors_loc = cached_path(vectors_loc)
        vectors_loc = ensure_path(vectors_loc)
    if freqs_loc is not None:
        freqs_loc = cached_path(freqs_loc)
        freqs_loc = ensure_path(freqs_loc)

    if freqs_loc is not None and not freqs_loc.exists():
        msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
    probs, oov_prob = read_freqs(freqs_loc, min_freq=min_word_frequency) if freqs_loc is not None else ({}, -20)
    vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
    nlp = create_model(lang, probs, oov_prob, vectors_data, vector_keys, not no_expand_vectors, prune_vectors)

    # Insert our custom tokenizer into the base model.
    #nlp.tokenizer = combined_rule_tokenizer(nlp)

    nlp.tokenizer = Tokenizer(nlp.vocab)

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)
        nlp.meta["version"] = VERSION

    if not output_dir.exists():
        os.makedirs(output_dir, exist_ok=True)
    nlp.to_disk(output_dir)
    return nlp 
開發者ID:ICLRandD,項目名稱:Blackstone,代碼行數:37,代碼來源:init_model.py

示例7: get_tokenizer

# 需要導入模塊: from spacy import tokenizer [as 別名]
# 或者: from spacy.tokenizer import Tokenizer [as 別名]
def get_tokenizer(model: French) -> Tokenizer:
    split_char = r"[ ,\\.()-/\\|:;'\"+=!’?_+#“’']"
    extended_infix = [r'[:\\(\\)-\./#"“’\'—'] + model.Defaults.infixes
    infix_re = spacy.util.compile_infix_regex(extended_infix)
    prefix_re = spacy.util.compile_prefix_regex(tuple(list(model.Defaults.prefixes) + [split_char]))
    suffix_re = spacy.util.compile_suffix_regex(tuple(list(model.Defaults.suffixes) + [split_char]))

    tok = Tokenizer(model.vocab,
                    prefix_search=prefix_re.search,
                    suffix_search=suffix_re.search,
                    infix_finditer=infix_re.finditer,
                    token_match=None)
    return tok 
開發者ID:ELS-RD,項目名稱:anonymisation,代碼行數:15,代碼來源:model_factory.py

示例8: biomedical_tokenizer

# 需要導入模塊: from spacy import tokenizer [as 別名]
# 或者: from spacy.tokenizer import Tokenizer [as 別名]
def biomedical_tokenizer(nlp):
    """
    Customizes spaCy's tokenizer class for better handling of biomedical text.
    """
    return Tokenizer(nlp.vocab, infix_finditer=INFIX_RE.finditer) 
開發者ID:BaderLab,項目名稱:saber,代碼行數:7,代碼來源:text_utils.py


注:本文中的spacy.tokenizer.Tokenizer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。