当前位置: 首页>>代码示例>>Python>>正文


Python spacy.blank方法代码示例

本文整理汇总了Python中spacy.blank方法的典型用法代码示例。如果您正苦于以下问题:Python spacy.blank方法的具体用法?Python spacy.blank怎么用?Python spacy.blank使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在spacy的用法示例。


在下文中一共展示了spacy.blank方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_nlp

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def load_nlp(vectors_loc, lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
    return nlp 
开发者ID:sonvx,项目名称:word2vecVN,代码行数:21,代码来源:spacy-fastext.py

示例2: main

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(args):
    objs = []
    with open(args.path, encoding=args.encoding) as f:
        for linum, line in enumerate(f):
            try:
                objs.append(json.loads(line.strip()))
            except Exception as e:
                message = f'line {linum+1}: {e}'
                raise RuntimeError(message)

    nlp = spacy.blank('id')
    with ProcessPoolExecutor(max_workers=args.max_workers) as exc:
        tok_objs = exc.map(partial(tokenize_obj, nlp), objs, chunksize=args.chunk_size)
        docs = [Document.from_mapping(obj) for obj in tok_objs]
        if args.discard_long_summary:
            docs = [doc for doc in docs if not has_long_summary(doc)]
        print('\n'.join(json.dumps(doc.to_dict(), sort_keys=True) for doc in docs)) 
开发者ID:kata-ai,项目名称:indosum,代码行数:19,代码来源:tokenize_jsonl.py

示例3: test_spacy_featurizer_using_empty_model

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def test_spacy_featurizer_using_empty_model():
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
    import spacy

    sentence = "This test is using an empty spaCy model"

    model = spacy.blank("en")
    doc = model(sentence)

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    message = Message(sentence)
    message.set(SPACY_DOCS[TEXT], doc)

    ftr._set_spacy_features(message)

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert vecs is None 
开发者ID:botfront,项目名称:rasa-for-botfront,代码行数:21,代码来源:test_spacy_featurizer.py

示例4: main

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(data_folder: str, output_folder: str, model_folder: str) -> None:
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".txt")]
    tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt'))

    for filename in tqdm(iterable=filenames, unit=" txt", desc="anonymize cases"):
        with open(os.path.join(data_folder, filename), 'r') as input_f:
            sentences = tagger.predict(sentences=input_f.readlines(),
                                       mini_batch_size=32,
                                       verbose=False,
                                       use_tokenizer=tokenizer)
            case_name = filename.split('.')[0]
            page_html = render_ner_html(sentences, colors=colors, title=case_name)

            with open(os.path.join(output_folder, case_name + ".html"), "w") as output:
                output.write(page_html) 
开发者ID:ELS-RD,项目名称:anonymisation,代码行数:20,代码来源:flair_generate_html_from_txt.py

示例5: get_empty_model

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def get_empty_model(load_labels_for_training: bool) -> French:
    """
    Generate an empty NER model
    :rtype: object
    """
    # Important to setup the right language because it impacts the tokenizer, sentences split, ...
    nlp = spacy.blank('fr')

    nlp.tokenizer = get_tokenizer(nlp)

    nlp.add_pipe(prevent_sentence_boundary_detection, name='prevent-sbd', first=True)
    ner = nlp.create_pipe('ner')
    # add labels
    if load_labels_for_training:
        for token_type in list(colors.keys()):
            ner.add_label(token_type)

    nlp.add_pipe(ner, last=True)

    return nlp 
开发者ID:ELS-RD,项目名称:anonymisation,代码行数:22,代码来源:model_factory.py

示例6: spacy_model_with_data

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def spacy_model_with_data():
    # Creating blank model and setting up the spaCy pipeline
    nlp = spacy.blank("en")
    textcat = nlp.create_pipe(
        "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
    )
    nlp.add_pipe(textcat, last=True)

    # Training the model to recognize between computer graphics and baseball in 20newsgroups dataset
    categories = ['comp.graphics', 'rec.sport.baseball']
    for cat in categories:
        textcat.add_label(cat)

    # Split train/test and train the model
    train_x, train_y, test_x, _ = _get_train_test_dataset(categories)
    train_data = list(zip(train_x, [{"cats": cats} for cats in train_y]))
    _train_model(nlp, train_data)
    return ModelWithData(nlp, pd.DataFrame(test_x)) 
开发者ID:mlflow,项目名称:mlflow,代码行数:20,代码来源:test_spacy_model_export.py

示例7: test_model_log_without_pyfunc_flavor

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def test_model_log_without_pyfunc_flavor():
    artifact_path = "model"
    nlp = spacy.blank("en")

    # Add a component not compatible with pyfunc
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)

    # Ensure the pyfunc flavor is not present after logging and loading the model
    with mlflow.start_run():
        mlflow.spacy.log_model(spacy_model=nlp, artifact_path=artifact_path)
        model_path = _download_artifact_from_uri("runs:/{run_id}/{artifact_path}".format(
            run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path))

        loaded_model = Model.load(model_path)
        assert loaded_model.flavors.keys() == {"spacy"} 
开发者ID:mlflow,项目名称:mlflow,代码行数:18,代码来源:test_spacy_model_export.py

示例8: __init__

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def __init__(self, lang: str):
        self.tok = spacy.blank(lang, disable=["parser", "tagger", "ner"]) 
开发者ID:jrzaurin,项目名称:pytorch-widedeep,代码行数:4,代码来源:fastai_transforms.py

示例9: main

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(data_folder: str, model_folder: str, dev_size: float, nb_epochs: int,
         nb_segment: Optional[int], segment: Optional[int]) -> None:
    nlp = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size,
                                                     nb_segment=nb_segment, segment=segment)
    tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
    print(tag_dictionary.idx2item)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('fr'),
        FlairEmbeddings('fr-forward'),
        FlairEmbeddings('fr-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            use_crf=True,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner')

    trainer: ModelTrainer = ModelTrainer(model=tagger, corpus=corpus, use_tensorboard=True)

    trainer.train(model_folder,
                  max_epochs=nb_epochs,
                  mini_batch_size=32,
                  embeddings_storage_mode="cpu",
                  checkpoint=False,
                  ) 
开发者ID:ELS-RD,项目名称:anonymisation,代码行数:34,代码来源:flair_train.py

示例10: main

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(data_folder: str, model_folder: str, top_n: int) -> None:
    print(f"keep only top {top_n} examples per file")
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".xml")]
    sentences: List[Sentence] = list()
    with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar:
        for filename in filenames:
            paragraphs: List[Paragraph] = get_paragraph_from_file(path=os.path.join(data_folder, filename),
                                                                  keep_paragraph_without_annotation=True)
            if len(paragraphs) > top_n:
                for paragraph in paragraphs[:top_n]:
                    if len(paragraph.text) > 0:
                        s = Sentence(text=paragraph.text, tokenizer=tokenizer)
                        sentences.append(s)
            progress_bar.update()
    if len(sentences) == 0:
        raise Exception("No example loaded, causes: no cases in provided path or sample size is to high")

    tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt'))
    _ = tagger.predict(sentences=sentences,
                       mini_batch_size=32,
                       verbose=True)

    print("prepare html")
    page_html = render_ner_html(sentences, colors=colors)
    print("write html")
    with open("sentence.html", "w") as writer:
        writer.write(page_html) 
开发者ID:ELS-RD,项目名称:anonymisation,代码行数:32,代码来源:flair_generate_html_from_xml.py

示例11: interactive

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def interactive(opt):
    def prepare_seq(seq, max_seq_len, word2idx, device):
        ''' Prepares sequence for inference '''
        seq = nlp(seq)
        seq = [token.text for token in seq[:max_seq_len]]
        seq = [word2idx.get(w.lower(), Constants.UNK) for w in seq]
        seq = [Constants.BOS] + seq + [Constants.EOS]
        seq = np.array(seq + [Constants.PAD] * (max_seq_len - len(seq)))
        pos = np.array([pos_i+1 if w_i != Constants.PAD else 0 for pos_i, w_i in enumerate(seq)])

        seq = torch.LongTensor(seq).unsqueeze(0)
        pos = torch.LongTensor(pos).unsqueeze(0)
        return seq.to(device), pos.to(device)

    #- Load preprocessing file for vocabulary
    prepro = torch.load(opt.prepro_file)
    src_word2idx = prepro['dict']['src']
    tgt_idx2word = {idx: word for word, idx in prepro['dict']['tgt'].items()}
    del prepro # to save memory

    #- Prepare interactive shell
    nlp = spacy.blank('en')
    s2s = Interactive(opt)
    max_seq_len = s2s.model_opt.max_subseq_len
    print('[Info] Model opts: {}'.format(s2s.model_opt))

    #- Interact with console
    console_input = ''
    console_output = '[Seq2Seq](score:--.--) human , what do you have to say ( type \' exit \' to quit ) ?\n[Human] '
    while True:
        console_input = input(console_output) # get user input
        if console_input == 'exit':
            break
        seq, pos = prepare_seq(console_input, max_seq_len, src_word2idx, s2s.device)
        console_output, score = s2s.translate_batch(seq, pos)
        console_output = console_output[0][0]
        score = score[0][0]
        console_output = '[Seq2Seq](score:{score:2.2f}) '.format(score=score.item()) + \
            ' '.join([tgt_idx2word.get(word, Constants.UNK_WORD) for word in console_output]) + '\n[Human] '
    
    print('[Seq2Seq](score:--.--) thanks for talking with me !') 
开发者ID:vliu15,项目名称:dialogue-seq2seq,代码行数:43,代码来源:interactive.py

示例12: tokenizer

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def tokenizer(text: str) -> str:
    "Tokenize input string using a spaCy pipeline"
    nlp = spacy.blank('en')
    nlp.add_pipe(nlp.create_pipe('sentencizer'))  # Very basic NLP pipeline in spaCy
    doc = nlp(text)
    tokenized_text = ' '.join(token.text for token in doc)
    return tokenized_text 
开发者ID:prrao87,项目名称:fine-grained-sentiment,代码行数:9,代码来源:explainer.py

示例13: check_spacy_models

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def check_spacy_models(main, lang, pipeline):
    spacy_langs = {
        'nld': 'nl_core_news_sm',
        'eng': 'en_core_web_sm',
        'fra': 'fr_core_news_sm',
        'deu': 'de_core_news_sm',
        'ell': 'el_core_news_sm',
        'ita': 'it_core_news_sm',
        'lit': 'lt_core_news_sm',
        'nob': 'nb_core_news_sm',
        'por': 'pt_core_news_sm',
        'spa': 'es_core_news_sm',
        'other': 'en_core_web_sm'
    }

    # Remove unused pipelines to boost speed
    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['sentence_tokenization', 'tokenization']:
        nlp_pipelines = ['sentencizer']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    # Languages with models
    if lang in spacy_langs:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            model = importlib.import_module(spacy_langs[lang])

            main.__dict__[f'spacy_nlp_{lang}'] = model.load(disable = nlp_disable)
    # Languages without models
    else:
        # Serbian (Cyrillic) & Serbian (Latin)
        if lang in ['srp_cyrl', 'srp_latn']:
            main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('sr')
            main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('sr')
        else:
            main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(wl_conversion.to_iso_639_1(main, lang))

    if 'sentencizer' in nlp_pipelines:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if 'sentencizer' not in nlp.pipe_names:
            nlp.add_pipe(nlp.create_pipe('sentencizer')) 
开发者ID:BLKSerene,项目名称:Wordless,代码行数:52,代码来源:wl_text_utils.py

示例14: main

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(data_folder: str, model_folder: str, dev_size: float, entities_to_remove: List[str],
         nb_segment: Optional[int], segment: Optional[int]) -> None:
    nlp = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size,
                                                     nb_segment=nb_segment, segment=segment)
    # flair.device = torch.device('cpu')  # (4mn 28)
    tagger: SequenceTagger = SequenceTagger.load(model=os.path.join(model_folder, 'best-model.pt'))
    test_results, _ = tagger.evaluate(data_loader=DataLoader(corpus.test, batch_size=32))
    print(test_results.detailed_results)

    sentences_original = (corpus.train.sentences + corpus.test.sentences)
    sentences_predict = copy.deepcopy(sentences_original)
    # clean tokens in case there is a bug
    for s in sentences_predict:
        for t in s:
            t.tags = {}

    _ = tagger.predict(sentences=sentences_predict,
                       mini_batch_size=32,
                       embedding_storage_mode="none",
                       verbose=True)

    for index, (sentence_original, sentence_predict) \
            in enumerate(zip(sentences_original, sentences_predict)):  # type: int, (Sentence, Sentence)
        expected_entities_text = {f"{s.text} {s.tag}"
                                  for s in sentence_original.get_spans('ner')
                                  if s.tag not in entities_to_remove}
        predicted_entities_text = {f"{s.text} {s.tag}"
                                   for s in sentence_predict.get_spans('ner')
                                   if s.tag not in entities_to_remove}

        diff_expected = expected_entities_text.difference(predicted_entities_text)
        diff_predicted = predicted_entities_text.difference(expected_entities_text)

        if len(diff_predicted) > 0:  # (len(diff_expected) > 0) or
            print("------------")
            print(f"source {index}: [{sentence_original.to_plain_string()}]")
            print(f"expected missing: [{diff_expected}]")
            print(f"predicted missing: [{diff_predicted}]")
            print(f"common: [{set(predicted_entities_text).intersection(set(expected_entities_text))}]") 
开发者ID:ELS-RD,项目名称:anonymisation,代码行数:44,代码来源:flair_display_errors.py

示例15: pipeline

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def pipeline():
    """Create a new model, set up the pipeline and train the tagger. In order to
    train the tagger with a custom tag map, we're creating a new Language
    instance with a custom vocab.
    """
    args = get_args()
    print(args)
    curr_dir = path.dirname(path.abspath(__file__))
    lang = args.language
    print(lang)
    output_dir = path.join(path.dirname(path.abspath(__file__)), "outputs")
    if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    model_path = "%s/spacymodels/%s/%s.model" % (curr_dir, args.language,  args.tag_type)    
    data_path = "%s/data/train/%s/train.%s.conll" % (curr_dir, args.language, args.encoding)

    file = open(data_path, "r")
    TRAIN_DATA= spacy_data_reader.spacy_load_data(data_path)

    nlp = spacy.blank(lang)
    # add the tagger to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    tagger = nlp.create_pipe('tagger')
    # Add the tags. This needs to be done before you start training.
    for tag, values in TAG_MAP.items():
        tagger.add_label(tag, values)
    nlp.add_pipe(tagger)

    optimizer = nlp.begin_training()
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, losses=losses)
        print('Losses', losses)

    # test the trained model
    test_text = "నా నా కధ అందరి అందరి ఆడపిల్లల కధే ."
    doc = nlp(test_text)
    print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the save model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) 
开发者ID:avineshpvs,项目名称:indic_tagger,代码行数:60,代码来源:spacy_tagger.py


注:本文中的spacy.blank方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。