本文整理汇总了Python中spacy.blank方法的典型用法代码示例。如果您正苦于以下问题:Python spacy.blank方法的具体用法?Python spacy.blank怎么用?Python spacy.blank使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy
的用法示例。
在下文中一共展示了spacy.blank方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_nlp
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def load_nlp(vectors_loc, lang=None):
if lang is None:
nlp = Language()
else:
# create empty language class – this is required if you're planning to
# save the model to disk and load it back later (models always need a
# "lang" setting). Use 'xx' for blank multi-language class.
nlp = spacy.blank(lang)
with open(vectors_loc, 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
nlp.vocab.reset_vectors(width=int(nr_dim))
for line in file_:
line = line.rstrip().decode('utf8')
pieces = line.rsplit(' ', int(nr_dim))
word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
return nlp
示例2: main
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(args):
objs = []
with open(args.path, encoding=args.encoding) as f:
for linum, line in enumerate(f):
try:
objs.append(json.loads(line.strip()))
except Exception as e:
message = f'line {linum+1}: {e}'
raise RuntimeError(message)
nlp = spacy.blank('id')
with ProcessPoolExecutor(max_workers=args.max_workers) as exc:
tok_objs = exc.map(partial(tokenize_obj, nlp), objs, chunksize=args.chunk_size)
docs = [Document.from_mapping(obj) for obj in tok_objs]
if args.discard_long_summary:
docs = [doc for doc in docs if not has_long_summary(doc)]
print('\n'.join(json.dumps(doc.to_dict(), sort_keys=True) for doc in docs))
示例3: test_spacy_featurizer_using_empty_model
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def test_spacy_featurizer_using_empty_model():
from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
import spacy
sentence = "This test is using an empty spaCy model"
model = spacy.blank("en")
doc = model(sentence)
ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())
message = Message(sentence)
message.set(SPACY_DOCS[TEXT], doc)
ftr._set_spacy_features(message)
vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
assert vecs is None
示例4: main
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(data_folder: str, output_folder: str, model_folder: str) -> None:
nlp: Language = spacy.blank('fr')
nlp.tokenizer = get_tokenizer(nlp)
tokenizer = build_spacy_tokenizer(nlp)
filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".txt")]
tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt'))
for filename in tqdm(iterable=filenames, unit=" txt", desc="anonymize cases"):
with open(os.path.join(data_folder, filename), 'r') as input_f:
sentences = tagger.predict(sentences=input_f.readlines(),
mini_batch_size=32,
verbose=False,
use_tokenizer=tokenizer)
case_name = filename.split('.')[0]
page_html = render_ner_html(sentences, colors=colors, title=case_name)
with open(os.path.join(output_folder, case_name + ".html"), "w") as output:
output.write(page_html)
示例5: get_empty_model
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def get_empty_model(load_labels_for_training: bool) -> French:
"""
Generate an empty NER model
:rtype: object
"""
# Important to setup the right language because it impacts the tokenizer, sentences split, ...
nlp = spacy.blank('fr')
nlp.tokenizer = get_tokenizer(nlp)
nlp.add_pipe(prevent_sentence_boundary_detection, name='prevent-sbd', first=True)
ner = nlp.create_pipe('ner')
# add labels
if load_labels_for_training:
for token_type in list(colors.keys()):
ner.add_label(token_type)
nlp.add_pipe(ner, last=True)
return nlp
示例6: spacy_model_with_data
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def spacy_model_with_data():
# Creating blank model and setting up the spaCy pipeline
nlp = spacy.blank("en")
textcat = nlp.create_pipe(
"textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
)
nlp.add_pipe(textcat, last=True)
# Training the model to recognize between computer graphics and baseball in 20newsgroups dataset
categories = ['comp.graphics', 'rec.sport.baseball']
for cat in categories:
textcat.add_label(cat)
# Split train/test and train the model
train_x, train_y, test_x, _ = _get_train_test_dataset(categories)
train_data = list(zip(train_x, [{"cats": cats} for cats in train_y]))
_train_model(nlp, train_data)
return ModelWithData(nlp, pd.DataFrame(test_x))
示例7: test_model_log_without_pyfunc_flavor
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def test_model_log_without_pyfunc_flavor():
artifact_path = "model"
nlp = spacy.blank("en")
# Add a component not compatible with pyfunc
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
# Ensure the pyfunc flavor is not present after logging and loading the model
with mlflow.start_run():
mlflow.spacy.log_model(spacy_model=nlp, artifact_path=artifact_path)
model_path = _download_artifact_from_uri("runs:/{run_id}/{artifact_path}".format(
run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path))
loaded_model = Model.load(model_path)
assert loaded_model.flavors.keys() == {"spacy"}
示例8: __init__
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def __init__(self, lang: str):
self.tok = spacy.blank(lang, disable=["parser", "tagger", "ner"])
示例9: main
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(data_folder: str, model_folder: str, dev_size: float, nb_epochs: int,
nb_segment: Optional[int], segment: Optional[int]) -> None:
nlp = spacy.blank('fr')
nlp.tokenizer = get_tokenizer(nlp)
corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size,
nb_segment=nb_segment, segment=segment)
tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
print(tag_dictionary.idx2item)
embedding_types: List[TokenEmbeddings] = [
WordEmbeddings('fr'),
FlairEmbeddings('fr-forward'),
FlairEmbeddings('fr-backward'),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
embeddings=embeddings,
use_crf=True,
tag_dictionary=tag_dictionary,
tag_type='ner')
trainer: ModelTrainer = ModelTrainer(model=tagger, corpus=corpus, use_tensorboard=True)
trainer.train(model_folder,
max_epochs=nb_epochs,
mini_batch_size=32,
embeddings_storage_mode="cpu",
checkpoint=False,
)
示例10: main
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(data_folder: str, model_folder: str, top_n: int) -> None:
print(f"keep only top {top_n} examples per file")
nlp: Language = spacy.blank('fr')
nlp.tokenizer = get_tokenizer(nlp)
tokenizer = build_spacy_tokenizer(nlp)
filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".xml")]
sentences: List[Sentence] = list()
with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar:
for filename in filenames:
paragraphs: List[Paragraph] = get_paragraph_from_file(path=os.path.join(data_folder, filename),
keep_paragraph_without_annotation=True)
if len(paragraphs) > top_n:
for paragraph in paragraphs[:top_n]:
if len(paragraph.text) > 0:
s = Sentence(text=paragraph.text, tokenizer=tokenizer)
sentences.append(s)
progress_bar.update()
if len(sentences) == 0:
raise Exception("No example loaded, causes: no cases in provided path or sample size is to high")
tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt'))
_ = tagger.predict(sentences=sentences,
mini_batch_size=32,
verbose=True)
print("prepare html")
page_html = render_ner_html(sentences, colors=colors)
print("write html")
with open("sentence.html", "w") as writer:
writer.write(page_html)
示例11: interactive
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def interactive(opt):
def prepare_seq(seq, max_seq_len, word2idx, device):
''' Prepares sequence for inference '''
seq = nlp(seq)
seq = [token.text for token in seq[:max_seq_len]]
seq = [word2idx.get(w.lower(), Constants.UNK) for w in seq]
seq = [Constants.BOS] + seq + [Constants.EOS]
seq = np.array(seq + [Constants.PAD] * (max_seq_len - len(seq)))
pos = np.array([pos_i+1 if w_i != Constants.PAD else 0 for pos_i, w_i in enumerate(seq)])
seq = torch.LongTensor(seq).unsqueeze(0)
pos = torch.LongTensor(pos).unsqueeze(0)
return seq.to(device), pos.to(device)
#- Load preprocessing file for vocabulary
prepro = torch.load(opt.prepro_file)
src_word2idx = prepro['dict']['src']
tgt_idx2word = {idx: word for word, idx in prepro['dict']['tgt'].items()}
del prepro # to save memory
#- Prepare interactive shell
nlp = spacy.blank('en')
s2s = Interactive(opt)
max_seq_len = s2s.model_opt.max_subseq_len
print('[Info] Model opts: {}'.format(s2s.model_opt))
#- Interact with console
console_input = ''
console_output = '[Seq2Seq](score:--.--) human , what do you have to say ( type \' exit \' to quit ) ?\n[Human] '
while True:
console_input = input(console_output) # get user input
if console_input == 'exit':
break
seq, pos = prepare_seq(console_input, max_seq_len, src_word2idx, s2s.device)
console_output, score = s2s.translate_batch(seq, pos)
console_output = console_output[0][0]
score = score[0][0]
console_output = '[Seq2Seq](score:{score:2.2f}) '.format(score=score.item()) + \
' '.join([tgt_idx2word.get(word, Constants.UNK_WORD) for word in console_output]) + '\n[Human] '
print('[Seq2Seq](score:--.--) thanks for talking with me !')
示例12: tokenizer
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def tokenizer(text: str) -> str:
"Tokenize input string using a spaCy pipeline"
nlp = spacy.blank('en')
nlp.add_pipe(nlp.create_pipe('sentencizer')) # Very basic NLP pipeline in spaCy
doc = nlp(text)
tokenized_text = ' '.join(token.text for token in doc)
return tokenized_text
示例13: check_spacy_models
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def check_spacy_models(main, lang, pipeline):
spacy_langs = {
'nld': 'nl_core_news_sm',
'eng': 'en_core_web_sm',
'fra': 'fr_core_news_sm',
'deu': 'de_core_news_sm',
'ell': 'el_core_news_sm',
'ita': 'it_core_news_sm',
'lit': 'lt_core_news_sm',
'nob': 'nb_core_news_sm',
'por': 'pt_core_news_sm',
'spa': 'es_core_news_sm',
'other': 'en_core_web_sm'
}
# Remove unused pipelines to boost speed
if pipeline == 'word_tokenization':
nlp_pipelines = []
nlp_disable = ['tagger', 'parser', 'ner']
elif pipeline in ['sentence_tokenization', 'tokenization']:
nlp_pipelines = ['sentencizer']
nlp_disable = ['tagger', 'parser', 'ner']
elif pipeline in ['pos_tagging', 'lemmatization']:
nlp_pipelines = ['tagger']
nlp_disable = ['parser', 'ner']
# Languages with models
if lang in spacy_langs:
if f'spacy_nlp_{lang}' in main.__dict__:
if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
del main.__dict__[f'spacy_nlp_{lang}']
if f'spacy_nlp_{lang}' not in main.__dict__:
model = importlib.import_module(spacy_langs[lang])
main.__dict__[f'spacy_nlp_{lang}'] = model.load(disable = nlp_disable)
# Languages without models
else:
# Serbian (Cyrillic) & Serbian (Latin)
if lang in ['srp_cyrl', 'srp_latn']:
main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('sr')
main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('sr')
else:
main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(wl_conversion.to_iso_639_1(main, lang))
if 'sentencizer' in nlp_pipelines:
nlp = main.__dict__[f'spacy_nlp_{lang}']
if 'sentencizer' not in nlp.pipe_names:
nlp.add_pipe(nlp.create_pipe('sentencizer'))
示例14: main
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def main(data_folder: str, model_folder: str, dev_size: float, entities_to_remove: List[str],
nb_segment: Optional[int], segment: Optional[int]) -> None:
nlp = spacy.blank('fr')
nlp.tokenizer = get_tokenizer(nlp)
corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size,
nb_segment=nb_segment, segment=segment)
# flair.device = torch.device('cpu') # (4mn 28)
tagger: SequenceTagger = SequenceTagger.load(model=os.path.join(model_folder, 'best-model.pt'))
test_results, _ = tagger.evaluate(data_loader=DataLoader(corpus.test, batch_size=32))
print(test_results.detailed_results)
sentences_original = (corpus.train.sentences + corpus.test.sentences)
sentences_predict = copy.deepcopy(sentences_original)
# clean tokens in case there is a bug
for s in sentences_predict:
for t in s:
t.tags = {}
_ = tagger.predict(sentences=sentences_predict,
mini_batch_size=32,
embedding_storage_mode="none",
verbose=True)
for index, (sentence_original, sentence_predict) \
in enumerate(zip(sentences_original, sentences_predict)): # type: int, (Sentence, Sentence)
expected_entities_text = {f"{s.text} {s.tag}"
for s in sentence_original.get_spans('ner')
if s.tag not in entities_to_remove}
predicted_entities_text = {f"{s.text} {s.tag}"
for s in sentence_predict.get_spans('ner')
if s.tag not in entities_to_remove}
diff_expected = expected_entities_text.difference(predicted_entities_text)
diff_predicted = predicted_entities_text.difference(expected_entities_text)
if len(diff_predicted) > 0: # (len(diff_expected) > 0) or
print("------------")
print(f"source {index}: [{sentence_original.to_plain_string()}]")
print(f"expected missing: [{diff_expected}]")
print(f"predicted missing: [{diff_predicted}]")
print(f"common: [{set(predicted_entities_text).intersection(set(expected_entities_text))}]")
示例15: pipeline
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import blank [as 别名]
def pipeline():
"""Create a new model, set up the pipeline and train the tagger. In order to
train the tagger with a custom tag map, we're creating a new Language
instance with a custom vocab.
"""
args = get_args()
print(args)
curr_dir = path.dirname(path.abspath(__file__))
lang = args.language
print(lang)
output_dir = path.join(path.dirname(path.abspath(__file__)), "outputs")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_path = "%s/spacymodels/%s/%s.model" % (curr_dir, args.language, args.tag_type)
data_path = "%s/data/train/%s/train.%s.conll" % (curr_dir, args.language, args.encoding)
file = open(data_path, "r")
TRAIN_DATA= spacy_data_reader.spacy_load_data(data_path)
nlp = spacy.blank(lang)
# add the tagger to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
tagger = nlp.create_pipe('tagger')
# Add the tags. This needs to be done before you start training.
for tag, values in TAG_MAP.items():
tagger.add_label(tag, values)
nlp.add_pipe(tagger)
optimizer = nlp.begin_training()
for i in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses)
# test the trained model
test_text = "నా నా కధ అందరి అందరి ఆడపిల్లల కధే ."
doc = nlp(test_text)
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the save model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc = nlp2(test_text)
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])