本文整理汇总了Python中spacy.load方法的典型用法代码示例。如果您正苦于以下问题:Python spacy.load方法的具体用法?Python spacy.load怎么用?Python spacy.load使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy
的用法示例。
在下文中一共展示了spacy.load方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_vocab
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def build_vocab(tokens, cache='vocab.pkl', max_size=50000):
if not osp.isfile(cache):
counter = Counter(tokens)
words, _ = zip(*counter.most_common(max_size))
words = [PAD_TOKEN, UNK_TOKEN] + list(words)
token_to_index = dict(zip(words, range(len(words))))
if START_TOKEN not in token_to_index:
token_to_index[START_TOKEN] = len(token_to_index)
words += [START_TOKEN]
if END_TOKEN not in token_to_index:
token_to_index[END_TOKEN] = len(token_to_index)
words += [END_TOKEN]
with open(cache, 'wb') as f:
pickle.dump((token_to_index, words), f)
else:
with open(cache, 'rb') as f:
token_to_index, words = pickle.load(f)
return token_to_index, words
示例2: tokenize
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def tokenize(data, process_text=True, process_da=True, process_ref=True):
print('Begin tokenization:')
print('='*50)
nlp = spacy.load('en_core_web_sm')
cnt = 0
for no, session in data.items():
cnt += 1
if cnt % 1000 == 0:
print('[%d|%d]' % (cnt,len(data)))
for turn in session['log']:
if process_text:
doc = nlp(turn['text'])
turn['text'] = ' '.join([token.text for token in doc]).strip()
if process_da:
for da, svs in turn['dialog_act'].items():
for i in range(len(svs)):
if svs[i][0] == 'Ref' and not process_ref:
continue
svs[i][1] = ' '.join([token.text for token in nlp(svs[i][1])]).strip()
print('=' * 50)
print('Finish tokenization')
示例3: __init__
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def __init__(self, model_path):
weights, biases = [], []
for file in sorted(os.listdir(model_path)):
if file.startswith("single_mention_weights"):
w = np.load(os.path.join(model_path, file))
weights.append(w)
if file.startswith("single_mention_bias"):
w = np.load(os.path.join(model_path, file))
biases.append(w)
self.single_mention_model = list(zip(weights, biases))
weights, biases = [], []
for file in sorted(os.listdir(model_path)):
if file.startswith("pair_mentions_weights"):
w = np.load(os.path.join(model_path, file))
weights.append(w)
if file.startswith("pair_mentions_bias"):
w = np.load(os.path.join(model_path, file))
biases.append(w)
self.pair_mentions_model = list(zip(weights, biases))
示例4: one_shot_coref
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def one_shot_coref(
self,
utterances,
utterances_speakers_id=None,
context=None,
context_speakers_id=None,
speakers_names=None,
):
""" Clear history, load a list of utterances and an optional context and run the coreference model on them
Arg:
- `utterances` : iterator or list of string corresponding to successive utterances (in a dialogue) or sentences.
Can be a single string for non-dialogue text.
- `utterances_speakers_id=None` : iterator or list of speaker id for each utterance (in the case of a dialogue).
- if not provided, assume two speakers speaking alternatively.
- if utterances and utterances_speaker are not of the same length padded with None
- `context=None` : iterator or list of string corresponding to additionnal utterances/sentences sent prior to `utterances`. Coreferences are not computed for the mentions identified in `context`. The mentions in `context` are only used as possible antecedents to mentions in `uterrance`. Reduce the computations when we are only interested in resolving coreference in the last sentences/utterances.
- `context_speakers_id=None` : same as `utterances_speakers_id` for `context`.
- `speakers_names=None` : dictionnary of list of acceptable speaker names (strings) for speaker_id in `utterances_speakers_id` and `context_speakers_id`
Return:
clusters of entities with coreference resolved
"""
self.data.set_utterances(context, context_speakers_id, speakers_names)
self.continuous_coref(utterances, utterances_speakers_id, speakers_names)
return self.get_clusters()
示例5: pipeline
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def pipeline(args):
'''
Runs the model loop.
'''
df = pd.read_csv(args.filename)
df.loc[:,args.x_label] = df[args.x_label].fillna("None")
if args.dedupe:
df = df.drop_duplicates(subset='content')
if args.reduce:
df = restrict_sources(df)
X = df[args.x_label]
y = df[args.y_label]
parser = spacy.load('en')
X_train, X_test, y_train, y_test = train_test_split(X, y)
loop = ModelLoop(X_train, X_test, y_train, y_test, args.models,
args.iterations, args.output_dir,
thresholds = args.thresholds, ks = args.ks,
setting=args.features[0])
loop.run()
示例6: build_dataset
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def build_dataset(args):
print("Building dataset from : {}".format(args.input))
print("-> Building {} random splits".format(args.nb_splits))
nlp = spacy.load('en', create_pipeline=custom_pipeline)
gen_a,gen_b = itertools.tee(data_generator(args.input),2)
data = [(z["reviewerID"],z["asin"],tok,z["overall"]) for z,tok in zip(tqdm((z for z in gen_a),desc="reading file"),nlp.pipe((x["reviewText"] for x in gen_b), batch_size=1000000, n_threads=8))]
print(data[0])
shuffle(data)
splits = [randint(0,args.nb_splits-1) for _ in range(0,len(data))]
count = Counter(splits)
print("Split distribution is the following:")
print(count)
return {"data":data,"splits":splits,"rows":("user_id","item_id","review","rating")}
示例7: test_construct_query
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def test_construct_query(self):
sql_man = SqLiteManager()
en_nlp_l = spacy.load(EN_MODEL_MD)
result = sql_man.get_questions_between(5, 7)
for row in result:
qid = row[0]
with self.subTest(qid):
question = row[1]
question_type = row[2]
question_feat = json.loads(row[3])
if question_feat is not None:
en_doc = en_nlp_l(u'' + question)
query = construct_query(question_feat, en_doc)
print("{0}){1} :\nQuery: {2}".format(qid, question, repr(query)))
js_query = json.dumps(repr(query))
sql_man.update_search_query(qid, js_query)
assert query is not None
# sql_man.close_db()
示例8: test_umls2
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def test_umls2():
nlp = spacy.load("en_core_sci_sm")
negex = Negex(
nlp, language="en_clinical_sensitive", ent_types=["ENTITY"], chunk_prefix=["no"]
)
nlp.add_pipe(negex, last=True)
docs = build_med_docs()
for d in docs:
doc = nlp(d[0])
for i, e in enumerate(doc.ents):
print(e.text, e._.negex)
assert (e.text, e._.negex) == d[1][i]
# blocked by spacy 2.1.8 issue. Adding back after spacy 2.2.
# def test_no_ner():
# nlp = spacy.load("en_core_web_sm", disable=["ner"])
# negex = Negex(nlp)
# nlp.add_pipe(negex, last=True)
# with pytest.raises(ValueError):
# doc = nlp("this doc has not been NERed")
示例9: __init__
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
#return pkgutil.get_data('scattertext',
# 'data/viz/semiotic_new.html').decode('utf-8')
path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
tokenizer_fn = path + 'punkt.english.pickle'
tagger_fn = path + 'averaged_perceptron_tagger.pickle'
#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
示例10: get_stdeng_spacy_tagger
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def get_stdeng_spacy_tagger(suppress_errors=False):
global SPACY_WRAPPER
if SPACY_WRAPPER is not None:
return SPACY_WRAPPER
try:
import spacy
SPACY_WRAPPER = SpacyTagger()
SPACY_WRAPPER.spacy_object = spacy.load('en', parser=False, entity=False)
return SPACY_WRAPPER
except ImportError:
if not suppress_errors: raise
except RuntimeError:
## this seems to happen if the 'en' model is not installed. it might
## look like this:
# RuntimeError: Model 'en' not installed. Please run 'python -m spacy.en.download' to install latest compatible model.
if not suppress_errors: raise
return None
示例11: buildMapVec
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def buildMapVec(text):
"""
An example wrapper function for text2mapVec(), reads in necessary collections and then runs text2mapVec().
Feel free to modify to your preference and task objective.
:param text: to create the Map Vector from encoded as unicode.
:return: currently only prints the vector, add 'return map_vector' or whatever you prefer.
"""
ENCODING_MAP = cPickle.load(open(u"data/1x1_encode_map.pkl")) # the resolution of the map
OUTLIERS_MAP = cPickle.load(open(u"data/1x1_outliers_map.pkl")) # dimensions must match the above
nlp = spacy.load(u'en_core_web_lg') # or spacy.load(u'en') depending on your Spacy Download (simple or full)
conn = sqlite3.connect(u'../data/geonames.db').cursor() # this DB can be downloaded using the GitHub link
map_vector = text2mapvec(doc=nlp(text), mapping=ENCODING_MAP, outliers=OUTLIERS_MAP, polygon_size=1, db=conn, exclude=u"Cairo")
print(map_vector)
# text = u"The Giza pyramid complex is an archaeological site on the Giza Plateau, on the outskirts of Cairo, Egypt."
# buildMapVec(text)
示例12: test_morph_exception
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def test_morph_exception() -> None:
assert spacy.__version__ <= SPACY_VERSION
lang = RO
text = "Ce mai faci?"
download(lang=lang)
try:
nlp = load(lang=lang)
assert nlp._meta["lang"] == f"udpipe_{lang}"
doc = nlp(text)
except ValueError:
nlp = load(lang=lang, ignore_tag_map=True)
assert nlp._meta["lang"] == f"udpipe_{lang}"
doc = nlp(text)
assert doc
示例13: build_vocab
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def build_vocab(dataset_name, labels):
raw_dir = _util.getRelRawPath(dataset_name)
labels_path = os.path.join(raw_dir, labels)
try:
with open(labels_path) as label_file:
labels = str(''.join(json.load(label_file)))
except:
labels = _labels
_getSharedLogger().warning("Could not open '%s'... \n\tUsing hardcoded labels: '%s'", labels_path, labels)
char2idx = {}
for k, v in _markers2Id.items():
char2idx[k] = v
for char in labels:
char2idx[char] = len(char2idx)
return char2idx
示例14: split_sentences
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def split_sentences(dataviews, captions):
nlp = spacy.load('en')
new_frames, new_captions = [], []
for frames, caps in zip(dataviews, captions):
new_fs, new_caps = [], []
left = 0
right = 1
while left < len(caps) and right < len(caps):
cap = " ".join(caps[left:right])
doc = nlp(cap)
sentences = [x.string.strip() for x in doc.sents]
if len(sentences) >= 2 and right - 1 - left > 0:
cap = " ".join(caps[left:right - 1])
new_fs.append(np.concatenate(frames[left:right - 1]))
print("sentence:", cap)
new_caps.append(cap)
left = right - 1
right += 1
new_frames.append(new_fs)
new_captions.append(new_caps)
return new_frames, new_captions
# REVIEW josephz: This is a copy of `FrameCaptionDataset.parse_caption`.
示例15: __init__
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
try:
import ftfy
import spacy
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True,
never_split=special_tokens if special_tokens is not None else [])
self.fix_text = None
self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
self.special_tokens = {}
self.special_tokens_decoder = {}
self.set_special_tokens(special_tokens)