Python spacy.load方法代码示例

本文整理汇总了Python中spacy.load方法的典型用法代码示例。如果您正苦于以下问题：Python spacy.load方法的具体用法？Python spacy.load怎么用？Python spacy.load使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy的用法示例。

在下文中一共展示了spacy.load方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build_vocab

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def build_vocab(tokens, cache='vocab.pkl', max_size=50000):
    if not osp.isfile(cache):
        counter = Counter(tokens)
        words, _ = zip(*counter.most_common(max_size))
        words = [PAD_TOKEN, UNK_TOKEN] + list(words)
        token_to_index = dict(zip(words, range(len(words))))
        if START_TOKEN not in token_to_index:
            token_to_index[START_TOKEN] = len(token_to_index)
            words += [START_TOKEN]
        if END_TOKEN not in token_to_index:
            token_to_index[END_TOKEN] = len(token_to_index)
            words += [END_TOKEN]
        with open(cache, 'wb') as f:
            pickle.dump((token_to_index, words), f)
    else:
        with open(cache, 'rb') as f:
            token_to_index, words = pickle.load(f)

    return token_to_index, words

开发者ID:tofunlp，项目名称:lineflow，代码行数:21，代码来源:imdb_pytorch.py

示例2: tokenize

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def tokenize(data, process_text=True, process_da=True, process_ref=True):
    print('Begin tokenization:')
    print('='*50)
    nlp = spacy.load('en_core_web_sm')
    cnt = 0
    for no, session in data.items():
        cnt += 1
        if cnt % 1000 == 0:
            print('[%d|%d]' % (cnt,len(data)))
        for turn in session['log']:
            if process_text:
                doc = nlp(turn['text'])
                turn['text'] = ' '.join([token.text for token in doc]).strip()
            if process_da:
                for da, svs in turn['dialog_act'].items():
                    for i in range(len(svs)):
                        if svs[i][0] == 'Ref' and not process_ref:
                            continue
                        svs[i][1] = ' '.join([token.text for token in nlp(svs[i][1])]).strip()
    print('=' * 50)
    print('Finish tokenization')

开发者ID:ConvLab，项目名称:ConvLab，代码行数:23，代码来源:annotate.py

示例3: init

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def __init__(self, model_path):
        weights, biases = [], []
        for file in sorted(os.listdir(model_path)):
            if file.startswith("single_mention_weights"):
                w = np.load(os.path.join(model_path, file))
                weights.append(w)
            if file.startswith("single_mention_bias"):
                w = np.load(os.path.join(model_path, file))
                biases.append(w)
        self.single_mention_model = list(zip(weights, biases))
        weights, biases = [], []
        for file in sorted(os.listdir(model_path)):
            if file.startswith("pair_mentions_weights"):
                w = np.load(os.path.join(model_path, file))
                weights.append(w)
            if file.startswith("pair_mentions_bias"):
                w = np.load(os.path.join(model_path, file))
                biases.append(w)
        self.pair_mentions_model = list(zip(weights, biases))

开发者ID:huggingface，项目名称:neuralcoref，代码行数:21，代码来源:algorithm.py

示例4: one_shot_coref

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def one_shot_coref(
        self,
        utterances,
        utterances_speakers_id=None,
        context=None,
        context_speakers_id=None,
        speakers_names=None,
    ):
        """ Clear history, load a list of utterances and an optional context and run the coreference model on them

        Arg:
        - `utterances` : iterator or list of string corresponding to successive utterances (in a dialogue) or sentences.
            Can be a single string for non-dialogue text.
        - `utterances_speakers_id=None` : iterator or list of speaker id for each utterance (in the case of a dialogue).
            - if not provided, assume two speakers speaking alternatively.
            - if utterances and utterances_speaker are not of the same length padded with None
        - `context=None` : iterator or list of string corresponding to additionnal utterances/sentences sent prior to `utterances`. Coreferences are not computed for the mentions identified in `context`. The mentions in `context` are only used as possible antecedents to mentions in `uterrance`. Reduce the computations when we are only interested in resolving coreference in the last sentences/utterances.
        - `context_speakers_id=None` : same as `utterances_speakers_id` for `context`. 
        - `speakers_names=None` : dictionnary of list of acceptable speaker names (strings) for speaker_id in `utterances_speakers_id` and `context_speakers_id`
        Return:
            clusters of entities with coreference resolved
        """
        self.data.set_utterances(context, context_speakers_id, speakers_names)
        self.continuous_coref(utterances, utterances_speakers_id, speakers_names)
        return self.get_clusters()

开发者ID:huggingface，项目名称:neuralcoref，代码行数:27，代码来源:algorithm.py

示例5: pipeline

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def pipeline(args):
    '''
    Runs the model loop.
    '''
    df = pd.read_csv(args.filename)
    df.loc[:,args.x_label] = df[args.x_label].fillna("None")
    if args.dedupe:
        df = df.drop_duplicates(subset='content')
    if args.reduce:
        df = restrict_sources(df)
    X = df[args.x_label]
    y = df[args.y_label]
    parser = spacy.load('en')
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    loop = ModelLoop(X_train, X_test, y_train, y_test, args.models,
                     args.iterations, args.output_dir,
                     thresholds = args.thresholds, ks = args.ks,
                     setting=args.features[0])
    loop.run()

开发者ID:aldengolab，项目名称:fake-news-detection，代码行数:21，代码来源:run.py

示例6: build_dataset

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def build_dataset(args):

    print("Building dataset from : {}".format(args.input))
    print("-> Building {} random splits".format(args.nb_splits))

    nlp = spacy.load('en', create_pipeline=custom_pipeline)
    gen_a,gen_b = itertools.tee(data_generator(args.input),2)
    data = [(z["reviewerID"],z["asin"],tok,z["overall"]) for z,tok in zip(tqdm((z for z in gen_a),desc="reading file"),nlp.pipe((x["reviewText"] for x in gen_b), batch_size=1000000, n_threads=8))]

    print(data[0])
    shuffle(data)

    splits = [randint(0,args.nb_splits-1) for _ in range(0,len(data))]
    count = Counter(splits)

    print("Split distribution is the following:")
    print(count)

    return {"data":data,"splits":splits,"rows":("user_id","item_id","review","rating")}

开发者ID:cedias，项目名称:Hierarchical-Sentiment，代码行数:21，代码来源:prepare_data.py

示例7: test_construct_query

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def test_construct_query(self):
        sql_man = SqLiteManager()
        en_nlp_l = spacy.load(EN_MODEL_MD)

        result = sql_man.get_questions_between(5, 7)

        for row in result:
            qid = row[0]
            with self.subTest(qid):
                question = row[1]
                question_type = row[2]
                question_feat = json.loads(row[3])

                if question_feat is not None:

                    en_doc = en_nlp_l(u'' + question)

                    query = construct_query(question_feat, en_doc)
                    print("{0}){1} :\nQuery: {2}".format(qid, question, repr(query)))
                    js_query = json.dumps(repr(query))
                    sql_man.update_search_query(qid, js_query)
                    assert query is not None
        # sql_man.close_db()

开发者ID:5hirish，项目名称:adam_qas，代码行数:25，代码来源:test_construct_query.py

示例8: test_umls2

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def test_umls2():
    nlp = spacy.load("en_core_sci_sm")
    negex = Negex(
        nlp, language="en_clinical_sensitive", ent_types=["ENTITY"], chunk_prefix=["no"]
    )
    nlp.add_pipe(negex, last=True)
    docs = build_med_docs()
    for d in docs:
        doc = nlp(d[0])
        for i, e in enumerate(doc.ents):
            print(e.text, e._.negex)
            assert (e.text, e._.negex) == d[1][i]


# blocked by spacy 2.1.8 issue. Adding back after spacy 2.2.
# def test_no_ner():
#     nlp = spacy.load("en_core_web_sm", disable=["ner"])
#     negex = Negex(nlp)
#     nlp.add_pipe(negex, last=True)
#     with pytest.raises(ValueError):
#         doc = nlp("this doc has not been NERed")

开发者ID:jenojp，项目名称:negspacy，代码行数:23，代码来源:test.py

示例9: init

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html

开发者ID:JasonKessler，项目名称:scattertext，代码行数:25，代码来源:phrasemachine.py

示例10: get_stdeng_spacy_tagger

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def get_stdeng_spacy_tagger(suppress_errors=False):
	global SPACY_WRAPPER
	if SPACY_WRAPPER is not None:
		return SPACY_WRAPPER
	try:
		import spacy
		SPACY_WRAPPER = SpacyTagger()
		SPACY_WRAPPER.spacy_object = spacy.load('en', parser=False, entity=False)
		return SPACY_WRAPPER
	except ImportError:
		if not suppress_errors: raise
	except RuntimeError:
		## this seems to happen if the 'en' model is not installed. it might
		## look like this:
		# RuntimeError: Model 'en' not installed. Please run 'python -m spacy.en.download' to install latest compatible model.
		if not suppress_errors: raise
	return None

开发者ID:JasonKessler，项目名称:scattertext，代码行数:19，代码来源:phrasemachine.py

示例11: buildMapVec

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def buildMapVec(text):
    """
    An example wrapper function for text2mapVec(), reads in necessary collections and then runs text2mapVec().
    Feel free to modify to your preference and task objective.
    :param text: to create the Map Vector from encoded as unicode.
    :return: currently only prints the vector, add 'return map_vector' or whatever you prefer.
    """
    ENCODING_MAP = cPickle.load(open(u"data/1x1_encode_map.pkl"))  # the resolution of the map
    OUTLIERS_MAP = cPickle.load(open(u"data/1x1_outliers_map.pkl"))  # dimensions must match the above
    nlp = spacy.load(u'en_core_web_lg')  # or spacy.load(u'en') depending on your Spacy Download (simple or full)
    conn = sqlite3.connect(u'../data/geonames.db').cursor()  # this DB can be downloaded using the GitHub link
    map_vector = text2mapvec(doc=nlp(text), mapping=ENCODING_MAP, outliers=OUTLIERS_MAP, polygon_size=1, db=conn, exclude=u"Cairo")
    print(map_vector)


# text = u"The Giza pyramid complex is an archaeological site on the Giza Plateau, on the outskirts of Cairo, Egypt."
# buildMapVec(text)

开发者ID:milangritta，项目名称:Geocoding-with-Map-Vector，代码行数:19，代码来源:text2mapVec.py

示例12: test_morph_exception

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def test_morph_exception() -> None:
    assert spacy.__version__ <= SPACY_VERSION

    lang = RO
    text = "Ce mai faci?"

    download(lang=lang)

    try:
        nlp = load(lang=lang)
        assert nlp._meta["lang"] == f"udpipe_{lang}"
        doc = nlp(text)
    except ValueError:
        nlp = load(lang=lang, ignore_tag_map=True)
        assert nlp._meta["lang"] == f"udpipe_{lang}"
        doc = nlp(text)

    assert doc

开发者ID:TakeLab，项目名称:spacy-udpipe，代码行数:20，代码来源:test_spacy_udpipe.py

示例13: build_vocab

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def build_vocab(dataset_name, labels):
  raw_dir = _util.getRelRawPath(dataset_name)
  labels_path = os.path.join(raw_dir, labels)
  try:
    with open(labels_path) as label_file:
      labels = str(''.join(json.load(label_file)))
  except:
    labels = _labels
    _getSharedLogger().warning("Could not open '%s'... \n\tUsing hardcoded labels: '%s'", labels_path, labels)

  char2idx = {}
  for k, v in _markers2Id.items():
    char2idx[k] = v
  for char in labels:
    char2idx[char] = len(char2idx)
  return char2idx

开发者ID:joseph-zhong，项目名称:LipReading，代码行数:18，代码来源:data_loader.py

示例14: split_sentences

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def split_sentences(dataviews, captions):
    nlp = spacy.load('en')
    new_frames, new_captions = [], []
    for frames, caps in zip(dataviews, captions):
      new_fs, new_caps = [], []
      left = 0
      right = 1
      while left < len(caps) and right < len(caps):
        cap = " ".join(caps[left:right])
        doc = nlp(cap)
        sentences = [x.string.strip() for x in doc.sents]
        if len(sentences) >= 2 and right - 1 - left > 0:
          cap = " ".join(caps[left:right - 1])
          new_fs.append(np.concatenate(frames[left:right - 1]))
          print("sentence:", cap)
          new_caps.append(cap)
          left = right - 1
        right += 1
      new_frames.append(new_fs)
      new_captions.append(new_caps)
    return new_frames, new_captions

  # REVIEW josephz: This is a copy of `FrameCaptionDataset.parse_caption`.

开发者ID:joseph-zhong，项目名称:LipReading，代码行数:25，代码来源:data_loader.py

示例15: init

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import load [as 别名]
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True,
                                      never_split=special_tokens if special_tokens is not None else [])
            self.fix_text = None

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens)

开发者ID:649453932，项目名称:Bert-Chinese-Text-Classification-Pytorch，代码行数:24，代码来源:tokenization_openai.py

注：本文中的spacy.load方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。