本文整理汇总了Python中gensim.models.keyedvectors.KeyedVectors.load_word2vec_format方法的典型用法代码示例。如果您正苦于以下问题:Python KeyedVectors.load_word2vec_format方法的具体用法?Python KeyedVectors.load_word2vec_format怎么用?Python KeyedVectors.load_word2vec_format使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.keyedvectors.KeyedVectors
的用法示例。
在下文中一共展示了KeyedVectors.load_word2vec_format方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
save_sentence(sentences, sentence_path)
print('train w2v model...')
# train model
w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
size=256, window=5, min_count=min_count, iter=40)
w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
print("save %s ok." % w2v_bin_path)
# test
# sim = w2v.wv.similarity('大', '小')
# print('大 vs 小 similarity score:', sim)
# load model
model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
word_dict = {}
for word in model.vocab:
word_dict[word] = model[word]
save_pkl(word_dict, out_path, overwrite=True)
示例2: get_init_embedding
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def get_init_embedding(reversed_dict, embedding_size):
glove_file = "glove/glove.42B.300d.txt"
word2vec_file = get_tmpfile("word2vec_format.vec")
glove2word2vec(glove_file, word2vec_file)
print("Loading Glove vectors...")
word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)
word_vec_list = list()
for _, word in sorted(reversed_dict.items()):
try:
word_vec = word_vectors.word_vec(word)
except KeyError:
word_vec = np.zeros([embedding_size], dtype=np.float32)
word_vec_list.append(word_vec)
# Assign random vector to <s>, </s> token
word_vec_list[2] = np.random.normal(0, 1, embedding_size)
word_vec_list[3] = np.random.normal(0, 1, embedding_size)
return np.array(word_vec_list)
示例3: load_word2vec
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def load_word2vec(filename=None, path=None, binary=False, limit=None):
if path is not None:
return KeyedVectors.load_word2vec_format(
path, binary=binary, limit=limit)
elif filename is not None:
for dir_path in ASSET_SEARCH_DIRS:
try:
path = os.path.join(dir_path, filename)
return KeyedVectors.load_word2vec_format(
path, binary=binary, limit=limit)
except FileNotFoundError:
continue
raise FileNotFoundError("Please make sure that 'filename' \
specifies the word vector binary name \
in default search paths or 'path' \
speficies file path of the binary")
else:
raise TypeError(
"load_word2vec() requires either 'filename' or 'path' to be set.")
示例4: __init__
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def __init__(self, dataset, p=1, q=4, walk_length=100,
num_walks=50, dimensions=200, window_size=30, workers=8, iterations=5):
Node2Vec.__init__(self, False, True, False, p, q, walk_length, num_walks, dimensions, window_size,
workers, iterations)
self.dataset = dataset
file = 'num%d_p%d_q%d_l%d_d%d_iter%d_winsize%d.emd' % (num_walks, p, q,
walk_length, dimensions,
iterations, window_size)
self.path = 'datasets/%s/node2vec/' % self.dataset + file
if file not in os.listdir('datasets/%s/node2vec/' % self.dataset):
self.run('datasets/%s/node2vec/altogether.edgelist' % self.dataset,
self.path)
self.node2vec_model = KeyedVectors.load_word2vec_format(self.path, binary=True)
示例5: convert
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def convert(fname, save_file):
with open(fname, 'rb') as dim_file:
vocab_size, dim = (int(x) for x in dim_file.readline().split())
word_vectors = KeyedVectors.load_word2vec_format(fname, binary=True)
print("Loading vectors from {}".format(fname))
vectors = []
for line in tqdm(word_vectors.syn0, total=len(word_vectors.syn0)):
vectors.extend(line.tolist())
vectors = torch.Tensor(vectors).view(-1, dim)
stoi = {word.strip():voc.index for word, voc in word_vectors.vocab.items()}
print('saving vectors to', save_file)
torch.save((stoi, vectors, dim), save_file)
示例6: gensim_w2v_handler
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def gensim_w2v_handler(url):
def wrapped(logger):
with tempfile.TemporaryDirectory() as p:
vocab_path = os.path.join(p, 'vocab')
with logger.duration(f'downloading {url}'):
util.download(url, vocab_path)
with logger.duration(f'loading binary {vocab_path}'):
vectors = KeyedVectors.load_word2vec_format(vocab_path, binary=True)
vocab_path += '.txt'
with logger.duration(f'saving text {vocab_path}'):
vectors.save_word2vec_format(vocab_path)
with logger.duration(f'reading embedding'):
weights = None
terms = []
for i, values in enumerate(plaintext.read_sv(vocab_path, sep=' ')):
if i == 0:
weights = np.ndarray((int(values[0]), int(values[1])))
else:
term, values = values[0], values[1:]
terms.append(term)
weights[i-1] = [float(v) for v in values]
return terms, np.array(weights)
return wrapped
示例7: load_pretrained_vectors
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def load_pretrained_vectors(
dir_path, file_name="GoogleNews-vectors-negative300.bin", limit=None
):
""" Method that loads word2vec vectors. Downloads if it doesn't exist.
Args:
file_name(str): Name of the word2vec file.
dir_path(str): Path to the directory where word2vec vectors exist or will be
downloaded.
limit(int): Number of word vectors that is loaded from gensim. This option
allows us to save RAM space and avoid memory errors.
Returns:
gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors
"""
file_path = _maybe_download_and_extract(dir_path, file_name)
word2vec_vectors = KeyedVectors.load_word2vec_format(
file_path, binary=True, limit=limit
)
return word2vec_vectors
示例8: load_word2vec_model
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def load_word2vec_model(file):
'''
load node embedding model
'''
model = KeyedVectors.load_word2vec_format(file , binary=False)
# print model.wv["1"]
return model
示例9: load_word2vec_model
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def load_word2vec_model(file):
'''
return node embedding model
'''
model = KeyedVectors.load_word2vec_format(file , binary=False)
# print model.wv["1"]
return model
示例10: __init__
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def __init__(self, model_path):
self.model_path = model_path
print("loading fastText model ...")
#self.model = pickle.load(open(self.model_path,"rb"))
self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore')
print("done fastText loading model")
self.tokenizer = WordPunctTokenizer()
self.stemmer = ARLSTem()
self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
self.vocab = self.model.vocab
示例11: set_model
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def set_model(self, filename, embed_type='glove'):
timer = Timer('Load {}'.format(filename))
if embed_type == 'glove':
self._model = GloveModel(filename)
else:
self._model = KeyedVectors.load_word2vec_format(filename, binary=True
if embed_type == 'word2vec' else False)
print('Embeddings: vocab = {}, embed_size = {}'.format(len(self._model.vocab), self._model.vector_size))
timer.finish()
示例12: load_word_embeddings
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def load_word_embeddings(path, binary=True):
w2v_model = KeyedVectors.load_word2vec_format(path, binary=binary)
return w2v_model
示例13: sim_mat_and_kernel_d2d
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def sim_mat_and_kernel_d2d(relevance_file, topic_file, corpus_file, topk_corpus_file, embedding_file, stop_file,
sim_output_path, kernel_output_path, kernel_mu_list, kernel_sigma_list,
topk_supervised, d2d, test):
'''Simultaneously compute similarity matrix and RBF kernel features
Args:
relevance_file: A dumped relevance dict file
topic_file: a single line format topic file. format: qid term1 term2 ...
corpus_file: corpus corresponding to docnolist file. format: docno\tdoclen\tterm1 term2
topk_corpus_file: corpus that contain only the topk terms for each document, format: same as corpus_file
embedding_file: output file from word2vec toolkit, boolean=True
stop_file: a stopword list file, one word per line
sim_output_path:
kernel_output_path:
kernel_mu_list:
kernel_sigma_list:
topk_supervised: number of top-n documents for each query
d2d: True for NPRF, False for simple query-document matching used by e.g. DRMM, K-NRM
test: control the temporary output. Set false
Returns:
'''
relevance_dict = load_pickle(relevance_file)
topic_dict = parse_topic(topic_file)
corpus = parse_corpus(corpus_file)
topk_corpus = parse_corpus(topk_corpus_file)
embeddings = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
stoplist = parse_stoplist(stop_file)
qid_list = relevance_dict.keys()
for qid in qid_list:
sim_mat_and_kernel_per_query(relevance_dict, topic_dict, corpus, topk_corpus, embeddings, stoplist, sim_output_path,
kernel_output_path, kernel_mu_list, kernel_sigma_list, topk_supervised, d2d, test, qid)
示例14: check_for_similar_words
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def check_for_similar_words(self,):
from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format("../../temp_results/word2vec_hindi.txt", binary=False)
self.pretty_print(u"भारत",model.most_similar(u"भारत"))
self.pretty_print(u"सिंह",model.most_similar(u"सिंह"))
self.pretty_print(u"क्रिकेट",model.most_similar(u"क्रिकेट"))
self.pretty_print(u"रुपये",model.most_similar(u"रुपये"))
示例15: add_embedding
# 需要导入模块: from gensim.models.keyedvectors import KeyedVectors [as 别名]
# 或者: from gensim.models.keyedvectors.KeyedVectors import load_word2vec_format [as 别名]
def add_embedding(self, property, embedding_file):
self.embedding_files[property] = KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary)