本文整理匯總了Python中gensim.models.word2vec.Word2Vec.load_word2vec_format方法的典型用法代碼示例。如果您正苦於以下問題:Python Word2Vec.load_word2vec_format方法的具體用法?Python Word2Vec.load_word2vec_format怎麽用?Python Word2Vec.load_word2vec_format使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gensim.models.word2vec.Word2Vec
的用法示例。
在下文中一共展示了Word2Vec.load_word2vec_format方法的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: loadW2V
# 需要導入模塊: from gensim.models.word2vec import Word2Vec [as 別名]
# 或者: from gensim.models.word2vec.Word2Vec import load_word2vec_format [as 別名]
def loadW2V(self,emb_path, type="bin"):
print("Loading W2V data...")
num_keys = 0
if type=="textgz":
# this seems faster than gensim non-binary load
for line in gzip.open(emb_path):
l = line.strip().split()
st=l[0].lower()
self.pre_emb[st]=np.asarray(l[1:])
num_keys=len(self.pre_emb)
if type=="text":
# this seems faster than gensim non-binary load
for line in open(emb_path):
l = line.strip().split()
st=l[0].lower()
self.pre_emb[st]=np.asarray(l[1:])
num_keys=len(self.pre_emb)
else:
self.pre_emb = Word2Vec.load_word2vec_format(emb_path,binary=True)
self.pre_emb.init_sims(replace=True)
num_keys=len(self.pre_emb.vocab)
print("loaded word2vec len ", num_keys)
gc.collect()
示例2: loadEmbeddings
# 需要導入模塊: from gensim.models.word2vec import Word2Vec [as 別名]
# 或者: from gensim.models.word2vec.Word2Vec import load_word2vec_format [as 別名]
def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val):
if not os.path.exists(data_path):
os.makedirs(data_path)
embed_short = os.path.normpath("%s/embed.dat" % data_path)
if not os.path.exists(embed_short):
print("Caching word embeddings in memmapped format...")
print(binary_val, filepath)
wv = Word2Vec.load_word2vec_format("%s" % (filepath), binary=binary_val)
fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape)
fp[:] = wv.syn0[:]
with open(os.path.normpath("%s/embed.vocab" % data_path), "w", encoding='utf-8') as fp:
for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
fp.write('%s\n' % w)
del fp, wv
self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size))
with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f:
vocab_list = [x.strip() for x in f.readlines()]
self.vocab_dict = {w: k for k, w in enumerate(vocab_list)}
示例3: check
# 需要導入模塊: from gensim.models.word2vec import Word2Vec [as 別名]
# 或者: from gensim.models.word2vec.Word2Vec import load_word2vec_format [as 別名]
def check(self, model):
assert model.contains(['topics_term', 'sentences_term'])
with ElapsedTimeIndicator('load ' + self._word2vec_model + ' [{elapsed}]') as indicator:
self._word2vec = Word2Vec.load_word2vec_format(self._word2vec_model, binary=True)
self._word2vec.init_sims(replace=True)
示例4: loadW2V
# 需要導入模塊: from gensim.models.word2vec import Word2Vec [as 別名]
# 或者: from gensim.models.word2vec.Word2Vec import load_word2vec_format [as 別名]
def loadW2V(self,emb_path, type="textgz"):
print("Loading W2V data...")
num_keys = 0
if type=="textgz":
# this seems faster than gensim non-binary load
for line in gzip.open(emb_path):
l = line.strip().split()
self.pre_emb[l[0]]=np.asarray(l[1:])
num_keys=len(self.pre_emb)
else:
self.pre_emb = Word2Vec.load_word2vec_format(emb_path,binary=True)
self.pre_emb.init_sims(replace=True)
num_keys=len(self.pre_emb.vocab)
print("loaded word2vec len ", num_keys)
gc.collect()