本文整理汇总了Python中gensim.models.word2vec.Word2Vec.load_word2vec_format方法的典型用法代码示例。如果您正苦于以下问题:Python Word2Vec.load_word2vec_format方法的具体用法?Python Word2Vec.load_word2vec_format怎么用?Python Word2Vec.load_word2vec_format使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.word2vec.Word2Vec
的用法示例。
在下文中一共展示了Word2Vec.load_word2vec_format方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: loadW2V
# 需要导入模块: from gensim.models.word2vec import Word2Vec [as 别名]
# 或者: from gensim.models.word2vec.Word2Vec import load_word2vec_format [as 别名]
def loadW2V(self,emb_path, type="bin"):
print("Loading W2V data...")
num_keys = 0
if type=="textgz":
# this seems faster than gensim non-binary load
for line in gzip.open(emb_path):
l = line.strip().split()
st=l[0].lower()
self.pre_emb[st]=np.asarray(l[1:])
num_keys=len(self.pre_emb)
if type=="text":
# this seems faster than gensim non-binary load
for line in open(emb_path):
l = line.strip().split()
st=l[0].lower()
self.pre_emb[st]=np.asarray(l[1:])
num_keys=len(self.pre_emb)
else:
self.pre_emb = Word2Vec.load_word2vec_format(emb_path,binary=True)
self.pre_emb.init_sims(replace=True)
num_keys=len(self.pre_emb.vocab)
print("loaded word2vec len ", num_keys)
gc.collect()
示例2: loadEmbeddings
# 需要导入模块: from gensim.models.word2vec import Word2Vec [as 别名]
# 或者: from gensim.models.word2vec.Word2Vec import load_word2vec_format [as 别名]
def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val):
if not os.path.exists(data_path):
os.makedirs(data_path)
embed_short = os.path.normpath("%s/embed.dat" % data_path)
if not os.path.exists(embed_short):
print("Caching word embeddings in memmapped format...")
print(binary_val, filepath)
wv = Word2Vec.load_word2vec_format("%s" % (filepath), binary=binary_val)
fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape)
fp[:] = wv.syn0[:]
with open(os.path.normpath("%s/embed.vocab" % data_path), "w", encoding='utf-8') as fp:
for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
fp.write('%s\n' % w)
del fp, wv
self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size))
with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f:
vocab_list = [x.strip() for x in f.readlines()]
self.vocab_dict = {w: k for k, w in enumerate(vocab_list)}
示例3: check
# 需要导入模块: from gensim.models.word2vec import Word2Vec [as 别名]
# 或者: from gensim.models.word2vec.Word2Vec import load_word2vec_format [as 别名]
def check(self, model):
assert model.contains(['topics_term', 'sentences_term'])
with ElapsedTimeIndicator('load ' + self._word2vec_model + ' [{elapsed}]') as indicator:
self._word2vec = Word2Vec.load_word2vec_format(self._word2vec_model, binary=True)
self._word2vec.init_sims(replace=True)
示例4: loadW2V
# 需要导入模块: from gensim.models.word2vec import Word2Vec [as 别名]
# 或者: from gensim.models.word2vec.Word2Vec import load_word2vec_format [as 别名]
def loadW2V(self,emb_path, type="textgz"):
print("Loading W2V data...")
num_keys = 0
if type=="textgz":
# this seems faster than gensim non-binary load
for line in gzip.open(emb_path):
l = line.strip().split()
self.pre_emb[l[0]]=np.asarray(l[1:])
num_keys=len(self.pre_emb)
else:
self.pre_emb = Word2Vec.load_word2vec_format(emb_path,binary=True)
self.pre_emb.init_sims(replace=True)
num_keys=len(self.pre_emb.vocab)
print("loaded word2vec len ", num_keys)
gc.collect()