本文整理汇总了Python中gensim.models.KeyedVectors.load_word2vec_format方法的典型用法代码示例。如果您正苦于以下问题:Python KeyedVectors.load_word2vec_format方法的具体用法?Python KeyedVectors.load_word2vec_format怎么用?Python KeyedVectors.load_word2vec_format使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.KeyedVectors
的用法示例。
在下文中一共展示了KeyedVectors.load_word2vec_format方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_word2vec_embeddings
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size):
model = KeyedVectors.load_word2vec_format(filepath, binary=True)
emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std()
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
for word, i in word_index.items():
if i >= max_features:
continue
try:
embedding_vector = model[word]
embedding_matrix[i] = embedding_vector
except KeyError:
continue
return embedding_matrix
示例2: load_poincare_model
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_poincare_model(path, word2vec_format=True, binary=False):
""" Load a Poincare embedding model.
:param path: path of the file of the pre-trained Poincare embedding model
:param word2vec_format: whether to load from word2vec format (default: True)
:param binary: binary format (default: False)
:return: a pre-trained Poincare embedding model
:type path: str
:type word2vec_format: bool
:type binary: bool
:rtype: gensim.models.poincare.PoincareKeyedVectors
"""
if word2vec_format:
return PoincareKeyedVectors.load_word2vec_format(path, binary=binary)
else:
return PoincareModel.load(path).kv
示例3: __init__
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def __init__(self, all_texts):
# Creating the model
print("Reading the pretrained model for Word2VecEmbedder")
self.sk_model = KeyedVectors.load_word2vec_format(
'/data/verejne/datautils/embedding_data/slovak.vec', encoding='utf-8', unicode_errors='ignore')
print("Model contains", len(self.sk_model.vocab), "tokens")
print(self.sk_model.similarity("mesto", "mesta"))
self.dimension = len(self.sk_model["auto"])
print("sídlisk" in self.sk_model)
print("sídlisk".encode('utf8') in self.sk_model)
print("Dimension of embedding of 'auto' is", self.dimension)
# Create frequency table for words
if all_texts is None:
return
for text in all_texts:
self.add_text_to_corpus(text)
self.print_corpus_stats()
示例4: _get_embedding_matrix
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def _get_embedding_matrix(self, tokenizer):
model = KeyedVectors.load_word2vec_format(self.pretrained_filepath, binary=True)
emb_mean, emb_std = model.syn0.mean(), model.syn0.std()
word_index = tokenizer.word_index
nb_words = min(self.max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, self.embedding_size))
for word, i in word_index.items():
if i >= self.max_features:
continue
try:
embedding_vector = model[word]
embedding_matrix[i] = embedding_vector
except KeyError:
continue
return embedding_matrix
示例5: word2vec
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def word2vec(word2vec_path):
# Download word2vec data if it's not present yet
if not path.exists(word2vec_path):
glove_file_path = get_glove_file_path()
print('Converting Glove to word2vec...', end='')
glove2word2vec(glove_file_path, word2vec_path) # Convert glove to word2vec
os.remove(glove_file_path) # Remove glove file and keep only word2vec
print('Done')
print('Reading word2vec data... ', end='')
model = KeyedVectors.load_word2vec_format(word2vec_path)
print('Done')
def get_word_vector(word):
try:
return model[word]
except KeyError:
return np.zeros(model.vector_size)
return get_word_vector
示例6: loadEmbeddings
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val):
embed_short = os.path.normpath("%s/embed.dat" % data_path)
if not os.path.exists(embed_short):
print("Caching word embeddings in memmapped format...")
print(binary_val, filepath)
wv = KeyedVectors.load_word2vec_format("%s" % (filepath), binary=binary_val)
fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape)
fp[:] = wv.syn0[:]
with open(os.path.normpath("%s/embed.vocab" % data_path), "w") as fp:
for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
fp.write("%s\n"%(w.encode("utf8")))
del fp, wv
self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size))
with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f:
vocab_list = [x.strip() for x in f.readlines()]
self.vocab_dict = {w: k for k, w in enumerate(vocab_list)}
示例7: main
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def main(opt):
vocab = pickle.load(open(opt.vocab_path, 'rb'))
num = len(vocab)
print (num)
model = KeyedVectors.load_word2vec_format(opt.embed_weight, binary=True)
matrix_len = num
weights_matrix = np.zeros((num, 300))
words_found = 0
mask = np.zeros(num, dtype=int)
for i, word in enumerate(vocab.idx2word):
try:
weights_matrix[i] = model[vocab.idx2word[i]]
words_found += 1
mask[i] = 1
except KeyError:
weights_matrix[i] = np.random.normal(scale=0.1, size=(300, ))
print (words_found)
np.save("./embed/f30kword2vec300dim_3.npy", weights_matrix)
np.save("./embed/f30kword2vecmask_3.npy", mask)
示例8: get_summary_nlp
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def get_summary_nlp(self):
"""
Return a summary of a NLP model file
:return:
"""
ret = []
try:
word2vec = KeyedVectors.load_word2vec_format(self.filename, binary=False)
mtime = self._get_mtime()
dim_vectors = word2vec.vector_size
word_count = len(word2vec.vectors)
ret.append("---------------------------")
ret.append("Summary for NLP model file:")
ret.append("---------------------------")
ret.append(self.FILE_NAME_OUTPUT.format(self.filename))
ret.append(self.LAST_MODIFICATION_TIME.format(mtime))
ret.append(self.FEATURE_DIMENSION.format(dim_vectors))
ret.append(self.NUM_SENTENCES.format(word_count))
ret.append("\n")
except Exception as e:
ret.append("Failed to read NLP model {}.".format(self.filename))
ret.append("Error: {}".format(e))
return ret
示例9: load
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load(cls, np2vec_model_file, binary=False, word_ngrams=0, word2vec_format=True):
"""
Load the np2vec model.
Args:
np2vec_model_file (str): the file containing the np2vec model to load
binary (bool): boolean indicating whether the np2vec model to load is in binary format
word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
ngrams) information.
word2vec_format(bool): boolean indicating whether the model to load has been stored in
original word2vec format.
Returns:
np2vec model to load
"""
if word_ngrams == 0:
if word2vec_format:
return KeyedVectors.load_word2vec_format(np2vec_model_file, binary=binary)
return KeyedVectors.load(np2vec_model_file, mmap="r")
if word_ngrams == 1:
return FastText.load(np2vec_model_file)
logger.error("invalid value for 'word_ngrams'")
return None
示例10: _prepare_embedding_index
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def _prepare_embedding_index(self, binary=True):
"""Returns an embedding index for pre-trained token embeddings.
For pre-trained word embeddings given at `self.filepath`, returns a
dictionary mapping words to their embedding (an 'embedding index'). If `self.debug` is
True, only the first ten thousand vectors are loaded.
Args:
binary (bool): True if pre-trained embeddings are in C binary format, False if they are
in C text format. Defaults to True.
Returns:
Dictionary mapping words to pre-trained word embeddings, known as an 'embedding index'.
"""
limit = 10000 if self.__dict__.get("debug", False) else None
vectors = KeyedVectors.load_word2vec_format(self.filepath, binary=binary, limit=limit)
embedding_idx = {word: vectors[word] for word in vectors.vocab}
return embedding_idx
示例11: get_word_embeddings
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def get_word_embeddings(w2vfile, word_to_index, embedsize=300):
'''
For each word in our vocabulary, get the word2vec encoding of the word
Inputs:
w2vfile (string) : Path to the file containing (pre-trained) word embeddings
embedsize (int) : Length of each word vector
Returns:
word_embeddings : Dictionary mapping each word to corresponding embedding
'''
word_embeddings = {}
if w2vfile.endswith('.txt'):
f = open(w2vfile)
for line in tqdm(f):
values = line.split(" ")
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
if word in word_to_index:
word_embeddings[word] = coefs
f.close()
elif w2vfile.endswith('.bin'):
word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000)
for word in tqdm(word_to_index):
try:
word_embeddings[word] = word2vec[word.lower()]
except KeyError:
pass
else:
print ('Can\'t load word embeddings.')
exit(-1)
print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index)))
if len(word_to_index) > len(word_embeddings):
print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings)))
for word in word_to_index:
if word not in word_embeddings:
word_embeddings[word] = np.zeros((embedsize,))
return word_embeddings
示例12: load_word2vec_model
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_word2vec_model(path, binary=True):
""" Load a pre-trained Word2Vec model.
:param path: path of the file of the pre-trained Word2Vec model
:param binary: whether the file is in binary format (Default: True)
:return: a pre-trained Word2Vec model
:type path: str
:type binary: bool
:rtype: gensim.models.keyedvectors.KeyedVectors
"""
return KeyedVectors.load_word2vec_format(path, binary=binary)
示例13: load_text
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_text(self):
try:
model = KeyedVectors.load_word2vec_format(self.model_path, self.vocab_path, binary=False)
except FileNotFoundError:
model = None
return model
示例14: load_bin_word2vec
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_bin_word2vec(word_index, word2vec_path, max_index):
log.info('Begin load word vector from bin...')
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
embeddings = _select_vectors(word2vec, word_index, max_index)
log.info('End load word vector from bin...')
return embeddings
# 加载词向量(获取需要的词向量:自己训练好的文本词向量)
示例15: load_text_vector
# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_text_vector(word_index, word2vec_path, max_index):
log.info('Begin load word vector from text...')
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=False)
embeddings = _select_vectors(word2vec, word_index, max_index)
log.info('End load word vector from text...')
return embeddings