本文整理汇总了Python中gensim.models.word2vec方法的典型用法代码示例。如果您正苦于以下问题:Python models.word2vec方法的具体用法?Python models.word2vec怎么用?Python models.word2vec使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models
的用法示例。
在下文中一共展示了models.word2vec方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def load_model(self,path):
"""
Load a trained word2vec model(binary format only).
Args:
path: the path of the model.
"""
try:
self.model = models.Word2Vec.load(path) # current loading method
except FileNotFoundError as file_not_found_err:
print("[Gensim] FileNotFoundError", file_not_found_err)
exit()
except UnicodeDecodeError as unicode_decode_err:
print("[Gensim] UnicodeDecodeError", unicode_decode_err)
self.model = models.KeyedVectors.load_word2vec_format(path, binary=True) # old loading method
except Exception as ex:
print("[Gensim] Exception", ex)
exit()
示例2: create_metadata_file
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def create_metadata_file(word2vec_file, output_file):
"""
Create the metadata file based on the corpus file (Used for the Embedding Visualization later).
Args:
word2vec_file: The word2vec file
output_file: The metadata file path
Raises:
IOError: If word2vec model file doesn't exist
"""
if not os.path.isfile(word2vec_file):
raise IOError("[Error] The word2vec file doesn't exist.")
model = gensim.models.Word2Vec.load(word2vec_file)
word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]
with open(output_file, 'w+') as fout:
for word in word2idx_sorted:
if word[0] is None:
print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
fout.write('<Empty Line>' + '\n')
else:
fout.write(word[0] + '\n')
示例3: load_word2vec_matrix
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def load_word2vec_matrix(word2vec_file):
"""
Return the word2vec model matrix.
Args:
word2vec_file: The word2vec file
Returns:
The word2vec model matrix
Raises:
IOError: If word2vec model file doesn't exist
"""
if not os.path.isfile(word2vec_file):
raise IOError("[Error] The word2vec file doesn't exist. ")
model = gensim.models.Word2Vec.load(word2vec_file)
vocab_size = model.wv.vectors.shape[0]
embedding_size = model.vector_size
vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
embedding_matrix = np.zeros([vocab_size, embedding_size])
for key, value in vocab.items():
if key is not None:
embedding_matrix[value] = model[key]
return vocab_size, embedding_size, embedding_matrix
示例4: load_data_and_labels
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def load_data_and_labels(data_file, word2vec_file):
"""
Load research data from files, splits the data into words and generates labels.
Return split sentences, labels and the max sentence length of the research data.
Args:
data_file: The research data
word2vec_file: The word2vec model file
Returns:
The class Data
"""
# Load word2vec file
if not os.path.isfile(word2vec_file):
raise IOError("[Error] The word2vec file doesn't exist. ")
model = word2vec.Word2Vec.load(word2vec_file)
# Load data from files and split by words
data = data_word2vec(input_file=data_file, word2vec_model=model)
# plot_seq_len(data_file, data)
return data
示例5: train_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def train_model(file_input, file_output):
file_intermediate = os.path.join(
os.path.dirname(file_input),
os.path.splitext(file_input)[0])
process_corpus_extraction(
file_input, file_intermediate + '.extracted')
process_chinese_filtering(
file_intermediate + '.extracted',
file_intermediate + '.filtered')
process_chinese_transformation(
file_intermediate + '.filtered',
file_intermediate + '.transformed')
process_chinese_transformation(
file_intermediate + '.transformed',
file_intermediate + '.segmented')
# we can train for either word2vec or doc2vec
# process_word_training(
# file_intermediate + '.segmented', file_output)
process_doc_training(
file_intermediate + '.segmented', file_output)
示例6: load_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def load_model(self, save_model_name):
"""
load model into the object(self.model)
"""
self.model=word2vec.Word2Vec.load(save_model_name)
self.len_vector=self.model.trainables.layer1_size
try:
self.renew_label_vec()
except:
self.safe_renew_label_vec()
示例7: safe_nlp_vector
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def safe_nlp_vector(self, words):
"""
Parameters
----------
words : list of str/str
wordbag
Returns
----------
ndarray(float)
the corresponding vectors of words in wordbag.
a vector contains the similarities calculated by word2vec and wordnet.
"""
if isinstance(words, string_types):
synonym=self.synonym_label(words)
similarity=self.similarity_label(words)
else:
synonym=np.empty((len(self.Label_index),len(words)))
similarity=np.empty((len(self.Label_index),len(words)))
for i in range(len(words)):
try:
synonym[:,i]=self.synonym_label(words[i])
except:
synonym[:,i]=np.zeros((len(self.Label_index),1))[:,0]
try:
similarity[:,i]=self.similarity_label(words[i])[:,0]
except:
similarity[:,i]=np.zeros((len(self.Label_index),1))[:,0]
vector=np.concatenate((similarity, synonym))
return vector
示例8: load_data_and_labels
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def load_data_and_labels(data_file, num_labels, word2vec_file, data_aug_flag):
"""
Load research data from files, splits the data into words and generates labels.
Return split sentences, labels and the max sentence length of the research data.
Args:
data_file: The research data
num_labels: The number of classes
word2vec_file: The word2vec model file
data_aug_flag: The flag of data augmented
Returns:
The class _Data()
Raises:
IOError: If word2vec model file doesn't exist
"""
# Load word2vec file
if not os.path.isfile(word2vec_file):
raise IOError("[Error] The word2vec file doesn't exist. ")
model = word2vec.Word2Vec.load(word2vec_file)
# Load data from files and split by words
data = data_word2vec(input_file=data_file, num_labels=num_labels, word2vec_model=model)
if data_aug_flag:
data = data_augmented(data)
# plot_seq_len(data_file, data)
return data
示例9: process_word_training
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def process_word_training(file_input, file_output):
model = gensim.models.Word2Vec(
gensim.models.word2vec.LineSentence(file_input),
size=400, workers=multiprocessing.cpu_count())
# trim unneeded model memory = use (much) less RAM
model.init_sims(replace=True)
model.save(file_output)
示例10: load_data_and_labels
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def load_data_and_labels(data_file, num_classes_list, total_classes, word2vec_file, data_aug_flag):
"""
Load research data from files, splits the data into words and generates labels.
Return split sentences, labels and the max sentence length of the research data.
Args:
data_file: The research data
num_classes_list: <list> The number of classes
total_classes: The total number of classes
word2vec_file: The word2vec file
data_aug_flag: The flag of data augmented
Returns:
The class _Data()
Raises:
IOError: If word2vec model file doesn't exist
"""
# Load word2vec file
if not os.path.isfile(word2vec_file):
raise IOError("[Error] The word2vec file doesn't exist. ")
model = word2vec.Word2Vec.load(word2vec_file)
# Load data from files and split by words
data = data_word2vec(data_file, num_classes_list, total_classes, word2vec_model=model)
if data_aug_flag:
data = data_augmented(data)
# plot_seq_len(data_file, data)
return data
示例11: train_Word2Vec
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def train_Word2Vec(self, train_corpus, saveflag=False, save_model_name='NLP_model', Size=100, Min_count=5):#, show_process=True):
"""
train the word2vec model with the processing file.
Parameters
----------
train_corpus : str/list of lists
name(absolute path) of train_corpus.
of a list of sentences(a sentence is a list of words).
saveflag : bool
save trained model locally?
save_model_name : str
the model name(absolute path)
default: 'NLP_model'
Size : int
length of the word vector
Min_count : int
minimum frequence can a word record on dictionary.
Returns
Nothing
"""
print('start training...')
prev_time = datetime.datetime.now() #当前时间
self.len_vector=Size
#if show_process==True:
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if isinstance(train_corpus, string_types):
sentences=self.txt2sentence(train_corpus)
else:
sentences=train_corpus
self.model=gensim.models.Word2Vec(sentences, size=Size, min_count=Min_count) #word to vector\in R^Size
if saveflag:
self.save_model(save_model_name) # save model locally
try:
self.renew_label_vec()
except:
self.safe_renew_label_vec()
cur_time = datetime.datetime.now() #训练后此时时间
h, remainder = divmod((cur_time - prev_time).seconds, 3600)
m, s = divmod(remainder, 60)
print('done.')
print("It costs %02d:%02d:%02d to train word2vec model." % (h, m, s))
# model.wv.save_word2vec_format(save_model_name+".bin",binary=True)
示例12: show_Word2Vec
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import word2vec [as 别名]
def show_Word2Vec(self, s, k=1, mode='topk'):
"""
not often use now.
Parameters
----------
save_model_name : str
the name of saved model
s : str
k : int/str
if mode='similarity', it's a string.
if mode='topk', it's a number, and defaultly 1.
mode : str
'similarity' : calculate the similarity between s and k, and note that k is a string.
'topk' (default): find top k similar words of s, and note that k is a integer.
Returns
----------
float
if mode='similarity', this is the similarity between s and k.
if mode='return_topk', it'll not return a number but a iterator.
if mode='topk', it'll print the most similar k words.
"""
if self.model is None:
raise Exception("no model")
#model=word2vec.Word2Vec.load(save_model_name)
if mode=='topk':
y=self.model.most_similar(s,topn=k)
print('与"%s"最相关的词有:\n' % s)
for item in y:
print(item[0],item[1])
elif mode=='return_topk':
return self.model.wv.most_similar(s,topn=k)
#return model.most_similar(s,topn=k)
elif mode=='similarity':
y=self.model.wv.similarity(s,k)
# 余弦相似度,即对于两个向量v1,v2,先单位化后,再求内积。
print('"%s"和"%s"的相似度为:%f%%' % (s,k,(y*100)))
return y
elif mode=='vector':
print(self.model[s])