本文整理汇总了Python中gensim.models方法的典型用法代码示例。如果您正苦于以下问题:Python gensim.models方法的具体用法?Python gensim.models怎么用?Python gensim.models使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim
的用法示例。
在下文中一共展示了gensim.models方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_metadata_file
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def create_metadata_file(word2vec_file, output_file):
"""
Create the metadata file based on the corpus file (Used for the Embedding Visualization later).
Args:
word2vec_file: The word2vec file
output_file: The metadata file path
Raises:
IOError: If word2vec model file doesn't exist
"""
if not os.path.isfile(word2vec_file):
raise IOError("[Error] The word2vec file doesn't exist.")
model = gensim.models.Word2Vec.load(word2vec_file)
word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]
with open(output_file, 'w+') as fout:
for word in word2idx_sorted:
if word[0] is None:
print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
fout.write('<Empty Line>' + '\n')
else:
fout.write(word[0] + '\n')
示例2: load_poincare_model
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def load_poincare_model(path, word2vec_format=True, binary=False):
""" Load a Poincare embedding model.
:param path: path of the file of the pre-trained Poincare embedding model
:param word2vec_format: whether to load from word2vec format (default: True)
:param binary: binary format (default: False)
:return: a pre-trained Poincare embedding model
:type path: str
:type word2vec_format: bool
:type binary: bool
:rtype: gensim.models.poincare.PoincareKeyedVectors
"""
if word2vec_format:
return PoincareKeyedVectors.load_word2vec_format(path, binary=binary)
else:
return PoincareModel.load(path).kv
示例3: shorttext_to_avgvec
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def shorttext_to_avgvec(shorttext, wvmodel):
""" Convert the short text into an averaged embedded vector representation.
Given a short sentence, it converts all the tokens into embedded vectors according to
the given word-embedding model, sums
them up, and normalize the resulting vector. It returns the resulting vector
that represents this short sentence.
:param shorttext: a short sentence
:param wvmodel: word-embedding model
:return: an embedded vector that represents the short sentence
:type shorttext: str
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:rtype: numpy.ndarray
"""
vec = np.sum([wvmodel[token] for token in tokenize(shorttext) if token in wvmodel], axis=0)
# normalize
norm = np.linalg.norm(vec)
if norm != 0:
vec /= norm
return vec
示例4: load_word2vec_matrix
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def load_word2vec_matrix(word2vec_file):
"""
Return the word2vec model matrix.
Args:
word2vec_file: The word2vec file
Returns:
The word2vec model matrix
Raises:
IOError: If word2vec model file doesn't exist
"""
if not os.path.isfile(word2vec_file):
raise IOError("[Error] The word2vec file doesn't exist. ")
model = gensim.models.Word2Vec.load(word2vec_file)
vocab_size = model.wv.vectors.shape[0]
embedding_size = model.vector_size
vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
embedding_matrix = np.zeros([vocab_size, embedding_size])
for key, value in vocab.items():
if key is not None:
embedding_matrix[value] = model[key]
return vocab_size, embedding_size, embedding_matrix
示例5: compute_epoch_accuracies
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def compute_epoch_accuracies(root, prefix, analogy_file):
filenames = glob.glob(os.path.join(root, prefix+"_epoch*.model"))
nr_epochs = len(filenames)
accuracies = dict()
losses = [0] * nr_epochs
for filename in filenames:
epoch = int(re.search("\d+\.model", filename).group()[:-6])
m = Word2Vec.load(filename)
losses[epoch] = m.get_latest_training_loss()
sections = m.wv.accuracy(analogy_file)
for sec in sections:
if sec["section"] not in accuracies:
accuracies[sec["section"]] = [0] * nr_epochs
correct, incorrect = len(sec["correct"]), len(sec["incorrect"])
if incorrect > 0:
accuracy = correct / (correct + incorrect)
else:
accuracy = 0
accuracies[sec["section"]][epoch] = (correct, incorrect, accuracy)
save_obj(accuracies, os.path.join("models", prefix + "_accuracies"))
save_obj(np.concatenate([np.array([losses[0]]), np.diff(losses)]), os.path.join("models", prefix + "_loss"))
示例6: train_artistsong2vec_model
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None,
min_count=5, sorted_vocab=1, window=10,
size=250,
iter_n=50):
if not input_datas and data_path:
input_datas = pickle.load(open(data_path, 'rb'))
full_data = []
for i in input_datas:
tmp = []
for j in i:
tmp.append(j[0])
tmp.append(j[1])
full_data.append(tmp)
data_process_logger.info('start training')
wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
size=size, iter=iter_n)
with open(fout_path, 'wb') as fout:
data_process_logger.info('start saving model')
pickle.dump(wv_model, fout)
print 'model saved'
示例7: get_word
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def get_word(word):
inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word)
if inst == None:
length = len(word.split("_"))
if length < 5:
return True, word
else:
if inst.group(1) != "disambiguation":
word2 = re.sub(r'_\(.+\)','',word)
if len(word2.split(" ")) <5:
return True, word
return False,word
# Load the trained doc2vec and word2vec models.
示例8: online_lda
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def online_lda(corpus, dictionary, k=25, alpha="symmetric", chunk_size=10000, update_every=1, passes=1):
"""
Build the standard online LDA topic model (see gensim:
http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation)
Updates model every 'update_every' chunks, does 'passes' full passes over the corpus (updating
every 'update_every' time each pass), and breaks corpus into 'chunk_size' document chunks.
EG: chunk_size=100, update_every=1, passes=1: Does one full pass over the corpus, updating the
model every one chunk, breaking the whole corpus into corpus_size/chunk_size chunks.
500 documents => 5 chunks, updates model on every chunk.
Alpha values can be "symmetric", "asymmetric", and "auto". See:
http://radimrehurek.com/gensim/models/ldamodel.html
"""
return models.ldamodel.LdaModel(corpus=corpus,
id2word=dictionary,
num_topics=k,
alpha=alpha,
chunksize=chunk_size,
update_every=update_every,
passes=passes)
示例9: batch_lda
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def batch_lda(corpus, dictionary, k=25, alpha="symmetric", passes=20):
"""
Build basic batch LDA topic model (see gensim:
http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation)
Does 'passes' number of passes over the whole corpus, no chunking, and updates the model
at the end of every full pass.
Alpha values can be "symmetric", "asymmetric", and "auto". See:
http://radimrehurek.com/gensim/models/ldamodel.html
"""
return models.ldamodel.LdaModel(corpus=corpus,
id2word=dictionary,
num_topics=k,
alpha=alpha,
update_every=0,
passes=passes)
示例10: predictData
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def predictData():
"""
使用模型预测真实数据
"""
input_texts = ["很好很满意","不好不满意","质量有问题","商家态度很差","售后很渣,渣渣"]
# word_model = word2vec.Word2Vec.load('./models/Word2vec_model.model')
# w2indx, w2vec, texts = create_dictionaries(word_model, texts)
# print(texts)
texts = predict_wordtoVect(input_texts)
model = get_model()
# # 预测
pred_result = model.predict_classes(texts)
print(pred_result)
labels = [int(round(x[0])) for x in pred_result]
label2word = {1: '正面', 0: '负面'}
for i in range(len(pred_result)):
print('{0} -------- {1}'.format(label2word[labels[i]], input_texts[i]))
示例11: predict_phrase
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def predict_phrase(phrase):
#load the model
#preprocess the phrase
#phrase_clean = clean_str(phrase)
phrase_clean = phrase
#load the dictionary
char_dict = np.load('EARL/models/char_dict.npy').item()
#phrase_clean = [char for char in phrase_clean]
#print phrase_clean
phrase_clean = [char_dict[char] for char in phrase_clean]
#print phrase_clean
#print np.concatenate((np.zeros(max_len-len(phrase_clean)), phrase_clean) )
prediction = model.predict(np.concatenate((np.zeros((270-len(phrase_clean))), phrase_clean)).reshape(1,270))
print prediction[0]
pred = np.argmax(prediction[0])
return 'R' if pred == 0 else 'E'
示例12: _expand_from
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def _expand_from(self, corpus, prefix=None, labels=None):
"""
Pass through the dataset once to add the new labels to the model.
These labels stand in one for each document/sentence and not
for new vocabulary.
"""
if prefix is None:
prefix = 'SENT'
num_lines = sum(1 for _ in corpus)
# Expand syn0
shape = (self.syn0.shape[0] + num_lines, self.syn0.shape[1])
syn0 = (np.random.random(shape).astype(self.syn0.dtype) - 0.5)
syn0 /= self.layer1_size
syn0[:self.syn0.shape[0]] = self.syn0
self.syn0 = syn0
index2word_start = len(self.index2word)
for j, line_no in enumerate(range(num_lines)):
# Expand vocab
newvocab = gensim.models.doc2vec.Vocab()
newvocab.index = len(self.index2word)
newvocab.sample_probability = 1.0
# We insert each sentence at the root of the
# Huffman tree. It's a hack.
newvocab.code = [1, ] * int(math.log(line_no + 1, 2) + 1)
label = Document2Vec._make_label(prefix, str(j))
self.vocab[label] = newvocab
# Expand index2word
self.index2word.append(label)
assert len(self.index2word) == newvocab.index + 1
return index2word_start
示例13: load_word2vec_model
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def load_word2vec_model(path, binary=True):
""" Load a pre-trained Word2Vec model.
:param path: path of the file of the pre-trained Word2Vec model
:param binary: whether the file is in binary format (Default: True)
:return: a pre-trained Word2Vec model
:type path: str
:type binary: bool
:rtype: gensim.models.keyedvectors.KeyedVectors
"""
return KeyedVectors.load_word2vec_format(path, binary=binary)
示例14: load_fasttext_model
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def load_fasttext_model(path, encoding='utf-8'):
""" Load a pre-trained FastText model.
:param path: path of the file of the pre-trained FastText model
:return: a pre-trained FastText model
:type path: str
:rtype: gensim.models.keyedvectors.FastTextKeyedVectors
"""
return gensim.models.fasttext.load_facebook_vectors(path, encoding=encoding)
示例15: save
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def save(self, fname_or_handle, **kwargs):
"""
:param fname_or_handle:
:param kwargs:
:return:
"""
raise IOError('The class RESTfulKeyedVectors do not persist models to a file.')