本文整理匯總了Python中gensim.models方法的典型用法代碼示例。如果您正苦於以下問題:Python gensim.models方法的具體用法?Python gensim.models怎麽用?Python gensim.models使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gensim
的用法示例。
在下文中一共展示了gensim.models方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: create_metadata_file
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def create_metadata_file(word2vec_file, output_file):
"""
Create the metadata file based on the corpus file (Used for the Embedding Visualization later).
Args:
word2vec_file: The word2vec file
output_file: The metadata file path
Raises:
IOError: If word2vec model file doesn't exist
"""
if not os.path.isfile(word2vec_file):
raise IOError("[Error] The word2vec file doesn't exist.")
model = gensim.models.Word2Vec.load(word2vec_file)
word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]
with open(output_file, 'w+') as fout:
for word in word2idx_sorted:
if word[0] is None:
print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
fout.write('<Empty Line>' + '\n')
else:
fout.write(word[0] + '\n')
示例2: load_poincare_model
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def load_poincare_model(path, word2vec_format=True, binary=False):
""" Load a Poincare embedding model.
:param path: path of the file of the pre-trained Poincare embedding model
:param word2vec_format: whether to load from word2vec format (default: True)
:param binary: binary format (default: False)
:return: a pre-trained Poincare embedding model
:type path: str
:type word2vec_format: bool
:type binary: bool
:rtype: gensim.models.poincare.PoincareKeyedVectors
"""
if word2vec_format:
return PoincareKeyedVectors.load_word2vec_format(path, binary=binary)
else:
return PoincareModel.load(path).kv
示例3: shorttext_to_avgvec
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def shorttext_to_avgvec(shorttext, wvmodel):
""" Convert the short text into an averaged embedded vector representation.
Given a short sentence, it converts all the tokens into embedded vectors according to
the given word-embedding model, sums
them up, and normalize the resulting vector. It returns the resulting vector
that represents this short sentence.
:param shorttext: a short sentence
:param wvmodel: word-embedding model
:return: an embedded vector that represents the short sentence
:type shorttext: str
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:rtype: numpy.ndarray
"""
vec = np.sum([wvmodel[token] for token in tokenize(shorttext) if token in wvmodel], axis=0)
# normalize
norm = np.linalg.norm(vec)
if norm != 0:
vec /= norm
return vec
示例4: load_word2vec_matrix
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def load_word2vec_matrix(word2vec_file):
"""
Return the word2vec model matrix.
Args:
word2vec_file: The word2vec file
Returns:
The word2vec model matrix
Raises:
IOError: If word2vec model file doesn't exist
"""
if not os.path.isfile(word2vec_file):
raise IOError("[Error] The word2vec file doesn't exist. ")
model = gensim.models.Word2Vec.load(word2vec_file)
vocab_size = model.wv.vectors.shape[0]
embedding_size = model.vector_size
vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
embedding_matrix = np.zeros([vocab_size, embedding_size])
for key, value in vocab.items():
if key is not None:
embedding_matrix[value] = model[key]
return vocab_size, embedding_size, embedding_matrix
示例5: compute_epoch_accuracies
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def compute_epoch_accuracies(root, prefix, analogy_file):
filenames = glob.glob(os.path.join(root, prefix+"_epoch*.model"))
nr_epochs = len(filenames)
accuracies = dict()
losses = [0] * nr_epochs
for filename in filenames:
epoch = int(re.search("\d+\.model", filename).group()[:-6])
m = Word2Vec.load(filename)
losses[epoch] = m.get_latest_training_loss()
sections = m.wv.accuracy(analogy_file)
for sec in sections:
if sec["section"] not in accuracies:
accuracies[sec["section"]] = [0] * nr_epochs
correct, incorrect = len(sec["correct"]), len(sec["incorrect"])
if incorrect > 0:
accuracy = correct / (correct + incorrect)
else:
accuracy = 0
accuracies[sec["section"]][epoch] = (correct, incorrect, accuracy)
save_obj(accuracies, os.path.join("models", prefix + "_accuracies"))
save_obj(np.concatenate([np.array([losses[0]]), np.diff(losses)]), os.path.join("models", prefix + "_loss"))
示例6: train_artistsong2vec_model
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None,
min_count=5, sorted_vocab=1, window=10,
size=250,
iter_n=50):
if not input_datas and data_path:
input_datas = pickle.load(open(data_path, 'rb'))
full_data = []
for i in input_datas:
tmp = []
for j in i:
tmp.append(j[0])
tmp.append(j[1])
full_data.append(tmp)
data_process_logger.info('start training')
wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
size=size, iter=iter_n)
with open(fout_path, 'wb') as fout:
data_process_logger.info('start saving model')
pickle.dump(wv_model, fout)
print 'model saved'
示例7: get_word
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def get_word(word):
inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word)
if inst == None:
length = len(word.split("_"))
if length < 5:
return True, word
else:
if inst.group(1) != "disambiguation":
word2 = re.sub(r'_\(.+\)','',word)
if len(word2.split(" ")) <5:
return True, word
return False,word
# Load the trained doc2vec and word2vec models.
示例8: online_lda
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def online_lda(corpus, dictionary, k=25, alpha="symmetric", chunk_size=10000, update_every=1, passes=1):
"""
Build the standard online LDA topic model (see gensim:
http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation)
Updates model every 'update_every' chunks, does 'passes' full passes over the corpus (updating
every 'update_every' time each pass), and breaks corpus into 'chunk_size' document chunks.
EG: chunk_size=100, update_every=1, passes=1: Does one full pass over the corpus, updating the
model every one chunk, breaking the whole corpus into corpus_size/chunk_size chunks.
500 documents => 5 chunks, updates model on every chunk.
Alpha values can be "symmetric", "asymmetric", and "auto". See:
http://radimrehurek.com/gensim/models/ldamodel.html
"""
return models.ldamodel.LdaModel(corpus=corpus,
id2word=dictionary,
num_topics=k,
alpha=alpha,
chunksize=chunk_size,
update_every=update_every,
passes=passes)
示例9: batch_lda
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def batch_lda(corpus, dictionary, k=25, alpha="symmetric", passes=20):
"""
Build basic batch LDA topic model (see gensim:
http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation)
Does 'passes' number of passes over the whole corpus, no chunking, and updates the model
at the end of every full pass.
Alpha values can be "symmetric", "asymmetric", and "auto". See:
http://radimrehurek.com/gensim/models/ldamodel.html
"""
return models.ldamodel.LdaModel(corpus=corpus,
id2word=dictionary,
num_topics=k,
alpha=alpha,
update_every=0,
passes=passes)
示例10: predictData
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def predictData():
"""
使用模型預測真實數據
"""
input_texts = ["很好很滿意","不好不滿意","質量有問題","商家態度很差","售後很渣,渣渣"]
# word_model = word2vec.Word2Vec.load('./models/Word2vec_model.model')
# w2indx, w2vec, texts = create_dictionaries(word_model, texts)
# print(texts)
texts = predict_wordtoVect(input_texts)
model = get_model()
# # 預測
pred_result = model.predict_classes(texts)
print(pred_result)
labels = [int(round(x[0])) for x in pred_result]
label2word = {1: '正麵', 0: '負麵'}
for i in range(len(pred_result)):
print('{0} -------- {1}'.format(label2word[labels[i]], input_texts[i]))
示例11: predict_phrase
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def predict_phrase(phrase):
#load the model
#preprocess the phrase
#phrase_clean = clean_str(phrase)
phrase_clean = phrase
#load the dictionary
char_dict = np.load('EARL/models/char_dict.npy').item()
#phrase_clean = [char for char in phrase_clean]
#print phrase_clean
phrase_clean = [char_dict[char] for char in phrase_clean]
#print phrase_clean
#print np.concatenate((np.zeros(max_len-len(phrase_clean)), phrase_clean) )
prediction = model.predict(np.concatenate((np.zeros((270-len(phrase_clean))), phrase_clean)).reshape(1,270))
print prediction[0]
pred = np.argmax(prediction[0])
return 'R' if pred == 0 else 'E'
示例12: _expand_from
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def _expand_from(self, corpus, prefix=None, labels=None):
"""
Pass through the dataset once to add the new labels to the model.
These labels stand in one for each document/sentence and not
for new vocabulary.
"""
if prefix is None:
prefix = 'SENT'
num_lines = sum(1 for _ in corpus)
# Expand syn0
shape = (self.syn0.shape[0] + num_lines, self.syn0.shape[1])
syn0 = (np.random.random(shape).astype(self.syn0.dtype) - 0.5)
syn0 /= self.layer1_size
syn0[:self.syn0.shape[0]] = self.syn0
self.syn0 = syn0
index2word_start = len(self.index2word)
for j, line_no in enumerate(range(num_lines)):
# Expand vocab
newvocab = gensim.models.doc2vec.Vocab()
newvocab.index = len(self.index2word)
newvocab.sample_probability = 1.0
# We insert each sentence at the root of the
# Huffman tree. It's a hack.
newvocab.code = [1, ] * int(math.log(line_no + 1, 2) + 1)
label = Document2Vec._make_label(prefix, str(j))
self.vocab[label] = newvocab
# Expand index2word
self.index2word.append(label)
assert len(self.index2word) == newvocab.index + 1
return index2word_start
示例13: load_word2vec_model
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def load_word2vec_model(path, binary=True):
""" Load a pre-trained Word2Vec model.
:param path: path of the file of the pre-trained Word2Vec model
:param binary: whether the file is in binary format (Default: True)
:return: a pre-trained Word2Vec model
:type path: str
:type binary: bool
:rtype: gensim.models.keyedvectors.KeyedVectors
"""
return KeyedVectors.load_word2vec_format(path, binary=binary)
示例14: load_fasttext_model
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def load_fasttext_model(path, encoding='utf-8'):
""" Load a pre-trained FastText model.
:param path: path of the file of the pre-trained FastText model
:return: a pre-trained FastText model
:type path: str
:rtype: gensim.models.keyedvectors.FastTextKeyedVectors
"""
return gensim.models.fasttext.load_facebook_vectors(path, encoding=encoding)
示例15: save
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import models [as 別名]
def save(self, fname_or_handle, **kwargs):
"""
:param fname_or_handle:
:param kwargs:
:return:
"""
raise IOError('The class RESTfulKeyedVectors do not persist models to a file.')