本文整理匯總了Python中gensim.corpora方法的典型用法代碼示例。如果您正苦於以下問題:Python gensim.corpora方法的具體用法?Python gensim.corpora怎麽用?Python gensim.corpora使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gensim
的用法示例。
在下文中一共展示了gensim.corpora方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: build
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import corpora [as 別名]
def build(self):
self.cursor.rewind()
dictionary = corpora.Dictionary(review["words"] for review in self.cursor)
dictionary.filter_extremes(keep_n=10000)
dictionary.compactify()
corpora.Dictionary.save(dictionary, self.dictionary_path)
return dictionary
示例2: run
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import corpora [as 別名]
def run(lda_model_path, corpus_path, num_topics, id2word):
corpus = corpora.BleiCorpus(corpus_path)
lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=id2word)
lda.save(lda_model_path)
return lda
示例3: wmdistance
# 需要導入模塊: import gensim [as 別名]
# 或者: from gensim import corpora [as 別名]
def wmdistance(self, document1, document2, all_distances, distance_metric="cosine"):
model = self.w2vmodel
if len(document1) == 0 or len(document2) == 0:
print(
"At least one of the documents had no words that were in the vocabulary. Aborting (returning inf)."
)
return float("inf")
dictionary = gensim.corpora.Dictionary(documents=[document1, document2])
vocab_len = len(dictionary)
# Sets for faster look-up.
docset1 = set(document1)
docset2 = set(document2)
distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double)
for i, t1 in dictionary.items():
for j, t2 in dictionary.items():
if t1 not in docset1 or t2 not in docset2:
continue
if distance_metric == "euclidean":
distance_matrix[i, j] = np.sqrt(
np.sum((model.wv[t1] - model.wv[t2]) ** 2)
)
elif distance_metric == "cosine":
distance_matrix[i, j] = all_distances[model.wv.vocab[t2].index, i]
if np.sum(distance_matrix) == 0.0:
print("The distance matrix is all zeros. Aborting (returning inf).")
return float("inf")
def nbow(document):
d = np.zeros(vocab_len, dtype=np.double)
nbow = dictionary.doc2bow(document)
doc_len = len(document)
for idx, freq in nbow:
d[idx] = freq / float(doc_len)
return d
d1 = nbow(document1)
d2 = nbow(document2)
return emd(d1, d2, distance_matrix)