本文整理汇总了Python中gensim.corpora方法的典型用法代码示例。如果您正苦于以下问题:Python gensim.corpora方法的具体用法?Python gensim.corpora怎么用?Python gensim.corpora使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim
的用法示例。
在下文中一共展示了gensim.corpora方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import corpora [as 别名]
def build(self):
self.cursor.rewind()
dictionary = corpora.Dictionary(review["words"] for review in self.cursor)
dictionary.filter_extremes(keep_n=10000)
dictionary.compactify()
corpora.Dictionary.save(dictionary, self.dictionary_path)
return dictionary
示例2: run
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import corpora [as 别名]
def run(lda_model_path, corpus_path, num_topics, id2word):
corpus = corpora.BleiCorpus(corpus_path)
lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=id2word)
lda.save(lda_model_path)
return lda
示例3: wmdistance
# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import corpora [as 别名]
def wmdistance(self, document1, document2, all_distances, distance_metric="cosine"):
model = self.w2vmodel
if len(document1) == 0 or len(document2) == 0:
print(
"At least one of the documents had no words that were in the vocabulary. Aborting (returning inf)."
)
return float("inf")
dictionary = gensim.corpora.Dictionary(documents=[document1, document2])
vocab_len = len(dictionary)
# Sets for faster look-up.
docset1 = set(document1)
docset2 = set(document2)
distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double)
for i, t1 in dictionary.items():
for j, t2 in dictionary.items():
if t1 not in docset1 or t2 not in docset2:
continue
if distance_metric == "euclidean":
distance_matrix[i, j] = np.sqrt(
np.sum((model.wv[t1] - model.wv[t2]) ** 2)
)
elif distance_metric == "cosine":
distance_matrix[i, j] = all_distances[model.wv.vocab[t2].index, i]
if np.sum(distance_matrix) == 0.0:
print("The distance matrix is all zeros. Aborting (returning inf).")
return float("inf")
def nbow(document):
d = np.zeros(vocab_len, dtype=np.double)
nbow = dictionary.doc2bow(document)
doc_len = len(document)
for idx, freq in nbow:
d[idx] = freq / float(doc_len)
return d
d1 = nbow(document1)
d2 = nbow(document2)
return emd(d1, d2, distance_matrix)