本文整理汇总了Python中util.Util.tfidf方法的典型用法代码示例。如果您正苦于以下问题:Python Util.tfidf方法的具体用法?Python Util.tfidf怎么用?Python Util.tfidf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类util.Util
的用法示例。
在下文中一共展示了Util.tfidf方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: tfidf
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tfidf [as 别名]
def tfidf(self, tf, df, num_docs):
if len(self.stemmed) == 0:
return 0
if not self.score == 0:
return self.score
for s in self.stemmed:
self.score += Util.tfidf(s, tf, df, num_docs)
return self.score
示例2: summarize
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tfidf [as 别名]
def summarize(self, document_path):
allwords = {}
sentences = []
with open(document_path, 'r') as f:
index = 0
for line in f:
s = Util.tokenize(line, Summarizer.non_space)
sentence = []
for w in s:
sentence.append(w)
if Summarizer.sentence_terminator.search(w):
sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
sentences.append(sent)
for t in sent.stemmed:
if t not in allwords:
allwords[t] = index
index += 1
sentence = []
matrix = np.zeros((len(sentences), len(allwords)))
for i, sent in enumerate(sentences):
for t in sent.stemmed:
matrix[i, allwords[t]] = Util.tfidf(t, self.tf, self.df, Summarizer.NUM_DOCS)
# Normalize
normalizer = np.reshape(np.sum(matrix**2, axis=1)**0.5, (len(matrix), 1))
matrix /= normalizer
model = KMeans(n_clusters=Cluster.NUM_CLUSTERS, tol=1e-9)
model.fit_predict(np.nan_to_num(matrix))
labels = model.labels_
totalWords = 0
selected = []
# From each cluster, pick the sentence that is nearest to the cluster
# centroid
for i in range(Cluster.NUM_CLUSTERS):
member_indices = np.where(labels == i)
distances = np.dot(matrix[member_indices], model.cluster_centers_[i])
closest_index = np.argmin(distances, 0)
# 'closest_index' is the index into the member_indices array
member_index = member_indices[0][closest_index]
selected.append((member_index, sentences[member_index])) # stash the index of the sentence as well
totalWords += sentences[member_index].getLength()
if totalWords > 100:
break
# return the selected sentences in their order of appearance in the document
return [s[1] for s in sorted(selected, key=lambda x: x[0])]
示例3: summarize
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tfidf [as 别名]
def summarize(self, document_path):
allwords = {}
sentences = []
with open(document_path, 'r') as f:
index = 0
for line in f:
s = Util.tokenize(line, Summarizer.non_space)
sentence = []
for w in s:
sentence.append(w)
if Summarizer.sentence_terminator.search(w):
sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
sentences.append(sent)
for t in sent.stemmed:
if t not in allwords:
allwords[t] = index
index += 1
sentence = []
matrix = np.zeros((len(allwords), len(sentences)))
for i, sent in enumerate(sentences):
for t in sent.stemmed:
matrix[allwords[t], i] = Util.tfidf(t, self.tf, self.df, Summarizer.NUM_DOCS)
U, sigma, V_T = np.linalg.svd(matrix, full_matrices=False) # V is already transposed
# The rows of V_T correspond to 'independent topics', and the columns are the sentences.
# For each topic, we pick the sentence that has the highest strength (value) in the row.
max_cols = V_T.argmax(axis=1)
already_included = set()
totalWords = 0
selected = []
for i in max_cols:
if i not in already_included:
already_included.add(i)
selected.append((i, sentences[i])) # stash the index of the sentence as well
totalWords += sentences[i].getLength()
if totalWords > 100:
break
# return the selected sentences in their order of appearance in the document
return [s[1] for s in sorted(selected, key=lambda x: x[0])]