当前位置: 首页>>代码示例>>Python>>正文


Python Util.tfidf方法代码示例

本文整理汇总了Python中util.Util.tfidf方法的典型用法代码示例。如果您正苦于以下问题:Python Util.tfidf方法的具体用法?Python Util.tfidf怎么用?Python Util.tfidf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在util.Util的用法示例。


在下文中一共展示了Util.tfidf方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: tfidf

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tfidf [as 别名]
 def tfidf(self, tf, df, num_docs):
     if len(self.stemmed) == 0:
         return 0
     if not self.score == 0:
         return self.score
     for s in self.stemmed:
         self.score += Util.tfidf(s, tf, df, num_docs)
     return self.score
开发者ID:ChenluJi,项目名称:cs224n-project,代码行数:10,代码来源:sentence.py

示例2: summarize

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tfidf [as 别名]
    def summarize(self, document_path):
        allwords = {}
        sentences = []

        with open(document_path, 'r') as f:
            index = 0
            for line in f:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences.append(sent)
                        for t in sent.stemmed:
                            if t not in allwords:
                                allwords[t] = index
                                index += 1
                        sentence = []

        matrix = np.zeros((len(sentences), len(allwords)))
        for i, sent in enumerate(sentences):
            for t in sent.stemmed:
                matrix[i, allwords[t]] = Util.tfidf(t, self.tf, self.df, Summarizer.NUM_DOCS)

        # Normalize
        normalizer = np.reshape(np.sum(matrix**2, axis=1)**0.5, (len(matrix), 1))
        matrix /= normalizer

        model = KMeans(n_clusters=Cluster.NUM_CLUSTERS, tol=1e-9)
        model.fit_predict(np.nan_to_num(matrix))
        labels = model.labels_

        totalWords = 0
        selected = []

        # From each cluster, pick the sentence that is nearest to the cluster
        # centroid
        for i in range(Cluster.NUM_CLUSTERS):
            member_indices = np.where(labels == i)
            distances = np.dot(matrix[member_indices], model.cluster_centers_[i])
            closest_index = np.argmin(distances, 0)
            # 'closest_index' is the index into the member_indices array
            member_index = member_indices[0][closest_index]
            selected.append((member_index, sentences[member_index]))  # stash the index of the sentence as well
            totalWords += sentences[member_index].getLength()
            if totalWords > 100:
                break

        # return the selected sentences in their order of appearance in the document
        return [s[1] for s in sorted(selected, key=lambda x: x[0])]
开发者ID:ChenluJi,项目名称:cs224n-project,代码行数:53,代码来源:cluster.py

示例3: summarize

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tfidf [as 别名]
    def summarize(self, document_path):
        allwords = {}
        sentences = []

        with open(document_path, 'r') as f:
            index = 0
            for line in f:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences.append(sent)
                        for t in sent.stemmed:
                            if t not in allwords:
                                allwords[t] = index
                                index += 1
                        sentence = []

        matrix = np.zeros((len(allwords), len(sentences)))
        for i, sent in enumerate(sentences):
            for t in sent.stemmed:
                matrix[allwords[t], i] = Util.tfidf(t, self.tf, self.df, Summarizer.NUM_DOCS)

        U, sigma, V_T = np.linalg.svd(matrix, full_matrices=False) # V is already transposed

        # The rows of V_T correspond to 'independent topics', and the columns are the sentences.
        # For each topic, we pick the sentence that has the highest strength (value) in the row.
        max_cols = V_T.argmax(axis=1)

        already_included = set()
        totalWords = 0
        selected = []

        for i in max_cols:
            if i not in already_included:
                already_included.add(i)
                selected.append((i, sentences[i])) # stash the index of the sentence as well
                totalWords += sentences[i].getLength()
                if totalWords > 100:
                    break

        # return the selected sentences in their order of appearance in the document
        return [s[1] for s in sorted(selected, key=lambda x: x[0])]
开发者ID:ChenluJi,项目名称:cs224n-project,代码行数:47,代码来源:svd.py


注:本文中的util.Util.tfidf方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。