当前位置: 首页>>代码示例>>Python>>正文


Python Parser.fullParse方法代码示例

本文整理汇总了Python中Parser.Parser.fullParse方法的典型用法代码示例。如果您正苦于以下问题:Python Parser.fullParse方法的具体用法?Python Parser.fullParse怎么用?Python Parser.fullParse使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Parser.Parser的用法示例。


在下文中一共展示了Parser.fullParse方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from Parser import Parser [as 别名]
# 或者: from Parser.Parser import fullParse [as 别名]
class InvertedIndex:


    def __init__(self):
        self.inverted_index = {}
        self.collections_index = {}
        self.unique_id = 0  # for assigning doc ID to docs held in the collections index
        self.parser = Parser()
        self.tfidf = {} 

    '''
    # doc_name == url
    # doc_text == stripped of html
    '''
    def addDocument(self, doc_name, doc_text):

        # split the doc_text into a list of words as they appear in the document
        input_text = doc_text.strip().split()

        # get the twenty words & store the doc in the collections
        twenty_words = self.extractTwentyWords(input_text)
        self.addToCollectionsIndex(doc_name, twenty_words)

        # remove stopWords, to lower case, clean punctuation symbols, and stem
        parsedWords = self.parser.fullParse(input_text)

        # store words in index
        self.addWordsToInvertedIndex(parsedWords)

        # this is the main starting function of the tfidf class
        # self.unique_id -- at this point -- is equivalent to len(collections_index)
        self.calcTFIDF()

    def calcTFIDF(self):
        t = TFIDF()
        self.tfidf = t.docHandler(self.inverted_index, self.unique_id)

 
    
    def extractTwentyWords(self, words_list):
        twenty_words = ''
        for i in range(20):
            twenty_words += words_list[i]
            if (i < 19):
                twenty_words += ' '
        return twenty_words

    def addToCollectionsIndex(self, doc_name, twenty_words):
        self.collections_index[self.unique_id] = [doc_name, twenty_words]
        self.unique_id += 1

    def addWordsToInvertedIndex(self, words_list):
        for word in words_list:
            self.inverted_index[word] = self.inverted_index.get(word, {})
            if not self.inverted_index[word] or self.unique_id not in self.inverted_index[word]:  # if empty list
                self.inverted_index[word][self.unique_id] = 1
            elif self.unique_id in self.inverted_index[word]:
                self.inverted_index[word][self.unique_id] += 1

    def printCollectionsIndex(self):
        print "****************************************************************"
        print "                     COLLECTIONS INDEX"
        print "****************************************************************"
        for key, value in self.collections_index.iteritems():
            print key, value

    def printInvertedIndex(self):
        print "****************************************************************"
        print "                      INVERTED INDEX"
        print "****************************************************************"
        for key, value in self.inverted_index.iteritems():
            print key, value
开发者ID:zshainsky,项目名称:CSE7337WebSearch,代码行数:74,代码来源:InvertedIndex.py

示例2: __init__

# 需要导入模块: from Parser import Parser [as 别名]
# 或者: from Parser.Parser import fullParse [as 别名]
class InvertedIndex:


    def __init__(self):
        self.inverted_index = {}
        self.collections_index = {}
        self.unique_id = 0  # for assigning doc ID to docs held in the collections index
        self.parser = Parser()
        self.tfidf = {} 

    '''
    # doc_name == url
    # doc_text == stripped of html
    '''
    def addDocument(self, doc_name, doc_text):

        # split the doc_text into a list of words as they appear in the document
        input_text = doc_text.strip().split()

        # get the twenty words & store the doc in the collections
        twenty_words = self.extractTwentyWords(input_text)
        self.addToCollectionsIndex(doc_name, twenty_words)

        # remove stopWords, to lower case, clean punctuation symbols, and stem
        parsedWords = self.parser.fullParse(input_text)

        # store words in index
        self.addWordsToInvertedIndex(parsedWords)

        # this is the main starting function of the tfidf class
        # self.unique_id -- at this point -- is equivalent to len(collections_index)
        self.calcTFIDF()

    def calcTFIDF(self):
        t = TFIDF()
        self.tfidf = t.docHandler(self.inverted_index, self.unique_id)
    
    def extractTwentyWords(self, words_list):
        twenty_words = ''
        for i in range(20):
            twenty_words += words_list[i]
            if (i < 19):
                twenty_words += ' '
        return twenty_words

    def addToCollectionsIndex(self, doc_name, twenty_words):
        self.collections_index[self.unique_id] = [doc_name, twenty_words]
        self.unique_id += 1

    def addWordsToInvertedIndex(self, words_list):
        for word in words_list:
            self.inverted_index[word] = self.inverted_index.get(word, {})
            if not self.inverted_index[word] or self.unique_id not in self.inverted_index[word]:  # if empty list
                self.inverted_index[word][self.unique_id] = 1
            elif self.unique_id in self.inverted_index[word]:
                self.inverted_index[word][self.unique_id] += 1

    def printCollectionsIndex(self):
        print "****************************************************************"
        print "                     COLLECTIONS INDEX"
        print "****************************************************************"
        for key, value in self.collections_index.iteritems():
            print key, value

    def printInvertedIndex(self):
        print "****************************************************************"
        print "                      INVERTED INDEX"
        print "****************************************************************"
        for key, value in self.inverted_index.iteritems():
            print key, value

    def pickleThis(self):
        pickle.dump(self.inverted_index, open("Pickled_InvertedIndex.p", "wb"))
        pickle.dump(self.collections_index, open("Pickled_CollectionsIndex.p", "wb"))


    def loadPickles(self):
        self.inverted_index = pickle.load(open("../zCrawler/Pickled_InvertedIndex.p", "rb"))
        self.collections_index = pickle.load(open("../zCrawler/Pickled_CollectionsIndex.p", "rb"))


    def createTermFrequencyMatrix(self):
        tempDocMatrix = {}
        numDocs = len(self.collections_index)
        for key, value in self.inverted_index.iteritems():
            wordFreq = [0] * numDocs
            for k, v in value.iteritems():
                wordFreq[k-1] = v
            tempDocMatrix[key] = wordFreq


        docHeading = ""
        for i in range (0, numDocs):
            docHeading = docHeading + "Doc " + str(i) + "   "

        print "{:45s} {:60s}".format('Word', docHeading)
        for word, freq in tempDocMatrix.iteritems():
            frequency = " "
            for i in range(0, len(freq)):
                frequency = frequency + "       " + str(freq[i])
#.........这里部分代码省略.........
开发者ID:zshainsky,项目名称:CSE7337WebSearch,代码行数:103,代码来源:InvertedIndex.py


注:本文中的Parser.Parser.fullParse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。