当前位置: 首页>>代码示例>>Python>>正文


Python Concepts.getConcepts方法代码示例

本文整理汇总了Python中nl_lib.Concepts.Concepts.getConcepts方法的典型用法代码示例。如果您正苦于以下问题:Python Concepts.getConcepts方法的具体用法?Python Concepts.getConcepts怎么用?Python Concepts.getConcepts使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nl_lib.Concepts.Concepts的用法示例。


在下文中一共展示了Concepts.getConcepts方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Collocations

# 需要导入模块: from nl_lib.Concepts import Concepts [as 别名]
# 或者: from nl_lib.Concepts.Concepts import getConcepts [as 别名]
class Collocations(object):
    concepts         = None
    
    conceptsNGram        = None
    conceptNGramScore    = None
    conceptsNGramSubject = None

    conceptFile      = u"documents.p"

    ngramFile        = u"ngrams.p"
    ngramScoreFile   = u"ngramscore.p"
    ngramSubjectFile = u"ngramsubject.p"
    
    def __init__(self, conceptFile=None):
        if conceptFile == None:
            conceptFile      = u"documents.p"

        logger.info(u"Load Concepts from %s " % (conceptFile))
        self.concepts = Concepts.loadConcepts(conceptFile)
        logger.info(u"Loaded Concepts")

        self.conceptsNGram = Concepts(u"n-gram", u"NGRAM")
        self.conceptsNGramScore = Concepts(u"NGram_Score", u"Score")
        self.conceptsNGramSubject = Concepts(u"Subject", u"Subjects")

    def getCollocationConcepts(self):
        return self.conceptsNGram, self.conceptsNGramScore, self.conceptsNGramSubject
        
    def find_collocations(self):
        lemmatizer = WordNetLemmatizer()

        stopset = set(stop)
        filter_stops = lambda w: len(w) < 3 or w in stopset

        words = list()

        dictWords = dict()

        for document in self.concepts.getConcepts().values():
            logger.debug(document.name)
            for concept in document.getConcepts().values():
               logger.debug(concept.name)

               for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept.name)):
                    logger.debug(u"Word: " + word + u" POS: " + pos)
                    lemmaWord = lemmatizer.lemmatize(word.lower())
                    logger.debug(u"Word: " + word + u" Lemma: " + lemmaWord)
                    words.append(lemmaWord)

                    if pos[0] == u"N":
                        dictWords[lemmaWord] = word


        for x in dictWords.keys():
            logger.info(u"noun : %s" % x)

        bcf = BigramCollocationFinder.from_words(words)
        tcf = TrigramCollocationFinder.from_words(words)

        bcf.apply_word_filter(filter_stops)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)

        listBCF = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 100)

        for bigram in listBCF:
            concept = u' '.join([bg for bg in bigram])
            e = self.conceptsNGram.addConceptKeyType(concept, u"BiGram")
            logger.info(u"Bigram  : %s" % concept)
            for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)):
                e.addConceptKeyType(word, pos)

        listTCF = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 100)

        for trigram in listTCF:
            concept = u' '.join([bg for bg in trigram])
            e = self.conceptsNGram.addConceptKeyType(concept, u"TriGram")
            logger.info(u"Trigram : %s" % concept)
            for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)):
                e.addConceptKeyType(word, pos)
            
        bcfscored = bcf.score_ngrams(BigramAssocMeasures.likelihood_ratio)
        lt = sorted(bcfscored, key=lambda c: c[1], reverse=True)
        for score in lt:
            name = ' '.join([w for w in score[0]])
            count = float(score[1])
            e = self.conceptsNGramScore.addConceptKeyType(name, u"BiGram")
            for x in score[0]:
                e.addConceptKeyType(x, u"BWord")
            e.count = count
            logger.debug(u"bcfscored: %s=%s" % (name, count))

        tcfscored = tcf.score_ngrams(TrigramAssocMeasures.likelihood_ratio)
        lt = sorted(tcfscored, key=lambda c: c[1], reverse=True)
        for score in lt:
            name = ' '.join([w for w in score[0]])
            count = float(score[1])
            e = self.conceptsNGramScore.addConceptKeyType(name, u"TriGram")
            for x in score[0]:
                e.addConceptKeyType(x, u"TWord")
#.........这里部分代码省略.........
开发者ID:Darth-Neo,项目名称:DirCrawler,代码行数:103,代码来源:nl_phase_d_find_collocations.py


注:本文中的nl_lib.Concepts.Concepts.getConcepts方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。