当前位置: 首页>>代码示例>>Python>>正文


Python LazyCorpusLoader.freqs方法代码示例

本文整理汇总了Python中nltk.corpus.util.LazyCorpusLoader.freqs方法的典型用法代码示例。如果您正苦于以下问题:Python LazyCorpusLoader.freqs方法的具体用法?Python LazyCorpusLoader.freqs怎么用?Python LazyCorpusLoader.freqs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.util.LazyCorpusLoader的用法示例。


在下文中一共展示了LazyCorpusLoader.freqs方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: LangDetector

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import freqs [as 别名]
	class LangDetector(object):
		
		def __init__(self, languages=LangIDDict().keys()):
		
			self.language_trigrams = {}
			self.langid = LazyCorpusLoader('langid', LangIdReader, r'(?!\.).*\.txt')
			
			for lang in languages:
				self.language_trigrams[lang] = FreqDist()
				for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
					self.language_trigrams[lang].inc(f[0], f[1])
				self.language_dicts = dict([
					(id, dict([(trigram, float(value)/float(fdist.N())) for trigram, value in fdist.items()]))
					for id, fdist in self.language_trigrams.items()
				])
				
		def detect(self, text):
		
			words = nltk_word_tokenize(text.lower())
			trigrams = {}
			scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])

			trigcount = [(trigram, 1.0) for match in words for trigram in self.get_word_trigrams(match)]
			if len(trigcount) > 0:
				trigdf = pandas.DataFrame(trigcount, columns = ["key", "value"])
				trigrams = trigdf.groupby("key")["value"].sum().to_dict()
			else:
				trigrams = {}

			total = sum(trigrams.values())
			maxscore, maxid = 0, ""
			for trigram, count in trigrams.items():
				trishare = (float(count) / float(total))
				for lang, frequencies in filter(lambda (l, f): trigram in f, self.language_dicts.iteritems()):
					scores[lang] += frequencies[trigram] * trishare
					if scores[lang] > maxscore:
						maxid, maxscore = lang, scores[lang]
						
			return sorted(scores.items(), key=lambda x: x[1], reverse=True)
开发者ID:harixxy,项目名称:solutions,代码行数:41,代码来源:f7_language.py

示例2: LangDetect

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import freqs [as 别名]
class LangDetect(object):

    _instance = None
    _instanceMutex = threading.Semaphore()

    def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es', 'th', 'pt', 'pl', "id", "ru", "it", "ru", "tr"]):
        logger.info("Build " + self.__class__.__name__ + " ... ")
        self.language_trigrams = {}
        self.langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt')
        self.__mutex = threading.Semaphore()
        for lang in languages:
            self.language_trigrams[lang] = FreqDist()
            for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
                self.language_trigrams[lang].inc(f[0], f[1])
        logger.info("Build " + self.__class__.__name__ + ": done!")

    @staticmethod
    def instance():
        if LangDetect._instance is not None:
            return LangDetect._instance
        try:
            LangDetect._instanceMutex.acquire()
            if LangDetect._instance is None:
                LangDetect._instance = LangDetect()
            return LangDetect._instance
        finally:
            LangDetect._instanceMutex.release()

    def detect(self, text):
        '''
        Detect the text's language
        '''
        #print "Detect: " + text
        try:
            self.__mutex.acquire()
            if not text:
                raise ValueError(u"Text: " + unicode(text))
            text = unicodedata.normalize("NFC", text)
            words    = nltk_word_tokenize(text.lower())
            trigrams = {}
            scores   = dict([(lang, 0) for lang in self.language_trigrams.keys()])

            for match in words:
                word_trigrams = self.__get_word_trigrams(match)
                #print "Match: " + match
                #print "trigrams: " + str(word_trigrams)
                for trigram in word_trigrams:
                    if not trigram in trigrams.keys():
                        trigrams[trigram] = 0
                    trigrams[trigram] += 1

            total = sum(trigrams.values())

            for trigram, count in trigrams.items():
                for lang, frequencies in self.language_trigrams.items():
                    # normalize and add to the total score
                    try:
                        scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total))
                    except ZeroDivisionError as e:
                        logger.error(u"Div: " + unicode(float(frequencies.N())) + u" " + unicode(float(total)))
                        raise e

            sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            #print sorted_scores
            #logger.info(u"%s: %s" % (text, unicode(sorted_scores)))
            for lang, score in sorted_scores:
                if score > 0.0001:
                    return lang
            return None
        finally:
            self.__mutex.release()

    def __get_word_trigrams(self, match):
        return [''.join(trigram) for trigram in nltk_trigrams(match) if trigram != None]
开发者ID:soldierkam,项目名称:pynews,代码行数:76,代码来源:lang.py


注:本文中的nltk.corpus.util.LazyCorpusLoader.freqs方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。