本文整理汇总了Python中nltk.corpus.util.LazyCorpusLoader.freqs方法的典型用法代码示例。如果您正苦于以下问题:Python LazyCorpusLoader.freqs方法的具体用法?Python LazyCorpusLoader.freqs怎么用?Python LazyCorpusLoader.freqs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.util.LazyCorpusLoader
的用法示例。
在下文中一共展示了LazyCorpusLoader.freqs方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: LangDetector
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import freqs [as 别名]
class LangDetector(object):
def __init__(self, languages=LangIDDict().keys()):
self.language_trigrams = {}
self.langid = LazyCorpusLoader('langid', LangIdReader, r'(?!\.).*\.txt')
for lang in languages:
self.language_trigrams[lang] = FreqDist()
for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
self.language_trigrams[lang].inc(f[0], f[1])
self.language_dicts = dict([
(id, dict([(trigram, float(value)/float(fdist.N())) for trigram, value in fdist.items()]))
for id, fdist in self.language_trigrams.items()
])
def detect(self, text):
words = nltk_word_tokenize(text.lower())
trigrams = {}
scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])
trigcount = [(trigram, 1.0) for match in words for trigram in self.get_word_trigrams(match)]
if len(trigcount) > 0:
trigdf = pandas.DataFrame(trigcount, columns = ["key", "value"])
trigrams = trigdf.groupby("key")["value"].sum().to_dict()
else:
trigrams = {}
total = sum(trigrams.values())
maxscore, maxid = 0, ""
for trigram, count in trigrams.items():
trishare = (float(count) / float(total))
for lang, frequencies in filter(lambda (l, f): trigram in f, self.language_dicts.iteritems()):
scores[lang] += frequencies[trigram] * trishare
if scores[lang] > maxscore:
maxid, maxscore = lang, scores[lang]
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
示例2: LangDetect
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import freqs [as 别名]
class LangDetect(object):
_instance = None
_instanceMutex = threading.Semaphore()
def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es', 'th', 'pt', 'pl', "id", "ru", "it", "ru", "tr"]):
logger.info("Build " + self.__class__.__name__ + " ... ")
self.language_trigrams = {}
self.langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt')
self.__mutex = threading.Semaphore()
for lang in languages:
self.language_trigrams[lang] = FreqDist()
for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
self.language_trigrams[lang].inc(f[0], f[1])
logger.info("Build " + self.__class__.__name__ + ": done!")
@staticmethod
def instance():
if LangDetect._instance is not None:
return LangDetect._instance
try:
LangDetect._instanceMutex.acquire()
if LangDetect._instance is None:
LangDetect._instance = LangDetect()
return LangDetect._instance
finally:
LangDetect._instanceMutex.release()
def detect(self, text):
'''
Detect the text's language
'''
#print "Detect: " + text
try:
self.__mutex.acquire()
if not text:
raise ValueError(u"Text: " + unicode(text))
text = unicodedata.normalize("NFC", text)
words = nltk_word_tokenize(text.lower())
trigrams = {}
scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])
for match in words:
word_trigrams = self.__get_word_trigrams(match)
#print "Match: " + match
#print "trigrams: " + str(word_trigrams)
for trigram in word_trigrams:
if not trigram in trigrams.keys():
trigrams[trigram] = 0
trigrams[trigram] += 1
total = sum(trigrams.values())
for trigram, count in trigrams.items():
for lang, frequencies in self.language_trigrams.items():
# normalize and add to the total score
try:
scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total))
except ZeroDivisionError as e:
logger.error(u"Div: " + unicode(float(frequencies.N())) + u" " + unicode(float(total)))
raise e
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
#print sorted_scores
#logger.info(u"%s: %s" % (text, unicode(sorted_scores)))
for lang, score in sorted_scores:
if score > 0.0001:
return lang
return None
finally:
self.__mutex.release()
def __get_word_trigrams(self, match):
return [''.join(trigram) for trigram in nltk_trigrams(match) if trigram != None]