Python FreqDist.max方法代码示例

本文整理汇总了Python中nltk.FreqDist.max方法的典型用法代码示例。如果您正苦于以下问题：Python FreqDist.max方法的具体用法？Python FreqDist.max怎么用？Python FreqDist.max使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist的用法示例。

在下文中一共展示了FreqDist.max方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: mostprobableparse

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
	def mostprobableparse(self, sent, sample=None):
		"""warning: this problem is NP-complete. using an unsorted
		chart parser avoids unnecessary sorting (since we need all
		derivations anyway).
		
		@param sent: a sequence of terminals
		@param sample: None or int; if int then sample that many parses"""
		p = FreqDist()
		for a in self.parser.nbest_parse(sent, sample):
			p.inc(removeids(a).freeze(), a.prob())
		if p.max():
			return ProbabilisticTree(p.max().node, p.max(), prob=p[p.max()])
		else: raise ValueError("no parse")

开发者ID:andreasvc，项目名称:eodop，代码行数:15，代码来源:dopg.py

示例2: plot_freq

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
def plot_freq(productions):
    prod_fd = FreqDist(productions)
    prod_to_dist = [prod_fd[key] for key in prod_fd]
    dist_fd = FreqDist(prod_to_dist)
    X_vec = list(range(prod_fd[prod_fd.max()]))[1:]
    Y_vec = [dist_fd[x] for x in X_vec]
    py.plot(X_vec, Y_vec)

开发者ID:dianagastrin，项目名称:NLP，代码行数:9，代码来源:Q1.2.py

示例3: choose_tag

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
    def choose_tag(self, tokens, index, history):
        word = tokens[index]
        fd = FreqDist()

        for synset in wordnet.synsets(word):
            fd[synset.pos()] += 1

        if fd:
            return self.wordnet_tag_map.get(fd.max())
        else:
            return None

开发者ID:anderscui，项目名称:nlpy，代码行数:13，代码来源:taggers.py

示例4: handle

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
    def handle(self, *args, **options):
    	fdist = FreqDist()
    	print "Analyzing raw data"
    	limit = 10
    	if args:
    		raw_datas = RawData.objects.filter(pk__in=args)
    	else:
	   		raw_datas = RawData.objects.all()[:limit]
    	tagged_data = []
    	for raw_data in raw_datas:
    		words = nltk.word_tokenize(raw_data.data)
    		tagged_data.extend(nltk.pos_tag(words))
    		for word in words:
    			word = word.strip()
    			if word:
	    			fdist.inc(word)

    	print "Anaylzed %s items" % len(raw_datas)
    	print

    	print "Top word: %s" % fdist.max()
    	print 

    	print "Top 10 words"
    	for word in fdist.keys()[:10]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	
    	print "Bottom 10 words"
    	for word in fdist.keys()[-10:]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	print "Words occurring between 50-100 times"
    	words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ]
    	print ", ".join(words)


    	cfdist = ConditionalFreqDist()
    	for (word, tag) in tagged_data:
    		cfdist[tag].inc(word)
    	
    	print "Most popular noun: %s" % cfdist["NN"].max()
    	print 

    	print "Top 50 nouns"
    	for word in cfdist["NN"].keys()[:50]:
    		times = cfdist["NN"][word]
    		print " -- %s occurred %s times" % (word, times)
    	print

开发者ID:jaywhy13，项目名称:mapstream，代码行数:55，代码来源:analyze.py

示例5: main

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
def main():
	""" a basic REPL for testing """
	corpus = """(S (NP John) (VP (V likes) (NP Mary)))
(S (NP Peter) (VP (V hates) (NP Susan)))
(S (NP Harry) (VP (V eats) (NP pizza)))
(S (NP Hermione) (VP (V eats)))""".splitlines()
	corpus = """(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog))))
(S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))""".splitlines()
	#corpus = """(S (NP mary) (VP walks) (AP quickly))""".splitlines()
	#(S (NP Harry) (VP (V likes) (NP Susan) (ADVP (RB very) (RB much))))
	corpus = [Tree(a) for a in corpus]
	#d = GoodmanDOP(corpus, rootsymbol='S')
	from bitpar import BitParChartParser
	d = GoodmanDOP(corpus, rootsymbol='TOP', wrap='TOP',
						parser=BitParChartParser)
	#d = GoodmanDOP(corpus, rootsymbol='TOP', wrap='TOP')
	#print d.grammar
	print "corpus"
	for a in corpus: print a
	w = "foo!"
	while w:
		print "sentence:",
		w = raw_input().split()
		try:
			p = FreqDist()
			for n, a in enumerate(d.parser.nbest_parse(w)):
				if n > 1000: break
				print a
				p.inc(ImmutableTree.convert(removeids(a)), a.prob())
			#for b, a in sorted((b,a) for (a,b) in p.items()):
			#	print a, b
			print
			print 'best', p.max(), p[p.max()]
			#print d.parse(w)
		except Exception: # as e:
			print "error", #e

开发者ID:andreasvc，项目名称:eodop，代码行数:38，代码来源:dopg.py

示例6: stem

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
# stem of word
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem


def lexical_diversity(text):
    return len(text) / len(set(text))

nostop_title = lemma(remove_stopwords(text_title))
nltk.Text(nostop_title).collocations()
# Frequency distribution of text
fdist_title = FreqDist(nostop_title)
fdist_title.most_common(50)
fdist_title.max()
fdist_title.plot(50, cumulative=True)#plot
fdist_title.plot(50)
total_words = len(set(nostop_title))
print("The total number of words in title of KD is: " + str(total_words))
avg_words = fdist_title.N()/total_words
print("Each word appears in title of KD is: " + str(int(avg_words)))


# process for text
f = open('kdtext.txt', encoding="latin-1")
raw_text = f.read()
# type
type(raw_text)
tokens = word_tokenize(raw_text)
type(tokens)

开发者ID:dmml，项目名称:NLTK，代码行数:33，代码来源:kd.py

示例7: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
word_len = [len(w) for w in text1]
print word_len





# Example	Description
# fdist = FreqDist(samples)	create a frequency distribution containing the given samples
# fdist[sample] += 1	increment the count for this sample
# fdist['monstrous']	count of the number of times a given sample occurred
# fdist.freq('monstrous')	frequency of a given sample
# fdist.N()	total number of samples
# fdist.most_common(n)	the n most common samples and their frequencies
# for sample in fdist:	iterate over the samples
# fdist.max()	sample with the greatest count
# fdist.tabulate()	tabulate the frequency distribution
# fdist.plot()	graphical plot of the frequency distribution
# fdist.plot(cumulative=True)	cumulative plot of the frequency distribution
# fdist1 |= fdist2	update fdist1 with counts from fdist2
# fdist1 < fdist2	test if samples in fdist1 occur less frequently than in fdist2

fdlist = FreqDist(len(w) for w in text1)
print dict(fdlist)
print fdlist.most_common(3)
print fdlist.max()
print fdlist[2]
print fdlist.tabulate()
fdlist.plot()
fdlist.plot(cumulative=True)

开发者ID:loveclj，项目名称:python，代码行数:32，代码来源:counting_other3.4.py

示例8: multi_sentence

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
def multi_sentence(context_sentences, ambiguous_word):
    fdist = FreqDist()
    for sentence in context_sentences:
        fdist.inc(lesk(sentence, ambiguous_word))
    return fdist.max()

开发者ID:finiteautomata，项目名称:wisdom，代码行数:7，代码来源:multi_sentence.py

示例9: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
#!/usr/bin/python
# coding: utf-8

# 2013/03/20

from nltk import FreqDist

fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成
fdist.inc(sample) # sampleで指定されたデータの数を1増やす
fdist['データ'] # 指定されたデータの出現数
fdist.freq('データ') # 指定されたデータの頻度
fdist.N() # サンプルの総数
fdist.keys() # 頻度の順にソートされたサンプル
for sample in fdist: # 頻度の順にサンプルをイテレート
    pass
fdist.max() # 数の最も多いサンプル
fdist.tabulate() # 頻度分布を表形式で表示
fdist.plot() # 頻度分布をプロット
fdist.plot(cumulative=True) # 累積頻度をプロット
fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト

开发者ID:pombredanne，项目名称:pysample，代码行数:22，代码来源:FreqDist.py

示例10: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]

### What is the most frequent tag?
### Which word has the most number of distinct tags?
fd = FreqDist()
cfd = ConditionalFreqDist()

# for each tagged sentence in the corpus, get the (token, tag) pair and update
# both count(tag) and count(tag given token)
for sentence in brown.tagged_sents():
    for (token, tag) in sentence:
        fd[tag] += 1
        cfd[token][tag] += 1

# Find the most frequent tag
fd.max()

# Initialize a list to hold (numtags,word) tuple
wordbins = []

# Append each tuple (number of unique tags for token, token) to list
for token in cfd.conditions():
    wordbins.append((cfd[token].B(), token))

# sort tuples by number of unique tags (highest first)
wordbins.sort(reverse=True)
print wordbins[0] # token with max. no. of tags is ...


### What is the ratio of masculine to feminine pronouns?
male = ['he','his','him','himself']  # masculine pronouns

开发者ID:DeepakSinghRawat，项目名称:Tutorials，代码行数:32，代码来源:NLP_tut.py

示例11: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
title = nostop_title_dsc + nostop_title_kd
nltk.Text(title).collocations()
fdist_title = FreqDist(title)
fdist_title.most_common(50)
fdist_title.plot(50, cumulative=True)
fdist_title.plot(50)
total_words = len(set(title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))

text = nostop_text_dsc + nostop_text_kd
nltk.Text(text).collocations()
fdist_text = FreqDist(text)
fdist_text.most_common(50)
fdist_text.max()
fdist_text.plot(50, cumulative=True)
fdist_text.plot(50)
total_textwords = len(set(text))
print("The total number of words in text is: " + str(total_textwords))
avg_text = fdist_text.N() / total_textwords
print("Each word appears in text " + str(int(avg_text)) + " times")

# bigrams and trigrams
word_pair_text = list(bigrams(text))
word_triple_text = list(trigrams(text))
bigrams_text = FreqDist(word_pair_text)
trigrams_text = FreqDist(word_triple_text)
bigrams_text.most_common(50)
bigrams_text.plot(50)
bigrams_text.plot(50, cumulative=True)

开发者ID:dmml，项目名称:NLTK，代码行数:33，代码来源:dsc_kd_bi.py

示例12: lemma

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
    return [w for w in word if w not in stopwords.words('english') and w != '']


# lemma
def lemma(text):
    lmtzr = WordNetLemmatizer()
    return [lmtzr.lemmatize(w) for w in text]

nostop_title = lemma(remove_stopwords(text_title))
# check the collocations of text
nostop_title = nltk.Text(nostop_title)
nostop_title.collocations()
fdist_title = FreqDist(nostop_title)  # Frequency distribution of text
fdist_title.most_common(50)  # most common 50
fdist_title['science']  # return count of a given word
fdist_title.max()  # max counts
fdist_title.plot(50, cumulative=True)  # plot
fdist_title.plot(50)
fdist_title.tabulate(50)  # tabulate
total_words = len(set(nostop_title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))

# bigrams, trigrams
from nltk import bigrams
from nltk import trigrams
word_pair = list(bigrams(nostop_title))
word_triple = list(trigrams(nostop_title))
bigrams_title = FreqDist(word_pair)
trigrams_title = FreqDist(word_triple)

开发者ID:dmml，项目名称:NLTK，代码行数:33，代码来源:dsc.py

示例13: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
#!/usr/bin/python3
# coding: utf-8
import nltk
from nltk.corpus import gutenberg  # 导入 gutenberg 集
##################################################################
## FreqDist 跟踪分布中的采样频率 (sample frequencies)
from nltk import FreqDist  # 导入 FreqDist 类
fd = FreqDist(gutenberg.words('austen-persuasion.txt'))  # 频率分布实例化, 统计文本中的 Token
print(fd)  # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token
print(type(fd))  # <class 'nltk.probability.FreqDist'>
print(fd['the'])  # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典
print(fd.N())  # 98171; 是单词, 不是字母, 有重复的
print(fd.B())  # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中
print(len(fd.keys()), type(fd.keys()))  # 6132 <class 'dict_keys'>
print(fd.keys())  # fd.B() 只是输出个数, 这个是把所有词汇表输出
print(fd.max())  # 频率最高的一个词
print(fd.freq('the'))  # 0.03178127960395636; 出现频率 3120 / 98171
print(fd.hapaxes())  # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词
# 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征
for idx, word in enumerate(fd):  # 可以用 enumerate 来遍历, 是按出现顺序排的
    if idx == 5: break
    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(fdist.items())  # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha())  # 可以不用 [] 将生成器 list 化

开发者ID:coder352，项目名称:shellscript，代码行数:33，代码来源:l21_FreqDist-词频统计_Zipf-Law-可视化.py

示例14: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
#!/usr/bin/env python

from nltk.corpus import brown
from nltk import FreqDist, ConditionalFreqDist
fd = FreqDist()
cfd = ConditionalFreqDist()

# for each tagged sentence in the corpus, get the (token, tag) pair and update
# both count(tag) and count(tag given token)
for sentence in brown.tagged_sents():
    for (token, tag) in sentence:
        fd[tag] += 1
        cfd[token][tag] += 1

# The most frequent tag is ...
print(fd.max())

# Initialize a list to hold (numtags,word) tuple
wordbins = []

# Append each (n(unique tags for token),token) tuple to list
for token in cfd.conditions():
    wordbins.append((cfd[token].B(), token))

# Sort tuples by number of unique tags (highest first)
wordbins.sort(reverse=True)

# The token with max. no. of tags is ...
print(wordbins[0])

# masculine pronouns

开发者ID:voxmenthe，项目名称:acm-crossroads-nltk，代码行数:33，代码来源:task3.py

示例15: Document

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
class Document(object):
    def __init__(self, doc_id):
        #rename metadata something more general?
        self.metadata = { "doc_title": None, "author_lastname": None, "author_first_middle": None, "year_written": None, "year_published": None,
                "pub_title": None, "pub_type": None, "Type-Token Ratio": None, "Hapax Dislegomena": None, "Honore's R": None, "Yule's K": None, "tokenized_doc": []}
        self.doc_id = doc_id
        self.fdist = None
        self.frequencies = []
        self.metadata_getter()
        self.tokenized_doc_getter() 
        self.thrk_getter()
        self.frequency_dist_getter()
        #method?
        #self.timestamp()
    
    def timestamp(self):
        ts = time.time()
        return datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d_%H%M%S_')
        
    def metadata_getter(self):
        # move to object?
        cursor = db.cursor()
        c = cursor.execute('SELECT author_lastname, author_first_middle, doc_title, original_publication_title, original_publication_type, year_written, year_published FROM metadata WHERE doc_id = (?)', (self.doc_id,))
        for row in c:
            self.metadata["author_lastname"] = row[0]
            self.metadata["author_first_middle"] = row[1]
            self.metadata["doc_title"] = row[2]
            self.metadata["pub_title"] = row[3]
            self.metadata["pub_type"] = row[4]
            self.metadata["year_written"] = row[5]
            self.metadata["year_published"] = row[6]
        #print "Metadata Found for Doc ", (self.doc_id)
        
    def tokenized_doc_getter(self):
        #assumes we're connected to db
        doc_name = 'document_' + str(self.doc_id) 
        cursor = db.execute('SELECT * FROM {}'.format(doc_name,))
        text = []
        for i in cursor:
            text.append(str(i[0]))
            self.metadata["tokenized_doc"] = text
        #print "Tokenized Document ", (self.doc_id)
    
    def type_token_ratio(self):
        self.metadata["Type-Token Ratio"] = float(self.V / self.N)
        
    def hap_dis_ratio(self):
        self.metadata["Hapax Dislegomena"] = float(self.hapaxes[2] / self.V)
        #assignments can go in methods
    
    def honore_r(self):
        if self.hapaxes[1] != 0:
            self.metadata["Honore's R"] = float((100*math.log(self.N, 10)) / (1 - (self.hapaxes[1] / self.V)))
        else:
            self.metadata["Honore's R"] = 'NA'

    def yule_k(self):
        #we find the value of the greatest number of times any word appears
        summation = []
        for i in self.hapaxes:
            summation.append(float(i**2 * self.hapaxes[i]))
        #with the summation, find K
        self.metadata["Yule's K"] = float((10**4 * (sum(summation) - self.N)) / (self.N**2))

    def frequency_dist(self):
        self.fdist = FreqDist(self.metadata["tokenized_doc"])
    
    def frequency_dist_getter(self):
        if self.fdist == None:
            self.frequency_dist()
        self.frequencies = self.fdist.items()
                
    def hapaxes_summation(self):
        self.frequency_dist()
        max = self.fdist[self.fdist.max()]
        # hapaxes method (only gets called if you hit else here)
        hapaxes = {}
        for n in range(1, max+1):
            hapaxes[n] = 0
        for i in self.fdist:
            hapaxes[self.fdist[i]] += 1
        self.hapaxes = hapaxes
    
    def thrk_getter(self):
        cursor = db.cursor()
        c = cursor.execute('SELECT doc_id, t, h, r, k FROM thrk WHERE doc_id = (?)', (self.doc_id,))
        count = 0
        for i in c:
            count +=1
        if count > 0:
            c = cursor.execute('SELECT doc_id, t, h, r, k FROM thrk WHERE doc_id = (?)', (self.doc_id,))
            for i in c:
                self.metadata["Type-Token Ratio"] = i[1]
                self.metadata["Hapax Dislegomena"] = i[2]
                self.metadata["Honore's R"] = i[3]
                self.metadata["Yule's K"] = i[4]
        else:
            self.hapaxes_summation()    
            # make these instance variables
            self.N = float(self.fdist.N())
#.........这里部分代码省略.........

开发者ID:mjlavin80，项目名称:py_style，代码行数:103，代码来源:py_styleModel.py

注：本文中的nltk.FreqDist.max方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。