本文整理汇总了Python中nltk.FreqDist.max方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.max方法的具体用法?Python FreqDist.max怎么用?Python FreqDist.max使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.max方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: mostprobableparse
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
def mostprobableparse(self, sent, sample=None):
"""warning: this problem is NP-complete. using an unsorted
chart parser avoids unnecessary sorting (since we need all
derivations anyway).
@param sent: a sequence of terminals
@param sample: None or int; if int then sample that many parses"""
p = FreqDist()
for a in self.parser.nbest_parse(sent, sample):
p.inc(removeids(a).freeze(), a.prob())
if p.max():
return ProbabilisticTree(p.max().node, p.max(), prob=p[p.max()])
else: raise ValueError("no parse")
示例2: plot_freq
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
def plot_freq(productions):
prod_fd = FreqDist(productions)
prod_to_dist = [prod_fd[key] for key in prod_fd]
dist_fd = FreqDist(prod_to_dist)
X_vec = list(range(prod_fd[prod_fd.max()]))[1:]
Y_vec = [dist_fd[x] for x in X_vec]
py.plot(X_vec, Y_vec)
示例3: choose_tag
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
def choose_tag(self, tokens, index, history):
word = tokens[index]
fd = FreqDist()
for synset in wordnet.synsets(word):
fd[synset.pos()] += 1
if fd:
return self.wordnet_tag_map.get(fd.max())
else:
return None
示例4: handle
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
def handle(self, *args, **options):
fdist = FreqDist()
print "Analyzing raw data"
limit = 10
if args:
raw_datas = RawData.objects.filter(pk__in=args)
else:
raw_datas = RawData.objects.all()[:limit]
tagged_data = []
for raw_data in raw_datas:
words = nltk.word_tokenize(raw_data.data)
tagged_data.extend(nltk.pos_tag(words))
for word in words:
word = word.strip()
if word:
fdist.inc(word)
print "Anaylzed %s items" % len(raw_datas)
print
print "Top word: %s" % fdist.max()
print
print "Top 10 words"
for word in fdist.keys()[:10]:
times = fdist[word]
print " -- %s occurred %s times" % (word, times)
print
print "Bottom 10 words"
for word in fdist.keys()[-10:]:
times = fdist[word]
print " -- %s occurred %s times" % (word, times)
print
print "Words occurring between 50-100 times"
words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ]
print ", ".join(words)
cfdist = ConditionalFreqDist()
for (word, tag) in tagged_data:
cfdist[tag].inc(word)
print "Most popular noun: %s" % cfdist["NN"].max()
print
print "Top 50 nouns"
for word in cfdist["NN"].keys()[:50]:
times = cfdist["NN"][word]
print " -- %s occurred %s times" % (word, times)
print
示例5: main
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
def main():
""" a basic REPL for testing """
corpus = """(S (NP John) (VP (V likes) (NP Mary)))
(S (NP Peter) (VP (V hates) (NP Susan)))
(S (NP Harry) (VP (V eats) (NP pizza)))
(S (NP Hermione) (VP (V eats)))""".splitlines()
corpus = """(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog))))
(S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))""".splitlines()
#corpus = """(S (NP mary) (VP walks) (AP quickly))""".splitlines()
#(S (NP Harry) (VP (V likes) (NP Susan) (ADVP (RB very) (RB much))))
corpus = [Tree(a) for a in corpus]
#d = GoodmanDOP(corpus, rootsymbol='S')
from bitpar import BitParChartParser
d = GoodmanDOP(corpus, rootsymbol='TOP', wrap='TOP',
parser=BitParChartParser)
#d = GoodmanDOP(corpus, rootsymbol='TOP', wrap='TOP')
#print d.grammar
print "corpus"
for a in corpus: print a
w = "foo!"
while w:
print "sentence:",
w = raw_input().split()
try:
p = FreqDist()
for n, a in enumerate(d.parser.nbest_parse(w)):
if n > 1000: break
print a
p.inc(ImmutableTree.convert(removeids(a)), a.prob())
#for b, a in sorted((b,a) for (a,b) in p.items()):
# print a, b
print
print 'best', p.max(), p[p.max()]
#print d.parse(w)
except Exception: # as e:
print "error", #e
示例6: stem
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
# stem of word
def stem(word):
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
stem, suffix = re.findall(regexp, word)[0]
return stem
def lexical_diversity(text):
return len(text) / len(set(text))
nostop_title = lemma(remove_stopwords(text_title))
nltk.Text(nostop_title).collocations()
# Frequency distribution of text
fdist_title = FreqDist(nostop_title)
fdist_title.most_common(50)
fdist_title.max()
fdist_title.plot(50, cumulative=True)#plot
fdist_title.plot(50)
total_words = len(set(nostop_title))
print("The total number of words in title of KD is: " + str(total_words))
avg_words = fdist_title.N()/total_words
print("Each word appears in title of KD is: " + str(int(avg_words)))
# process for text
f = open('kdtext.txt', encoding="latin-1")
raw_text = f.read()
# type
type(raw_text)
tokens = word_tokenize(raw_text)
type(tokens)
示例7: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
word_len = [len(w) for w in text1]
print word_len
# Example Description
# fdist = FreqDist(samples) create a frequency distribution containing the given samples
# fdist[sample] += 1 increment the count for this sample
# fdist['monstrous'] count of the number of times a given sample occurred
# fdist.freq('monstrous') frequency of a given sample
# fdist.N() total number of samples
# fdist.most_common(n) the n most common samples and their frequencies
# for sample in fdist: iterate over the samples
# fdist.max() sample with the greatest count
# fdist.tabulate() tabulate the frequency distribution
# fdist.plot() graphical plot of the frequency distribution
# fdist.plot(cumulative=True) cumulative plot of the frequency distribution
# fdist1 |= fdist2 update fdist1 with counts from fdist2
# fdist1 < fdist2 test if samples in fdist1 occur less frequently than in fdist2
fdlist = FreqDist(len(w) for w in text1)
print dict(fdlist)
print fdlist.most_common(3)
print fdlist.max()
print fdlist[2]
print fdlist.tabulate()
fdlist.plot()
fdlist.plot(cumulative=True)
示例8: multi_sentence
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
def multi_sentence(context_sentences, ambiguous_word):
fdist = FreqDist()
for sentence in context_sentences:
fdist.inc(lesk(sentence, ambiguous_word))
return fdist.max()
示例9: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
#!/usr/bin/python
# coding: utf-8
# 2013/03/20
from nltk import FreqDist
fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成
fdist.inc(sample) # sampleで指定されたデータの数を1増やす
fdist['データ'] # 指定されたデータの出現数
fdist.freq('データ') # 指定されたデータの頻度
fdist.N() # サンプルの総数
fdist.keys() # 頻度の順にソートされたサンプル
for sample in fdist: # 頻度の順にサンプルをイテレート
pass
fdist.max() # 数の最も多いサンプル
fdist.tabulate() # 頻度分布を表形式で表示
fdist.plot() # 頻度分布をプロット
fdist.plot(cumulative=True) # 累積頻度をプロット
fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト
示例10: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
### What is the most frequent tag?
### Which word has the most number of distinct tags?
fd = FreqDist()
cfd = ConditionalFreqDist()
# for each tagged sentence in the corpus, get the (token, tag) pair and update
# both count(tag) and count(tag given token)
for sentence in brown.tagged_sents():
for (token, tag) in sentence:
fd[tag] += 1
cfd[token][tag] += 1
# Find the most frequent tag
fd.max()
# Initialize a list to hold (numtags,word) tuple
wordbins = []
# Append each tuple (number of unique tags for token, token) to list
for token in cfd.conditions():
wordbins.append((cfd[token].B(), token))
# sort tuples by number of unique tags (highest first)
wordbins.sort(reverse=True)
print wordbins[0] # token with max. no. of tags is ...
### What is the ratio of masculine to feminine pronouns?
male = ['he','his','him','himself'] # masculine pronouns
示例11: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
title = nostop_title_dsc + nostop_title_kd
nltk.Text(title).collocations()
fdist_title = FreqDist(title)
fdist_title.most_common(50)
fdist_title.plot(50, cumulative=True)
fdist_title.plot(50)
total_words = len(set(title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))
text = nostop_text_dsc + nostop_text_kd
nltk.Text(text).collocations()
fdist_text = FreqDist(text)
fdist_text.most_common(50)
fdist_text.max()
fdist_text.plot(50, cumulative=True)
fdist_text.plot(50)
total_textwords = len(set(text))
print("The total number of words in text is: " + str(total_textwords))
avg_text = fdist_text.N() / total_textwords
print("Each word appears in text " + str(int(avg_text)) + " times")
# bigrams and trigrams
word_pair_text = list(bigrams(text))
word_triple_text = list(trigrams(text))
bigrams_text = FreqDist(word_pair_text)
trigrams_text = FreqDist(word_triple_text)
bigrams_text.most_common(50)
bigrams_text.plot(50)
bigrams_text.plot(50, cumulative=True)
示例12: lemma
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
return [w for w in word if w not in stopwords.words('english') and w != '']
# lemma
def lemma(text):
lmtzr = WordNetLemmatizer()
return [lmtzr.lemmatize(w) for w in text]
nostop_title = lemma(remove_stopwords(text_title))
# check the collocations of text
nostop_title = nltk.Text(nostop_title)
nostop_title.collocations()
fdist_title = FreqDist(nostop_title) # Frequency distribution of text
fdist_title.most_common(50) # most common 50
fdist_title['science'] # return count of a given word
fdist_title.max() # max counts
fdist_title.plot(50, cumulative=True) # plot
fdist_title.plot(50)
fdist_title.tabulate(50) # tabulate
total_words = len(set(nostop_title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))
# bigrams, trigrams
from nltk import bigrams
from nltk import trigrams
word_pair = list(bigrams(nostop_title))
word_triple = list(trigrams(nostop_title))
bigrams_title = FreqDist(word_pair)
trigrams_title = FreqDist(word_triple)
示例13: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
#!/usr/bin/python3
# coding: utf-8
import nltk
from nltk.corpus import gutenberg # 导入 gutenberg 集
##################################################################
## FreqDist 跟踪分布中的采样频率 (sample frequencies)
from nltk import FreqDist # 导入 FreqDist 类
fd = FreqDist(gutenberg.words('austen-persuasion.txt')) # 频率分布实例化, 统计文本中的 Token
print(fd) # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token
print(type(fd)) # <class 'nltk.probability.FreqDist'>
print(fd['the']) # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典
print(fd.N()) # 98171; 是单词, 不是字母, 有重复的
print(fd.B()) # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中
print(len(fd.keys()), type(fd.keys())) # 6132 <class 'dict_keys'>
print(fd.keys()) # fd.B() 只是输出个数, 这个是把所有词汇表输出
print(fd.max()) # 频率最高的一个词
print(fd.freq('the')) # 0.03178127960395636; 出现频率 3120 / 98171
print(fd.hapaxes()) # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词
# 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征
for idx, word in enumerate(fd): # 可以用 enumerate 来遍历, 是按出现顺序排的
if idx == 5: break
print(idx, word) # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist) # <FreqDist with 16 samples and 98171 outcomes>
print(fdist.items()) # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3)) # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha()) # 可以不用 [] 将生成器 list 化
示例14: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
#!/usr/bin/env python
from nltk.corpus import brown
from nltk import FreqDist, ConditionalFreqDist
fd = FreqDist()
cfd = ConditionalFreqDist()
# for each tagged sentence in the corpus, get the (token, tag) pair and update
# both count(tag) and count(tag given token)
for sentence in brown.tagged_sents():
for (token, tag) in sentence:
fd[tag] += 1
cfd[token][tag] += 1
# The most frequent tag is ...
print(fd.max())
# Initialize a list to hold (numtags,word) tuple
wordbins = []
# Append each (n(unique tags for token),token) tuple to list
for token in cfd.conditions():
wordbins.append((cfd[token].B(), token))
# Sort tuples by number of unique tags (highest first)
wordbins.sort(reverse=True)
# The token with max. no. of tags is ...
print(wordbins[0])
# masculine pronouns
示例15: Document
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import max [as 别名]
class Document(object):
def __init__(self, doc_id):
#rename metadata something more general?
self.metadata = { "doc_title": None, "author_lastname": None, "author_first_middle": None, "year_written": None, "year_published": None,
"pub_title": None, "pub_type": None, "Type-Token Ratio": None, "Hapax Dislegomena": None, "Honore's R": None, "Yule's K": None, "tokenized_doc": []}
self.doc_id = doc_id
self.fdist = None
self.frequencies = []
self.metadata_getter()
self.tokenized_doc_getter()
self.thrk_getter()
self.frequency_dist_getter()
#method?
#self.timestamp()
def timestamp(self):
ts = time.time()
return datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d_%H%M%S_')
def metadata_getter(self):
# move to object?
cursor = db.cursor()
c = cursor.execute('SELECT author_lastname, author_first_middle, doc_title, original_publication_title, original_publication_type, year_written, year_published FROM metadata WHERE doc_id = (?)', (self.doc_id,))
for row in c:
self.metadata["author_lastname"] = row[0]
self.metadata["author_first_middle"] = row[1]
self.metadata["doc_title"] = row[2]
self.metadata["pub_title"] = row[3]
self.metadata["pub_type"] = row[4]
self.metadata["year_written"] = row[5]
self.metadata["year_published"] = row[6]
#print "Metadata Found for Doc ", (self.doc_id)
def tokenized_doc_getter(self):
#assumes we're connected to db
doc_name = 'document_' + str(self.doc_id)
cursor = db.execute('SELECT * FROM {}'.format(doc_name,))
text = []
for i in cursor:
text.append(str(i[0]))
self.metadata["tokenized_doc"] = text
#print "Tokenized Document ", (self.doc_id)
def type_token_ratio(self):
self.metadata["Type-Token Ratio"] = float(self.V / self.N)
def hap_dis_ratio(self):
self.metadata["Hapax Dislegomena"] = float(self.hapaxes[2] / self.V)
#assignments can go in methods
def honore_r(self):
if self.hapaxes[1] != 0:
self.metadata["Honore's R"] = float((100*math.log(self.N, 10)) / (1 - (self.hapaxes[1] / self.V)))
else:
self.metadata["Honore's R"] = 'NA'
def yule_k(self):
#we find the value of the greatest number of times any word appears
summation = []
for i in self.hapaxes:
summation.append(float(i**2 * self.hapaxes[i]))
#with the summation, find K
self.metadata["Yule's K"] = float((10**4 * (sum(summation) - self.N)) / (self.N**2))
def frequency_dist(self):
self.fdist = FreqDist(self.metadata["tokenized_doc"])
def frequency_dist_getter(self):
if self.fdist == None:
self.frequency_dist()
self.frequencies = self.fdist.items()
def hapaxes_summation(self):
self.frequency_dist()
max = self.fdist[self.fdist.max()]
# hapaxes method (only gets called if you hit else here)
hapaxes = {}
for n in range(1, max+1):
hapaxes[n] = 0
for i in self.fdist:
hapaxes[self.fdist[i]] += 1
self.hapaxes = hapaxes
def thrk_getter(self):
cursor = db.cursor()
c = cursor.execute('SELECT doc_id, t, h, r, k FROM thrk WHERE doc_id = (?)', (self.doc_id,))
count = 0
for i in c:
count +=1
if count > 0:
c = cursor.execute('SELECT doc_id, t, h, r, k FROM thrk WHERE doc_id = (?)', (self.doc_id,))
for i in c:
self.metadata["Type-Token Ratio"] = i[1]
self.metadata["Hapax Dislegomena"] = i[2]
self.metadata["Honore's R"] = i[3]
self.metadata["Yule's K"] = i[4]
else:
self.hapaxes_summation()
# make these instance variables
self.N = float(self.fdist.N())
#.........这里部分代码省略.........