本文整理汇总了Python中nltk.FreqDist.values方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.values方法的具体用法?Python FreqDist.values怎么用?Python FreqDist.values使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.values方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: BootstrapFD
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def BootstrapFD(samp):
fd = FreqDist(samp)
f1 = float(fd.Nr(1))
f2 = float(fd.Nr(2))
N = float(fd.N())
B = fd.B()
# Undetected species & Coverage
if f2 > 0.0:
f0 = ceil(((N - 1.0) / N) * (f1 ** 2.0) / (2.0 * f2))
C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0 * f2)
else:
f0 = ceil(((N - 1.0) / N) * f1 * (f1 - 1.0) / 2.0)
C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0)
# Correct abundances
probs = array(fd.values()) / N
lambdah = (1 - C) / sum(probs * (1 - probs) ** N)
probs = probs * (1 - lambdah * (1 - probs) ** N)
# P for unseen
# paux = (1-C)/f0
yield fd.values()
popO = arange(B)
dist = binom(n=N, p=1 - C)
probsA = probs / sum(probs)
while True:
ns2 = dist.rvs()
ns1 = int(N) - ns2
if ns1 > 0:
samp1 = list(choice(popO, size=ns1, replace=True, p=probsA))
else:
samp2 = []
if ns2 > 0:
samp2 = list(random_integers(B, B + int(f0) - 1, ns2))
else:
samp2 = []
yield FreqDist(samp1 + samp2).values()
示例2: tf_measure
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def tf_measure(word_tokens, query_tokens, N):
tfscore = 0.0
freq = FreqDist(word_tokens)
try:
wf = max(freq.values())
except:
wf = 0.0
for token in query_tokens:
try:
tf = freq[token]
tf = 1.0 + log(tf)
#tfscore = 0.5 + (0.5 * (0.0 + tf))/(0.0 + wf)
except:
tf = 0.0
tfscore+=tf
# IDF measures
#print tfscore
return tfscore
示例3: get_probs
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def get_probs(filename):
"""read the given text and calculate the probabilities for all symbols."""
with open(filename) as file_in:
text = file_in.read()
probs = FreqDist(text)
count_sum = sum(v for v in probs.values())
for k,v in probs.items():
probs[k] = v * 1.0 / count_sum
return probs
示例4: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
# How many times does a word appear in the text?
text1.count("do")
# Percentage of the text occupied by a word, see E28 below for a better function.
from nltk.book import text5 # Chat conversations
100*text5.count("call")/len(text5)
100*text5.count("whatever")/len(text5)
# Frequency distribution
from nltk import FreqDist
fdist1 = FreqDist(text1)
vocabulary = fdist1.keys()
frequencies = fdist1.values()
fdist1['whale']
# Define a function that computes lexical diversity
def lexical_diversity(text):
return len(text)/len(set(text))
#Note that our new function can be used on any text, even your own:
lexical_diversity(myText)
# You can combine two lists with text (the addition operator concatenates strings and lists):
myText1 = ["This", "is", "my","text","and"]
myText2 = ["there","is","nothing","you","can","do","about","it","!"]
示例5: tag
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
newtuple = (new_words[i], t[1]) #Each new tuple uses same POS tag (t[1])
text.insert(position+i, newtuple)
position+=1
#==============================================================================
text = [(w,p) for w,p in text if re.match(r"[\'a-z]",w[0])]
nonlemwords = [w for w,p in text]
#==============================================================================
# Create non-lemmatized version to use if the lemmatized version doesn't have matches (because of differences in POS tagging)
#==============================================================================
bigrams = FreqDist(zip(nonlemwords[:-1],nonlemwords[1:]))
unigram = FreqDist(nonlemwords)
sbig = float(sum(bigrams.values()))
suni = float(sum(unigram.values()))
nonlemassoc = {}
for b0,b1 in bigrams:
p1 = unigram[b0]/suni
p2 = unigram[b1]/suni
p12 = bigrams[b0,b1]/sbig
nonlemassoc[b0,b1] = log(p12)-log(p1)-log(p2)
#==============================================================================
# #Write SBC Bigram association scores to file
#==============================================================================
f=open("/Users/heathersimpson/Documents/Dissertation/Articles/Chp3_IUvsClauseBoundaries/BigramStrength/SBC-nonlembigrams.txt","w")
#Give it headers first
f.write("Word1\tWord2\tpwMI\n")
f.close()
示例6: set
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
import matplotlib
import string
exclude = set(string.punctuation)
with open("YT_Comment_Output.txt", "rb") as f:
lines = [line.rstrip() for line in f]
splits = [line.split() for line in lines]
some_upper = [item for sublist in splits for item in sublist]
#replace BOM w known stopword
BOM_gone = [word.replace('\xef\xbb\xbf', 'i') for word in some_upper]
punct_gone = []
for word in BOM_gone:
punct_gone.append(''.join(ch for ch in word if ch not in exclude))
YT_comment_words = [word.lower() for word in punct_gone]
with open('stopwords.txt', 'rb') as f:
stopwords = [line.rstrip() for line in f]
print YT_comment_words[:10]
print stopwords[:10]
filtered_words = [w for w in YT_comment_words if not w in stopwords]
print filtered_words[:10]
fd = FreqDist(filtered_words)
print fd.values()[:10]
print fd
fd.plot(30)
示例7: while
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
in_str = sys.stdin.read(BUF_SIZE)
rest = ''
read_count = 0
while (rest + in_str).strip() != '':
read_count += 1
if read_count % 100 == 0:
sys.stderr.write('.')
sys.stderr.flush()
tokens = (rest + in_str).split()
rest = tokens.pop()
if not tokens:
vocab.update(rest)
break
else:
vocab.update(tokens)
in_str = sys.stdin.read(BUF_SIZE)
print
for i in [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]:
if i > len(vocab.values()):
break
print "vocab size %7d - cutoff = %d" % (i, vocab.values()[i])
示例8: len
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
words = (word for word in words if word not in nltk.corpus.stopwords.words('english'))
#a = nltk.word_tokenize(word_list)
b = nltk.pos_tag(word_list)
c = nltk.ne_chunk(b,binary=True)
tokencount = tokencount + len(word_list)
fdist = FreqDist()
for x in c.subtrees():
if x.node == "NE":
words = [w[0] for w in x.leaves()]
name = " ".join(words)
#print name
fdist.inc(name)
bigfdist.inc(name)
nercount = nercount + 1
a = [f, tokencount, nercount,fdist.keys(), fdist.values()]
print a
#mycsv = csv.writer(ofile)
mycsv.writerow(a)
mycsv2 = csv.writer(namefile)
for word in bigfdist:
thepair = word+ ',' + str(bigfdist[word])
mycsv2.writerow(thepair)
mycsv.close()
mycsv2.close()
示例9: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
print "building Text format"
text = nltk.Text(tokens)
print "building freqdist."
fdist = FreqDist(text)
print "freqdist done."
# output result to csv file
print "opening csv file."
csvfile = file("/Users/Zhao/Documents/gone_with_the_wind.csv", "aw")
writer = csv.writer(csvfile)
print "writing csv file"
# no repeated item
arr = []
arr.append((1, fdist.keys()[1], fdist.values()[1], 1))
pre_value = fdist.values()[1]
cur_num = 2
for i in xrange(0, 24903):
# print i+1, fdist.keys()[i], fdist.values()[i]
if fdist.values()[i] != pre_value:
if pattern.match(fdist.keys()[i]) != None:
item = (i + 1, fdist.keys()[i], fdist.values()[i], cur_num)
pre_value = fdist.values()[i]
cur_num += 1
arr.append(item)
print i + 1, "done."
arr.append((24903, fdist.keys()[-1], fdist.values()[-1], cur_num))
print arr
示例10: Model
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
#.........这里部分代码省略.........
punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag'])
punkt_param = PunktParameters()
sentence_splitter = PunktSentenceTokenizer(punkt_param)
text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText)
#text = re.sub('(\d+)', r' \1 ', text)
sentences = sentence_splitter.tokenize(text)
#TOKENS
self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))]
wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+')
#wordTokenizer = RegexpTokenizer('[\w]+')
sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0]
words = list(itertools.chain(*sentences))
self.__words = words
self.__sentences = sentences
self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3)
self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3)
self.__freqDist = FreqDist(words)
self.__wordCount = len(words)
self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5)
### resetting members
self.__concordanceIndex = None
self.__bigrams = None
return encoding
def computeZipf(self, unit):
if unit == 'word':
self.__logx = np.array([math.log(i, 10) for i in range(1, len(self.__freqDist.values())+1) ] )
self.__logfreqDist = np. array([math.log(i, 10) for i in self.__freqDist.values() ])
if unit == 'bigram':
bigramFreqDist = dict()
for first in self.__letters:
for second in self.__letters:
bigramFreqDist[first+second] = 0
for token in self.__freqDist.items():
for ii in range(len(token[0])-1):
try:
bigram = token[0][ii]+token[0][ii+1]
bigramFreqDist[bigram] += token[1]
except KeyError:
print "Key error on token: ", token
self.__sortedBigrams = sorted([x for x in bigramFreqDist.items() if x[1]>0], key=itemgetter(1))
self.__sortedBigrams.reverse()
self.__logx = np.array([math.log(i, 10) for i in range(1, len(self.__sortedBigrams)+1) ] )
self.__logfreqDist = np. array([math.log(i[1], 10) for i in self.__sortedBigrams])
if unit == 'letter':
letterFreqDist = dict()
for letter in self.__letters:
letterFreqDist[letter] = 0
for token in self.__freqDist.items():
for ii in range(len(token[0])):
try:
letter = token[0][ii]
示例11: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
# -*- coding: utf-8 -*-
from nltk import FreqDist
from nltk.corpus import reuters
yen = reuters.words(categories='yen')
fd1 = FreqDist(i.lower() for i in yen)
sfd1 = sorted(fd1.values(), reverse=True)
# ---
for i, v in enumerate(fd1[0:100], 1): print('%d, %d, %d' % (i, v, i*v))
# ---
import pylab
pylab.plot(sfd1, color='red')
pylab.xscale('log')
pylab.yscale('log')
pylab.show()
# ---
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
yen_exclude_stops = [i for i in yen if i.lower() not in english_stopwords]
fd2 = FreqDist(i.lower() for i in yen_exclude_stops)
sfd2 = sorted(fd2.values(), reverse=True)
示例12: get_frequency_distribution
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def get_frequency_distribution(words):
fd = FreqDist(i.lower() for i in words)
print(fd)
sorted_fd = sorted(fd.values(), reverse=True)
print(sorted_fd[0:10])
return sorted_fd
示例13: buildcorpus
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def buildcorpus(corpus, rootpath, filelimit = 0):
#rootpath = corpus.rootpath
fileids = os.listdir(rootpath)
hugewordlist = []
hugewordlist.extend(corpus.words) # will contain distinct Word instances
numoffiles = 0
corpus.set_corpusname(str(max(filelimit, len(fileids)))+"texts")
for fileid in fileids:
allwords = FreqDist() # will contain all words in this text
doc_id = fileid.split(".")[0]
# corpus.inserttext(doc_id) ##### ! text in kendisini gondermeli
newtext = Text(doc_id)
path = rootpath + os.sep + fileid
#lines = readtextlines(path)
#rawtext = texter.readtxtfile(path)
rawtext = texter.readnewstext(path)
lines = texter.splitToSentences(rawtext)
sntindex = 0
# each line is a sentence
for line in lines:
words = [] # words in this sentence
words = line.split()
words = texter.eliminatepunctuation(words)
words = [word for word in words if not word.isspace()]
for word in words:
allwords.inc(word)
newword = Word(word)
newword.insertsentenceid(doc_id+"_"+str(sntindex))
if allwords[word] <= 1: # if this was not added to the hugelist before, add it
hugewordlist.append(newword)
sentence = Sentence(sntindex)
sntindex = sntindex + 1
# sentence'a Word mu wordindex mi atalim?
for word in words:
index = hugewordlist.index(Word(word))
hugewordlist[index].insertsentenceid(doc_id+"_"+str(sntindex-1))
sentence.insertword(index)
newtext.insertsentence(sentence)
if (not rawtext.isspace()) or (len(allwords) != 0):
corpus.inserttext(newtext)
print str(numoffiles)," : finished handling the words-snts-txts ",doc_id
numofwords = reduce(lambda x,y : x+y, allwords.values())
for word in hugewordlist:
cnt = allwords[word.literal]
#freq = cnt / float(numofwords)
word.assigntermfreq(cnt, numofwords, doc_id)
#hugewordlist[index].toscreen()
numoffiles = numoffiles + 1
if filelimit == numoffiles:
break
# end for - docs
numofdocs = len(fileids)
print "computing tf*idf"
for word in hugewordlist:
word.computeinvdocfreq(numofdocs)
word.computeTFIDF()
#word.toscreen()
corpus.assignwords(hugewordlist)
print "corpus length ",str(len(corpus.words))," words"
print "huges length ",str(len(hugewordlist))," words"
print "exiting buildcorpus()"
print "pickle-dumping words"
corpus.pickledumpwords()