Python FreqDist.values方法代码示例

本文整理汇总了Python中nltk.FreqDist.values方法的典型用法代码示例。如果您正苦于以下问题：Python FreqDist.values方法的具体用法？Python FreqDist.values怎么用？Python FreqDist.values使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist的用法示例。

在下文中一共展示了FreqDist.values方法的13个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: BootstrapFD

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def BootstrapFD(samp):
    fd = FreqDist(samp)
    f1 = float(fd.Nr(1))
    f2 = float(fd.Nr(2))
    N = float(fd.N())
    B = fd.B()
    # Undetected species & Coverage
    if f2 > 0.0:
        f0 = ceil(((N - 1.0) / N) * (f1 ** 2.0) / (2.0 * f2))
        C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0 * f2)
    else:
        f0 = ceil(((N - 1.0) / N) * f1 * (f1 - 1.0) / 2.0)
        C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0)
        # Correct abundances
    probs = array(fd.values()) / N
    lambdah = (1 - C) / sum(probs * (1 - probs) ** N)
    probs = probs * (1 - lambdah * (1 - probs) ** N)
    # P for unseen
    # paux = (1-C)/f0
    yield fd.values()
    popO = arange(B)
    dist = binom(n=N, p=1 - C)
    probsA = probs / sum(probs)
    while True:
        ns2 = dist.rvs()
        ns1 = int(N) - ns2
        if ns1 > 0:
            samp1 = list(choice(popO, size=ns1, replace=True, p=probsA))
        else:
            samp2 = []
        if ns2 > 0:
            samp2 = list(random_integers(B, B + int(f0) - 1, ns2))
        else:
            samp2 = []
        yield FreqDist(samp1 + samp2).values()

开发者ID:hsimpson22，项目名称:DissChp3_AssociationStrength，代码行数:37，代码来源:entropies.py

示例2: tf_measure

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def tf_measure(word_tokens, query_tokens, N):
    tfscore = 0.0

    freq = FreqDist(word_tokens)
    try:
        wf = max(freq.values())
    except:
        wf = 0.0

    for token in query_tokens:
        try:
            tf = freq[token]
            tf = 1.0 + log(tf)
            #tfscore = 0.5 + (0.5 * (0.0 + tf))/(0.0 + wf)
        except:
            tf = 0.0

        tfscore+=tf



    # IDF measures



    #print tfscore
    return tfscore

开发者ID:rakshit-agrawal，项目名称:search_relevance，代码行数:29，代码来源:data_fetch.py

示例3: get_probs

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def get_probs(filename):
    """read the given text and calculate the probabilities for all symbols."""
    with open(filename) as file_in:
        text = file_in.read()
    probs = FreqDist(text)
    count_sum = sum(v for v in probs.values())
    for k,v in probs.items():
        probs[k] = v * 1.0 / count_sum
    return probs

开发者ID:jhb86253817，项目名称:ITM-exercise，代码行数:11，代码来源:shannon_fano2.py

示例4: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
# How many times does a word appear in the text?
text1.count("do")

# Percentage of the text occupied by a word, see E28 below for a better function.
from nltk.book import text5 # Chat conversations
100*text5.count("call")/len(text5)
100*text5.count("whatever")/len(text5)

# Frequency distribution
from nltk import FreqDist

fdist1 = FreqDist(text1)

vocabulary = fdist1.keys()

frequencies = fdist1.values()

fdist1['whale'] 

# Define a function that computes lexical diversity
def lexical_diversity(text):
        return len(text)/len(set(text))

#Note that our new function can be used on any text, even your own:
lexical_diversity(myText)

# You can combine two lists with text (the addition operator concatenates strings and lists):
myText1 = ["This", "is", "my","text","and"]

myText2 = ["there","is","nothing","you","can","do","about","it","!"]

开发者ID:STIMALiU，项目名称:TextMiningCourse，代码行数:32，代码来源:Intro2NLTK.py

示例5: tag

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
            newtuple = (new_words[i], t[1]) #Each new tuple uses same POS tag (t[1])
            text.insert(position+i, newtuple)
    position+=1
    
#==============================================================================  

text = [(w,p) for w,p in text if re.match(r"[\'a-z]",w[0])]

nonlemwords = [w for w,p in text]
#==============================================================================
# Create non-lemmatized version to use if the lemmatized version doesn't have matches (because of differences in POS tagging)
#==============================================================================
bigrams = FreqDist(zip(nonlemwords[:-1],nonlemwords[1:]))
unigram = FreqDist(nonlemwords)

sbig = float(sum(bigrams.values()))
suni = float(sum(unigram.values()))

nonlemassoc = {}
for b0,b1 in bigrams:
    p1 = unigram[b0]/suni
    p2 = unigram[b1]/suni
    p12 = bigrams[b0,b1]/sbig
    nonlemassoc[b0,b1] = log(p12)-log(p1)-log(p2) 
#==============================================================================
# #Write SBC Bigram association scores to file
#==============================================================================
f=open("/Users/heathersimpson/Documents/Dissertation/Articles/Chp3_IUvsClauseBoundaries/BigramStrength/SBC-nonlembigrams.txt","w")
#Give it headers first
f.write("Word1\tWord2\tpwMI\n")
f.close()

开发者ID:hsimpson22，项目名称:DissChp3_AssociationStrength，代码行数:33，代码来源:SBC-bigrams.py

示例6: set

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
import matplotlib
import string

exclude = set(string.punctuation)

with open("YT_Comment_Output.txt", "rb") as f:
	lines = [line.rstrip() for line in f]
	splits = [line.split() for line in lines]
	some_upper = [item for sublist in splits for item in sublist]
	#replace BOM w known stopword
	BOM_gone = [word.replace('\xef\xbb\xbf', 'i') for word in some_upper]
	punct_gone = []
	for word in BOM_gone: 		
		punct_gone.append(''.join(ch for ch in word if ch not in exclude))
	YT_comment_words = [word.lower() for word in punct_gone]

with open('stopwords.txt', 'rb') as f:
    stopwords = [line.rstrip() for line in f]

print YT_comment_words[:10]
print stopwords[:10]

filtered_words = [w for w in YT_comment_words if not w in stopwords]

print filtered_words[:10]

fd = FreqDist(filtered_words)
print fd.values()[:10]
print fd
fd.plot(30)

开发者ID:andymckenzie，项目名称:youtube，代码行数:32，代码来源:get_freq.py

示例7: while

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
    in_str = sys.stdin.read(BUF_SIZE)
    rest = ''

    read_count = 0

    while (rest + in_str).strip() != '':
        read_count += 1

        if read_count % 100 == 0:
            sys.stderr.write('.')
            sys.stderr.flush()

        tokens = (rest + in_str).split()
        rest = tokens.pop()

        if not tokens:
            vocab.update(rest)
            break
        else:
            vocab.update(tokens)

        in_str = sys.stdin.read(BUF_SIZE)

    print

    for i in [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]:
        if i > len(vocab.values()):
            break

        print "vocab size %7d - cutoff = %d" % (i, vocab.values()[i])

开发者ID:andrely，项目名称:sublexical-features，代码行数:32，代码来源:find_vocab_cutoff.py

示例8: len

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
            words = (word for word in words if word not in nltk.corpus.stopwords.words('english'))

            #a = nltk.word_tokenize(word_list)
            b = nltk.pos_tag(word_list)
            c = nltk.ne_chunk(b,binary=True)
            tokencount = tokencount + len(word_list)
            fdist = FreqDist()
            for x in c.subtrees():
                if x.node == "NE":
                    words = [w[0] for w in x.leaves()]
                    name = " ".join(words)
                    #print name
                    
                    fdist.inc(name)
		    bigfdist.inc(name)
                    nercount = nercount + 1
	    a = [f, tokencount, nercount,fdist.keys(), fdist.values()]
	    print a
            
	    #mycsv = csv.writer(ofile)
            mycsv.writerow(a)

mycsv2 = csv.writer(namefile)
for word in bigfdist:
    thepair = word+ ',' + str(bigfdist[word])
    mycsv2.writerow(thepair)

mycsv.close()
mycsv2.close()

开发者ID:kirkhess，项目名称:Digitizing-Serialized-Fiction，代码行数:31，代码来源:ner.py

示例9: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
print "building Text format"
text = nltk.Text(tokens)
print "building freqdist."
fdist = FreqDist(text)
print "freqdist done."

# output result to csv file
print "opening csv file."

csvfile = file("/Users/Zhao/Documents/gone_with_the_wind.csv", "aw")
writer = csv.writer(csvfile)
print "writing csv file"

# no repeated item
arr = []
arr.append((1, fdist.keys()[1], fdist.values()[1], 1))
pre_value = fdist.values()[1]
cur_num = 2
for i in xrange(0, 24903):
    # print i+1, fdist.keys()[i], fdist.values()[i]
    if fdist.values()[i] != pre_value:
        if pattern.match(fdist.keys()[i]) != None:
            item = (i + 1, fdist.keys()[i], fdist.values()[i], cur_num)
            pre_value = fdist.values()[i]
            cur_num += 1
            arr.append(item)
    print i + 1, "done."
arr.append((24903, fdist.keys()[-1], fdist.values()[-1], cur_num))

print arr

开发者ID:ZHAOTING，项目名称:LangStat，代码行数:32，代码来源:gone_with_the_wind_statistic.py

示例10: Model

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]

#.........这里部分代码省略.........
            punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag'])
            
            punkt_param = PunktParameters()
            sentence_splitter = PunktSentenceTokenizer(punkt_param)
            text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText)
            #text = re.sub('(\d+)', r' \1 ', text)
            sentences = sentence_splitter.tokenize(text)
            
            #TOKENS
            self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))]
            wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+')
            #wordTokenizer = RegexpTokenizer('[\w]+')
            
            sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0]
            words =  list(itertools.chain(*sentences))
            self.__words = words
            self.__sentences = sentences
            
            self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3)
            self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3)
            self.__freqDist = FreqDist(words)
            self.__wordCount = len(words)
            self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5)
            
            ### resetting members
            self.__concordanceIndex = None
            self.__bigrams = None
                 
        return encoding
    
    def computeZipf(self, unit):
        
        if unit == 'word':
            self.__logx = np.array([math.log(i, 10) for i in  range(1, len(self.__freqDist.values())+1) ] )
            self.__logfreqDist = np. array([math.log(i, 10) for i in self.__freqDist.values() ])
        
        if unit == 'bigram':
            
            bigramFreqDist = dict()
            for first in self.__letters:
                for second in self.__letters:
                    bigramFreqDist[first+second] = 0
            
            for token in self.__freqDist.items():
                for ii in range(len(token[0])-1):
                    try:
                        bigram = token[0][ii]+token[0][ii+1]
                        bigramFreqDist[bigram] += token[1]
                    except KeyError:
                        print "Key error on token: ", token

            self.__sortedBigrams = sorted([x for x in bigramFreqDist.items() if x[1]>0], key=itemgetter(1))
            self.__sortedBigrams.reverse()
            self.__logx = np.array([math.log(i, 10) for i in  range(1, len(self.__sortedBigrams)+1) ] )
            self.__logfreqDist = np. array([math.log(i[1], 10) for i in self.__sortedBigrams])
            
        if unit == 'letter':

            letterFreqDist = dict()
            for letter in self.__letters:
                    letterFreqDist[letter] = 0
                
            for token in self.__freqDist.items():
                for ii in range(len(token[0])):
                    try:
                        letter = token[0][ii]

开发者ID:gabruszka，项目名称:SAIL，代码行数:70，代码来源:Model.py

示例11: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
# -*- coding: utf-8 -*-
from nltk import FreqDist
from nltk.corpus import reuters


yen = reuters.words(categories='yen')
fd1 = FreqDist(i.lower() for i in yen)
sfd1 = sorted(fd1.values(), reverse=True)

# ---

for i, v in enumerate(fd1[0:100], 1): print('%d, %d, %d'  % (i, v, i*v))

# ---

import pylab
pylab.plot(sfd1, color='red')

pylab.xscale('log')
pylab.yscale('log')
pylab.show()

# ---

from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

yen_exclude_stops = [i for i in yen if i.lower() not in english_stopwords]
fd2 = FreqDist(i.lower() for i in yen_exclude_stops)
sfd2 = sorted(fd2.values(), reverse=True)

开发者ID:t2y，项目名称:learnnlp，代码行数:32，代码来源:demo.py

示例12: get_frequency_distribution

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def get_frequency_distribution(words):
    fd = FreqDist(i.lower() for i in words)
    print(fd)
    sorted_fd = sorted(fd.values(), reverse=True)
    print(sorted_fd[0:10])
    return sorted_fd

开发者ID:t2y，项目名称:learnnlp，代码行数:8，代码来源:practice23_a.py

示例13: buildcorpus

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import values [as 别名]
def buildcorpus(corpus, rootpath, filelimit = 0):
    
    #rootpath = corpus.rootpath
    fileids = os.listdir(rootpath)
    
    hugewordlist = []   
    hugewordlist.extend(corpus.words)   # will contain distinct Word instances

    numoffiles = 0
    
    corpus.set_corpusname(str(max(filelimit, len(fileids)))+"texts")
    
    for fileid in fileids:
    
        
        allwords = FreqDist()    # will contain all words in this text
        
        doc_id = fileid.split(".")[0]
        # corpus.inserttext(doc_id)    ##### !   text in kendisini gondermeli
        newtext = Text(doc_id)
        
        path = rootpath + os.sep + fileid
        #lines = readtextlines(path)
    
        #rawtext = texter.readtxtfile(path)
        rawtext = texter.readnewstext(path)
        lines = texter.splitToSentences(rawtext)
        
        sntindex = 0
        # each line is a sentence
        for line in lines:
            words = []   # words in this sentence
            words = line.split()
            words = texter.eliminatepunctuation(words)
            words = [word for word in words if not word.isspace()]
            
            
            
            for word in words:
                allwords.inc(word)
                
                
                newword = Word(word)
                newword.insertsentenceid(doc_id+"_"+str(sntindex))
                
                if allwords[word] <= 1:    # if this was not added to the hugelist before, add it
                    hugewordlist.append(newword)
                
                    
            sentence = Sentence(sntindex)
            sntindex = sntindex + 1
            
            # sentence'a Word mu wordindex mi atalim?
            for word in words:
                index = hugewordlist.index(Word(word))
                hugewordlist[index].insertsentenceid(doc_id+"_"+str(sntindex-1))
                sentence.insertword(index)
                
            newtext.insertsentence(sentence)
            
        if (not rawtext.isspace()) or (len(allwords) != 0):   
            corpus.inserttext(newtext)    
            
            print str(numoffiles)," : finished handling the words-snts-txts ",doc_id 
    
                
            numofwords = reduce(lambda x,y : x+y, allwords.values())
            
            for word in hugewordlist:
                cnt =  allwords[word.literal]
                #freq = cnt / float(numofwords)
                word.assigntermfreq(cnt, numofwords, doc_id)
                #hugewordlist[index].toscreen()
        
        numoffiles = numoffiles + 1
        if filelimit == numoffiles:
            break       

        
    # end for - docs
    

    numofdocs = len(fileids)
    print "computing tf*idf"
    for word in hugewordlist:
        word.computeinvdocfreq(numofdocs)
        word.computeTFIDF()
        #word.toscreen()
        
    corpus.assignwords(hugewordlist)
    print "corpus length ",str(len(corpus.words))," words"
    print "huges length ",str(len(hugewordlist))," words"
    print "exiting buildcorpus()"
    
    print "pickle-dumping words"
    corpus.pickledumpwords()

开发者ID:dicleoztur，项目名称:subjectivity_detection，代码行数:98，代码来源:ReaderV2.py

注：本文中的nltk.FreqDist.values方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。