Python FreqDist.freq方法代码示例

本文整理汇总了Python中nltk.probability.FreqDist.freq方法的典型用法代码示例。如果您正苦于以下问题：Python FreqDist.freq方法的具体用法？Python FreqDist.freq怎么用？Python FreqDist.freq使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist的用法示例。

在下文中一共展示了FreqDist.freq方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: statsText

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def statsText(text, words):

    fdist = FreqDist()
    # formatted prints will work with Python2 and Python3
    for word in word_tokenize(text):
        fdist[word.lower()] += 1


    #loop over the words in fdist and see if you can find those words in the wordslist keys. Since some words in the  
    #wordslist also has wildcard * at the end to denote anything after the initial word, we use Regex to match those 
    #rather than matching on equity; e.g wrong* will match wrong, wrongful, wrongfully, wronged etc...

    frequencies = []

    for word in words:
        if '*' in word:         #if word has * we need to compare it with each item in fdist...
            wordRegEx = word.replace('*', '.*')         #make it suitable for Regular Expression...
            for k in fdist:
                m = re.match(wordRegEx, k)
                if m:
                    frequencies.append((word, fdist.freq(m.group())))

        else:
            frequencies.append((word, fdist.freq(word)))

    return frequencies

开发者ID:cbbruss，项目名称:wordstatistics，代码行数:28，代码来源:stats.py

示例2: text_to_vector

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix

开发者ID:himanshusapra9，项目名称:TextNet，代码行数:36，代码来源:freq_representation.py

示例3: text_to_dict

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i%100==0: print '    dict',str(i)+'/'+str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        dicts.append(d)
    return dicts

开发者ID:himanshusapra9，项目名称:TextNet，代码行数:35，代码来源:freq_representation.py

示例4: fun14

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def fun14():
    """counting other things"""
    # print [len(w) for w in text1]
    fdist1 = FreqDist([len(w) for w in text1])
    # print fdist1.keys()
    # print fdist1.items()
    # word length 3 => 50223
    print fdist1[3]
    print fdist1.max()
    # frequency 20%
    print fdist1.freq(3)

开发者ID:gree2，项目名称:hobby，代码行数:13，代码来源:ch01.py

示例5: get_best_answers

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
    def get_best_answers(self, passage_list, q):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tAnswer Processing", q.id_q)

        empty = passage_list == []

        logger.info("%s:\t\tAnswer Extraction", q.id_q)

        answer_list = []
        for passage in passage_list:
            a = passage.find_answer(q)
            if a.is_successful():
                answer_list.append(a)

        if not answer_list:
            return ([], empty)

        logger.info("%s:\t\tAnswer Filtering", q.id_q)

        # Obtain answer frequency
        fd = FreqDist(answer_list)

        # Normalize frequencies
        normalize = fd.freq(fd.max())

        # Modify scores by frequency
        for answer in answer_list:
            answer.score = int(answer.score * (fd.freq(answer) / normalize))

        # Sort answers by score
        answer_list.sort(key=lambda x: x.score, reverse=True)

        # Filter bad answers
        try:
            threshold = int(MyConfig.get("answer_filtering", "threshold"))
        except:
            logger = logging.getLogger("qa_logger")
            logger.error("answer quality threshold not found")
            threshold = 50

        answer_list = filter(lambda x: x.score > threshold, answer_list)

        final_answers = []
        for a in answer_list:
            if a not in final_answers:
                final_answers.append(a)
            if len(final_answers) == 3:
                break

        return (final_answers, empty)

开发者ID:danigarabato，项目名称:qa，代码行数:52，代码来源:QA.py

示例6: zipfity

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def zipfity(lst):
    unigram = FreqDist()

    for sent in lst:
        for word in sent:
            unigram[word.lower()] +=1 #the task didn't say anything so did it all to lower

    sorted_unigram = sorted(unigram, key = unigram.get, reverse = True)
    top10 = sorted_unigram[:10]
    most_freq = unigram.freq(top10[0])
    count = 1

    print '{0:7s}{1:10s}{2:10s}'.format('word', 'obs.freq(%) ', 'zipf-law(%)')
    print '----------------------------'
    for word in top10:
        print '{0:7s}{1:10.2f}{2:10.2f}'.format(word, unigram.freq(word)*100, (most_freq/count)*100)
        count += 1

开发者ID:emmestl，项目名称:UiO，代码行数:19，代码来源:oblig2b_emmestl.py

示例7: statsText

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def statsText(text, words):

    fdist = FreqDist()
    # formatted prints will work with Python2 and Python3
    for word in word_tokenize(text):
        fdist.inc(word.lower())

    return [(k, fdist.freq(k)) for k in words]

开发者ID:diaakasem，项目名称:wordstatistics，代码行数:10，代码来源:stats.py

示例8: generate_weight_dictionary

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
    def generate_weight_dictionary(self, service, words):

        df = open(self.dictionary.get_dict_service_file_name(service), "w+")

        t = Text(words)
        freq_dist = FreqDist(t)

        for w in freq_dist:
            weight = 100 * freq_dist.freq(w)
            df.write(w + helper.results_field_separator + str(weight) + "\n")

        df.close()

开发者ID:alacambra，项目名称:datacrunch，代码行数:14，代码来源:activities_corpora_builder.py

示例9: main

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def main():
        argparser = argparse.ArgumentParser(description='text file')
        argparser.add_argument('file', type=str, help='file to produce frequency distribution for')
        args = argparser.parse_args()
        
	#toker = WhitespaceTokenizer()

	f = open(args.file)
	text = f.read()
	print(text)
	fdist = FreqDist(text)
	print(fdist.freq('28') * 100)
	fdist.plot()

开发者ID:Argonaught，项目名称:playground，代码行数:15，代码来源:freq.py

示例10: freq_lema_ngrams

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def freq_lema_ngrams(list_monograms,list_lemas):
    fdist1 = FreqDist(list_monograms)
    #fdist2 = FreqDist(list_lemas)
    vocabulary1 = fdist1.keys()  #valores distintos
    frec_grams=[];
    for tag in vocabulary1:
        temp1=[]
        for i in range(len(list_monograms)):
            if(list_monograms[i] == tag):
                temp1.append(list_lemas[i])
        temp2=set(temp1)                 
        
        frec_grams.append([tag, fdist1[tag], fdist1.freq(tag),'-'.join(temp2)])
    frec_grams_sort= sorted(frec_grams, key=itemgetter(1), reverse=True)
    return frec_grams_sort

开发者ID:jesusmiguelgarcia，项目名称:FSTmikes，代码行数:17，代码来源:attr_util_mk.py

示例11: _entity_ranking

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
    def _entity_ranking(self, entities):
        if len(entities) == 0:
            return "", "", int(0)

        # Obtain frequency of entities
        entities_freq = FreqDist(entities)

        # Our answer is the sample with the greatest number of outcomes
        exact = entities_freq.max()

        # Our window is empty because this algorithm generates exact answers
        window = ""

        # Our score is the entity frequency
        score = int(entities_freq.freq(exact) * 1000)

        return exact, window, score

开发者ID:danigarabato，项目名称:qa，代码行数:19，代码来源:answer.py

示例12: create_enhanced_dale_chall_list

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
    def create_enhanced_dale_chall_list(self):
        #list of sites used to create list of most frequent words 
        alexa_list = ['Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask']
    
        #bring all privacy texts into one list
        corpus = []
        data = get_all_policies()
        for site in data:
                if site in alexa_list:
                    corpus.append(data[site]["text"])
        
        #get the words of this list into a list of words
        t = textanalyzer("eng")
        words = t.getWords("".join(corpus))
        
        #open the dale chall wordlist        
        dale_chall_list = open('../nltk_contrib/dale_chall_wordlist.txt').read().split(';')
        
        #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words
        new_corpus = []
        
        for word in words:
            if word.lower() not in dale_chall_list and word not in alexa_list:
                new_corpus.append(word.lower())
        
        #create a frequency distribution of the words of this list of words
        fdist = FreqDist(new_corpus)
        #plot this
        fdist.plot(80, cumulative=True)
        
        #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative)
        most_frequ = []
        cum_percentage = 0.0
        for sample in fdist:
            cum_percentage += fdist.freq(sample)
            most_frequ.append(sample)
            if cum_percentage > 0.33:
                break

        #write those into a file
        privacy_file = open("privacy_wordlist.txt", "w")
        privacy_file.write(";".join(most_frequ))

开发者ID:Saragon87，项目名称:raTest，代码行数:44，代码来源:readabilityanalyzer.py

示例13: next

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
    def next(self, s, method = MOST_LIKELY):
        # Pick a transition leaving state s and return a state that would
        # likely follow.  The next state is chosen according to the method
        # specified.  The default is to choose and return the most likely
        # transition state.

        # determine all states adjacent to s
        transitions = self._adjacentVertices[s]
        freqDist = FreqDist()

        # determine the weights of the edges between state s and all adjacent states
        for state in transitions:
            freqDist.inc(state)

        if method == MarkovChain.MOST_LIKELY:
            return freqDist.max()

        elif method == MarkovChain.LEAST_LIKELY:
            # NLTK provides no built-in method to return the minimum of a
            # frequency distribution so for now, we get a list of samples
            # sorted in decreasing order and grab the last one.

            return freqDist.sorted_samples()[-1]

        else:
            # choose a real number between 0 and 1
            x = uniform(0,1)
            
            # choose next state based on weights of the edges.  Randomness plays a part here.
            for i in range(len(transitions)):
                probability = freqDist.freq(transitions[i])
             
                if x < probability:
                    return transitions[i]

                x = x - probability

            exc = "Error in MarkovChain.next().  Did not find next state.\n"
            raise exc

开发者ID:chrispenick，项目名称:pynlg，代码行数:41，代码来源:MarkovChain.py

示例14: get_content_avg_entropy

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
    def get_content_avg_entropy(self):
        '''
        :return: avg entropy of text/<mime> parts for multipart bodies
        '''
        n = 0
        txt_avg_ent = INIT_SCORE
        # todo: make n-grams
        tokens_list = tuple(self.pattern.get_stemmed_tokens())
        #logger.debug(tokens_list)

        for tokens in tokens_list:
            #logger.debug(tokens)
            n +=1
            freqdist = FreqDist(tokens)
            probs = [freqdist.freq(l) for l in FreqDist(tokens)]
            txt_avg_ent += -sum([p * math.log(p,2) for p in probs])
            #logger.debug(n)

        # :))
        if n !=0:
            txt_avg_ent = txt_avg_ent/n

        return txt_avg_ent

开发者ID:ml-course-stanford，项目名称:algos，代码行数:25，代码来源:checkers.py

示例15: listdir

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
unigrams_path = reu_path + unigramsFrom

# count word length frequencies
for f in listdir(samples_path):
	if (isfile(join(samples_path, f))):
		output_path = reu_path + toDir + f
		output =  open(output_path, "w")

		thisfile = open(samples_path + f).read()
		tokens = tokenize(thisfile)
		
		fd_words = FreqDist([len(w) for w in tokens])

		for a in range(1, 21):
			output.write(str(a) + '\t' + str(fd_words.freq(a)) + '\n')
		count_20 = 0
		# count 20+
		for w in tokens:
			if (len(w) >= 20):
				count_20 += 1
		output.write("20+\t" + str(count_20 / len(fd_words)) + '\n')

# count POS tag frequencies
for f in listdir(unigrams_path):
	if (isfile(join(unigrams_path, f))):
		output_path = reu_path + toDir + f
		output =  open(output_path, "a")

		thisfile = open(unigrams_path + f).read()
		tokens = tokenize(thisfile)

开发者ID:kperi，项目名称:AuthorshipAnalysis，代码行数:32，代码来源:bigramFrequencies.py

注：本文中的nltk.probability.FreqDist.freq方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。