当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.items方法代码示例

本文整理汇总了Python中nltk.FreqDist.items方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.items方法的具体用法?Python FreqDist.items怎么用?Python FreqDist.items使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.FreqDist的用法示例。


在下文中一共展示了FreqDist.items方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: make_cutOff

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def make_cutOff(flatList, bottomCutOff, topCutOff):
    '''
    INPUT:
    flatList is a 1-d list of all tokens in set of tweets and both bottom and
    topCutOff are intergers
    OUTPUT:
    newVocab = a 1-d list of all tokens we want to keep
    thrownOut = a 1-d list of all tokens to throw out
    '''
    fd = FreqDist(flatList)
    newVocab = []
    thrownOut = []
    
    for item in fd.items()[:topCutOff]:
        # append most common words
        thrownOut.append(item)

    for item in fd.items()[topCutOff:]:
        if item[1] > bottomCutOff:
            # append good words
            newVocab.append(item[0])
        else:
            # append uncommon words
            thrownOut.append(item)

    print 'Cutoffs made...'
    return newVocab, thrownOut
开发者ID:JRMeyer,项目名称:twitter,代码行数:29,代码来源:twitter_lda.py

示例2: get_most_frequent

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
 def get_most_frequent(self, rawText, number = None, cleaning_level = 3):
     cleaned_tokens_levels = TokensCleaner.clean(self, rawText, cleaning_level)
     freq_distributions_levels = dict()
     for level, cleand_tokens in cleaned_tokens_levels.items():
         all_words = FreqDist(cleand_tokens)
         if number == None:
             freq_distributions_levels[level] = all_words.items()
         else:
             freq_distributions_levels[level] = all_words.items()[:number]
     return freq_distributions_levels
开发者ID:ssteku,项目名称:NLPRelatedPhenomenon,代码行数:12,代码来源:MostFrequentWordsExtractor.py

示例3: main

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def main():
    fileName = '../data/deals.txt'
    words,lines = get_filter(fileName)
    word_dist = FreqDist(words)  # get distribution, descending order
    print("Most Popular Term: ",word_dist.items()[0])# question 1
    print("Least Popular Term: ", word_dist.items()[-1]) # question 2
#   solution 1 for question 3
#    print("Types of Guitars Found:  ",len(count_guitar_types.count(lines)))
#   Solutioin 2 , better and more reasonable, but could be better 
    print("Type of Guitars mentioned", count_guitar_types2.count(lines)) 
开发者ID:TigerDeng,项目名称:exercises,代码行数:12,代码来源:task1.py

示例4: __getTimelineFeatures

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
    def __getTimelineFeatures(self, timeline):
        logger.info(u"Get timeline features")
        tweets = []
        self.__changePhase(PHASE["GET_TIMELINE_URLS"])
        for t in timeline:
            try:
                tweet = TweetText(t, self.__urlBuilder, self.__userBuilder)
            except:
                logger.exception(u"Error: \"" + unicode(t) + u"\"")
                raise ValueError(t)
            logger.debug(u"Tweet:" + unicode(tweet))
            tweets.append(tweet)

        urls = []
        ti = 0
        for tweet in tweets:
            for url in tweet.urls():
                self.__breakIfStopped()
                self.__urlResolver.addUrlToQueue(url)
                urls.append(url)
            logger.info(u"Tweet:" + unicode(tweet))
            ti += 1
            self.__proc = 100 * float(ti) / float(len(tweets))

        #Kategorie
        self.__changePhase(PHASE["GET_TIMELINE_FEATURES"])
        url2labels = {}
        ui = 0
        for url in urls:
            self.__breakIfStopped()
            if not url.isError():
                logger.debug(u"Classify " + unicode(url.getUrl()))
                url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText())
            ui += 1
            self.__proc = 100 * float(ui) / float(len(urls))

        labelsFreq = FreqDist()
        for labels in url2labels.values():
            for label in labels:
                labelsFreq.inc(label)
        self.__catFreq = labelsFreq.items()
        logger.info(u"Categories: "  + unicode(labelsFreq.items()))
        labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']]
        #normalizacja
        labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues}
        logger.info(u"Category factors: "  + unicode(labelsFreqValues))

        #Języki
        langFreq = FreqDist()
        for u in urls:
            langFreq.inc(u.lang())
        self.__langFreq = langFreq.items()
        logger.info(u"Languages: " + unicode(langFreq.items()))

        return labelsFreqValues
开发者ID:soldierkam,项目名称:pynews,代码行数:57,代码来源:user_tools.py

示例5: __extract_bigram_words

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
 def __extract_bigram_words(self, bigrams, values):
     bigrams_number_per_value = self.__configuration_map["most_frequent_bigrams_number_per_value"]
     most_frequent_bigrams = {}
     for value in values:
         fdist = FreqDist(bigrams[value])
         most_frequent_bigrams[value] = fdist.items()[:bigrams_number_per_value]
     return most_frequent_bigrams
开发者ID:ssteku,项目名称:NLPRelatedPhenomenon,代码行数:9,代码来源:MostFrequentWordsExtractor.py

示例6: findBestWords

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for category, words in wordsInCategories:
        word_fd.update(words)
        label_word_fd[category].update(words)

    word_counts = {}
    for condition in label_word_fd.conditions():
        word_counts[condition] = label_word_fd[condition].N()

    total_word_count = 0
    for condition, count in word_counts.items():
        total_word_count += count

    word_scores = {}

    for word, freq in word_fd.items():
        score = 0
        for condition, count in word_counts.items():
            score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count)
        word_scores[word] = score

    best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words]
    return set([w for w, s in best])
开发者ID:ekedziora,项目名称:sentiment,代码行数:28,代码来源:utils.py

示例7: freq_dist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def freq_dist(input, filtering_functions=[], plot = False, limit = None, return_counts = False):
    """Takes a list of words (hashtags, keywrods, anything) and plots a frequency distribution
       
       Filtering functions is an ORDERED set of functions to call on the raw input list that are executed before the freq dist
       That is, each item in input is run though f1,f2..,fn where filtering_functions = [f1,...fn]
       
       limit truncates the freq_dist to the limit most common items
       
       return_counts determines whether a list of tuples (word, count) are returned, 
          or whether a list of just the limit most used words is returned
    """
    for f in filtering_functions + [str.lower, str.strip]:
        input = map(f, input) 
    
    nltk_fdist = FreqDist(list(input))    
    
    if plot: #use nltks built in plotting function before destroying the data structure
        nltk_fdist.plot(limit) if limit else nltk_fdist.plot()      
    
    fdist = sorted(nltk_fdist.items(), key=lambda x:(-x[1], x[0]))   #alphabetically sort equally counted items
    fidst = fdist[0:limit] if limit else fdist                                  #apply limit
    fdist = [i[0] for i in fdist] if not return_counts else fdist               #remove counts if desired
        

    
    return fdist
开发者ID:SumAll,项目名称:python3-analysis-tools,代码行数:28,代码来源:statistical_functions.py

示例8: count_pos

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def count_pos(input, language):
    if language == 'english-nltk':
        words = word_tokenize(input)
        pos = pos_tag(words)

    elif language == 'english':
        s = pattern.en.parsetree(input, relations=True, lemmata=True)
        words = []
        pos = []
        for sentence in s:
            for w in sentence.words:
                words.append(w.string)
                pos.append((w.string, clean_text.clean_pos(w.type)))

    elif language == 'spanish':
        s = pattern.es.parsetree(input, relations=True, lemmata=True)
        words = []
        pos = []
        for sentence in s:
            for w in sentence.words:
                words.append(w.string)
                pos.append((w.string, clean_text.clean_pos(w.type)))

    elif language == 'dutch':
        words = word_tokenize(input, 'dutch')
        tagger = nltk.data.load('taggers/alpino_aubt.pickle')
        pos = tagger.tag(words)

    tags = FreqDist(tag for (word, tag) in pos)
    relative_frequency = []
    for item in tags.items():
        relative_frequency.append((item[0], float(item[1])/tags.N()))
    return relative_frequency
开发者ID:constanr,项目名称:gender,代码行数:35,代码来源:pos.py

示例9: preprocess

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def preprocess(content):
	stopset = set(stopwords.words('english'))
	#replace punctuation and tag with space
	tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower())) 
	pos_list = pos_tag(tokens)
	s_tokens = list()

	#noun and verb only
	for pos in pos_list:
		#print pos[1]
		#if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
		if pos[1] in ['NN', 'NNS']:
			s_tokens.append(pos[0])

	wordfreq = FreqDist(s_tokens)
	stemfreq = dict()
	st = LancasterStemmer()
	for word, freq in wordfreq.items():
		#stopwords
		if word in stopset:
			del wordfreq[word]
			continue
		#tiny words
		if len(word) <= 2:
			del wordfreq[word]
			continue
		#stemmer
		stem = st.stem(word)
		try:
			stemfreq[stem]+=freq
		except:
			stemfreq[stem]=freq
	return stemfreq
开发者ID:TorchmanX,项目名称:TARS,代码行数:35,代码来源:nc.py

示例10: summarize

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
    def summarize(self, text):
        # get words from text
        words = word_tokenize(text)

        # filter out stop words and lower case
        words = [word.lower() for word in words if word not in self.stopwords]

        # filter non-alphameric chars from words
        words = [filter(unicode.isalnum, word) for word in words]
        words = filter(lambda w: len(w) > 0, words)  # Remove empty words

        # stemming
        words = [self.pst.stem(word) for word in words]
        word_frequencies = FreqDist(words)
        most_frequent = [word[0] for word in word_frequencies.items()[:self.top_words_count]]

        # get sentences
        sentences = sent_tokenize(text)

        sentence_score = defaultdict(int)

        for i in range(len(sentences)):
            sentence = sentences[i]
            sentence_words = word_tokenize(sentence)
            sentence_words = [self.pst.stem(word).lower() for word in sentence_words if word not in self.stopwords]

            for sentence_word in sentence_words:
                if sentence_word in most_frequent:
                    sentence_score[i] += 1

        sorted_wordcounts = sorted(sentence_score.iteritems(), key=operator.itemgetter(1), reverse=True)[:self.number_of_sentences]
        summary = "\n".join([sentences[num] for num, count in sorted_wordcounts])

        return summary
开发者ID:joshnewnham,项目名称:document-mining,代码行数:36,代码来源:summariser.py

示例11: termfreq

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def termfreq(storytext, filename):
    '''
    This function takes a speech/text/article, preprocesses it into tokens, 
    removes stopwords, and outputs a csv of term counts and frequencies 
    relative to the size of the speech/text/article
    '''
    
    # Split into tokens, remove stopwords
    tokens = make.preprocess(storytext)
    stops = make.filter_stopwords(tokens)
    numstops = len(stops)    
    
    # Create a FreqDist and turn it into a list of tuples
    freq = FreqDist(stops)
    data = freq.items()[:numstops]
    
    # Build a pandas DataFrame of that list
    df = pd.DataFrame(data)
    df.columns = ['word', 'count']
    
    # Add a 'relative frequency' column to the DataFrame
    a = []
    for i in df['count']:
        a.append(i/numstops)
    df['pct'] = a
    
    # Write the file to csv
    df.to_csv('%s.csv' % filename, sep=',')
    print df
    print 'Check your files for the csv!'    
开发者ID:SpacaB,项目名称:Concordance-Collages,代码行数:32,代码来源:termfreq.py

示例12: top_words_from_corpus

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
 def top_words_from_corpus(self, num_words, test_name):
     corpus_tokens = []
     for i in self.corpus_vars["corpus_member_ids"]:
         title = 'document_' + str(i)
         doc_tokens = Library.document_instances[title].metadata["tokenized_doc"]
         corpus_tokens += doc_tokens
     top_words = []
     fdist_corpus = FreqDist(corpus_tokens)
     fdist_list = fdist_corpus.items()
     if test_name == "Function Word PCA":
         function_pos = ['IN', 'TO', 'CC', 'DT', 'PDT', 'WDT']
         for i in fdist_list:
             top_words.append(i[0])
             if len(top_words) == num_words:
                 tagged_top = nltk.pos_tag(top_words)
                 for j,k in tagged_top:
                     if k not in function_pos:
                         top_words.remove(j)
                 if len(top_words) == num_words:
                     break
     elif test_name == "Burrows's Delta":
         for i in fdist_list:
             top_words.append(i[0])
             if len(top_words) == num_words:
                 break
     return top_words
开发者ID:mjlavin80,项目名称:py_style,代码行数:28,代码来源:py_styleModel.py

示例13: palavrasChaves

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
    def palavrasChaves(self):
        # fun��o da NLTK que retorna as stopwords na lingua inglesa
        stopE = stopwords.words('english')

        # fun��o da NLTK que retorna as stopwords na lingua portuguesa
        stop = stopwords.words('portuguese')  
              
        stopS = stopwords.words('spanish')
        
        palavrasChaves = [] 
        textoArtigo = []
        
        #retira pontua��es do texto e divide o texto em palavras
        for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split():
            #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
            if i not in stop:
                #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
                if i not in stopE:
                    #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
                    if i not in stopS:
                            if len(i) > 2:
                                textoArtigo.append(i)
        
        # apresenta a frequencia de repeticoes das palavras no corpo do artigo
        freq = FreqDist(textoArtigo)
        
        # separa as quatro palavras mais frequentes
        items = freq.items()[:4]
        
        # coloca as palavras mais frequentes do texto na variavel palavrasChaves
        for i in range(0,len(items)):
            palavrasChaves.append(items[i][0])
            
        return palavrasChaves        
开发者ID:dienerpiske,项目名称:QSabe,代码行数:36,代码来源:models.py

示例14: posAnalysis

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def posAnalysis(collection):

	reviews = collection.find(timeout=False)

	__reportProgress.counter = 0

	skip = 1

	for rev in reviews:
		if skip%200 == 0:
			print 'skip'+str(skip)
		__reportProgress()
		if rev.has_key('tags'):
			skip += 1
			if rev['tags'].has_key('NN'):				
				continue

		sents = sent_tokenize(rev['text'])
		tokens = [word for sent in sents for word in word_tokenize(sent)]
		pos = tagger.tag([tok for tok in tokens if tok not in ',.-$\" '])
		tag_fd = FreqDist(tag for (word, tag) in pos)
		tags = dict()
		for (key,value) in tag_fd.items():
			k = key.replace('$','S')
			out = key.translate(string.maketrans("",""), string.punctuation)
			if len(out)>0:
				tags[k] = value
		collection.update({'_id':rev['_id']},{"$set": {"tags": tags}})		
开发者ID:ecsark,项目名称:Yelp-Recruiting,代码行数:30,代码来源:trueRating.py

示例15: get_probs

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def get_probs(filename):
    """read the given text and calculate the probabilities for all symbols."""
    with open(filename) as file_in:
        text = file_in.read()
    probs = FreqDist(text)
    count_sum = sum(v for v in probs.values())
    for k,v in probs.items():
        probs[k] = v * 1.0 / count_sum
    return probs
开发者ID:jhb86253817,项目名称:ITM-exercise,代码行数:11,代码来源:shannon_fano2.py


注:本文中的nltk.FreqDist.items方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。