当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.update方法代码示例

本文整理汇总了Python中nltk.probability.FreqDist.update方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.update方法的具体用法?Python FreqDist.update怎么用?Python FreqDist.update使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.probability.FreqDist的用法示例。


在下文中一共展示了FreqDist.update方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: buildCategoryDictionary

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def buildCategoryDictionary(category):
    tweetList = twitter_fetch.get_tweets_text(classn=category)
    freq = FreqDist()
    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))
    saveDictionaryToFile(freq, category + categoryDictFilePath)
    return freq
开发者ID:elms1990,项目名称:twitter-ml,代码行数:9,代码来源:textMining.py

示例2: high_words

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    pos = 0
    neg = 0
    for review in posids:
        pos += 1
        if (pos != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
 
    for review in negids:
        neg += 1
        if (neg != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return bestwords
    
    """
开发者ID:efrenaguilar95,项目名称:Yelp_Analyzer,代码行数:36,代码来源:classifiers.py

示例3: text_to_vector

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix
开发者ID:himanshusapra9,项目名称:TextNet,代码行数:36,代码来源:freq_representation.py

示例4: evaluate_html

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def evaluate_html(content, html_conf):
    fdist = FreqDist()
    if html_conf['usehtml'] == False:
        logging.info('Discarding HTML tags')
        return fdist
 
    logging.info("\tEvaluating HTML")
     
    # try with TITLE tag
    titles = re.findall("<title>[A-Za-z0-9 ]+</title>", content)
    for title in titles:
        root = etree.fromstring(title)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['title']):
            fdist.update(stems)

    # try with H1 tag
    headers = re.findall("<h1>[A-Za-z0-9 ]+</h1>", content)
    for header in headers:
        root = etree.fromstring(header)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['h1']):
            fdist.update(stems)

    return fdist
开发者ID:pejotr,项目名称:doc-clustering,代码行数:33,代码来源:preprocessing.py

示例5: text_to_dict

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i%100==0: print '    dict',str(i)+'/'+str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        dicts.append(d)
    return dicts
开发者ID:himanshusapra9,项目名称:TextNet,代码行数:35,代码来源:freq_representation.py

示例6: reduce_text

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def reduce_text(t1, t2):
    words = FreqDist(t1[0])
    words.update(t2[0])

    try:
        bigrams = FreqDist(t1[1])
        bigrams.update(t2[1])
    except:
        logger.error('problem in reducing..')
        logger.error('t1: %s' % str(t1))
        logger.error('t2: %s' % str(t2))
    
    return words, bigrams
开发者ID:ayat-rashad,项目名称:eg_twitter,代码行数:15,代码来源:process_data_spark.py

示例7: analyze

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def analyze(data, out_dir):
    summary = {}
    freq = FreqDist()
    sentence_length = defaultdict(list)
    year_freq_dist = defaultdict(FreqDist)
    year_dist = defaultdict(int)
    year_month_dist = defaultdict(int)
    year_quarter_dist = defaultdict(int)

    has_date = no_date = sentences = words = 0

    for year, date_str, title, text in data:
        date = parsedate(date_str)
        logger.debug('%s -> %s' % (date_str, str(date)))
        freq.update(ngram_phrases(text,3))
        if date:
            # Since can't use strftime for years before 1900, we need to use isoformat
            year_str = date.isoformat()[:4]
            year_mo_str = date.isoformat()[:7]
            has_date += 1
        else:
            no_date += 1
            year_mo_str = ''

        if year_str:
            year_range = get_year_range(year_str)
            sentence_length[ year_range ].extend( sentence_lengths(text) )
            year_freq_dist[ year_range ].update( ngram_phrases(text,3) )
            year_dist[year] += 1

        if year_mo_str:
            year_month_dist[year_mo_str] += 1
            year_quarter_dist[ year_quarter(year_mo_str) ] += 1

        sentences += count_sentences(text)
        words += count_words(text)

    logger.debug('Documents with a valid date: %d Documents without a valid date: %d' % (has_date, no_date))
    logger.debug('Total # Sentences: %d' % sentences)
    logger.debug('Total $ Words: %d' % words)

    generate_dict_csv(['year', 'cnt'], year_dist, os.path.join(out_dir, 'year-data.csv'))
    generate_dict_csv(['yearmo', 'cnt'], year_month_dist, os.path.join(out_dir, 'year-mo-data.csv'))
    generate_dict_csv(['yearq', 'cnt'], year_quarter_dist, os.path.join(out_dir, 'year-quarter-data.csv'))
    generate_stream_js(year_freq_dist, os.path.join(out_dir, 'stream-data.json'))
    generate_cloud_csv(year_freq_dist, os.path.join(out_dir, 'year-phrase-data.csv'))
    generate_sentence_length_csv(sentence_length, os.path.join(out_dir, 'data-sentence-lengths.csv'))
开发者ID:dangoldin,项目名称:lincoln-text-analysis,代码行数:49,代码来源:analyze.py

示例8: updateCategoryDictionary

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def updateCategoryDictionary(category):
    tweetList = twitter_fetch.get_new_tweets(classn=category)
    freq = FreqDist()
    tmpDict = FreqDist()

    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))

    try:
        oldDict = readDictionaryFromFile(category + categoryDictFilePath)
    except:
        newDict = buildCategoryDictionary(category)
        return newDict

    oldDict.update(freq)
    saveDictionaryToFile(oldDict, category + categoryDictFilePath)
    return oldDict
开发者ID:elms1990,项目名称:twitter-ml,代码行数:19,代码来源:textMining.py

示例9: cnc

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def cnc(phrase_lists, c_value_threshold=0, include_unigrams=False, weight_by_length=True):
    """given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs"""
    frequency_dists_by_length = {}
    for phrase in phrase_lists:
        l = len(phrase)
        if l not in frequency_dists_by_length:
            frequency_dists_by_length[l] = FreqDist()
        frequency_dists_by_length[l].inc(tuple(phrase))

    # word -> C-value(word)
    phrase_scores = {}

    # word -> num occurrences(word)
    phrase_frequencies = FreqDist()

    # word -> (t(word), c(word))
    sub_phrase_scores = {}

    # traverse from longest phrases to shortest
    for length, frequency_dist in sorted(frequency_dists_by_length.items(), key=lambda pair: pair[0], reverse=True):
        # update global frequency counts with all counts of this length
        phrase_frequencies.update(frequency_dist)
        # within each phrase length, traverse from most common phrases to least
        for phrase, frequency in frequency_dist.iteritems():
            if phrase in sub_phrase_scores:
                t, c = sub_phrase_scores[phrase]
                subtractive = 1.0 / c * t
            else:
                subtractive = 0
            if weight_by_length:
                if include_unigrams:
                    weight = log(length + 1, 2)
                else:
                    weight = log(length, 2)
            else:
                weight = 1
            c_value = weight * (frequency - subtractive)
            if c_value >= c_value_threshold:
                phrase_scores[phrase] = c_value
                for sub_phrase in utils.sub_lists(phrase):
                    if sub_phrase in sub_phrase_scores:
                        t, c = sub_phrase_scores[sub_phrase]
                    else:
                        t, c = 0, 0
                    sub_phrase_scores[sub_phrase] = t + frequency, c + 1
    return phrase_scores, phrase_frequencies
开发者ID:zzx88991,项目名称:mocs,代码行数:48,代码来源:ranking.py

示例10: reduce_tweets

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def reduce_tweets(t1, t2):
    tags = FreqDist(t1[0])
    tags.update(t2[0])
    
    words = FreqDist(t1[1])
    words.update(t2[1])
    
    places = FreqDist(t1[2])
    places.update(t2[2])
    
    bigrams = FreqDist(t1[3])
    bigrams.update(t2[3])
    
    return tags, words, places, bigrams
开发者ID:ayat-rashad,项目名称:eg_twitter,代码行数:16,代码来源:process_data_spark.py

示例11: word_count

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def word_count(drug=None,limit=None,pos_filter=False,lemma=True):
	"""Scans comment texts (from drug_mentions.texts) for selected drug, 
	calculates most common words.

	KWARGS:
		drug: string or None.
			Drug selector.  Allows three cases:
			* None: scrape all comments in database, regardless of drug.
			* 'antidepressant': select comments speaking generically about
				drug, not referencing specific drug.
			* [drug name]: comments referencing specific drug.
			Default None.  Passed to drug_mentions.texts.
		limit: int or None.
			Optional limit on SQL queries retrieved by drug_mentions.texts. 
			Defaults to None (returns all hits).
		pos_filter: boolean.
			Passed to tokenize(), set True to use part-of-speech filtering.
		lemma: boolean.
			Passed to tokenize(), set True to use lemmatization.

	RETURNS:
		freq: nltk.probability.FreqDist object.
			Frequency distribution of words from comments.

	RAISES:
		ValueError:
			for invalid drug name.
	"""
	try:
		texts = dm.texts(drug=drug,limit=limit)
	except ValueError:
		raise ValueError('Invalid drug name.')

	freq = FreqDist()
	for text in texts:
		freq.update(tokenize(text,drug,pos_filter=pos_filter,lemma=lemma))

	return freq
开发者ID:jrwalk,项目名称:empath,代码行数:40,代码来源:word_count.py

示例12: buildGoogleUnigram

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def buildGoogleUnigram( ):
    DirPrefix = "/home/jcavalie/googleNgrams_unigrams/"

    unigramFiles = os.listdir( DirPrefix )

    unigramFiles = list( map( lambda _fileName: DirPrefix + _fileName, unigramFiles ) )

    masterUnigram = FreqDist( )

    with multiprocessing.Pool( 8, initializer = initProcess ) as ProcessPool:
        resAsync = ProcessPool.map_async( _buildUnigram, unigramFiles )
        results = resAsync.get( )

    ProcessPool.join( )

    print( "all jobs finished, building master unigram" )
    for freqdist in results:
        masterUnigram.update( freqdist )

    with open( "PickledData/GoogleUnigram.pickle", 'wb' ) as pklFile:
        pickle.dump( masterUnigram, pklFile, pickle.HIGHEST_PROTOCOL )

    return
开发者ID:jcavalieri8619,项目名称:OCRerror_correct,代码行数:25,代码来源:buildWordLangModel.py

示例13: AddAlphaBigramModel

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
class AddAlphaBigramModel():
    def __init__(self, alpha=0.1):
        self.vocabulary=set()
        self.V = 0
        self.bigrams=ConditionalFreqDist([])
        self.unigrams=FreqDist([])
        self.alpha = 0.1
    def train(self):
        self.vocabulary=set()
        
        this_bigrams=[]
        self.unigrams = FreqDist([])
        
        for fileid in gutenberg.fileids():
            for sentence in gutenberg.sents(fileid):
                words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",]
                this_bigrams += bigrams(words)
                self.vocabulary.update(words)
                self.unigrams.update(words)
        self.bigrams=ConditionalFreqDist(this_bigrams)
        self.V = len(self.vocabulary)
        
    def bigram_prob(self, w1, w2):
        numerator = self.bigrams[w1][w2] + self.alpha
        denominator = self.bigrams[w1].N() + (self.alpha * self.V)
        retval= math.log(numerator / denominator)

        return retval

    def unigram_prob(self, w):
        numerator = self.unigrams[w] + self.alpha
        denominator = self.unigrams.N() + (self.alpha * self.V)
        return math.log(numerator/denominator)
    
    def __contains__(self, w):
        return w in self.vocabulary
开发者ID:slee17,项目名称:NLP,代码行数:38,代码来源:LanguageModel.py

示例14: process_documents

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def process_documents(path, html_conf):
    logging.info("Using documents from \"" + path + "\" directory ")
    
    if path[-1] != "/" :
        path + "/"

    documents = {}
    allterms  = {}
    listing   = os.listdir(path)
    allfreq   = FreqDist()

    # retriving document content - discarding structure
    logging.info("Processing files...")
    for infile in listing:
        logging.info("\tReading document " + infile)
        raw_doc     = open(path + infile, 'r').read()
        nonhtml_doc = nltk.clean_html(raw_doc)
        word_list   = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', raw_doc))
        terms_list  = [ x.lower() for x in word_list if x.lower() not in stopwords.words('english')]

        stemmes = steming(terms_list)

        for stem in stemmes :
            allterms[stem] = 0

        fdist = FreqDist(word.lower() for word in stemmes)
        allfreq.update(word.lower() for word in stemmes)

        htmldist = evaluate_html(raw_doc.lower(), html_conf)
        fdist.update(htmldist)
        allfreq.update(htmldist)
    
        documents[infile] = { 'docname': infile,  'terms': stemmes, 'tf': fdist, 'tfidf': None  }

    for key, doc in documents.iteritems():
        doctfidf = compute_tfidf(doc ,documents)
        documents[key]['tfidf'] = dict(allterms.items() + doctfidf.items())

    return documents, allfreq
开发者ID:pejotr,项目名称:doc-clustering,代码行数:41,代码来源:preprocessing.py

示例15: EditDistanceFinder

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
class EditDistanceFinder():  
    def __init__(self):
        self.char_probs = ConditionalProbDist([],MLEProbDist)
        self.bichar_freqs = ConditionalFreqDist([])
        self.transp_freqs = FreqDist()
        self.DOWN,self.LEFT,self.DIAG,self.DOUBLE_DIAG = range(4)
        self.INSERT, self.DELETE, self.SUBST, self.TRANSP = range(4)
        
    def train(self, fname):
        misspellings=[]
        for line in open(fname):
            line=line.strip()
            if not(line): continue
            w1, w2 = line.split(",")
            misspellings.append((w1.strip(),w2.strip()))
       
        last_alignments = None
        done = False
        while not done:
            print("Iteration")
            alignments, bigrams = self.train_alignments(misspellings)
            self.train_costs(alignments, bigrams)
            done = (alignments == last_alignments)
            last_alignments = alignments
            
    def train_alignments(self, misspellings):
        alignments = []
        self.bichar_freqs = FreqDist()

        for error, corrected in misspellings:
            distance, this_alignments = self.align(corrected, error)
            alignments += this_alignments
            bigrams = [corrected[i:i+2] for i in range(len(corrected)-1)]
            self.bichar_freqs.update(bigrams)
            
        return alignments,bigrams
    
    def train_costs(self, alignments,bigrams):
        add_one_aligns = [(a,b) for a in string.ascii_lowercase for b in string.ascii_lowercase]
        single_aligns = [(a,b) for a,b in alignments if len(a) < 2]
        
        char_aligns = ConditionalFreqDist(single_aligns + add_one_aligns)
        self.char_probs = ConditionalProbDist(char_aligns, MLEProbDist)
        
        double_aligns = [a for a,b in alignments if len(a) >= 2]
        self.transp_freqs = FreqDist(double_aligns)

    def align(self, w1, w2, verbose=False):
        M = len(w1) +1
        N = len(w2) +1
        table = numpy.zeros((M,N))
        backtrace = numpy.zeros((M,N))
    
        for i in range(1,M):
            w1_char = w1[i-1]
            table[i,0] = table[i-1,0] + self.del_cost(w1_char)
            backtrace[i,0] = self.DOWN
        for j in range(1,N):
            w2_char = w2[j-1]
            backtrace[0,j] = self.LEFT
            table[0,j] = table[0,j-1] + self.ins_cost(w2_char)   
    
        for i in range(1,M):
            w1_char = w1[i-1]
            for j in range(1,N):
                w2_char = w2[j-1]

                this_del = table[i-1,j] + self.del_cost(w1_char)
                this_ins = table[i,j-1] + self.ins_cost(w2_char)
                this_sub = table[i-1,j-1] + self.sub_cost(w1_char,w2_char)
                
                if j > 1 and i > 1 and w1[i-1] == w2[j-2] and w1[i-2]==w2[j-1] and w1[i-1] != w1[i-2]:
                    this_transp = table[i-2,j-2] + self.transp_cost(w1_char, w2_char)
                else:
                    this_transp = 999999
            
                min_cost = min(this_del, this_ins, this_sub, this_transp)
                table[i,j] = min_cost

                if this_sub == min_cost:
                    backtrace[i,j] = self.DIAG
                elif this_transp == min_cost:
                    backtrace[i,j] = self.DOUBLE_DIAG
                elif this_ins == min_cost:
                    backtrace[i,j] = self.LEFT
                else: # insert
                    backtrace[i,j] = self.DOWN

                
        alignments = []
        i = M - 1    
        j = N - 1
        while (j or i):
            this_backtrace = backtrace[i,j]
            if this_backtrace == self.DIAG: # sub
                alignments.append((w1[i-1],w2[j-1]))
                i -= 1
                j -= 1
            elif this_backtrace == self.DOUBLE_DIAG:
                alignments.append((w1[i-2:i],w2[j-2:j]))
#.........这里部分代码省略.........
开发者ID:slee17,项目名称:NLP,代码行数:103,代码来源:EditDistance.py


注:本文中的nltk.probability.FreqDist.update方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。