Python FreqDist.update方法代码示例

本文整理汇总了Python中nltk.FreqDist.update方法的典型用法代码示例。如果您正苦于以下问题：Python FreqDist.update方法的具体用法？Python FreqDist.update怎么用？Python FreqDist.update使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist的用法示例。

在下文中一共展示了FreqDist.update方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: findBestWords

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for category, words in wordsInCategories:
        word_fd.update(words)
        label_word_fd[category].update(words)

    word_counts = {}
    for condition in label_word_fd.conditions():
        word_counts[condition] = label_word_fd[condition].N()

    total_word_count = 0
    for condition, count in word_counts.items():
        total_word_count += count

    word_scores = {}

    for word, freq in word_fd.items():
        score = 0
        for condition, count in word_counts.items():
            score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count)
        word_scores[word] = score

    best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words]
    return set([w for w, s in best])

开发者ID:ekedziora，项目名称:sentiment，代码行数:28，代码来源:utils.py

示例2: get_stats

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
    def get_stats(self, output_fname):
        fd = FreqDist()
        for text in self.texts:
            fd.update(set(text))

        fh = open(output_fname, 'w')
        text = Text(self.paragraph_tokens)
        fdist = FreqDist(text)
        for (w,f) in fdist.iteritems():
            print >> fh, "%s\t%i" % (w, f)
        fh.close()

开发者ID:yuedong111，项目名称:topical-spiders，代码行数:13，代码来源:topic_dictionary.py

示例3: get_ngrams

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def get_ngrams(fileLines, n, pos_tag_dict):
    # Get n gram counts for corpus
    tokens = [];
    ngram_counts = FreqDist();
    for excerpt in fileLines:	 	
	ngram_counts_exp = get_ngram_counts_per_excerpt(excerpt,n,pos_tag_dict);	
	for ngram in ngram_counts_exp:
        	if ( ngram in ngram_counts ):
			val = ngram_counts[ngram];
		else:	
			val = 0;	
		ngram_counts[ngram] = val + ngram_counts_exp[ngram]; 
			
    ngram_counts.update(ngram_counts_exp);
 
    return ngram_counts;

开发者ID:vidurbhatnagar，项目名称:PennCIS530CompLinguistics，代码行数:18，代码来源:createPosNgramFeatures.py

示例4: check_svc_bef_aft

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def check_svc_bef_aft(list_line, command):
    # check the freq of words before and after bus service
    # check the freq of words before and after of word (number) which is non bus svc
    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        for k in range(0, len(split_second)):
            if command == 'before_svc':
                if int(split_second[k]) == 1:  # mean bus svc
                    if command == 'before_svc':
                        if k > 0:  # bus svc doesn't appear at the first position of sentences
                            text = text + split_first[k - 1].lower() + ' '  # take the word before
                print i, k, split_first[k]

            if command == 'after_svc':
                if int(split_second[k]) == 1:  # mean bus svc
                    if command == 'after_svc':
                        if k < len(split_second) - 1:
                            text = text + split_first[k + 1].lower() + ' '  # take the word after

            if command == 'before_notsvc':
                if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1:  # text is a number and not a bus svc
                    if k > 0:  # bus svc doesn't appear at the last position of sentences
                        text = text + split_first[k - 1].lower() + ' '

            if command == 'after_notsvc':
                if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1:  # text is a number and not a bus svc
                    if k < len(split_second) - 1:  # bus svc doesn't appear at the last position of sentences
                        text = text + split_first[k + 1].lower() + ' '

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    print text

开发者ID:hvdthong，项目名称:Transportation_NEC，代码行数:47，代码来源:load_crf.py

示例5: get_vocab

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def get_vocab(series, addtional_tokens=[], top=None):
    """
    extract the vocabulary out of an array, allow to add additional tokens to the vocabulary and choose only the top n frequent words.
    :param series: array of sentences
    :param addtional_tokens: additional tokens we want to include in the vocabulary
    :param top: top n frequent words we want to include in the vocabulary
    :return: map from a word to its numeric representation and the opposite map
    """
    rev_vocab = addtional_tokens
    freq_vocab = FreqDist()
    for s in tqdm(series):
        freq_vocab.update(word_tokenize(decontracted(s)))
    print("Original vocab size %s" % len(freq_vocab))
    all_words_sorted = sorted(freq_vocab, key=freq_vocab.get, reverse=True)
    top_words = all_words_sorted[:top]
    rev_vocab += top_words
    vocab = {word: index for index, word in enumerate(rev_vocab)}
    return vocab, rev_vocab

开发者ID:ofrik，项目名称:Seq2Seq，代码行数:20，代码来源:util.py

示例6: term_freq_all

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def term_freq_all(path, name):
    file = path + '/' + name

    fdist = FreqDist()
    list_line = []
    with open(file) as f:
        for line in f:
            split_line = line.split('\t')
            words = nltk.word_tokenize(split_line[1].decode('utf-8').lower().strip())
            fdist.update(words)
            print split_line[0]

            # list_stem = []
            # for token in words:
            #     # st = LancasterStemmer()
            #     # try:
            #     #     list_stem.append(st.stem(token).decode('utf-8'))
            #     # except:
            #     #     print (split_line[0])
            #
            #     st = PorterStemmer()
            #     try:
            #         list_stem.append(st.stem(token).decode('utf-8'))
            #     except:
            #         print (split_line[0])
            # fdist.update(list_stem)


            #print (line)

    print ('==========================================')
    print ('==========================================')
    print (len(fdist))
    stop = stopwords.words('english')


    for value in fdist.most_common(15000):
        # if (value[0] not in stop and (len(value[0]) >= 4)):
        if (value[0] not in stop):
            print (str(value[0].encode('utf-8')) + '\t' + str(value[1]))

开发者ID:hvdthong，项目名称:Transportation_NEC，代码行数:42，代码来源:term_freq.py

示例7: token_aft

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def token_aft(list_line, command):
    # check the token after label, note that belongs to the command ('svc', 'road', 'busstop')
    text = ''
    list_length = []

    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')  # list of sentences
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')  # list of label for each word
        list_length.append(len(split_first))

        if command == 'svc':
            for k in range(0, len(split_second)):
                # check the frequency of token before bus service
                if int(split_second[k]) == 1:  # mean bus svc
                    if k < len(split_second) - 1:  # bus svc doesn't appear at the first position of sentences
                        # try:  # don't use stemming here
                        #     stem_word = port.stem(connect_token(split_first[k - 1].lower()))  # take the token before
                        # except UnicodeDecodeError:
                        #     stem_word = connect_token(split_first[k - 1].lower())
                        stem_word = connect_token(split_first[k + 1].lower())  # take the token after label
                        if is_int(stem_word) is False:
                            text = text + stem_word + ' '

                        # if stem_word == 'sd' or stem_word == 'dd':
                        #     print list_line[i]

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        # print value[0], '\t', value[1]
        print value[0]

    print text

开发者ID:hvdthong，项目名称:Transportation_NEC，代码行数:42，代码来源:token_creating.py

示例8: term_freq_time

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def term_freq_time(first, last):
    ## get the time convert in sgforum
    db = MySQLdb.connect(host="localhost", # your host, usually localhost
                         user="root", # your username
                          passwd="ducthong", # your password
                          db="sgforums_singaporebuses") # name of the data base

    # you must create a Cursor object. It will let
    #  you execute all the queries you need
    cur = db.cursor()

    # Use all the SQL you like
    sql = "select p.post_id, s.createdAtSecond, p.summary from posts_filter p, posts_createatsecond s where p.post_id = s.post_id and s.createdAtSecond >= " \
          + str(first) + " and s.createdAtSecond <= " + str(last) + " order by s.createdAtSecond;"
    cur.execute(sql) #call the database which name 'posts'

    fdist = FreqDist()
    for row in cur.fetchall():
        post_id = str(row[0])
        createdAtSecond = str(row[1])
        summary = unicode(str(row[2]), errors='ignore')
        #print (post_id + '\t' + createdAtSecond + '\t' + summary)
        words = nltk.word_tokenize(summary.lower().strip().decode('utf-8'))
        # try:
        #     words = nltk.word_tokenize(summary.lower().strip().decode('utf-8'))
        # except:
        #     print (post_id + '\t' + summary)
        fdist.update(words)
    cur.close()
    print ('==========================================')
    print ('==========================================')
    print (len(fdist))
    stop = stopwords.words('english')

    for value in fdist.most_common(200):
        if (value[0] not in stop and len(value[0]) >= 3):
            print (str(value[0]).encode('utf-8') + '\t' + str(value[1]))

开发者ID:hvdthong，项目名称:Transportation_NEC，代码行数:39，代码来源:term_freq.py

示例9: check_bef_aft_roadBusStop

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def check_bef_aft_roadBusStop(list_line, command):
    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        k = 0
        while True:
            if k >= len(split_second):
                break

            if command == 'bef_road':
                try:
                    if int(split_second[k]) == 2:  # take road
                        if k > 0:
                            text = text + connect_token(split_first[k - 1].lower()) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 2:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'aft_road':
                try:
                    if int(split_second[k]) == 2:  # take road
                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 2:
                                    break
                        if k < len(split_second) - 1:
                            if is_int(split_first[k]) is False:
                                text = text + connect_token(split_first[k].lower()) + ' '  # take the token after the label
                    else:
                        k += 1

                except ValueError:
                    k += 1

            if command == 'bef_busstop':
                try:
                    if int(split_second[k]) == 3:  # take busstop
                        if k > 0:
                            text = text + connect_token(split_first[k - 1].lower()) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 3:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'aft_busstop':
                try:
                    if int(split_second[k]) == 3:  # take road
                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 3:
                                    break
                        if k < len(split_second) - 1:
                            if is_int(split_first[k]) is False:
                                text = text + connect_token(split_first[k].lower()) + ' '  # take the token after the label
                    else:
                        k += 1

                except ValueError:
                    k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    print text

开发者ID:hvdthong，项目名称:Transportation_NEC，代码行数:100，代码来源:load_crf.py

示例10: ngrams

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk import FreqDist

sentence = 'this is a foo bar sentences and i want to ngramize it this this'
# n = 3
# list_grams = ngrams(sentence.split(), n)
#
# for grams in list_grams:
#     string = ''
#     for value in grams:
#         string = string + ' ' + value
#     print (string.strip())

fdist = FreqDist()
tokens = word_tokenize(str(sentence))
fdist.update(tokens)

for value in fdist.most_common():
    print value

i = 11
for i in range(0, 10):
    i = i + 2
    print 'testing'

text = 'Mount Batten Rd Haig Rd Sims Ave'
split_text = text.split('Rd')
for value in split_text:
    print value

开发者ID:hvdthong，项目名称:Transportation_NEC，代码行数:32，代码来源:nltk_n_grams.py

示例11: init

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
class BiWordExtractor:

    def __init__(self, pickle_file):
        self._statuses = pickle.load(open(pickle_file, 'rb'))
        self._averages = dict()
        self._gender_stats = dict()
        self.fdistneuro = FreqDist()
        self.fdistnonneuro = FreqDist()
        self.highneuro = defaultdict()
        self.highnonneuro =defaultdict()
        

    """
    Processes statuses. (For information on how the different data structures
    are set up, look at the comments for the getters.)
    """

    
    def wordprocess(self):
        lengths = dict()
        row = 0
        for status in self._statuses[1:]:
            row +=1
            print row
            user = status[0]
            
            filtered_status = status[1].translate(string.maketrans("",""), string.punctuation)
            
            tokens = pattern_split.split(filtered_status.lower())
            
            filtered_tokens = [w for w in tokens if w not in stopwordslist and w not in filterlist]
                        
            bitokens = nltk.bigrams(filtered_tokens)
            
            if status[5] == '+':
                self.fdistneuro.update(bitokens) 
            elif status[5] == '-':
                self.fdistnonneuro.update(bitokens)            
            
                
                

                

        

    

    def neuro_word_frequency(self):
        vocneuro= self.fdistneuro.keys()
        highvocneuro = vocneuro [:300]
        return highvocneuro
        
   

   

    def highneuro_word_frequency(self):
        for w in self.neuro_word_frequency():
            if self.fdistneuro[w]>= 5:
                self.highneuro[w] =self.fdistneuro[w]
            
        print self.highneuro.items()
        print self.highneuro.keys()
        return self.highneuro.keys()

开发者ID:velveret，项目名称:personality，代码行数:67，代码来源:biword.py

示例12: token_bef

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def token_bef(list_line, command):
    # check the token before label, note that belongs to the command ('svc', 'road', 'busstop')
    port = PorterStemmer()
    text = ''
    list_length = []
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')  # list of sentences
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')  # list of label for each word
        list_length.append(len(split_first))

        if command == 'svc':
            for k in range(0, len(split_second)):
                # check the frequency of token before bus service
                if int(split_second[k]) == 1:  # mean bus svc
                    if k > 0:  # bus svc doesn't appear at the first position of sentences
                        # try:  # don't use stemming here
                        #     stem_word = port.stem(connect_token(split_first[k - 1].lower()))  # take the token before
                        # except UnicodeDecodeError:
                        #     stem_word = connect_token(split_first[k - 1].lower())
                        stem_word = connect_token(split_first[k - 1].lower())
                        if is_int(stem_word) is False:
                            text = text + stem_word + ' '

        elif command == 'road':
            k = 0
            while True:
                if k >= len(split_second):
                    break
                else:
                    try:
                        if int(split_second[k]) == 2:  # mean road
                            if k > 0:
                                stem_word = connect_token(split_first[k - 1].lower())
                                if is_int(stem_word) is False:
                                    text = text + stem_word + ' '  # take the word before

                            while True:
                                k += 1
                                if k == len(split_second):
                                    break
                                else:
                                    if int(split_second[k]) != 2:
                                        break
                        else:
                            k += 1
                    except ValueError:
                        k += 1

        elif command == 'busstop':
            k = 0
            while True:
                if k >= len(split_second):
                    break
                else:
                    try:
                        if int(split_second[k]) == 3:  # mean bus stop
                            if k > 0:
                                stem_word = connect_token(split_first[k - 1].lower())
                                if is_int(stem_word) is False:
                                    text = text + stem_word + ' '  # take the word before

                            while True:
                                k += 1
                                if k == len(split_second):
                                    break
                                else:
                                    if int(split_second[k]) != 3:
                                        break
                        else:
                            k += 1
                    except ValueError:
                        k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]
        # print value[0]

    print text

开发者ID:hvdthong，项目名称:Transportation_NEC，代码行数:89，代码来源:token_creating.py

示例13: while

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
    in_str = sys.stdin.read(BUF_SIZE)
    rest = ''

    read_count = 0

    while (rest + in_str).strip() != '':
        read_count += 1

        if read_count % 100 == 0:
            sys.stderr.write('.')
            sys.stderr.flush()

        tokens = (rest + in_str).split()
        rest = tokens.pop()

        if not tokens:
            vocab.update(rest)
            break
        else:
            vocab.update(tokens)

        in_str = sys.stdin.read(BUF_SIZE)

    print

    for i in [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]:
        if i > len(vocab.values()):
            break

        print "vocab size %7d - cutoff = %d" % (i, vocab.values()[i])

开发者ID:andrely，项目名称:sublexical-features，代码行数:32，代码来源:find_vocab_cutoff.py

示例14: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
all_words = FreqDist(w.lower() for w in train_set_words).keys()

def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    features = {}
    for word in all_words:
        features['contains({})'.format(word)] = (word in tweet_words)
    return features

word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

for item in train_set:
    tweet = item[0].lower()
    words = word_tokenize(item[0])
    word_fd.update(words)
    label_word_fd[item[1]].update(words)

pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}
 
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

开发者ID:MARS87，项目名称:ieor242，代码行数:32，代码来源:naive_bayes_classifier_reduce_features.py

示例15: parse_ngram_order

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
    if word_limit:
        logging.info('Word limit %d' % word_limit)

    order = parse_ngram_order(opts.ngram_order)

    logging.info('Char n-gram order (%d, %d)' % order)
    cutoff = opts.min_count

    corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit)

    tf = FreqDist()
    df = FreqDist()

    n_docs = 0

    for text in corpus:
        n_docs += 1

        tf.update(text)
        df.update(set(text))

    print "###TOTAL###\t%d\t%d" % (tf.N(), n_docs)

    for token, freq in tf.items():
        if freq < cutoff:
            break

        print "%s\t%d\t%d\t%.6f" % (token, freq, df[token], math.log(float(n_docs)/df[token]))

开发者ID:andrely，项目名称:sublexical-features，代码行数:30，代码来源:calc_freqs.py

注：本文中的nltk.FreqDist.update方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。