当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.keys方法代码示例

本文整理汇总了Python中nltk.FreqDist.keys方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.keys方法的具体用法?Python FreqDist.keys怎么用?Python FreqDist.keys使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.FreqDist的用法示例。


在下文中一共展示了FreqDist.keys方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: prepare_pos_features

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def prepare_pos_features(Language_model_set, output_file):
    corpus_root = '/home1/c/cis530/data-hw2/' + Language_model_set
    texts = PlaintextCorpusReader(corpus_root, '.*')
    text = texts.words()
    tagged_text = nltk.pos_tag(text)
    merged_tag_text = mergeTags(tagged_text)
    lists = seperate_pos(merged_tag_text)
    nouns_dist = FreqDist(lists[0])
    top_nouns = nouns_dist.keys()[:200]
    verbs_dist = FreqDist(lists[1])
    top_verbs =verbs_dist.keys()[:200]
    advs_dist = FreqDist(lists[2])
    top_advs =advs_dist.keys()[:100]
    prep_dist = FreqDist(lists[3])
    top_preps =prep_dist.keys()[:100]
    adjs_dist = FreqDist(lists[4])
    top_adjs =adjs_dist.keys()[:200]


    out = open(output_file, 'w')

    for n in top_nouns:
        out.write('NN'+ n + '\n')
    for v in top_verbs:
        out.write('VV'+ v + '\n')
    for av in top_advs:
        out.write('ADV'+ av + '\n')
    for p in top_preps:
        out.write('PREP'+ p + '\n')
    for aj in top_adjs:
        out.write('ADJ'+ aj + '\n')
开发者ID:madhuraraju,项目名称:NLP_Class_Code_Samples,代码行数:33,代码来源:CL_Two_Code_rmadhura.py

示例2: main

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def main():
    keyword_list = ["Top Secret", "Secret Service", "Classified", "Targeted", "Assassination",
                    "Kill Program", "NSA", "wire", "CIA", "FBI", "DEA", "DOJ", "hackers",
                    "hacker", "exploit code", "Defense", "Intelligence", "Agency"]
    file_name = "tweets_output.txt"
    pickle_words_file = "words.pickle"
    pickle_words(file_name, pickle_words_file, keyword_list)
    pickle_tweets_file = "tweets.pickle"
    pickle_tweets(file_name, pickle_tweets_file)
    words = load(open("words.pickle"))
    tweets = load(open("tweets.pickle"))
    freq_dist = FreqDist(words)
    print tweets
    print("===")
    print("Conducting Frequency and Lexical Diversity Analysis of Twitter Search Space: ")
    print("===")
    print("Number of words within the twitter search space: ")
    print(len(words))
    print("Number of unique words within twitter search space: ")
    print(len(set(words)))
    print("Lexical Diversity of unique words within twitter search space: ")
    print(lexical_diversity(words))
    print("===")
    print("Conducting Native Language Processing Analysis Utilizing Python NLTK")
    print("===")
    print("Top 50 Frequent Words within the Twitter Search Space: ")
    print(freq_dist.keys()[:50])
    print("===")
    print("Bottom 50 Frequent Words within the Twitter Search Space: ")
    print(freq_dist.keys()[-50:])
    print("===")
开发者ID:0day1day,项目名称:OSINT,代码行数:33,代码来源:nltk_tweet_analysis.py

示例3: handle

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
    def handle(self, *args, **options):
    	fdist = FreqDist()
    	print "Analyzing raw data"
    	limit = 10
    	if args:
    		raw_datas = RawData.objects.filter(pk__in=args)
    	else:
	   		raw_datas = RawData.objects.all()[:limit]
    	tagged_data = []
    	for raw_data in raw_datas:
    		words = nltk.word_tokenize(raw_data.data)
    		tagged_data.extend(nltk.pos_tag(words))
    		for word in words:
    			word = word.strip()
    			if word:
	    			fdist.inc(word)

    	print "Anaylzed %s items" % len(raw_datas)
    	print

    	print "Top word: %s" % fdist.max()
    	print 

    	print "Top 10 words"
    	for word in fdist.keys()[:10]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	
    	print "Bottom 10 words"
    	for word in fdist.keys()[-10:]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	print "Words occurring between 50-100 times"
    	words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ]
    	print ", ".join(words)


    	cfdist = ConditionalFreqDist()
    	for (word, tag) in tagged_data:
    		cfdist[tag].inc(word)
    	
    	print "Most popular noun: %s" % cfdist["NN"].max()
    	print 

    	print "Top 50 nouns"
    	for word in cfdist["NN"].keys()[:50]:
    		times = cfdist["NN"][word]
    		print " -- %s occurred %s times" % (word, times)
    	print
开发者ID:jaywhy13,项目名称:mapstream,代码行数:55,代码来源:analyze.py

示例4: entropy

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def entropy(tokens):
    """
    Get the Shannon entropy of a document using it's token distribution
    :param tokens: A document represented as a list of tokens.
    :return:
    """
    doc_len = len(tokens)
    frq = FreqDist(tokens)
    for key in frq.keys():
        frq[key] /= doc_len
    ent = 0.0
    for key in frq.keys():
        ent += frq[key] * math.log(frq[key], 2)
    ent = -ent
    return ent
开发者ID:rug-compling,项目名称:glad,代码行数:17,代码来源:glad-main.py

示例5: draw_word2vec

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def draw_word2vec():
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load("output/data_cache.pk")
    print "Read in finished"

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model="data/GoogleNews-vectors-negative300.bin", binary=True, size=300)
    print "Pretrained word2vec loaded"

    all_tokens = sum(dataloader.data.viewvalues(), [])
    print "#Tokens: " + str(len(all_tokens))
    fdist = FreqDist(all_tokens)
    tokens = fdist.keys()[1:500]
    print tokens
    tokens_has_vectors = []
    for token in tokens:
        if word2vec[token] is not None:
            tokens_has_vectors.append(token)

    print "#Unique Tokens \w Vectors: " + str(len(tokens_has_vectors))
    vectors = word2vec.encode(tokens_has_vectors)
    print "#Unique Vectors: " + str(len(vectors))

    print ("Computing MDS embedding")
    clf = manifold.MDS(n_components=2, n_init=1, max_iter=2000)
    # clf = manifold.Isomap(n_components=2, max_iter=100)
    vectors_mds = clf.fit_transform(vectors)
    print ("Done. Stress: %f" % clf.stress_)
    plot_embedding(vectors_mds, tokens_has_vectors, "MDS embedding of the words")
开发者ID:vdavid70619,项目名称:Prediction-of-Derpression-using-NLP,代码行数:32,代码来源:draw_word2vec.py

示例6: compress_term_matrix

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def compress_term_matrix(matrix, words):
    initials = [item[0] for item in words]
    
    fdist = FreqDist(initials)
    
    letterindices = []
    for letter in sorted(fdist.keys()):
        letterindices.append((letter, fdist[letter]))
    
    indexmatrix = []
    start = 0
    for letter, occ in letterindices:
        newocc = occ / 5
        
        print letter,"  ",occ
        print " range: ", start,"  ", start+occ,"  ",newocc
        indexes = np.random.random_integers(start, start+occ, newocc)
        indexmatrix.append((letter, indexes.tolist()))
        start = start+ occ
    
    allindices = []
    for _,v in indexmatrix:
        allindices.extend(v)
    smatrix = matrix[allindices, :]
    return indexmatrix, smatrix                
开发者ID:dicleoztur,项目名称:subjectivity_detection,代码行数:27,代码来源:numericutils.py

示例7: find_abbreviations

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def find_abbreviations():
    import db
    from tokenizers import es
    from nltk import FreqDist

    corpus = db.connect()
    #text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)])
    text = '\n'.join([a['text'] for a in corpus.articles.find()])
    tokens = es.tokenize(text, ignore_abbreviations=True)

    fd = FreqDist()
    fd_abbr = FreqDist()
    fd_n_abbr = FreqDist()
    n_tokens = len(tokens)
    for i in range(n_tokens):
        fd.inc(tokens[i])
        if i < (n_tokens - 1) and tokens[i + 1] == u'.':
            fd_abbr.inc(tokens[i])
        else:
            fd_n_abbr.inc(tokens[i])

    adjusted = {}
    f_avg = len(fd.keys()) / fd.N()
    for t, n in fd_abbr.iteritems():
        f = fd.get(t, 0) / fd.N()
        deviation = 1 + (f - f_avg)
        adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t)

    items = adjusted.items()
    items.sort(key=lambda i: i[1], reverse=True)
    for t, n in items[:100]:
        print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
开发者ID:nosamanuel,项目名称:nlp,代码行数:34,代码来源:punctuation.py

示例8: parse

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def parse(filename):
    outfilename = filename + ".freq"
    entry_string = open(filename, 'r').read()
 
    # convert to lower case 
    entry_string = entry_string.lower() 

    # remove punctuation 
    for c in string.punctuation: 
            entry_string = entry_string.replace(c, " ") 

    # remove everything except letters and spaces
    entry_string = re.sub("[^a-z ]", " ", entry_string) 

    # strip out multiple spaces 
    entry_string = re.sub(r'\s+', r' ', entry_string) 

    # make the string into a list and remove stopwords from it 
    entry_string_split = entry_string.split() 
    entry_string_no_stopwords = remove_stopwords(entry_string_split) 

    fd = FreqDist(entry_string_no_stopwords)

    fout = open(outfilename, "w")
    sys.stdout.write(outfilename + "\n")
    fout.write(" ".join(fd.keys()))
    fout.close() 
开发者ID:KemingChen,项目名称:most_frequent_words,代码行数:29,代码来源:ohsumed_most_frequent.py

示例9: category_by_movie

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
开发者ID:brenden17,项目名称:infinity,代码行数:32,代码来源:category_nltk.py

示例10: category_by_pos

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
开发者ID:brenden17,项目名称:infinity,代码行数:34,代码来源:category_nltk.py

示例11: top

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
   def top(self, tokens, lowest_rank=50):
      ''' A list of the most frequent (non-stopword) tokens '''
      from operator import itemgetter
      content = self.words(tokens)

      fdist = FreqDist(content)
      vocab = iter(fdist.keys())

      # Forget all previous ranking
      self.lower_words = {}
      frequency = 0
      while frequency < lowest_rank:
         try:
            word = vocab.next()
         except StopIteration:
            break

         word_lower = word.lower()
         if word_lower in self.lower_words:
            self.lower_words[word_lower] = self.lower_words[word_lower] + fdist[word]
         else:
            self.lower_words[word_lower] = fdist[word]

         frequency = frequency + 1

#      return sorted(self.lower_words, key=itemgetter(1), reverse=True)
      return map(itemgetter(0), sorted(self.lower_words.items(), key=itemgetter(1), reverse=True))
开发者ID:colgur,项目名称:reader_pipeline,代码行数:29,代码来源:nlp.py

示例12: main

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def main():
    userInput = parser.getInput()
    fileList = parser.getFiles(userInput['train'])
    pdata = parser.parseFiles(fileList)





    allsent = ''
    for f in pdata:
        allsent += f[3]

    all_words = FreqDist(w.lower()
                    for w in word_tokenize(allsent)
                        if w not in stopwords.words('english') )

    global top_words
    top_words = all_words.keys()[:500]

    # pdata = getParseData()
    featdata = featureAggregator(pdata)







    print featdata[:10]
开发者ID:seekshreyas,项目名称:nlp-reviews-classifier,代码行数:32,代码来源:extractor.py

示例13: get_word_features

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def get_word_features(wordlist):

    wordlist = FreqDist(wordlist)

    word_features = wordlist.keys()

    return word_features
开发者ID:toshi09,项目名称:UserProfilingInSocialMedia,代码行数:9,代码来源:naive_bayes_nltk.py

示例14: features

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def features(word_list):
	freq = FreqDist(word_list)
	f = freq.keys()
	return {
		'biology': 'biolog' in word_list,
		'engineering': 'engin' in word_list,
		'animal' : 'anim' in word_list,
		'behavior': 'behavy' in word_list,
		'chemistry': 'chem' in word_list,
		'health': 'heal' in word_list,
		'physics': 'phys' in word_list,
		'math': 'math' in word_list,
		'plant': 'plant' in word_list,
		'earth': 'earth' in word_list,
		'biochemistry': 'biochem' in word_list,
		'social': 'soc' in word_list,
		'planet': 'planet' in word_list,
		'temperature': 'temperature' in word_list,
		'blood': 'blood' in word_list,
		'tube': 'tube' in word_list,
		'pyschology': 'pyscholog' in word_list,
		'protein': 'protein' in word_list,
		'gene': 'gen' in word_list,
		'most_0': f[0],
		'most_1': f[1],
		'most_2': f[2],
		'most_3': f[3],
		'most_4': f[4],
		'most_5': f[5],
		'most_6': f[6],
		'most_7': f[7],
		}
开发者ID:CamBurris,项目名称:Capstone,代码行数:34,代码来源:classify.py

示例15: bag_of_words

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def bag_of_words(data, label_codebook, feature_codebook, theta):
    """"""
    word_dict = Alphabet()
    stopset = set(stopwords.words('english'))
    for key, value in data.items():
        label_codebook.add(key)
        for doc in value:
            doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
            for word in doc_tokens:
                if word not in stopset:
                    word_dict.add(word)
                    
    all_words = word_dict._label_to_index.keys()
    fdict = FreqDist([w for w in all_words])
    word_feature = fdict.keys()[theta:]
    for word in all_words:
        if word in word_feature:
            feature_codebook.add(word)
    
    instance_list = {}
    for label, document_list in data.items():
        instance_list[label] = []
        for document in document_list:
            vector = np.zeros(feature_codebook.size())
            tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
            indice = 0
            
            for word in tokens:
                if feature_codebook.has_label(word):
                    indice = feature_codebook.get_index(word)
                    vector[indice] = 1.
            instance_list[label].append(vector)
    return instance_list
开发者ID:Juicechuan,项目名称:workspace,代码行数:35,代码来源:naive_bayes.py


注:本文中的nltk.FreqDist.keys方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。