Python nltk.FreqDist方法代码示例

本文整理汇总了Python中nltk.FreqDist方法的典型用法代码示例。如果您正苦于以下问题：Python nltk.FreqDist方法的具体用法？Python nltk.FreqDist怎么用？Python nltk.FreqDist使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk的用法示例。

在下文中一共展示了nltk.FreqDist方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: do_analysis

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def do_analysis(dataset_obj):
        # 1. all sample classification distribution
        # 2. all sentence sample classification distribution
        sample_num = dataset_obj.sample_num
        collect = []
        sent_collect = []
        for trees in dataset_obj.nn_data:
            for sample in trees:
                sentiment_float = sample['root_node']['sentiment_label']
                sentiment_int = cfg.sentiment_float_to_int(sentiment_float)
                if sample['is_sent']:
                    sent_collect.append(sentiment_int)
                collect.append(sentiment_int)
        all_pdf = nltk.FreqDist(collect)
        sent_pdf = nltk.FreqDist(sent_collect)
        print('sample_num:', sample_num)
        print('all')
        print(all_pdf.tabulate())
        print('sent')
        print(sent_pdf.tabulate())

开发者ID:taoshen58，项目名称:DiSAN，代码行数:22，代码来源:analysis.py

示例2: profile

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:21，代码来源:textcat.py

示例3: create_freq_dist

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def create_freq_dist(in_lst, exclude):
        """Create a frequency distribution.

        Parameters
        ----------
        in_lst : list of str
            Words to create the frequency distribution from.
        exclude : list of str
            Words to exclude from the frequency distribution.

        Returns
        -------
        freqs : nltk.FreqDist
            Frequency distribution of the words.

        Examples
        --------
        Compute the frequency distribution of a collection of words:

        >>> ArticlesAll.create_freq_dist(in_lst=['brain', 'brain', 'head', 'body'], exclude=['body'])
        FreqDist({'brain': 2, 'head': 1})

        If you want to visualize a frequency distribution, you can plot them as a wordcloud:

        >>> from lisc.plts.words import plot_wordcloud
        >>> freq_dist = nltk.FreqDist({'frontal': 26, 'brain': 26, 'lobe': 23, 'patients': 19})
        >>> plot_wordcloud(freq_dist, len(freq_dist))
        """

        freqs = nltk.FreqDist(in_lst)

        for excl in exclude:
            try:
                freqs.pop(excl.lower())
            except KeyError:
                pass

        return freqs

开发者ID:lisc-tools，项目名称:lisc，代码行数:40，代码来源:articles_all.py

示例4: bigram_counts

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def bigram_counts(word_list):
	bgs = nltk.bigrams(word_list)
	fdist = nltk.FreqDist(bgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d

开发者ID:yyht，项目名称:BERT，代码行数:9，代码来源:utils.py

示例5: describe

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            for sent in para:
                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Return data structure with information
        return {
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
        }

开发者ID:foxbook，项目名称:atap，代码行数:25，代码来源:reader.py

示例6: _calculate_word_scores

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def _calculate_word_scores(self, phrase_list):
        word_freq = nltk.FreqDist()
        word_degree = nltk.FreqDist()
        for phrase in phrase_list:
            # degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1
            # SML above cost error
            degree = len(list(filter(lambda x: not isNumeric(x), phrase))) - 1
            for word in phrase:
                # word_freq.inc(word)
                # SML error above:
                word_freq[word] += 1
                # word_degree.inc(word, degree) # other words
                word_degree[word] = degree
        for word in word_freq.keys():
            word_degree[word] = word_degree[word] + word_freq[word] # itself
        # word score = deg(w) / freq(w)
        word_scores = {}
        for word in word_freq.keys():
            word_scores[word] = word_degree[word] / word_freq[word]
        return word_scores

开发者ID:smlbiobot，项目名称:SML-Cogs，代码行数:22，代码来源:tldr.py

示例7: load_data

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def load_data():
    global N, words, labels

    posts = corpus.xml_posts()[:10000]
    freqs = [ FreqDist(post.text) for post in posts ] 
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    labels = list(set([ post.get('class') for post in posts ]))

    data = []
    N = len(words)
    for post, dist in zip(posts, freqs):
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, labels.index(post.get('class'))))

    return data

开发者ID:benglard，项目名称:ConvNetPy，代码行数:24，代码来源:dialogue.py

示例8: load_data

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, V.w))

    return data

开发者ID:benglard，项目名称:ConvNetPy，代码行数:21，代码来源:topics.py

示例9: load_data

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        x = volumize(dist)
        data.append((x, x.w))

    return data

开发者ID:benglard，项目名称:ConvNetPy，代码行数:19，代码来源:similarity.py

示例10: test

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def test(): 
    gt = GetTweets()
    documents = gt.get_hashtag('ferguson', count=20)
    documents += gt.get_hashtag('police', count=21)
    print 'Query:', documents[-1]

    tokenizer = RegexpTokenizer('\w+')
    vols = []
    for doc in documents:
        samples = []
        for token in tokenizer.tokenize(doc):
            word = token.lower()
            if word not in ENGLISH_STOP_WORDS and word not in punctuation:
                samples.append(word)
        vols.append(volumize(FreqDist(samples)))

    vectors = [ doc_code(v) for v in vols[:-1] ]
    query_vec = doc_code(vols[-1])

    sims = [ cos(v, query_vec) for v in vectors ]
    m = max(sims)
    print m, documents[sims.index(m)]

开发者ID:benglard，项目名称:ConvNetPy，代码行数:24，代码来源:similarity.py

示例11: gene_token_freq_info

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def gene_token_freq_info(context_token, question_token):
    def look_up_dict(t_dict, t):
        try:
            return t_dict[t]
        except KeyError:
            return 0
    context_token_dict = dict(nltk.FreqDist(context_token))
    question_token_dict = dict(nltk.FreqDist(question_token))

    # context tokens in context and question dicts
    context_tf = []
    for token in context_token:
        context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    # question tokens in context and question dicts
    question_tf = []
    for token in context_token:
        question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    return {'context':context_tf, 'question':question_tf}

开发者ID:taoshen58，项目名称:DiSAN，代码行数:22，代码来源:nlp.py

示例12: init

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def __init__(self, documents, terms, classes, class_types, frequency, main_class, min_docs):
        self.terms = terms  # the terms used to build the lexicon
        self.documents = documents
        self.classes = classes
        self.terms_frequency = frequency
        self.terms_frequency_per_class = dict()
        self.main_class = main_class
        # the minimum support for a term (i.e., number of documents in the class of interest in order to be considered)
        self.min_docs = min_docs
        self.class_occ = dict()
        for c in class_types:
            self.terms_frequency_per_class[c]=nltk.FreqDist()
            self.class_occ[c] = classes.count(c)
        for i, doc in enumerate(self.documents):
            cls = self.classes[i]
            for t in doc:
                self.terms_frequency_per_class[cls].inc(t)

    # the scoring functions return the list of discriminative terms for the class of interest according to each metric

开发者ID:sajao，项目名称:CrisisLex，代码行数:21，代码来源:lexicon.py

示例13: init

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def __init__(self, order, alpha, sentences):
        self.order = order
        self.alpha = alpha
        if order > 1:
            self.backoff = LangModel(order - 1, alpha, sentences)
            self.lexicon = None
        else:
            self.backoff = None
            self.n = 0
        self.ngramFD = nltk.FreqDist()
        lexicon = set()
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            wordNGrams = nltk.ngrams(words, order)
            for wordNGram in wordNGrams:
                self.ngramFD[wordNGram] += 1
                # self.ngramFD.inc(wordNGram)
                if order == 1:
                    lexicon.add(wordNGram)
                    self.n += 1
        self.v = len(lexicon)

开发者ID:iorch，项目名称:jakaton_feminicidios，代码行数:23，代码来源:lang_model_2.py

示例14: build_vocab

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def build_vocab(docs, save_path):
  print('Building vocab ...')

  sents = itertools.chain(*[text.split('<sssss>') for text in docs])
  tokenized_sents = [sent.split() for sent in sents]

  # Count the word frequencies
  word_freq = nltk.FreqDist(itertools.chain(*tokenized_sents))
  print("%d unique words found" % len(word_freq.items()))

  # Cut-off
  retained_words = [w for (w, f) in word_freq.items() if f > WORD_CUT_OFF]
  print("%d words retained" % len(retained_words))

  # Get the most common words and build index_to_word and word_to_index vectors
  # Word index starts from 2, 1 is reserved for UNK, 0 is reserved for padding
  word_to_index = {'PAD': 0, 'UNK': 1}
  for i, w in enumerate(retained_words):
    word_to_index[w] = i + 2
  index_to_word = {i: w for (w, i) in word_to_index.items()}

  print("Vocabulary size = %d" % len(word_to_index))

  with open('{}-w2i.pkl'.format(save_path), 'wb') as f:
    pickle.dump(word_to_index, f)

  with open('{}-i2w.pkl'.format(save_path), 'wb') as f:
    pickle.dump(index_to_word, f)

  return word_to_index

开发者ID:tqtg，项目名称:hierarchical-attention-networks，代码行数:32，代码来源:data_prepare.py

示例15: get_min_count

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def get_min_count(sents):
    '''
    Args:
      sents: A list of lists. E.g., [["I", "am", "a", "boy", "."], ["You", "are", "a", "girl", "."]]
     
    Returns:
      min_count: A uint. Should be set as the parameter value of word2vec `min_count`.   
    '''
    global vocab_size
    from itertools import chain
     
    fdist = nltk.FreqDist(chain.from_iterable(sents))
    min_count = fdist.most_common(vocab_size)[-1][1] # the count of the the top-kth word
    
    return min_count

开发者ID:Kyubyong，项目名称:wordvectors，代码行数:17，代码来源:make_wordvectors.py

注：本文中的nltk.FreqDist方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。