当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.inc方法代码示例

本文整理汇总了Python中nltk.FreqDist.inc方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.inc方法的具体用法?Python FreqDist.inc怎么用?Python FreqDist.inc使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.FreqDist的用法示例。


在下文中一共展示了FreqDist.inc方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: category_by_pos

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
开发者ID:brenden17,项目名称:infinity,代码行数:34,代码来源:category_nltk.py

示例2: process

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def process(f, return_tokens=True, return_freqdist=True):
    """
    Function to process deals data.
    Splits text into sentences. FreqDist is incremented from tokenization.
    Using PunktWordTokenizer, since it is a decent regexp-based tokenizer.
    Deals are also about domain names. Not intending to split it up

    :rtype : FreqDist, list() of str
    :param f: Input file with a deal per line
    """
    fd = FreqDist()
    tokens = []
    fh = open(f, 'r')
    sentences = [line.strip() for line in fh.readlines()]
    for line in sentences:
        t = []
        for word in PunktWordTokenizer().tokenize(line.lower()):
            if word not in set(stopwords.words('english')) and word not in set(string.punctuation):
                if return_tokens:
                    t.append(word)
                if return_freqdist:
                    fd.inc(word)
        tokens.append(t)
    fh.close()
    return fd, sentences, tokens
开发者ID:ypandit,项目名称:exercises,代码行数:27,代码来源:task1.py

示例3: dotranslate

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def dotranslate(sent, parser, tdop):
	# todo: tokenize sentence by maximizing unigram probabilities
	# in training corpus, to detect multiword units
	sent = sent.split()

	# parse sentence with bitpar, gives an n-best list
	try:
		parsetrees1 = list(parser.nbest_parse(sent))
	except Exception as e:
		parsetrees1 = []
		print "parsing failed", e
		return (), {}

	# undo binarization and auxilary POS tags introduced to accomodate bitpar:
	parsetrees = FreqDist()
	for tree in parsetrees1:
		tree.un_chomsky_normal_form()
		parsetrees.inc(removeforcepos(tree).freeze(), count=tree.prob())

	# for each parsetree, get a list of translations
	resultfd = {}
	for m, tree in enumerate(parsetrees):
		print "parse tree", tree
		for nn, (result, prob) in enumerate(
			tdop.get_mlt_deriv_multi(tree, smoothing=True, verbose=False)):
			if not result: continue
			key = (undecorate_with_ids(result).freeze(),
				sum(1 if "@" in a.node else 0 for a in result.subtrees()))
			resultfd[key] = resultfd.get(key, 0.0) + prob
	return parsetrees, resultfd
开发者ID:andreasvc,项目名称:dop-transformations,代码行数:32,代码来源:compsem.py

示例4: __init__

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
class Index:
    """
    The Index class stores an index for a document.
    """
    def __init__(self):
        self._freq_dist = None
        self._document = None

    def index(self, document):
        self._document = document
        if self._freq_dist == None:
            self._freq_dist = FreqDist()
            for term in self.terms():
                self._freq_dist.inc(term)

    def reset(self):
        "Reset the index"
        self._freq_dist = None

    def freq_dist(self):
        if self._freq_dist == None:
            self.index()
        return self._freq_dist

    # return the number of times a term appears in this document
    def freq(self, term):
        if not self._freq_dist:
            self.index()
        return self._freq_dist[term]

    def tf(self, term):
        if not self._freq_dist:
            self.index()
        return float(self._freq_dist[term]) / float(self._freq_dist.N())
开发者ID:jgerrish,项目名称:nltk_ext,代码行数:36,代码来源:index.py

示例5: proto

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
    def proto(self, num, language, authors, token_vocab, token_df, lemma_vocab,
              pos_vocab, synset_vocab, stemmer):
        d = Document()
        assert language == self.lang

        if self._id:
            d.id = self._id
        else:
            d.id = num

        d.language = language
        d.title = self.title.strip()
        num_sentences = max(self._sentences) + 1

        tf_token = FreqDist()
        for ii in self.tokens():
            tf_token.inc(ii)

        for ii in xrange(num_sentences):
            s = d.sentences.add()
            for jj in self._sentences[ii]:
                w = s.words.add()
                w.token = token_vocab[jj.word]
                w.lemma = lemma_vocab[jj.lemma]
                w.pos = pos_vocab[jj.pos]
                w.relation = pos_vocab[jj.rel]
                w.parent = jj.parent
                w.offset = jj.offset
                w.tfidf = token_df.compute_tfidf(jj.word,
                                                 tf_token.freq(jj.word))
        return d
开发者ID:NetBUG,项目名称:topicmod,代码行数:33,代码来源:wacky.py

示例6: word_fdist_single

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def word_fdist_single(address, exclude=excludes(), corpus=inaugural):
	fd = FreqDist()
	
	for word in corpus.words(address):
		if not word.lower() in exclude:
			fd.inc(word.lower())
	
	return fd
开发者ID:c-w,项目名称:ug2_NaturalLanguage,代码行数:10,代码来源:tts.py

示例7: sent_length_fdist_single

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def sent_length_fdist_single(address, exclude=excludePuncts(), corpus=inaugural):
	fd = FreqDist()
	
	for sent in corpus.sents(address):
		nopunct_sent = [word for word in sent if not word in exclude]
		fd.inc(len(nopunct_sent))
	
	return fd
开发者ID:c-w,项目名称:ug2_NaturalLanguage,代码行数:10,代码来源:tts.py

示例8: sent_length_fdist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def sent_length_fdist(address_list, exclude=excludePuncts(), corpus=inaugural):
	total_fd = FreqDist()
	
	for address in address_list:
		fd = sent_length_fdist_single(address, exclude, corpus)
		for len in fd.keys():
			total_fd.inc(len, fd[len])
	
	return total_fd
开发者ID:c-w,项目名称:ug2_NaturalLanguage,代码行数:11,代码来源:tts.py

示例9: content_FreqDist_generator

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def content_FreqDist_generator(articles_list):
    # get the FreqDist of all articles
    all_fdist = FreqDist()
    for article in articles_list:
        for item in article.content_freqDist().iteritems():
            key = item[0]
            value = item[1]
            all_fdist.inc(key, value)
    return all_fdist
开发者ID:LiuyinC,项目名称:MDLab,代码行数:11,代码来源:Project_1_1.py

示例10: word_fdist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def word_fdist(address_list, exclude=excludes(), corpus=inaugural):
	total_fd = FreqDist()
	
	for address in address_list:
		fd = word_fdist_single(address, exclude, corpus)
		for word in fd.keys():
			total_fd.inc(word, fd[word])
	
	return total_fd
开发者ID:c-w,项目名称:ug2_NaturalLanguage,代码行数:11,代码来源:tts.py

示例11: __extract_level_words

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
    def __extract_level_words(self, levels_db, level, values):
        words_number_per_value = self.__configuration_map["most_frequent_words_number_per_value"]
        most_freq_words = {}
        for value in values:
            fdist = FreqDist()
            for word_dist in levels_db[level][value]:
                fdist.inc(word_dist[0], count = word_dist[1])

            most_freq_words[value] = fdist.items()[:words_number_per_value]
        return most_freq_words
开发者ID:ssteku,项目名称:NLPRelatedPhenomenon,代码行数:12,代码来源:MostFrequentWordsExtractor.py

示例12: kneser_ney

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
    def kneser_ney(self, context, word):
        """
        Return the log probability of a word given a context given
        Kneser Ney backoff
        """

        bgram = (context, word)
        unigram_freq = FreqDist()

        theta = self._kn_concentration
        vocabulary = 1 / len(self._vocab_freq.keys())
        discount_delta = self._kn_discount
        unigram_T = len(self._context_freq.keys())
        bigram_T = self._context_freq[context]

        for i in self._gram_freq:
            unigram_freq.inc(i[1])

        # Unigram Restaurant
        # C_0,x
        count_unirest_wordTable = unigram_freq[word]
        # C_0,.
        count_unirest_allTable = unigram_freq.N()

        # u_Bigram Restaurant
        # C_u,x
        count_birest_wordTable = self._gram_freq[bgram]

        # C_u,.
        count_birest_allTable = self._context_freq[context]

        existingTable_numer = count_birest_wordTable - discount_delta
        existingTable_denom = theta + count_birest_allTable
        existingTable = existingTable_numer / existingTable_denom

        if existingTable < 0:
            existingTable = 0

        newTable_numer = theta + (bigram_T * discount_delta)
        newTable_denom = theta + count_birest_allTable
        newTable = newTable_numer / newTable_denom

        back_a_numer = count_unirest_wordTable - discount_delta
        back_a_denom = count_unirest_allTable + theta
        back_a = back_a_numer / back_a_denom
        if back_a < 0:
            back_a = 0

        back_b_numer = theta + (unigram_T * discount_delta)
        back_b_denom = count_unirest_allTable + theta
        back_b = back_b_numer / back_b_denom
        back_b = back_b * vocabulary

        result = existingTable + (newTable * (back_a + back_b))
        return lg(result)
开发者ID:ReedAnders,项目名称:StatisticalNLP,代码行数:57,代码来源:language_model.py

示例13: __getTimelineFeatures

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
    def __getTimelineFeatures(self, timeline):
        logger.info(u"Get timeline features")
        tweets = []
        self.__changePhase(PHASE["GET_TIMELINE_URLS"])
        for t in timeline:
            try:
                tweet = TweetText(t, self.__urlBuilder, self.__userBuilder)
            except:
                logger.exception(u"Error: \"" + unicode(t) + u"\"")
                raise ValueError(t)
            logger.debug(u"Tweet:" + unicode(tweet))
            tweets.append(tweet)

        urls = []
        ti = 0
        for tweet in tweets:
            for url in tweet.urls():
                self.__breakIfStopped()
                self.__urlResolver.addUrlToQueue(url)
                urls.append(url)
            logger.info(u"Tweet:" + unicode(tweet))
            ti += 1
            self.__proc = 100 * float(ti) / float(len(tweets))

        #Kategorie
        self.__changePhase(PHASE["GET_TIMELINE_FEATURES"])
        url2labels = {}
        ui = 0
        for url in urls:
            self.__breakIfStopped()
            if not url.isError():
                logger.debug(u"Classify " + unicode(url.getUrl()))
                url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText())
            ui += 1
            self.__proc = 100 * float(ui) / float(len(urls))

        labelsFreq = FreqDist()
        for labels in url2labels.values():
            for label in labels:
                labelsFreq.inc(label)
        self.__catFreq = labelsFreq.items()
        logger.info(u"Categories: "  + unicode(labelsFreq.items()))
        labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']]
        #normalizacja
        labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues}
        logger.info(u"Category factors: "  + unicode(labelsFreqValues))

        #Języki
        langFreq = FreqDist()
        for u in urls:
            langFreq.inc(u.lang())
        self.__langFreq = langFreq.items()
        logger.info(u"Languages: " + unicode(langFreq.items()))

        return labelsFreqValues
开发者ID:soldierkam,项目名称:pynews,代码行数:57,代码来源:user_tools.py

示例14: train_supervised

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        @return: the trained model
        @rtype: HiddenMarkovModelTagger
        @param labelled_sequences: the training data, a set of
            labelled sequences of observations
        @type labelled_sequences: list
        @param kwargs: may include an 'estimator' parameter, a function taking
            a C{FreqDist} and a number of bins and returning a C{ProbDistI};
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator == None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurences of starting states, transitions out of each state
        # and output symbols observed in each state
        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts == None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in self._states:
                    self._states.append(state)
                if symbol not in self._symbols:
                    self._symbols.append(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, False, N)
        B = ConditionalProbDist(outputs, estimator, False, len(self._symbols))
                               
        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
开发者ID:DrDub,项目名称:icsisumm,代码行数:56,代码来源:hmm.py

示例15: handle

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
    def handle(self, *args, **options):
    	fdist = FreqDist()
    	print "Analyzing raw data"
    	limit = 10
    	if args:
    		raw_datas = RawData.objects.filter(pk__in=args)
    	else:
	   		raw_datas = RawData.objects.all()[:limit]
    	tagged_data = []
    	for raw_data in raw_datas:
    		words = nltk.word_tokenize(raw_data.data)
    		tagged_data.extend(nltk.pos_tag(words))
    		for word in words:
    			word = word.strip()
    			if word:
	    			fdist.inc(word)

    	print "Anaylzed %s items" % len(raw_datas)
    	print

    	print "Top word: %s" % fdist.max()
    	print 

    	print "Top 10 words"
    	for word in fdist.keys()[:10]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	
    	print "Bottom 10 words"
    	for word in fdist.keys()[-10:]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	print "Words occurring between 50-100 times"
    	words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ]
    	print ", ".join(words)


    	cfdist = ConditionalFreqDist()
    	for (word, tag) in tagged_data:
    		cfdist[tag].inc(word)
    	
    	print "Most popular noun: %s" % cfdist["NN"].max()
    	print 

    	print "Top 50 nouns"
    	for word in cfdist["NN"].keys()[:50]:
    		times = cfdist["NN"][word]
    		print " -- %s occurred %s times" % (word, times)
    	print
开发者ID:jaywhy13,项目名称:mapstream,代码行数:55,代码来源:analyze.py


注:本文中的nltk.FreqDist.inc方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。