当前位置: 首页>>代码示例>>Python>>正文


Python collocations.TrigramCollocationFinder类代码示例

本文整理汇总了Python中nltk.collocations.TrigramCollocationFinder的典型用法代码示例。如果您正苦于以下问题:Python TrigramCollocationFinder类的具体用法?Python TrigramCollocationFinder怎么用?Python TrigramCollocationFinder使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了TrigramCollocationFinder类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: ngram_collocation

def ngram_collocation(words, sents, n, support=10, topK=200):

    if n>=4: 
        finder = TrigramCollocationFinder.from_words(words)
        ngram_measures = TrigramAssocMeasures()
        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
        ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support/3, 0.3)
        print_ngrams(ext_ngrams)
        return ext_ngrams
        #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK)
        #the current collocation measure is PMI
    else:
        if n==2:
            finder = BigramCollocationFinder.from_words(words)
            ngram_measures = BigramAssocMeasures()
        if n==3:
            finder = TrigramCollocationFinder.from_words(words)
            ngram_measures = TrigramAssocMeasures()

        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)

    print_ngrams(pmi_ngrams)
    return pmi_ngrams
开发者ID:chqsark,项目名称:hightext,代码行数:25,代码来源:nlp_module.py

示例2: collocations

def collocations(stream, top_n=10000, min_bigram_freq=50, min_trigram_freq=20):
    """Extract text collocations (bigrams and trigrams), from a stream of words.

    Parameters
    ----------
    stream: iterable object
        An iterable of words

    top_n: int
        Number of collocations to retrieve from the stream of words (order by decreasing frequency). Default is 10000

    min_bigram_freq: int
        Minimum frequency of a bigram in order to retrieve it. Default is 50.

    min_trigram_freq: int
        Minimum frequency of a trigram in order to retrieve it. Default is 20.

    """
    tcf = TrigramCollocationFinder.from_words(stream)

    tcf.apply_freq_filter(min_trigram_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_bigram_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns
开发者ID:lewismc,项目名称:topik,代码行数:33,代码来源:tokenizers.py

示例3: trigramFeats

def trigramFeats(thesewords, n=100):
    si = iter(thesewords)
    words = [c + " " + next(si, '') + " " + next(si, '') for c in si]
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_freq_filter(n)
    trigram = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, trigram)])
开发者ID:levidehaan,项目名称:securitynowbigdataproject,代码行数:7,代码来源:frameparser.py

示例4: get_frequencies

    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single,bigrm,trigrm]
        return matches
开发者ID:amac441,项目名称:Metten,代码行数:28,代码来源:grapher.py

示例5: trigram

def trigram(words, score_fn=TrigramAssocMeasures.likelihood_ratio, n=1500, freq=1):
    """
    tmp_words=[]
    for w in words:
        tmp_words.append(w)
    words=tmp_words
    """
    if len(words) <= 0:
        return {}

    tmp_dict = {}

    for w in words:
        tmp_dict[w] = 1

    if len(tmp_dict.keys()) < 3:
        return {}

    trigram_finder = TrigramCollocationFinder.from_words(words)  # 把文本变成双词搭配的形式
    trigram_finder.apply_freq_filter(freq)
    trigrams = trigram_finder.nbest(score_fn, n)  # 使用了卡方统计的方法,选择排名前1000的双词

    # print type(words)

    res = {}

    for s in trigrams:

        if res.has_key(s[0] + s[1] + s[2]) == True:
            res[s[0] + s[1] + s[2]] += 1
        else:
            res[s[0] + s[1] + s[2]] = 1

    return res
开发者ID:cysjtu,项目名称:SentimentAnalysis,代码行数:34,代码来源:nlp_machine2222.py

示例6: best_ngrams

def best_ngrams(words, top_n=10, min_freq=5):
    """
    Extract `top_n` most salient collocations (bigrams and trigrams),
    from a stream of words. Ignore collocations with frequency
    lower than `min_freq`.

    This fnc uses NLTK for the collocation detection itself -- not very scalable!

    Return the detected ngrams as compiled regular expressions, for their faster
    detection later on.

    """
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_freq_filter(min_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    print pat_gram2
    
    return pat_gram2, pat_gram3
开发者ID:ninamiriamjnana,项目名称:topic,代码行数:28,代码来源:get_data.py

示例7: create_tri_collocations

def create_tri_collocations(features_words,document_preprocess):
    finder = TrigramCollocationFinder.from_words(movie_reviews.words())
    finder.apply_freq_filter(3)
    tricoll = finder.nbest(trigram_measures.pmi,1000)
    for f in document_preprocess:
        tricoll = [(f(a),f(b),f(c)) for (a,b,c) in tricoll if (f(a) and f(b) and f(c))]
    return tricoll
开发者ID:katomaso,项目名称:LiU-TextMining,代码行数:7,代码来源:Lab5-2.py

示例8: __init__

    def __init__(self, words, sentences, language):
        self.num_words = len(words)
        self.unique_words = len(set(words))
        self.num_sentences = len(sentences)
        self.average_sentence_length = round(self.num_words / self.num_sentences)
        self.lexical_diversity = round(self.num_words / self.unique_words)

        fdist = FreqDist(words)
        stop_words = stopwords.words(language)
        not_stopwords = [w for w in words if w not in stop_words]
        fdist2 = FreqDist(not_stopwords)
        self.fifty_first_words = fdist.most_common(50)
        self.hundreds_nsw = fdist2.most_common(300)

        bigram_measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(words)
        finder.apply_freq_filter(10)
        self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50)

        trigram_measures = TrigramAssocMeasures()
        finder3 = TrigramCollocationFinder.from_words(words)
        finder3.apply_freq_filter(10)
        self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50)

        self.stcs_width_words = [' '.join(sent) for sent in sentences
                                 if "malheureusement" in sent.lower()]
开发者ID:Raveline,项目名称:journal-imaginaire,代码行数:26,代码来源:analyst.py

示例9: extract_trigrams

 def extract_trigrams(self, sent):
    sent = self._preprocess_sent(sent)
    trigram_measures = TrigramAssocMeasures()
    TriFinder = TrigramCollocationFinder.from_words(sent)
    trigrams = TriFinder.nbest(trigram_measures.pmi, 10000)
    trigrams = set([' '.join(i) for i in trigrams])
    trigrams = trigrams & self._trigrams_set
    return { i: True for i in trigrams }
开发者ID:aginiewicz,项目名称:EmoClassifier,代码行数:8,代码来源:feature_extraction.py

示例10: set_trigramas

	def set_trigramas(self,freq=2,best=20):
		tcf = TrigramCollocationFinder.from_words(self.palavras)
		stopset = set(stopwords.words('portuguese'))
		filter_stops = lambda w: len(w) < 3 or w in stopset
		tcf.apply_word_filter(filter_stops)
		tcf.apply_freq_filter(freq)
		a = tcf.nbest(TrigramAssocMeasures.pmi, best)
		self.trigramas = a
开发者ID:eric011,项目名称:SemantikaCrawler,代码行数:8,代码来源:lingprocessador.py

示例11: calc_trigrams

def calc_trigrams(text, min_freq=50):
	"""Returns frequency of trigrams from a text input."""
	words = [w.lower() for w in text]
	tcf = TrigramCollocationFinder.from_words(words)
	tcf.apply_freq_filter(min_freq)
	trigrams = tcf.ngram_fd.items()
	trigram_list.append(trigrams)
	return trigram_list
开发者ID:djkn0x,项目名称:GA_homework,代码行数:8,代码来源:trigrams.py

示例12: trigram_word_feats

def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=50):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    try:
        trigrams = trigram_finder.nbest(score_fn, n)
    except:
        print "lost trigrams", words
        return dict([(ngram, True) for ngram in itertools.chain(words)])

    return dict([(ngram, True) for ngram in itertools.chain(words, trigrams)])
开发者ID:zhouxiaofan,项目名称:Projects,代码行数:9,代码来源:sentimentMapper.py

示例13: getTrigrams

    def getTrigrams(self):

        words = [w.lower() for w in nltk.word_tokenize(self.text)]
        tcf = TrigramCollocationFinder.from_words(words)
        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(1)
        return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 6)
开发者ID:DevilDante88,项目名称:MyCogs,代码行数:9,代码来源:postagger.py

示例14: getTrigram

def getTrigram(haystack):
    tokenizer = WordPunctTokenizer()
    words = tokenizer.tokenize(haystack)
    tcf = TrigramCollocationFinder.from_words(words)
    stopset = set(stopwords.words('english'))
    filter_stops = lambda w: len(w) < 3 or w in stopset
    tcf.apply_word_filter(filter_stops)

    return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)
开发者ID:blorenz,项目名称:cms,代码行数:9,代码来源:seo.py

示例15: best_n_trigrams

    def best_n_trigrams(self, n, method="pmi"):
        trigram_measures = TrigramAssocMeasures()
        tokens = self.get_word_lst()
        finder = TrigramCollocationFinder.from_words(tokens)

        if method == "pmi":
            return finder.nbest(trigram_measures.pmi, n)
        if method == "raw_freq":
            return finder.nbest(trigram_measures.raw_freq, n)
开发者ID:WilliamHammond,项目名称:fbcanalyzer,代码行数:9,代码来源:ChatStream.py


注:本文中的nltk.collocations.TrigramCollocationFinder类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。