当前位置: 首页>>代码示例>>Python>>正文


Python nltk.ngrams函数代码示例

本文整理汇总了Python中nltk.ngrams函数的典型用法代码示例。如果您正苦于以下问题:Python ngrams函数的具体用法?Python ngrams怎么用?Python ngrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了ngrams函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: ngrams

 def ngrams(self, ns=[2, 3, 5]):
     _p = ["/".join(t) for t in zip(self.SUF, self.POS)]
     for n in ns:
         ngf = {"Ngram(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(self.SUF, n)}
         ngfp = {"NgramP(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(_p, n)}
     self.features.update(ngf)
     self.features.update(ngfp)
开发者ID:tuxedocat,项目名称:precure,代码行数:7,代码来源:feature_extractor.py

示例2: update_freqs

 def update_freqs(self, doc_text, id_str):
     for bigram in list(ngrams(doc_text, 2)):
         k = bigram[0] + u"_" + bigram[1]
         self.bicount.update([k])
         self.bigram_to_ids[k] = self.bigram_to_ids.get(k, []) + [id_str]
     for trigram in list(ngrams(doc_text, 3)):
         k = trigram[0] + u"_" + trigram[1] + u"_" + trigram[2]
         self.tricount.update([k])
         self.trigram_to_ids[k] = self.trigram_to_ids.get(k, []) + [id_str]
开发者ID:jtmurphy89,项目名称:twitter_challenge,代码行数:9,代码来源:part1.py

示例3: get_gram_ratio

def get_gram_ratio(w2v, text1, text2, n_grams_1=1, n_grams_2=1, n_jobs=1):
    t1 = list(ngrams(text1.split(), n_grams_1))
    t2 = list(ngrams(text2.split(), n_grams_2))
    pairs = list(iter_product(t1, t2, repeat=1))
    res = list(map(lambda x: similarity(w2v, x), pairs))
    if len(res) == 0:
        return 0
    else:
        return np.mean(res)
开发者ID:KhaoticMind,项目名称:kaggle-homedepot,代码行数:9,代码来源:helper_processing.py

示例4: ngrams_extract

def ngrams_extract(string):
    if random.random() < SAMPLE_RATE:
        print '[*]',string
    l = list
    grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5))
    SIZE = 1024
    vec = zeros((SIZE,))
    for t in grams:
        vec[hash(t)%SIZE]+=1
    return log(vec+1.0)
开发者ID:joshsaxe,项目名称:eXposeDeepNeuralNetwork,代码行数:10,代码来源:features.py

示例5: build_ngram

 def build_ngram(source):
     ngram_set = {}
     for key, value in source.items():
         ngram = []
         for line in value:
             if IS_PAD:
                 ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL, pad_left=True, pad_right=True, pad_symbol='SSS'))
             else:
                 ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL))
         ngram_set[key] = ngram
     return ngram_set
开发者ID:Tiotao,项目名称:CS3245HW1,代码行数:11,代码来源:build_test_LM.py

示例6: read_data

def read_data(type):
    datapath = '../data/' + type + '/'
    data = {}
    maxindex = 500
    count = 0
    unigrams = []
    bigrams = []
    dependecies = []
    for c in string.ascii_uppercase:
        data[c] = {}
        for i in range(1, maxindex):
            filename = datapath + c + str(i)
            txtpath = filename + '.data'
            metapath = filename + '.meta'
            text = read_file(txtpath)

            meta = read_file(metapath)
            if text is not None:
                count += 1
                # print (count)
                data[c][i] = {'text': text[0], 'meta': parse_meta(meta)}
                tokens = nltk.word_tokenize(text[0])

                data[c][i]['tokens'] = tokens
                data[c][i]['length'] = len(tokens)
                s = remove_punct(text[0])
                tokens = nltk.word_tokenize(remove_punct(s.lower()))

                data[c][i]['unigrams'] = list(nltk.ngrams(tokens, 1))
                data[c][i]['bigrams'] = list(nltk.ngrams(tokens, 2))

                # data[c][i]['dependencies'] = dependency_parse(text[0])
                # deppath = filename + '.dep'
                # with open (deppath, 'w') as f:
                #     json.dump(data[c][i]['dependencies'],f)
                # with open (deppath, 'r') as f:
                #     data[c][i]['dependencies'] = json.load(f)


                unigrams.extend(data[c][i]['unigrams'])
                bigrams.extend(data[c][i]['bigrams'])
                # dependecies.extend(data[c][i]['dependencies'])

        data[c]['sequences'] = gen_sequences(data[c])
        data['unigram_model'] = create_model(unigrams, maxfeat=5000, minfreq=3)
        data['bigram_model'] = create_model(bigrams, maxfeat=5000, minfreq=3)
        # data['dependencies'] = create_model(dependecies, maxfeat=5000, minfreq=3)

    # pprint.pprint (data['unigram_model'])
    # pprint.pprint (data['bigram_model'])
    # pprint.pprint (data['dependencies'])

    # print(type, count)
    return data
开发者ID:patwaria,项目名称:stance_classification,代码行数:54,代码来源:stance_classification.py

示例7: lookup_phrases

def lookup_phrases(sentence, noun_types, ignore_case=False):
    phrases = ngrams(sentence, 3) + ngrams(sentence, 2) + ngrams(sentence, 1)
    matches = []
    for phrase in phrases:
        if contains_noun(phrase):
            phrase_str = u' '.join(w.form for w in phrase)
            if ignore_case:
                phrase_str = phrase_str.lower()
            types = noun_types.get(phrase_str)
            if types:
                matches.append((phrase, types))
    return sorted(matches)
开发者ID:Noahs-ARK,项目名称:semafor,代码行数:12,代码来源:markup_sentence.py

示例8: extract_ngrams

    def extract_ngrams (self, memes):
        for meme_type in memes:
            for meme in memes[meme_type]:
                top_unigrams = meme[0]
                bottom_unigrams = meme[1]
                all_unigrams = top_unigrams + bottom_unigrams

                top_bigrams = ngrams (meme[0], 2)
                bottom_bigrams = ngrams (meme[1], 2)
                all_bigrams = top_bigrams + bottom_bigrams

                self.add_ngrams(key, top_unigrams, bottom_unigrams, all_unigrams, top_bigrams, bottom_bigrams, all_bigrams)
开发者ID:AlexeyMK,项目名称:DATASS,代码行数:12,代码来源:NgramsManager.py

示例9: get_gram_ratio

def get_gram_ratio(text1, text2, w2v, n_grams_1=1, n_grams_2=1, w=30, h=2000):
    arr = np.ndarray((w, h), np.float32)
    arr.fill(0)
    t1 = list(ngrams(text1.split(), n_grams_1))
    t2 = list(ngrams(text2.split(), n_grams_2))
    for i in range(len(t1)):
        for j in range(len(t2)):
            try:
                arr[i, j] = w2v.n_similarity(t1[i], t2[j])
            except:
                pass
    return arr
开发者ID:KhaoticMind,项目名称:kaggle-homedepot,代码行数:12,代码来源:neural_test.py

示例10: generate_location_vector

    def generate_location_vector(self, branch, index):
        if branch.text is not None:
            branch.text = branch.text.encode('ascii', 'ignore')

            if not branch.getchildren():
                sentences = branch.text.split('. ')
                for sentence in range(0, len(sentences)):
                    #sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
                    words = sentences[sentence].split()

                    for doc_word in range(0, len(words)):
                        word_location = (("{0}[{1}][{2}]".format(index, sentence, doc_word)), words[doc_word])
                        # any change in line below should be replicated in corpus.py also
                        symbols = ".,[]();:<>+=&+%[email protected]#~?{}|"
                        whitespace = "                       "
                        replace = maketrans(symbols, whitespace)
                        doc_word = word_location[1].translate(replace)
                        doc_word = doc_word.lstrip()
                        doc_word = doc_word.rstrip()
                        if len(doc_word) > 1 and not len(doc_word) > 16:
                            self.doc_words.append(doc_word)

                    doc_bigrams = bigrams(words)
                    if not len(doc_bigrams) < 1:
                        doc_bigrams = self.n_gram_cleaner(doc_bigrams)
                        for bi_gram in doc_bigrams:
                            bi_gram = ' '.join(bi_gram)
                            self.bi_grams.append(bi_gram)

                    doc_trigrams = trigrams(words)
                    if not len(doc_trigrams) < 1:
                        doc_trigrams = self.n_gram_cleaner(doc_trigrams)
                        for tri_gram in doc_trigrams:
                            tri_gram = ' '.join(tri_gram)
                            self.tri_grams.append(tri_gram)

                    doc_fourgrams = ngrams(words, 4)
                    if not len(doc_fourgrams) < 1:
                        doc_fourgrams = self.n_gram_cleaner(doc_fourgrams)
                        for four_gram in doc_fourgrams:
                            four_gram = ' '.join(four_gram)
                            self.four_grams.append(four_gram)

                    doc_fivegrams = ngrams(words, 5)
                    if not len(doc_fivegrams) < 1:
                        doc_fivegrams = self.n_gram_cleaner(doc_fivegrams)
                        for five_gram in doc_fivegrams:
                            five_gram = ' '.join(five_gram)
                            self.five_grams.append(five_gram)

            else:
                for subtree in range(0, len(branch)):
                    LocationVector.generate_location_vector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
开发者ID:arunenigma,项目名称:deva_algo,代码行数:53,代码来源:doc_analyzer.py

示例11: get_top_ngrams_tfidf

def get_top_ngrams_tfidf(text,collection,NGRAM=2,cutoff=100,docs=None):
    bigs = nltk.ngrams(text,NGRAM)
    print 'totally',len(bigs),'bigrams'
    bigs = remove_website_stopwords(bigs)
    freqdist = nltk.FreqDist(bigs)
    topwords = freqdist.keys()[:cutoff]
    # print len(topwords),'topwords:',topwords[:30],freqdist[topwords[0]],freqdist[topwords[1]]
    from math import log
    if True: #do_tfidf
	df = {}
	df_les = {}
	df_time = {}
	tfidf ={}
	for doc_id, text in docs.items():
	    words = [w for w in nltk.ngrams(text,NGRAM)]
	    les_id,time_id = doc_id.split(':')
	    time_id = time_id.replace('.csv','')
	    time_id = time_id[0:8]
	    for w in words:
		df.setdefault(w,set())
		df[w].add(doc_id)
		df_les.setdefault(w,set())
		df_les[w].add(les_id)
		df_time.setdefault(w,set())
		df_time[w].add(time_id)
        _cutoff=10000
        _topwords = freqdist.keys()[:_cutoff]
	df0,df1,df2={},{},{}
        for w in _topwords:
            # print w
	    try: df0[w] = len(df[w])
	    except: df0[w] = 0
	    try: df1[w] = len(df_les[w])
	    except: df1[w] = 0
	    try: df2[w] = len(df_time[w])
	    except: df2[w] = 0
	    tfidf[w] = freqdist[w]/(1+df0[w])
	# print df0
        #get sorted words in decreasing order of tfidf values
        sortedwords = sorted(tfidf.items(), key=itemgetter(1), reverse=True) 
        sortedwords = sortedwords[:cutoff]
        topwords = [w for w,s in sortedwords]
        sortedwords0 = sorted(df0.items(), key=itemgetter(1), reverse=True) 
        sortedwords1 = sorted(df1.items(), key=itemgetter(1), reverse=True) 
        sortedwords2 = sorted(df2.items(), key=itemgetter(1), reverse=True) 
        print 'TF-IDF topwords:'
        print len(topwords),'topwords:',sortedwords[:50],freqdist[topwords[0]],freqdist[topwords[1]]
	print sortedwords0[:30]
	print sortedwords1[:30]
	print sortedwords2[:30]
        return topwords,freqdist,df0,df1,df2
    return topwords,freqdist
开发者ID:iamhighman,项目名称:GoogleNewsAnalysis,代码行数:52,代码来源:nltk_utils.py

示例12: __call__

 def __call__(self, words):
     grams = list(ngrams(words, 2)) + list(ngrams(words, 3))
     positives = [
         (i, len(gram), gram) for i, gram in enumerate(grams)
         if self.colls[len(gram)][gram]
     ]
     if not positives:
         return words
     positives.sort(key=lambda x: (x[1], len(words) - x[0]), reverse=True)
     matches, covered = self.__non_overlapping(positives)
     unigrams = [(i, w) for i, w in enumerate(words) if i not in covered]
     catted = sorted(matches + unigrams)
     return zip(*catted)[1]
开发者ID:JordiCarreraVentura,项目名称:wlp,代码行数:13,代码来源:Collocations.py

示例13: generateLocationVector

    def generateLocationVector(self, branch, index):
        if branch.text is not None:
            branch.text = branch.text.encode('ascii', 'ignore')

            if not branch.getchildren():
                sentences = branch.text.split('. ')

                for sentence in range(0, len(sentences)):
                    #sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
                    words = sentences[sentence].split()

                    for word in range(0, len(words)):
                        word_location = (("{0}[{1}][{2}]".format(index, sentence, word)), words[word])
                        symbols = ",[]();:<>+=&+%[email protected]#~?{}|"
                        whitespace = "                      "
                        replace = maketrans(symbols, whitespace)
                        spec_word = word_location[1].translate(replace)
                        spec_word = spec_word.lstrip()
                        spec_word = spec_word.rstrip()

                        if len(spec_word) > 1 and not len(spec_word) > 16:
                            self.spec_words.append(spec_word)

                    bi_grams = bigrams(words)
                    if not len(bi_grams) < 1:
                        for bi_gram in bi_grams:
                            bi_gram = ' '.join(bi_gram)
                            self.bi_grams.append(bi_gram)

                    tri_grams = trigrams(words)
                    if not len(tri_grams) < 1:
                        for tri_gram in tri_grams:
                            tri_gram = ' '.join(tri_gram)
                            self.tri_grams.append(tri_gram)

                    four_grams = ngrams(words, 4)
                    if not len(four_grams) < 1:
                        for four_gram in four_grams:
                            four_gram = ' '.join(four_gram)
                            self.four_grams.append(four_gram)

                    five_grams = ngrams(words, 5)
                    if not len(five_grams) < 1:
                        for five_gram in five_grams:
                            five_gram = ' '.join(five_gram)
                            self.five_grams.append(five_gram)                    

            else:
                for subtree in range(0, len(branch)):
                    Corpus.generateLocationVector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
开发者ID:arunenigma,项目名称:Scenario-Mining,代码行数:50,代码来源:corpus.py

示例14: __init__

    def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
        split_text = text.split()
        if len(split_text) < shingle_length:
            raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))

        self.minhash = []
        self.shingles = ngrams(split_text, shingle_length)

        for hash_seed in generate_random_seeds(minhash_size, random_seed):
            min_value = float('inf')
            for shingle in ngrams(split_text, shingle_length):
                value = mmh3.hash(' '.join(shingle), hash_seed)
                min_value = min(min_value, value)
            self.minhash.append(min_value)
开发者ID:steven-s,项目名称:text-shingles,代码行数:14,代码来源:shingles.py

示例15: train

 def train(self, words, tagged=False):
     if tagged is True:
         tags = []
         for i in range(len(words)):
             tags.append(words[i][1])
         self.ngrams = list(nltk.ngrams(tags, self.n))
     else:
         # text = nltk.word_tokenize(words)
         tagged_words = nltk.pos_tag(words)
         universal_tags = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged_words]
         self.ngrams = list(nltk.ngrams(universal_tags, self.n))
     self.frequencies = nltk.FreqDist(self.ngrams)
     self.probs_ng = nltk.MLEProbDist(self.frequencies)
     print self.probs_ng
开发者ID:sofiabroome,项目名称:wordpredictor,代码行数:14,代码来源:GrammarModel.py


注:本文中的nltk.ngrams函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。