当前位置: 首页>>代码示例>>Python>>正文


Python util.bigrams函数代码示例

本文整理汇总了Python中nltk.util.bigrams函数的典型用法代码示例。如果您正苦于以下问题:Python bigrams函数的具体用法?Python bigrams怎么用?Python bigrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了bigrams函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: wiki_to_feature

def wiki_to_feature(wiki):
    """
    Specifically handles a single wiki document
    :param wiki: dict for wiki fields
    :type wiki: dict
    :return: tuple with wiki id and list of feature strings
    :rtype: tuple
    """
    try:
        features = []
        bow = []
        features += [u'ORIGINAL_HUB:%s' % wiki.get(u'hub_s', u'')]
        features += [u'TOP_CAT:%s' % u'_'.join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', [])]
        bow += [u"_".join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', [])]
        features += [u'TOP_ART:%s' % u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', [])]
        bow += [u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', [])]
        desc_ngrams = [u"_".join(n) for grouping in
                       [bigrams(normalize(np))
                       for np in TextBlob(wiki.get(u'description_txt', [u''])[0]).noun_phrases]
                       for n in grouping]
        bow += desc_ngrams
        features += [u'DESC:%s' % d for d in desc_ngrams]
        bow += [u"_".join(b) for b in bigrams(normalize(wiki[u'sitename_txt'][0]))]
        mp_nps = TextBlob(wiki.get(u'main_page_text', u'')).noun_phrases
        bow += [u"_".join(bg) for grouping in [bigrams(normalize(n)) for n in mp_nps] for bg in grouping]
        bow += [u''.join(normalize(w)) for words in [np.split(u" ") for np in mp_nps] for w in words]
        return wiki[u'id'], bow + features
    except Exception as e:
        print e, format_exc()
        raise e
开发者ID:Wikia,项目名称:data-science-toolkit,代码行数:30,代码来源:extract_wiki_data.py

示例2: getFeatures

def getFeatures(tokens, typefeat='unigrams'):

    if typefeat == 'unigrams':
        _features = FreqDist(tokens)

    elif typefeat == 'bigrams':
        _bigrams = bigrams(tokens)
        _features = FreqDist(_bigrams)

    elif typefeat == 'uni+bigrams':
        _bigrams = bigrams(tokens)
        _features = FreqDist(_bigrams + tokens)

    return _features
开发者ID:diegocaro,项目名称:opinionapp,代码行数:14,代码来源:features.py

示例3: score_by_topic

def score_by_topic(pkg, scores):
    '''Examines the pkg and adds scores according to topics in it.'''
    themes = Themes.instance()
    for level in range(3):
        pkg_text = package_text(pkg, level)
        words, words_without_stopwords = normalize_text(pkg_text)
        for num_words in (1, 2, 3):
            if num_words == 1:
                ngrams = words_without_stopwords
                topic_ngrams = themes.topic_words
                topic_ngrams_set = themes.topic_words_set
            elif num_words == 2:
                ngrams = bigrams(words)
                topic_ngrams = themes.topic_bigrams
                topic_ngrams_set = themes.topic_bigrams_set
            elif num_words == 3:
                ngrams = trigrams(words)
                topic_ngrams = themes.topic_trigrams
                topic_ngrams_set = themes.topic_trigrams_set
            matching_ngrams = set(ngrams) & topic_ngrams_set
            if matching_ngrams:
                for ngram in matching_ngrams:
                    occurrences = ngrams.count(ngram)
                    score = (3-level) * occurrences * num_words
                    theme = topic_ngrams[ngram]
                    ngram_printable = ' '.join(ngram) if isinstance(ngram, tuple) else ngram
                    reason = '"%s" matched %s' % (ngram_printable, LEVELS[level])
                    if occurrences > 1:
                        reason += ' (%s times)' % occurrences
                    scores[theme].append((score, reason))
                    log.debug(' %s %s %s', theme, score, reason)
开发者ID:palcu,项目名称:ckanext-dgu,代码行数:31,代码来源:theme.py

示例4: aggregate_topics_of_segmented_reports

 def aggregate_topics_of_segmented_reports(self, cut_of_segmented_reports, topics):
     aggregated_topics = []
     bigrams_of_topics = bigrams(map(lambda x: [x.decode('utf-8')], topics))
     for i in range(len(bigrams_of_topics)):
         for j in range(len(cut_of_segmented_reports)):
             aggregated_topics.extend(cut_of_segmented_reports[j][cut_of_segmented_reports[j].index(bigrams_of_topics[i][0]):cut_of_segmented_reports[j].index(bigrams_of_topics[i][1])])
     return aggregated_topics
开发者ID:EduardoCarvalho,项目名称:nltkPhraseDetector,代码行数:7,代码来源:extractPhrases.py

示例5: autocorrect_query

def autocorrect_query(query,df,cutoff=0.8,warning_on=True):
    """
    autocorrect a query based on the training set
    """	
    train_data = df.values[df['search_term'].values==query,:]
    s = ""
    for r in train_data:
        w = r
        s = "%s %s %s"%(s,BeautifulSoup(r[1]).get_text(" ",strip=True),BeautifulSoup(r[2]).get_text(" ",strip=True))
    s = re.findall(r'[\'\"\w]+',s.lower())
    s_bigram = [' '.join(i) for i in bigrams(s)]
    s.extend(s_bigram)
    corrected_query = []	
    for q in query.lower().split():
        if len(q)<=2:
            corrected_query.append(q)
            continue
        if bool(re.search('\d', q)): # skip if it is word with number, like 4.5in_
            corrected_query.append(q)
            continue
        corrected_word = difflib.get_close_matches(q, s,n=1,cutoff=cutoff)
        if len(corrected_word) >0:
            corrected_query.append(corrected_word[0])
        else :
            if warning_on:
                print("WARNING: cannot find matched word for '%s' -> used the original word"%(q))
            corrected_query.append(q)	
    return ' '.join(corrected_query)
开发者ID:aaxwaz,项目名称:Kaggle_HomeDepot_Stacking,代码行数:28,代码来源:utils.py

示例6: generate_unibitrigrams

def generate_unibitrigrams(key_score_file):
    with open(key_score_file,'rb') as infile:
        infile.readline()
        key_list=list()
        for line in infile:
            row=list(line.split(','))
            key_list.append(row[0])
    uni_bi_trigrams=[]
    for phrase in key_list:
        words=[]
        unigrams_ls=[]
        bigrams_ls=[]
        trigrams_ls=[]
        for word in nltk.word_tokenize(phrase):
            word=re.sub('[!"#$%&\'\(\)*+,-./:;<=>[email protected][\]\^_`{|}~]','',word)
            words.append(word)
        unigrams_ls=words
        #bigrams_ls=list(bigrams(words))

        for x in list(bigrams(words)):
            bigrams_ls.append(x[0]+' '+x[1] )


        for x in list(trigrams(words)):
            trigrams_ls.append(x[0]+' '+x[1]+' '+x[2] )
        #trigrams_ls=list(trigrams(words))
        uni_bi_trigrams=uni_bi_trigrams+unigrams_ls+bigrams_ls+trigrams_ls
    return uni_bi_trigrams
开发者ID:neethukurian,项目名称:keyextract,代码行数:28,代码来源:rake_stem.py

示例7: gender_feature

def gender_feature(text, feature_vect):
    """
    Extract the gender features
    :param text:
    :param feature_vect: contains a bag of words and a list of bigrams
    :return: a dictionary which contains the feature and its computed value
    """
    #sentence length and vocab features
    tokens = word_tokenize(text.lower())
    sentences = sent_tokenize(text.lower())
    words_per_sent = np.asarray([len(word_tokenize(s)) for s in sentences])

    #bag_of_word features
    bag_dict = {}
    for bag in feature_vect[:29]:
        bag_dict[bag] = bag in tokens

    #bigrams features
    bigram_dict = {}
    for big in feature_vect[29:]:
        bigram_dict[big] = big in bigrams(tokens)

    #POS tagging features
    POS_tag = ['ADJ', 'ADV', 'DET', 'NOUN', 'PRT', 'VERB', '.']
    tagged_word = parse(text, chunks=False, tagset='UNIVERSAL').split()
    simplified_tagged_word = [(tag[0], map_tag('en-ptb', 'universal', tag[1])) for s in tagged_word for tag in s]
    freq_POS = nltk.FreqDist(tag[1] for tag in simplified_tagged_word if tag[1] in POS_tag)

    d = dict({'sentence_length_variation': words_per_sent.std()}, **bag_dict)

    return dict(dict(d, **bigram_dict), **freq_POS)
开发者ID:kouki01,项目名称:Text_Mining_University_Project,代码行数:31,代码来源:Evaluation.py

示例8: get_bigram

def get_bigram(text_list):
	# text_list is a list of strings
	new_list = []
	for i in range(len(text_list)):
		new_list.append(list(bigrams(text_list[i])))

	return new_list
开发者ID:sheshant,项目名称:project-information-retrieval,代码行数:7,代码来源:new.py

示例9: BigramAll

def BigramAll():
    to_save_folder = "./#Bigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1 :
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name+"data.doc";
        fw = open(data_path,"r",encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);

        big = list(bigrams(w for w in words if len(w) > 1 and w != "``"));
        myBig = []
        for bi in big:
            myBig.append(bi[0]+" "+bi[1]);

        fdist = FreqDist(str(w) for w in myBig);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n";

        make_sure_path_exists(to_save_folder+folder)
        writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:29,代码来源:MakeNormalData.py

示例10: generate_ds

 def generate_ds(self, words):
     learning_info_dict = {lang: {w: float(t) 
                           for w, t in self._language_model_cfd[lang].most_common()} 
                           for lang in self._language_model_cfd.keys()}
     testing_info_dict = {w: float(t) 
                          for w, t in FreqDist([tpl for word in words for tpl in bigrams(word)]).most_common()}
     return learning_info_dict, testing_info_dict
开发者ID:PyWilhelm,项目名称:FoLT2014,代码行数:7,代码来源:core.py

示例11: bigramsPhi

def bigramsPhi(comment):
    """The basis for a bigrams feature function.
    """
    sent = [stemmer.stem(tok) for tok in comment.split()] # Stemming + punc
    unis = Counter()
    sent = ["<<START>>"] + sent + ["<<END>>"]
    unis.update(bigrams(sent))                             # Bigrams
    return unis
开发者ID:alexsax,项目名称:abusive-comment-detection,代码行数:8,代码来源:baseline.py

示例12: perplexity

    def perplexity(self, sentence, method):
        """
        Compute the perplexity of a sentence given a estimation method

        You do not need to modify this code.
        """
        return 2.0 ** (-1.0 * mean([method(context, word) for context, word in \
                                    bigrams(self.tokenize_and_censor(sentence))]))
开发者ID:sangheestyle,项目名称:cl1-hw,代码行数:8,代码来源:language_model.py

示例13: bigram_format

def bigram_format( test_corpus ):
    """
    >>> bigram_format(["the dog runs STOP", "the cat walks STOP", "the dog runs STOP"])
    [[('the', 'dog'), ('dog', 'runs'), ('runs', 'STOP')], [('the', 'cat'), ('cat', 'walks'), ('walks', 'STOP')], [('the', 'dog'), ('dog', 'runs'), ('runs', 'STOP')]]
    """

    wl = [ [word for word in sentence.split()] for sentence in test_corpus] 
    return [ util.bigrams( l ) for l in wl ]
开发者ID:manniche,项目名称:nlangp,代码行数:8,代码来源:ngram_utilities.py

示例14: get_ngram_tokens

 def get_ngram_tokens(self, line):
     tokens = nltk.wordpunct_tokenize(line)
     message = [self.stemmer.stem(x) for x in tokens if len(x) > 2 and x not in self.stops]
     bigram = bigrams(message)
     for pair in bigram:
         joined = " ".join(pair)
         message.append(joined)
     return list(set(message))
开发者ID:johnnysparks,项目名称:feelsbro,代码行数:8,代码来源:johnnyprocess.py

示例15: sentProbaility

 def sentProbaility(self,sent,smooth_const):
     V = 217847
     tool = MyToolKit()
     bigrs = bigrams(tool.words(sent));
     p = 1
     for tuple in bigrs:
         p = math.exp(math.log(p)+math.log(self.LaplaceSmoothing(tuple[1],tuple[0],smooth_const,V)))
         #p = math.exp(math.log(p)+math.log(self.AbsoluteDiscountingSmoothing(tuple[1],tuple[0],smooth_const,V)))
     return p
开发者ID:djidan10,项目名称:Arabic-Diacritizer,代码行数:9,代码来源:Vocaliser.py


注:本文中的nltk.util.bigrams函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。