當前位置: 首頁>>代碼示例>>Python>>正文


Python TfidfVectorizer.set_params方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.set_params方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.set_params方法的具體用法?Python TfidfVectorizer.set_params怎麽用?Python TfidfVectorizer.set_params使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction.text.TfidfVectorizer的用法示例。


在下文中一共展示了TfidfVectorizer.set_params方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: extract_tfidf_nmf_feats

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
 def extract_tfidf_nmf_feats(self, df_data, n_components):
     """
     Extract tfidf features using nmf.     
     """        
     df_feat = pd.DataFrame(index=range(df_data.shape[0]))
     tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english')
     tsvd = TruncatedSVD(n_components=n_components, random_state = 2016)
     nmf = NMF(solver='cd', n_components=n_components, init='nndsvda',
                 random_state=0, tol=1e-3)
     df_data['q'].to_csv('q', index=False)
     df_data['t'].to_csv('t', index=False)
     df_data['d'].to_csv('d', index=False)
     print('fitting in tfidf')
     tfidf.set_params(input='filename')        
     tfidf.fit(['q','t','d'])
     tfidf.set_params(input='content')  
     for col in ['d', 't', 'q', 'b']:
         print('process column', col)
         txt = df_data[col]
         tfidf_mat = tfidf.transform(txt)
         nd_feat = nmf.fit_transform(tfidf_mat)
         tmp = pd.DataFrame(nd_feat, columns=[col+'_tfidf_nmf_comp'+str(i) \
                                     for i in range(n_components)])
         df_feat = pd.merge(df_feat, tmp, left_index=True, right_index=True)
     saveit(df_feat, 'df_tfidf_nmf_feats')
開發者ID:amsqr,項目名稱:hd,代碼行數:27,代碼來源:feature_generator.py

示例2: nmf_topic_extraction

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def nmf_topic_extraction(corpus, bv_stop_tokens, n_features = 5000, n_top_words = 5, n_topics = 3, data={}):
    n_samples = len(corpus)

    # ensure we don't ask for more topics than we have samples
    # this happens when we there are only a few bibleverses in a cluster
    n_topics = min(n_samples, n_topics)
    if n_topics==2:
        n_topics = 1


    # vectorize the tweet text using the most common word
    # frequency with TF-IDF weighting (without the top 5% stop words)
    vectorizer = TfidfVectorizer(max_features=n_features,
                                 ngram_range=(2,2)
                                 )
    stoplist = ['retweet', 'rt', 'http', 'things', 'christ', 'lord', 'god', 'shall', 'jesus',
                'nlt', 'kjv', 'prov']

    try:
        vectorizer.set_params(stop_words=set(list(ENGLISH_STOP_WORDS)+stoplist+bv_stop_tokens))
        counts = vectorizer.fit_transform(corpus)

        tfidf = TfidfTransformer().fit_transform(counts)
        feature_names = vectorizer.get_feature_names()
    except Exception, ex:
        logger.exception("Tfidf analysis failed ex={}, bv={}, data={}, n_topics={}".format(ex, bv_stop_tokens, data, n_topics))
        return []
開發者ID:evethandar,項目名稱:habakkuk,代碼行數:29,代碼來源:topic_extraction.py

示例3: extract_tsne_gather_feat

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def extract_tsne_gather_feat(stage):
    """
    Extract tsne gather features.
    Note: python2 only.    
    Better than func:extract_tsne_feat in cv, but worst in submission.
    """  
    df_w2vlem_join = pd.read_csv('tmp2/df_w2vlem_join.csv', index_col=0)
        
    if stage <= 1:        
        df_feat = pd.DataFrame(index=df_w2vlem_join.index.values)
        tfidf = TfidfVectorizer(ngram_range=(2,4), stop_words='english', min_df=2)
        
        df_w2vlem_join['t_w2v'].to_csv('tmp2/t_w2v', index=False)
        df_w2vlem_join['q_w2v'].to_csv('tmp2/q_w2v', index=False)
        df_w2vlem_join['d_w2v'].to_csv('tmp2/d_w2v', index=False)
        
        tfidf.set_params(input='filename')        
        tfidf.fit(['tmp2/t_w2v','tmp2/q_w2v','tmp2/d_w2v'])
        tfidf.set_params(input='content')
        
        cPickle.dump(tfidf, open('tmp2/tfidf_obj','wb'))
    
    tfidf = cPickle.load(open('tmp2/tfidf_obj','rb'))
    X_t = tfidf.transform(df_w2vlem_join['t_w2v'].tolist())    
    if stage <= 2:           
        svd = TruncatedSVD(n_components=100, random_state=2016)     
        X_svd = svd.fit_transform(X_t)
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = bh_sne(X_scaled)
        df_feat['tsne_t_1'] = X_tsne[:len(df_w2vlem_join), 0]
        df_feat['tsne_t_2'] = X_tsne[:len(df_w2vlem_join), 1]
        df_feat.to_csv('tmp2/tsne_t', index=False)
    
    df_feat = pd.read_csv('tmp2/tsne_t')    
    if stage <= 3:
        print(df_feat)
        X_q = tfidf.transform(df_w2vlem_join['q_w2v'].tolist())
        X_tq = sp.hstack([X_t, X_q]).tocsr()
        svd = TruncatedSVD(n_components=50, random_state=2016)
        X_svd = svd.fit_transform(X_tq)
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = bh_sne(X_scaled)
        df_feat['tsne_qt_1'] = X_tsne[:len(df_w2vlem_join), 0]
        df_feat['tsne_qt_2'] = X_tsne[:len(df_w2vlem_join), 1]
        df_feat.to_csv('tmp2/tsne_qt', index=False)
    
    df_feat = pd.read_csv('tmp2/tsne_qt')    
    if stage <= 4:
        print(df_feat)    
        X_d = tfidf.transform(df_w2vlem_join['d_w2v'].tolist())
        svd = TruncatedSVD(n_components=100, random_state=2016)
        X_svd = svd.fit_transform(X_d)
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = bh_sne(X_scaled)
        df_feat['tsne_desc_1'] = X_tsne[:len(df_w2vlem_join), 0]
        df_feat['tsne_desc_2'] = X_tsne[:len(df_w2vlem_join), 1]
        
        df_tsne_feats = df_feat
        df_tsne_feats.to_csv('tmp2/df_tsne_gather_feats.csv')
開發者ID:amsqr,項目名稱:hd,代碼行數:61,代碼來源:python2_tsne.py

示例4: GroceryFeatureGenerator

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
class GroceryFeatureGenerator(object):
    #def __init__(self,name):
    def __init__(self):
        #self.name = name
        self.stop_words = set([])

        self.tfidf = TfidfVectorizer(max_features=4000,
	    ngram_range=(1,3),sublinear_tf = True)


    def settfidf(self,stopwords_path=None,max_features=4000):
        if  stopwords_path != None:
            self.get_stopwords(stopwords_path)
            self.tfidf.set_params(stop_words=list(self.stop_words))
        if max_features != 4000:
            self.tfidf.set_params(max_features=max_features)
        return self
    def get_stopwords(self,stop_words_path):
        try:
            with open(stop_words_path,'r') as fin:
                contents = fin.read().decode('utf-8')
        except IOError:
            raise ValueError("the given stop words path %s is in invalid."%(stop_words_path))
        for line in contents.splitlines():
            self.stop_words.add(line.strip())
        print("\nsuccess in getting stopwords\n") 
    def fit_transform(self,path,textlist):
        #self.settfidf('resource/stop_words.txt')
        self.settfidf(os.path.join(path,'stop_words.txt'))
        #if isinstance(textlist,list):
        if 1:
            tf = self.tfidf.fit_transform(textlist)
        #with open(self.tfidfpath,'w') as fout:
        #    pickle.dump(tf)
        return tf
#    def load_tfidf(self):
#        try:
#            with open(self.tfidfpath,'r') as fin:
#                self.tfidf = pickle.load(fin)
#        except IOError:
#            raise ValueError("the %s path is invalid."%(self.tfidfpath))
                
    def transform(self,textlist):
    #self.load_tfidf()
        if isinstance(textlist,list):
            return self.tfidf.transform(textlist)
    def save(self, dest_file):
        config = self.tfidf
        cPickle.dump(config, open(dest_file, 'w'), -1)

    def load(self, src_file):
        self.tfidf = cPickle.load(open(src_file, 'r'))
        return self
開發者ID:lovetimil,項目名稱:TextGrocery,代碼行數:55,代碼來源:converter.py

示例5: word_vectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def word_vectorizer(isScore, isStemming, **kwargs):
    params = {"strip_accents": 'unicode', "max_features": 1500}

    if isScore:
        vect = TfidfVectorizer()

        if kwargs.has_key('scoreType'):
            # Setting Score Type
            if kwargs['scoreType'] == 'TF':
                params["use_idf"] = False
            elif kwargs['scoreType'] == 'TF-IDF':
                params["use_idf"] = True
    else:
        vect = CountVectorizer()

        if kwargs.has_key('isBinary'):
            # Setting Binary Frequency
            if kwargs['isBinary']:
                params["binary"] = True
            else:
                params["binary"] = False

    if kwargs.has_key('stopwordPath'):
        # Reading stopwords
        with open(kwargs.get('stopwordPath')) as f:
            sw = f.read().splitlines()
        params["stop_words"] = sw

    if isStemming:
        params["tokenizer"] = stem_tokenize
    else:
        params["tokenizer"] = tokenize

    #print params
    vect.set_params(**params)

    return vect
開發者ID:dgacitua,項目名稱:SeminarioInfo1FIUBA,代碼行數:39,代碼來源:vectorizador.py

示例6: word_vectorizer2

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def word_vectorizer2(**args):
    params = {"strip_accents": 'unicode'}
    prefijo = ''

    # Establecer ponderación de las palabras vectorizadas
    if args.has_key('scoreType'):
        if args['scoreType'] == 'TFIDF':
            vector = TfidfVectorizer()
            params["use_idf"] = True
            prefijo += args.get('scoreType') + '_'
        elif args['scoreType'] == 'TF':
            vector = TfidfVectorizer()
            params["use_idf"] = False
            prefijo += args.get('scoreType') + '_'
        elif args['scoreType'] == 'BTO':
            vector = CountVectorizer()
            params["binary"] = True
            prefijo += args.get('scoreType') + '_'
        elif args['scoreType'] == 'TO':
            vector = CountVectorizer()
            params["binary"] = False
            prefijo += args.get('scoreType') + '_'
        else:
            vector = CountVectorizer()
            params["binary"] = False
            prefijo += args.get('scoreType') + '_'
            print 'ADVERTENCIA: Tipo de ponderación no válido, se usará TO'
    else:
        vector = CountVectorizer()
        params["binary"] = False
        prefijo += 'TO_'
        print 'ADVERTENCIA: Tipo de ponderación no especificado, se usará TO'

    # Establecer tamaño del conjunto de palabras
    if args.has_key('maxFeatures'):
        params["max_features"] = args.get('maxFeatures')
    else:
        params["max_features"] = 1500
        #print 'ADVERTENCIA: No se especifica un máximo de conjunto de palabras, se usarán 1500'

    # Establecer stemming
    if args.has_key('stemming'):
        if args['stemming'] == 'spanish':
            params["tokenizer"] = spanish_tokenize
            prefijo += 'SpanishStem_'
        elif args['stemming'] == 'english':
            params["tokenizer"] = english_tokenize
            prefijo += 'EnglishStem_'
        else:
            params["tokenizer"] = tokenize
            prefijo += 'NoStem_'
            print 'ADVERTENCIA: Stemmer no válido, no se realizará stemming'
    else:
        params["tokenizer"] = tokenize
        prefijo += 'NoStem_'

    # Establecer conjunto de stopwords (opcional)
    if args.has_key('stopwordPath'):
        # Reading stopwords
        with open(args.get('stopwordPath')) as f:
            sw = f.read().splitlines()
        params["stop_words"] = sw
        prefijo += 'Stopwords'
    else:
        prefijo += 'NoStopwords'

    # print params

    vector.set_params(**params)

    return vector, prefijo


# def vectorizer1(object):
#     cv = CountVectorizer(strip_accents='unicode', max_features=1500, tokenizer=tokenize)
#     X_train_counts = cv.fit_transform(object.data)
#     X_words = object.target
#     X_labels = object.target_names
#
#     return X_train_counts, X_words, X_labels
#
#
# def vectorizer2(object, stopwords):
#     fsw = open(stopwords, 'r')
#     sw = fsw.readlines()
#     fsw.close()
#
#     cv = CountVectorizer(strip_accents='unicode', max_features=1500, tokenizer=tokenize, stop_words=sw)
#     X_train_counts = cv.fit_transform(object.data)
#     X_words = object.target
#     X_labels = object.target_names
#
#     return X_train_counts, X_words, X_labels
#
#
# def vectorizer3(object):
#     cv = CountVectorizer(strip_accents='unicode', max_features=1500, tokenizer=stem_tokenize)
#     X_train_counts = cv.fit_transform(object.data)
#     X_words = object.target
#     X_labels = object.target_names
#.........這裏部分代碼省略.........
開發者ID:dgacitua,項目名稱:SeminarioInfo1FIUBA,代碼行數:103,代碼來源:vectorizador.py

示例7: extract_cosinedist_feat

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
 def extract_cosinedist_feat(self, df_data, df_w2vlem_join):
     """
     Extract cosine distance features.
     Note: this func is very slow. It would cost a few hours.       
     """
     df_feat = pd.DataFrame(index=df_data.index.values)
     tfv = TfidfVectorizer(ngram_range=(2,3), min_df=2)
     
     print('computing qt_w2v_cosdist')
     df_w2vlem_join['q_w2v'].to_csv('q_w2v', index=False)
     df_w2vlem_join['t_w2v'].to_csv('t_w2v', index=False)
     tfv.set_params(input='filename')
     tfv.fit(['q_w2v', 't_w2v'])# list(df_w2vlem_join['q_w2v'].values)+list(df_w2vlem_join['t_w2v'].values)
     tfv.set_params(input='content')        
     print('done fitting')
     qt_unigram_func = lambda x: self._cosine_dist(x['q_w2v'], x['t_w2v'], tfv)
     df_feat['qt_w2v_cosdist'] = df_w2vlem_join.apply(qt_unigram_func, axis=1)   
     
     if True: # You can abandon this feature because it costs too much time (more than 24 hours). 
         print('computing qd_w2v_cosdist')
         df_w2vlem_join['d_w2v'].to_csv('d_w2v', index=False)
         tfv.set_params(input='filename')
         tfv.fit(['q_w2v', 'd_w2v'])# list(df_w2vlem_join['q_w2v'].values)+list(df_w2vlem_join['d_w2v'].values)
         tfv.set_params(input='content')
         print('done fitting')
         qd_unigram_func = lambda x: self._cosine_dist(x['q_w2v'], x['d_w2v'], tfv)
         df_feat['qd_w2v_cosdist'] = df_w2vlem_join.apply(qd_unigram_func, axis=1)   
     
     print('computing qt_cosdist')
     df_data['q'].to_csv('q', index=False)
     df_data['t'].to_csv('t', index=False)
     tfv.set_params(input='filename')
     tfv.fit(['q', 't'])# list(df_data['q'].values) + list(df_data['t'].values)
     tfv.set_params(input='content')
     print('done fitting')
     qt_func = lambda x: self._cosine_dist(x['q'], x['t'], tfv)
     df_feat['qt_cosdist'] = df_data.apply(qt_func, axis=1)              
     
     df_cosdist_feats = df_feat 
     saveit(df_cosdist_feats, 'df_cosdist_feats')   
開發者ID:amsqr,項目名稱:hd,代碼行數:42,代碼來源:feature_generator.py


注:本文中的sklearn.feature_extraction.text.TfidfVectorizer.set_params方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。