當前位置: 首頁>>代碼示例>>Python>>正文


Python TfidfVectorizer.get_params方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.get_params方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.get_params方法的具體用法?Python TfidfVectorizer.get_params怎麽用?Python TfidfVectorizer.get_params使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction.text.TfidfVectorizer的用法示例。


在下文中一共展示了TfidfVectorizer.get_params方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: get_data_with_dandelion

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
    def get_data_with_dandelion(self, relevance_threshold=0.75, min_df=2,
                              gamma=0.89, filter=False):
        only_text, ent, data = self.get_data_with_abstract_2(relevance_threshold)
        entities_sparse = sparse.csr_matrix(ent)

        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=min_df,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)

        print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0],
                                                  tfidf_matrix.shape[1])
        print 'entities matrix dimension: %s x %s ' %(entities_sparse.shape[0],
                                                     entities_sparse.shape[1])
        print 'non zero elements in entities matrix: %s' \
              % len(entities_sparse.data)

        '''print tfidf_matrix[tfidf_matrix > 0].mean()
        print tfidf_matrix[tfidf_matrix > 0].max()

        print entities_sparse[entities_sparse > 0].mean()
        print entities_sparse[entities_sparse > 0].max()
        print '#' * 80'''
        #print 'after balancing'

        tfidf_matrix = tfidf_matrix * 1
        entities_sparse = entities_sparse * (1 - gamma)

        #print tfidf_matrix[tfidf_matrix > 0].mean()
        #print tfidf_matrix[tfidf_matrix > 0].max()

        #print entities_sparse[entities_sparse > 0].mean()
        #print entities_sparse[entities_sparse > 0].max()

        f_score_dict = self.labels_dict(data)
        params = tfidf_vectorizer.get_params()
        params['dandelion_entities'] = entities_sparse.shape[1]
        params['original_terms'] = tfidf_matrix.shape[0]
        params['gamma'] = gamma
        params['relevance_threshold'] = relevance_threshold
        params['classes'] = len(f_score_dict)
        params['tokenizer'] = 'TextUtils.tokenize_and_stem'
        del params['dtype']

        params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean()

        return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict, params
開發者ID:Neuro17,項目名稱:LOD-doc-clustering,代碼行數:56,代碼來源:document_processor.py

示例2: get_data_only_with_abstract

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
    def get_data_only_with_abstract(self, relevance_threshold=0.75, min_df=0.01,
                              gamma=0.89, filter=False):
        only_text, ent, data = self.get_data_with_abstract_2(relevance_threshold)
        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=min_df,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
        f_score_dict = self.labels_dict(data)
        params = tfidf_vectorizer.get_params()
        params['original_terms'] = tfidf_matrix.shape[0]
        params['gamma'] = gamma
        params['relevance_threshold'] = relevance_threshold
        params['classes'] = len(f_score_dict)
        params['tokenizer'] = 'TextUtils.tokenize_and_stem'

        return tfidf_matrix, f_score_dict, params
開發者ID:Neuro17,項目名稱:LOD-doc-clustering,代碼行數:25,代碼來源:document_processor.py

示例3: get_data_only_with_entities

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
    def get_data_only_with_entities(self, relevance_threshold=0.75, gamma=0.89, filter=False):
        data = self.mongo.get_all(order_by='id_doc')

        data = [doc for doc in data]
        only_text = [doc['text'] for doc in data]

        ent_dict, ent_set = self.get_dandelion_entities(data)

        if filter:
            entities_set = set([k for k, v in ent_dict.iteritems()])
        else:
            entities_set = ent_set
        entities = {e: i for i, e in enumerate(entities_set)}
        dandelion_entities = np.zeros((len(data), len(entities_set)))

        for doc in data[:]:
            text = doc['text']
            if 'dandelion' in doc:
                for e in doc['dandelion']['annotations']:
                    rel = np.float64(e['confidence'])
                    name = e['title']
                    if rel > relevance_threshold:
                        dandelion_entities[doc['id_doc']][entities[name]] = rel

        entities_sparse = sparse.csr_matrix(dandelion_entities)

        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=2,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)

        print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0],
                                                  tfidf_matrix.shape[1])
        print 'entities matrix dimension: %s x %s ' %(entities_sparse.shape[0],
                                                     entities_sparse.shape[1])
        print 'non zero elements in entities matrix: %s' \
              % len(entities_sparse.data)

        '''print tfidf_matrix[tfidf_matrix > 0].mean()
        print tfidf_matrix[tfidf_matrix > 0].max()

        print entities_sparse[entities_sparse > 0].mean()
        print entities_sparse[entities_sparse > 0].max()
        print '#' * 80'''
        #print 'after balancing'

        tfidf_matrix = tfidf_matrix * 1
        entities_sparse = entities_sparse * (1 - gamma)

        #print tfidf_matrix[tfidf_matrix > 0].mean()
        #print tfidf_matrix[tfidf_matrix > 0].max()

        #print entities_sparse[entities_sparse > 0].mean()
        #print entities_sparse[entities_sparse > 0].max()

        f_score_dict = self.labels_dict(data)
        params = tfidf_vectorizer.get_params()
        params['dandelion_entities'] = entities_sparse.shape[1]
        params['original_terms'] = tfidf_matrix.shape[0]
        params['gamma'] = gamma
        params['relevance_threshold'] = relevance_threshold
        params['classes'] = len(f_score_dict)
        params['tokenizer'] = 'TextUtils.tokenize_and_stem'
        del params['dtype']

        params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean()

        return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict,\
               params
開發者ID:Neuro17,項目名稱:LOD-doc-clustering,代碼行數:78,代碼來源:document_processor.py

示例4: get_data_fabio

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
    def get_data_fabio(self, gamma=0.89, rank_metric='r'):
        data = self.mongo.get_all(order_by='id_doc')

        data = [doc for doc in data]
        only_text = [doc['text'] for doc in data]

        entitySet = set()
        for d in data:
            if 'isa' in d:
                for e in d['isa']:
                    entitySet.add(e['entity'])

        current = np.zeros((len(data), len(entitySet)), dtype=np.float)
        count = 0
        invIndex = {}
        countFeatures = 0
        for i,d in enumerate(data):
            if 'isa' in d:
                for f in d['isa']:
                    if f['entity'] not in invIndex:
                       invIndex[f['entity']] = countFeatures
                       countFeatures += 1
                    current[count, invIndex[f['entity']]] = f[rank_metric]
            count += 1
        current = np.nan_to_num(current)
        current_sparse = sparse.csr_matrix(current)

        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=2,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)

        print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0],
                                                  tfidf_matrix.shape[1])
        print 'entities matrix dimension: %s x %s ' %(current_sparse.shape[0],
                                                     current_sparse.shape[1])
        print 'non zero elements in entities matrix: %s' \
              % len(current_sparse.data)

        tfidf_matrix = tfidf_matrix * 1
        entities_sparse = current_sparse * (1 - gamma)

        f_score_dict = self.labels_dict(data)
        params = tfidf_vectorizer.get_params()
        params['dandelion_entities'] = entities_sparse.shape[1]
        params['original_terms'] = tfidf_matrix.shape[0]
        params['gamma'] = gamma
        params['rank_metric'] = rank_metric
        params['classes'] = len(f_score_dict)
        params['tokenizer'] = 'TextUtils.tokenize_and_stem'
        del params['dtype']

        params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean()

        return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict,\
               params
開發者ID:Neuro17,項目名稱:LOD-doc-clustering,代碼行數:67,代碼來源:document_processor.py

示例5: newsgroups

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
        
    #20 newsgroups (part of sklearn)
    print "loading 20 newsgroups dataset..."
    tic = time()
    dataset = fetch_20newsgroups(shuffle=True, random_state=0, remove=('headers','footers','quotes'))
    train_corpus = dataset.data  # a list of 11314 documents / entries
    toc = time()
    print "elapsed time: %.4f sec" %(toc - tic)    
    
    #compute tf-idf (equivalent to count-vectorizer followed by tf-idf transformer)
    #count-vectorizer produces term-document matrix tf-idf scales tf counts by log N/nt 
    #(N:num of docs, nt: number of a word occurence in docs)
    #if float (proportion of docs): min_df < nt/N  < max_df, if int: refers to count nt, e.g. min_df = 2
    tfidf = TfidfVectorizer(max_features = num_features, max_df=0.95, min_df=2, stop_words = 'english')
    print "tfidf parameters:"
    print tfidf.get_params()    
        
    #generate tf-idf term-document matrix
    A_tfidf_sp = tfidf.fit_transform(train_corpus)  #size D x V
    
    print "number of docs: %d" %A_tfidf_sp.shape[0]
    print "dictionary size: %d" %A_tfidf_sp.shape[1]

    #tf-idf dictionary    
    tfidf_dict = tfidf.get_feature_names()
             
    #fit LDA model
    print "Fitting LDA model..."
    lda_vb = LatentDirichletAllocation(n_topics = num_topics, max_iter=10, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)

    tic = time()
開發者ID:vsmolyakov,項目名稱:ml,代碼行數:33,代碼來源:lda_vb.py

示例6: CommentsAnalyzer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
class CommentsAnalyzer(pmlutil.Configurable):
    
    def configTypes(self):
        return dict(amount=int, min_ngram=int, max_ngram=int, min_df=int, max_df=float, use_idf=int, alpha=readArray, l1_ratio=readArray, n_folds=int)

    def _loadData(self):
        logging.info("loading data")
        self.data = []
        count = 0
        for fn in os.listdir(self._datafolder):
            if not self._amount < 1 and count >= self._amount:
                break
            if fn.endswith(self._metaextension):
                mfn = self._datafolder + "/" + fn
                ddm = pml.Datum(mfn,None)
                if len(ddm.meta()['comments'])>0:
                    self.data.append(ddm)
                    count +=1
        logging.info("loaded %d data" % count)

    def __init__(self):
        self.data=[]

    def _aggregateComments(self, subset):
        allcomments = []
        for datum in subset:
            comments = []
            for comment in datum.meta()['comments']:
                comments.append(comment['text'])
            allcomments.append(" ".join(comments))
        return np.array(allcomments)

    def _buildDictionary(self, allcomments):
        print allcomments
        self.vectorizer = TfidfVectorizer(analyzer=self._analyzer, ngram_range=(self._min_ngram,self._max_ngram),
                                     min_df=self._min_df, max_df=self._max_df, norm='l2', smooth_idf=True, use_idf=bool(self._use_idf))
        self.vectorizer.fit(allcomments)

    def run(self):
        allcomments = self._aggregateComments(self.data)
        self._buildDictionary(allcomments)

        # create representation of documents
        tfidfArray = self.vectorizer.transform(allcomments)

        # create labelling
        labels = []
        for datum in self.data:
            labels.append(len(datum.meta()['favorites']))
        labels = np.array(labels)

        print self.vectorizer.get_params()
        print self.vectorizer.get_feature_names()

        # training
        self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds)
        self.elasticNet.fit(tfidfArray,labels)

        for i,l1_ratio in enumerate(self._l1_ratio):
            for j,alpha in enumerate(self._alpha):
                print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:]))

        print self.vectorizer.inverse_transform(self.elasticNet.coef_)
開發者ID:yk,項目名稱:pml14publish,代碼行數:65,代碼來源:commentsshell.py


注:本文中的sklearn.feature_extraction.text.TfidfVectorizer.get_params方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。