本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.get_params方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.get_params方法的具體用法?Python TfidfVectorizer.get_params怎麽用?Python TfidfVectorizer.get_params使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.get_params方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: get_data_with_dandelion
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
def get_data_with_dandelion(self, relevance_threshold=0.75, min_df=2,
gamma=0.89, filter=False):
only_text, ent, data = self.get_data_with_abstract_2(relevance_threshold)
entities_sparse = sparse.csr_matrix(ent)
tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
max_features=200000,
min_df=min_df,
stop_words='english',
strip_accents='unicode',
use_idf=True,
ngram_range=(1, 1),
norm='l2',
tokenizer=TextUtils.tokenize_and_stem)
tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0],
tfidf_matrix.shape[1])
print 'entities matrix dimension: %s x %s ' %(entities_sparse.shape[0],
entities_sparse.shape[1])
print 'non zero elements in entities matrix: %s' \
% len(entities_sparse.data)
'''print tfidf_matrix[tfidf_matrix > 0].mean()
print tfidf_matrix[tfidf_matrix > 0].max()
print entities_sparse[entities_sparse > 0].mean()
print entities_sparse[entities_sparse > 0].max()
print '#' * 80'''
#print 'after balancing'
tfidf_matrix = tfidf_matrix * 1
entities_sparse = entities_sparse * (1 - gamma)
#print tfidf_matrix[tfidf_matrix > 0].mean()
#print tfidf_matrix[tfidf_matrix > 0].max()
#print entities_sparse[entities_sparse > 0].mean()
#print entities_sparse[entities_sparse > 0].max()
f_score_dict = self.labels_dict(data)
params = tfidf_vectorizer.get_params()
params['dandelion_entities'] = entities_sparse.shape[1]
params['original_terms'] = tfidf_matrix.shape[0]
params['gamma'] = gamma
params['relevance_threshold'] = relevance_threshold
params['classes'] = len(f_score_dict)
params['tokenizer'] = 'TextUtils.tokenize_and_stem'
del params['dtype']
params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean()
return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict, params
示例2: get_data_only_with_abstract
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
def get_data_only_with_abstract(self, relevance_threshold=0.75, min_df=0.01,
gamma=0.89, filter=False):
only_text, ent, data = self.get_data_with_abstract_2(relevance_threshold)
tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
max_features=200000,
min_df=min_df,
stop_words='english',
strip_accents='unicode',
use_idf=True,
ngram_range=(1, 1),
norm='l2',
tokenizer=TextUtils.tokenize_and_stem)
tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
f_score_dict = self.labels_dict(data)
params = tfidf_vectorizer.get_params()
params['original_terms'] = tfidf_matrix.shape[0]
params['gamma'] = gamma
params['relevance_threshold'] = relevance_threshold
params['classes'] = len(f_score_dict)
params['tokenizer'] = 'TextUtils.tokenize_and_stem'
return tfidf_matrix, f_score_dict, params
示例3: get_data_only_with_entities
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
def get_data_only_with_entities(self, relevance_threshold=0.75, gamma=0.89, filter=False):
data = self.mongo.get_all(order_by='id_doc')
data = [doc for doc in data]
only_text = [doc['text'] for doc in data]
ent_dict, ent_set = self.get_dandelion_entities(data)
if filter:
entities_set = set([k for k, v in ent_dict.iteritems()])
else:
entities_set = ent_set
entities = {e: i for i, e in enumerate(entities_set)}
dandelion_entities = np.zeros((len(data), len(entities_set)))
for doc in data[:]:
text = doc['text']
if 'dandelion' in doc:
for e in doc['dandelion']['annotations']:
rel = np.float64(e['confidence'])
name = e['title']
if rel > relevance_threshold:
dandelion_entities[doc['id_doc']][entities[name]] = rel
entities_sparse = sparse.csr_matrix(dandelion_entities)
tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
max_features=200000,
min_df=2,
stop_words='english',
strip_accents='unicode',
use_idf=True,
ngram_range=(1, 1),
norm='l2',
tokenizer=TextUtils.tokenize_and_stem)
tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0],
tfidf_matrix.shape[1])
print 'entities matrix dimension: %s x %s ' %(entities_sparse.shape[0],
entities_sparse.shape[1])
print 'non zero elements in entities matrix: %s' \
% len(entities_sparse.data)
'''print tfidf_matrix[tfidf_matrix > 0].mean()
print tfidf_matrix[tfidf_matrix > 0].max()
print entities_sparse[entities_sparse > 0].mean()
print entities_sparse[entities_sparse > 0].max()
print '#' * 80'''
#print 'after balancing'
tfidf_matrix = tfidf_matrix * 1
entities_sparse = entities_sparse * (1 - gamma)
#print tfidf_matrix[tfidf_matrix > 0].mean()
#print tfidf_matrix[tfidf_matrix > 0].max()
#print entities_sparse[entities_sparse > 0].mean()
#print entities_sparse[entities_sparse > 0].max()
f_score_dict = self.labels_dict(data)
params = tfidf_vectorizer.get_params()
params['dandelion_entities'] = entities_sparse.shape[1]
params['original_terms'] = tfidf_matrix.shape[0]
params['gamma'] = gamma
params['relevance_threshold'] = relevance_threshold
params['classes'] = len(f_score_dict)
params['tokenizer'] = 'TextUtils.tokenize_and_stem'
del params['dtype']
params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean()
return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict,\
params
示例4: get_data_fabio
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
def get_data_fabio(self, gamma=0.89, rank_metric='r'):
data = self.mongo.get_all(order_by='id_doc')
data = [doc for doc in data]
only_text = [doc['text'] for doc in data]
entitySet = set()
for d in data:
if 'isa' in d:
for e in d['isa']:
entitySet.add(e['entity'])
current = np.zeros((len(data), len(entitySet)), dtype=np.float)
count = 0
invIndex = {}
countFeatures = 0
for i,d in enumerate(data):
if 'isa' in d:
for f in d['isa']:
if f['entity'] not in invIndex:
invIndex[f['entity']] = countFeatures
countFeatures += 1
current[count, invIndex[f['entity']]] = f[rank_metric]
count += 1
current = np.nan_to_num(current)
current_sparse = sparse.csr_matrix(current)
tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
max_features=200000,
min_df=2,
stop_words='english',
strip_accents='unicode',
use_idf=True,
ngram_range=(1, 1),
norm='l2',
tokenizer=TextUtils.tokenize_and_stem)
tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0],
tfidf_matrix.shape[1])
print 'entities matrix dimension: %s x %s ' %(current_sparse.shape[0],
current_sparse.shape[1])
print 'non zero elements in entities matrix: %s' \
% len(current_sparse.data)
tfidf_matrix = tfidf_matrix * 1
entities_sparse = current_sparse * (1 - gamma)
f_score_dict = self.labels_dict(data)
params = tfidf_vectorizer.get_params()
params['dandelion_entities'] = entities_sparse.shape[1]
params['original_terms'] = tfidf_matrix.shape[0]
params['gamma'] = gamma
params['rank_metric'] = rank_metric
params['classes'] = len(f_score_dict)
params['tokenizer'] = 'TextUtils.tokenize_and_stem'
del params['dtype']
params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean()
return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict,\
params
示例5: newsgroups
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
#20 newsgroups (part of sklearn)
print "loading 20 newsgroups dataset..."
tic = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=0, remove=('headers','footers','quotes'))
train_corpus = dataset.data # a list of 11314 documents / entries
toc = time()
print "elapsed time: %.4f sec" %(toc - tic)
#compute tf-idf (equivalent to count-vectorizer followed by tf-idf transformer)
#count-vectorizer produces term-document matrix tf-idf scales tf counts by log N/nt
#(N:num of docs, nt: number of a word occurence in docs)
#if float (proportion of docs): min_df < nt/N < max_df, if int: refers to count nt, e.g. min_df = 2
tfidf = TfidfVectorizer(max_features = num_features, max_df=0.95, min_df=2, stop_words = 'english')
print "tfidf parameters:"
print tfidf.get_params()
#generate tf-idf term-document matrix
A_tfidf_sp = tfidf.fit_transform(train_corpus) #size D x V
print "number of docs: %d" %A_tfidf_sp.shape[0]
print "dictionary size: %d" %A_tfidf_sp.shape[1]
#tf-idf dictionary
tfidf_dict = tfidf.get_feature_names()
#fit LDA model
print "Fitting LDA model..."
lda_vb = LatentDirichletAllocation(n_topics = num_topics, max_iter=10, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)
tic = time()
示例6: CommentsAnalyzer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_params [as 別名]
class CommentsAnalyzer(pmlutil.Configurable):
def configTypes(self):
return dict(amount=int, min_ngram=int, max_ngram=int, min_df=int, max_df=float, use_idf=int, alpha=readArray, l1_ratio=readArray, n_folds=int)
def _loadData(self):
logging.info("loading data")
self.data = []
count = 0
for fn in os.listdir(self._datafolder):
if not self._amount < 1 and count >= self._amount:
break
if fn.endswith(self._metaextension):
mfn = self._datafolder + "/" + fn
ddm = pml.Datum(mfn,None)
if len(ddm.meta()['comments'])>0:
self.data.append(ddm)
count +=1
logging.info("loaded %d data" % count)
def __init__(self):
self.data=[]
def _aggregateComments(self, subset):
allcomments = []
for datum in subset:
comments = []
for comment in datum.meta()['comments']:
comments.append(comment['text'])
allcomments.append(" ".join(comments))
return np.array(allcomments)
def _buildDictionary(self, allcomments):
print allcomments
self.vectorizer = TfidfVectorizer(analyzer=self._analyzer, ngram_range=(self._min_ngram,self._max_ngram),
min_df=self._min_df, max_df=self._max_df, norm='l2', smooth_idf=True, use_idf=bool(self._use_idf))
self.vectorizer.fit(allcomments)
def run(self):
allcomments = self._aggregateComments(self.data)
self._buildDictionary(allcomments)
# create representation of documents
tfidfArray = self.vectorizer.transform(allcomments)
# create labelling
labels = []
for datum in self.data:
labels.append(len(datum.meta()['favorites']))
labels = np.array(labels)
print self.vectorizer.get_params()
print self.vectorizer.get_feature_names()
# training
self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds)
self.elasticNet.fit(tfidfArray,labels)
for i,l1_ratio in enumerate(self._l1_ratio):
for j,alpha in enumerate(self._alpha):
print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:]))
print self.vectorizer.inverse_transform(self.elasticNet.coef_)