本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.set_params方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.set_params方法的具體用法?Python TfidfVectorizer.set_params怎麽用?Python TfidfVectorizer.set_params使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.set_params方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: extract_tfidf_nmf_feats
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def extract_tfidf_nmf_feats(self, df_data, n_components):
"""
Extract tfidf features using nmf.
"""
df_feat = pd.DataFrame(index=range(df_data.shape[0]))
tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english')
tsvd = TruncatedSVD(n_components=n_components, random_state = 2016)
nmf = NMF(solver='cd', n_components=n_components, init='nndsvda',
random_state=0, tol=1e-3)
df_data['q'].to_csv('q', index=False)
df_data['t'].to_csv('t', index=False)
df_data['d'].to_csv('d', index=False)
print('fitting in tfidf')
tfidf.set_params(input='filename')
tfidf.fit(['q','t','d'])
tfidf.set_params(input='content')
for col in ['d', 't', 'q', 'b']:
print('process column', col)
txt = df_data[col]
tfidf_mat = tfidf.transform(txt)
nd_feat = nmf.fit_transform(tfidf_mat)
tmp = pd.DataFrame(nd_feat, columns=[col+'_tfidf_nmf_comp'+str(i) \
for i in range(n_components)])
df_feat = pd.merge(df_feat, tmp, left_index=True, right_index=True)
saveit(df_feat, 'df_tfidf_nmf_feats')
示例2: nmf_topic_extraction
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def nmf_topic_extraction(corpus, bv_stop_tokens, n_features = 5000, n_top_words = 5, n_topics = 3, data={}):
n_samples = len(corpus)
# ensure we don't ask for more topics than we have samples
# this happens when we there are only a few bibleverses in a cluster
n_topics = min(n_samples, n_topics)
if n_topics==2:
n_topics = 1
# vectorize the tweet text using the most common word
# frequency with TF-IDF weighting (without the top 5% stop words)
vectorizer = TfidfVectorizer(max_features=n_features,
ngram_range=(2,2)
)
stoplist = ['retweet', 'rt', 'http', 'things', 'christ', 'lord', 'god', 'shall', 'jesus',
'nlt', 'kjv', 'prov']
try:
vectorizer.set_params(stop_words=set(list(ENGLISH_STOP_WORDS)+stoplist+bv_stop_tokens))
counts = vectorizer.fit_transform(corpus)
tfidf = TfidfTransformer().fit_transform(counts)
feature_names = vectorizer.get_feature_names()
except Exception, ex:
logger.exception("Tfidf analysis failed ex={}, bv={}, data={}, n_topics={}".format(ex, bv_stop_tokens, data, n_topics))
return []
示例3: extract_tsne_gather_feat
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def extract_tsne_gather_feat(stage):
"""
Extract tsne gather features.
Note: python2 only.
Better than func:extract_tsne_feat in cv, but worst in submission.
"""
df_w2vlem_join = pd.read_csv('tmp2/df_w2vlem_join.csv', index_col=0)
if stage <= 1:
df_feat = pd.DataFrame(index=df_w2vlem_join.index.values)
tfidf = TfidfVectorizer(ngram_range=(2,4), stop_words='english', min_df=2)
df_w2vlem_join['t_w2v'].to_csv('tmp2/t_w2v', index=False)
df_w2vlem_join['q_w2v'].to_csv('tmp2/q_w2v', index=False)
df_w2vlem_join['d_w2v'].to_csv('tmp2/d_w2v', index=False)
tfidf.set_params(input='filename')
tfidf.fit(['tmp2/t_w2v','tmp2/q_w2v','tmp2/d_w2v'])
tfidf.set_params(input='content')
cPickle.dump(tfidf, open('tmp2/tfidf_obj','wb'))
tfidf = cPickle.load(open('tmp2/tfidf_obj','rb'))
X_t = tfidf.transform(df_w2vlem_join['t_w2v'].tolist())
if stage <= 2:
svd = TruncatedSVD(n_components=100, random_state=2016)
X_svd = svd.fit_transform(X_t)
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = bh_sne(X_scaled)
df_feat['tsne_t_1'] = X_tsne[:len(df_w2vlem_join), 0]
df_feat['tsne_t_2'] = X_tsne[:len(df_w2vlem_join), 1]
df_feat.to_csv('tmp2/tsne_t', index=False)
df_feat = pd.read_csv('tmp2/tsne_t')
if stage <= 3:
print(df_feat)
X_q = tfidf.transform(df_w2vlem_join['q_w2v'].tolist())
X_tq = sp.hstack([X_t, X_q]).tocsr()
svd = TruncatedSVD(n_components=50, random_state=2016)
X_svd = svd.fit_transform(X_tq)
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = bh_sne(X_scaled)
df_feat['tsne_qt_1'] = X_tsne[:len(df_w2vlem_join), 0]
df_feat['tsne_qt_2'] = X_tsne[:len(df_w2vlem_join), 1]
df_feat.to_csv('tmp2/tsne_qt', index=False)
df_feat = pd.read_csv('tmp2/tsne_qt')
if stage <= 4:
print(df_feat)
X_d = tfidf.transform(df_w2vlem_join['d_w2v'].tolist())
svd = TruncatedSVD(n_components=100, random_state=2016)
X_svd = svd.fit_transform(X_d)
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = bh_sne(X_scaled)
df_feat['tsne_desc_1'] = X_tsne[:len(df_w2vlem_join), 0]
df_feat['tsne_desc_2'] = X_tsne[:len(df_w2vlem_join), 1]
df_tsne_feats = df_feat
df_tsne_feats.to_csv('tmp2/df_tsne_gather_feats.csv')
示例4: GroceryFeatureGenerator
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
class GroceryFeatureGenerator(object):
#def __init__(self,name):
def __init__(self):
#self.name = name
self.stop_words = set([])
self.tfidf = TfidfVectorizer(max_features=4000,
ngram_range=(1,3),sublinear_tf = True)
def settfidf(self,stopwords_path=None,max_features=4000):
if stopwords_path != None:
self.get_stopwords(stopwords_path)
self.tfidf.set_params(stop_words=list(self.stop_words))
if max_features != 4000:
self.tfidf.set_params(max_features=max_features)
return self
def get_stopwords(self,stop_words_path):
try:
with open(stop_words_path,'r') as fin:
contents = fin.read().decode('utf-8')
except IOError:
raise ValueError("the given stop words path %s is in invalid."%(stop_words_path))
for line in contents.splitlines():
self.stop_words.add(line.strip())
print("\nsuccess in getting stopwords\n")
def fit_transform(self,path,textlist):
#self.settfidf('resource/stop_words.txt')
self.settfidf(os.path.join(path,'stop_words.txt'))
#if isinstance(textlist,list):
if 1:
tf = self.tfidf.fit_transform(textlist)
#with open(self.tfidfpath,'w') as fout:
# pickle.dump(tf)
return tf
# def load_tfidf(self):
# try:
# with open(self.tfidfpath,'r') as fin:
# self.tfidf = pickle.load(fin)
# except IOError:
# raise ValueError("the %s path is invalid."%(self.tfidfpath))
def transform(self,textlist):
#self.load_tfidf()
if isinstance(textlist,list):
return self.tfidf.transform(textlist)
def save(self, dest_file):
config = self.tfidf
cPickle.dump(config, open(dest_file, 'w'), -1)
def load(self, src_file):
self.tfidf = cPickle.load(open(src_file, 'r'))
return self
示例5: word_vectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def word_vectorizer(isScore, isStemming, **kwargs):
params = {"strip_accents": 'unicode', "max_features": 1500}
if isScore:
vect = TfidfVectorizer()
if kwargs.has_key('scoreType'):
# Setting Score Type
if kwargs['scoreType'] == 'TF':
params["use_idf"] = False
elif kwargs['scoreType'] == 'TF-IDF':
params["use_idf"] = True
else:
vect = CountVectorizer()
if kwargs.has_key('isBinary'):
# Setting Binary Frequency
if kwargs['isBinary']:
params["binary"] = True
else:
params["binary"] = False
if kwargs.has_key('stopwordPath'):
# Reading stopwords
with open(kwargs.get('stopwordPath')) as f:
sw = f.read().splitlines()
params["stop_words"] = sw
if isStemming:
params["tokenizer"] = stem_tokenize
else:
params["tokenizer"] = tokenize
#print params
vect.set_params(**params)
return vect
示例6: word_vectorizer2
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def word_vectorizer2(**args):
params = {"strip_accents": 'unicode'}
prefijo = ''
# Establecer ponderación de las palabras vectorizadas
if args.has_key('scoreType'):
if args['scoreType'] == 'TFIDF':
vector = TfidfVectorizer()
params["use_idf"] = True
prefijo += args.get('scoreType') + '_'
elif args['scoreType'] == 'TF':
vector = TfidfVectorizer()
params["use_idf"] = False
prefijo += args.get('scoreType') + '_'
elif args['scoreType'] == 'BTO':
vector = CountVectorizer()
params["binary"] = True
prefijo += args.get('scoreType') + '_'
elif args['scoreType'] == 'TO':
vector = CountVectorizer()
params["binary"] = False
prefijo += args.get('scoreType') + '_'
else:
vector = CountVectorizer()
params["binary"] = False
prefijo += args.get('scoreType') + '_'
print 'ADVERTENCIA: Tipo de ponderación no válido, se usará TO'
else:
vector = CountVectorizer()
params["binary"] = False
prefijo += 'TO_'
print 'ADVERTENCIA: Tipo de ponderación no especificado, se usará TO'
# Establecer tamaño del conjunto de palabras
if args.has_key('maxFeatures'):
params["max_features"] = args.get('maxFeatures')
else:
params["max_features"] = 1500
#print 'ADVERTENCIA: No se especifica un máximo de conjunto de palabras, se usarán 1500'
# Establecer stemming
if args.has_key('stemming'):
if args['stemming'] == 'spanish':
params["tokenizer"] = spanish_tokenize
prefijo += 'SpanishStem_'
elif args['stemming'] == 'english':
params["tokenizer"] = english_tokenize
prefijo += 'EnglishStem_'
else:
params["tokenizer"] = tokenize
prefijo += 'NoStem_'
print 'ADVERTENCIA: Stemmer no válido, no se realizará stemming'
else:
params["tokenizer"] = tokenize
prefijo += 'NoStem_'
# Establecer conjunto de stopwords (opcional)
if args.has_key('stopwordPath'):
# Reading stopwords
with open(args.get('stopwordPath')) as f:
sw = f.read().splitlines()
params["stop_words"] = sw
prefijo += 'Stopwords'
else:
prefijo += 'NoStopwords'
# print params
vector.set_params(**params)
return vector, prefijo
# def vectorizer1(object):
# cv = CountVectorizer(strip_accents='unicode', max_features=1500, tokenizer=tokenize)
# X_train_counts = cv.fit_transform(object.data)
# X_words = object.target
# X_labels = object.target_names
#
# return X_train_counts, X_words, X_labels
#
#
# def vectorizer2(object, stopwords):
# fsw = open(stopwords, 'r')
# sw = fsw.readlines()
# fsw.close()
#
# cv = CountVectorizer(strip_accents='unicode', max_features=1500, tokenizer=tokenize, stop_words=sw)
# X_train_counts = cv.fit_transform(object.data)
# X_words = object.target
# X_labels = object.target_names
#
# return X_train_counts, X_words, X_labels
#
#
# def vectorizer3(object):
# cv = CountVectorizer(strip_accents='unicode', max_features=1500, tokenizer=stem_tokenize)
# X_train_counts = cv.fit_transform(object.data)
# X_words = object.target
# X_labels = object.target_names
#.........這裏部分代碼省略.........
示例7: extract_cosinedist_feat
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import set_params [as 別名]
def extract_cosinedist_feat(self, df_data, df_w2vlem_join):
"""
Extract cosine distance features.
Note: this func is very slow. It would cost a few hours.
"""
df_feat = pd.DataFrame(index=df_data.index.values)
tfv = TfidfVectorizer(ngram_range=(2,3), min_df=2)
print('computing qt_w2v_cosdist')
df_w2vlem_join['q_w2v'].to_csv('q_w2v', index=False)
df_w2vlem_join['t_w2v'].to_csv('t_w2v', index=False)
tfv.set_params(input='filename')
tfv.fit(['q_w2v', 't_w2v'])# list(df_w2vlem_join['q_w2v'].values)+list(df_w2vlem_join['t_w2v'].values)
tfv.set_params(input='content')
print('done fitting')
qt_unigram_func = lambda x: self._cosine_dist(x['q_w2v'], x['t_w2v'], tfv)
df_feat['qt_w2v_cosdist'] = df_w2vlem_join.apply(qt_unigram_func, axis=1)
if True: # You can abandon this feature because it costs too much time (more than 24 hours).
print('computing qd_w2v_cosdist')
df_w2vlem_join['d_w2v'].to_csv('d_w2v', index=False)
tfv.set_params(input='filename')
tfv.fit(['q_w2v', 'd_w2v'])# list(df_w2vlem_join['q_w2v'].values)+list(df_w2vlem_join['d_w2v'].values)
tfv.set_params(input='content')
print('done fitting')
qd_unigram_func = lambda x: self._cosine_dist(x['q_w2v'], x['d_w2v'], tfv)
df_feat['qd_w2v_cosdist'] = df_w2vlem_join.apply(qd_unigram_func, axis=1)
print('computing qt_cosdist')
df_data['q'].to_csv('q', index=False)
df_data['t'].to_csv('t', index=False)
tfv.set_params(input='filename')
tfv.fit(['q', 't'])# list(df_data['q'].values) + list(df_data['t'].values)
tfv.set_params(input='content')
print('done fitting')
qt_func = lambda x: self._cosine_dist(x['q'], x['t'], tfv)
df_feat['qt_cosdist'] = df_data.apply(qt_func, axis=1)
df_cosdist_feats = df_feat
saveit(df_cosdist_feats, 'df_cosdist_feats')