本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.max_features方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.max_features方法的具體用法?Python TfidfVectorizer.max_features怎麽用?Python TfidfVectorizer.max_features使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.max_features方法的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: to_vector
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import max_features [as 別名]
def to_vector(self, title_list):
vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=self.MAX_DF)
vectorizer.max_features = self.MAX_FEATURES
vectorizer.fit(title_list)
tf = vectorizer.transform(title_list)
lsa = TruncatedSVD(self.LSA_DIM)
lsa.fit(tf)
tf = lsa.transform(tf)
return tf, vectorizer, lsa
示例2: to_vector
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import max_features [as 別名]
def to_vector(self, text_set, MAX_DF, MAX_FEATURES, LSA_DIM):
'''
bag of words に変換、次元削減
'''
vectorizer = TfidfVectorizer(analyzer=analyzer ,max_df=MAX_DF, stop_words = stopwords)
vectorizer.max_features = MAX_FEATURES
X = vectorizer.fit_transform(text_set)
lsa= TruncatedSVD(LSA_DIM)
X = lsa.fit_transform(X)
return X, lsa, vectorizer
示例3: transform_data
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import max_features [as 別名]
def transform_data(filename,MAX_DF = 0.9, MAX_FEATURES = 500, LSA_DIM = 100):
'''mecabのテンプレート、ファイルを読み込み、タイトルを形態素解析して次元圧縮して正規化かする。戻り値はデータセットとタイトルの行列'''
data = pd.read_csv(filename)
title = []
for i in data.index:
title.append(data.ix[i, 'Title'].decode('utf-8'))
vectorizer = TfidfVectorizer(analyzer=analyzer ,max_df=MAX_DF, stop_words = stopwords)
vectorizer.max_features = MAX_FEATURES
X = vectorizer.fit_transform(title)
lsa= TruncatedSVD(LSA_DIM)
X = lsa.fit_transform(X)
X = Normalizer(copy=False).fit_transform(X)
return data,X
示例4: main
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import max_features [as 別名]
def main(filename):
# load tweets
tweets = get_tweets_from_csv(filename)
# print tweets
# feature extraction
vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=MAX_DF)
vectorizer.max_features = MAX_FEATURES
X = vectorizer.fit_transform(tweets)
# dimensionality reduction by LSA
lsa = TruncatedSVD(LSA_DIM)
X = lsa.fit_transform(X)
X = Normalizer(copy=False).fit_transform(X)
# clustering by KMeans
if MINIBATCH:
km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10, verbose=True)
else:
km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, verbose=True)
km.fit(X)
labels = km.labels_
transformed = km.transform(X)
dists = np.zeros(labels.shape)
for i in range(len(labels)):
dists[i] = transformed[i, labels[i]]
# sort by distance
clusters = []
for i in range(NUM_CLUSTERS):
cluster = []
ii = np.where(labels==i)[0]
dd = dists[ii]
di = np.vstack([dd,ii]).transpose().tolist()
di.sort()
for d, j in di:
cluster.append(tweets[int(j)])
clusters.append(cluster)
return clusters