本文整理汇总了Python中sklearn.decomposition.TruncatedSVD.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python TruncatedSVD.fit_transform方法的具体用法?Python TruncatedSVD.fit_transform怎么用?Python TruncatedSVD.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.decomposition.TruncatedSVD
的用法示例。
在下文中一共展示了TruncatedSVD.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: cook
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def cook():
x, y, weights = load_data()
n_components = 200
svd = TruncatedSVD(n_components, random_state=42)
x_unweighted = svd.fit_transform(x)
x_weighted = svd.fit_transform(weighted(x, weights))
for i in range(9):
frac = 1 - (i * 0.01 + 0.01)
print frac
x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac)
classifier = AdaBoostClassifier(n_estimators=100)
classifier.fit(x_train, y_train)
print "Unweighted: ", classifier.score(x_test, y_test)
x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac)
classifier = AdaBoostClassifier(n_estimators=100)
classifier.fit(x_train, y_train)
print "Weighted: ", classifier.score(x_test, y_test)
print '--------------------------'
'''
示例2: extract_tsne_gather_feat
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def extract_tsne_gather_feat(stage):
"""
Extract tsne gather features.
Note: python2 only.
Better than func:extract_tsne_feat in cv, but worst in submission.
"""
df_w2vlem_join = pd.read_csv('tmp2/df_w2vlem_join.csv', index_col=0)
if stage <= 1:
df_feat = pd.DataFrame(index=df_w2vlem_join.index.values)
tfidf = TfidfVectorizer(ngram_range=(2,4), stop_words='english', min_df=2)
df_w2vlem_join['t_w2v'].to_csv('tmp2/t_w2v', index=False)
df_w2vlem_join['q_w2v'].to_csv('tmp2/q_w2v', index=False)
df_w2vlem_join['d_w2v'].to_csv('tmp2/d_w2v', index=False)
tfidf.set_params(input='filename')
tfidf.fit(['tmp2/t_w2v','tmp2/q_w2v','tmp2/d_w2v'])
tfidf.set_params(input='content')
cPickle.dump(tfidf, open('tmp2/tfidf_obj','wb'))
tfidf = cPickle.load(open('tmp2/tfidf_obj','rb'))
X_t = tfidf.transform(df_w2vlem_join['t_w2v'].tolist())
if stage <= 2:
svd = TruncatedSVD(n_components=100, random_state=2016)
X_svd = svd.fit_transform(X_t)
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = bh_sne(X_scaled)
df_feat['tsne_t_1'] = X_tsne[:len(df_w2vlem_join), 0]
df_feat['tsne_t_2'] = X_tsne[:len(df_w2vlem_join), 1]
df_feat.to_csv('tmp2/tsne_t', index=False)
df_feat = pd.read_csv('tmp2/tsne_t')
if stage <= 3:
print(df_feat)
X_q = tfidf.transform(df_w2vlem_join['q_w2v'].tolist())
X_tq = sp.hstack([X_t, X_q]).tocsr()
svd = TruncatedSVD(n_components=50, random_state=2016)
X_svd = svd.fit_transform(X_tq)
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = bh_sne(X_scaled)
df_feat['tsne_qt_1'] = X_tsne[:len(df_w2vlem_join), 0]
df_feat['tsne_qt_2'] = X_tsne[:len(df_w2vlem_join), 1]
df_feat.to_csv('tmp2/tsne_qt', index=False)
df_feat = pd.read_csv('tmp2/tsne_qt')
if stage <= 4:
print(df_feat)
X_d = tfidf.transform(df_w2vlem_join['d_w2v'].tolist())
svd = TruncatedSVD(n_components=100, random_state=2016)
X_svd = svd.fit_transform(X_d)
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = bh_sne(X_scaled)
df_feat['tsne_desc_1'] = X_tsne[:len(df_w2vlem_join), 0]
df_feat['tsne_desc_2'] = X_tsne[:len(df_w2vlem_join), 1]
df_tsne_feats = df_feat
df_tsne_feats.to_csv('tmp2/df_tsne_gather_feats.csv')
示例3: test_singular_values
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def test_singular_values():
# Check that the TruncatedSVD output has the correct singular values
rng = np.random.RandomState(0)
n_samples = 100
n_features = 80
X = rng.randn(n_samples, n_features)
apca = TruncatedSVD(n_components=2, algorithm='arpack',
random_state=rng).fit(X)
rpca = TruncatedSVD(n_components=2, algorithm='arpack',
random_state=rng).fit(X)
assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 12)
# Compare to the Frobenius norm
X_apca = apca.transform(X)
X_rpca = rpca.transform(X)
assert_array_almost_equal(np.sum(apca.singular_values_**2.0),
np.linalg.norm(X_apca, "fro")**2.0, 12)
assert_array_almost_equal(np.sum(rpca.singular_values_**2.0),
np.linalg.norm(X_rpca, "fro")**2.0, 12)
# Compare to the 2-norms of the score vectors
assert_array_almost_equal(apca.singular_values_,
np.sqrt(np.sum(X_apca**2.0, axis=0)), 12)
assert_array_almost_equal(rpca.singular_values_,
np.sqrt(np.sum(X_rpca**2.0, axis=0)), 12)
# Set the singular values and see what we get back
rng = np.random.RandomState(0)
n_samples = 100
n_features = 110
X = rng.randn(n_samples, n_features)
apca = TruncatedSVD(n_components=3, algorithm='arpack',
random_state=rng)
rpca = TruncatedSVD(n_components=3, algorithm='randomized',
random_state=rng)
X_apca = apca.fit_transform(X)
X_rpca = rpca.fit_transform(X)
X_apca /= np.sqrt(np.sum(X_apca**2.0, axis=0))
X_rpca /= np.sqrt(np.sum(X_rpca**2.0, axis=0))
X_apca[:, 0] *= 3.142
X_apca[:, 1] *= 2.718
X_rpca[:, 0] *= 3.142
X_rpca[:, 1] *= 2.718
X_hat_apca = np.dot(X_apca, apca.components_)
X_hat_rpca = np.dot(X_rpca, rpca.components_)
apca.fit(X_hat_apca)
rpca.fit(X_hat_rpca)
assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14)
assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14)
示例4: perform_emsamble_model
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def perform_emsamble_model():
#get data from csv file
x , y_votes, y_comments, y_views, lat = read_train_data()
#transform to nunpy data type array for better usage
y_votes = np.array(y_votes)
y_comments = np.array(y_comments)
y_views = np.array(y_views)
#get test data
x_test, ids, lat = read_test_data()
#Change the parameters from the objects with the values from gridsearch
vec_votes = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2)
vec_comments = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2)
vec_views = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2)
#transfor x and x_test in a TFIDF matrix for feeding to the classifier
x_votes = vec_votes.fit_transform(x)
x_comments = vec_comments.fit_transform(x)
x_views = vec_views.fit_transform(x)
x_test_transformed_votes = vec_votes.transform(x_test)
x_test_transformed_comments = vec_comments.transform(x_test)
x_test_transformed_views = vec_views.transform(x_test)
print "TFIDF Matrixes generated"
print " LSA transforming"
lsa_votes = TruncatedSVD(500)
lsa_comments = TruncatedSVD(500)
lsa_views = TruncatedSVD(500)
x_votes = lsa_votes.fit_transform(x_votes)
print "LSA Votes Done.."
print
x_comments = lsa_comments.fit_transform(x_comments)
print "LSA Comments Done.."
print
x_views = lsa_views.fit_transform(x_views)
print "LSA Views Done.."
print
x_test_transformed_votes = lsa_votes.transform(x_test_transformed_votes)
x_test_transformed_comments = lsa_comments.transform(x_test_transformed_comments)
x_test_transformed_views = lsa_views.transform(x_test_transformed_views)
print "SLA Finished.."
ada_votes = AdaBoostClassifier(base_estimator=RandomForestClassifier())
ada_comments = AdaBoostClassifier(base_estimator=RandomForestClassifier())
ada_views = AdaBoostClassifier(base_estimator=RandomForestClassifier())
ada_votes.fit(x_votes, y_votes)
ada_comments.fit(x_comments, y_comments)
ada_views.fit(x_views, y_views)
print "Fitting done"
print
#predict number of votes
pred_votes = ada_votes.predict(x_test_transformed_votes)
pred_comments = ada_comments.predict(x_test_transformed_comments)
pred_views = ada_views.predict(x_test_transformed_views)
#generate submission response csv file
create_csv_response(len(x_test), ids, pred_views, pred_votes, pred_comments)
示例5: test_algorithms
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def test_algorithms():
svd_a = TruncatedSVD(30, algorithm="arpack")
svd_r = TruncatedSVD(30, algorithm="randomized", random_state=42)
Xa = svd_a.fit_transform(X)[:, :6]
Xr = svd_r.fit_transform(X)[:, :6]
assert_array_almost_equal(Xa, Xr, decimal=5)
comp_a = np.abs(svd_a.components_)
comp_r = np.abs(svd_r.components_)
# All elements are equal, but some elements are more equal than others.
assert_array_almost_equal(comp_a[:9], comp_r[:9])
assert_array_almost_equal(comp_a[9:], comp_r[9:], decimal=2)
示例6: benchmark
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def benchmark(k, epochs):
print("*" * 80)
print("k: %d, epochs: %d\n" % (k, epochs))
#select = SelectKBest(score_func=chi2, k=k)
select = TruncatedSVD(n_components=k)
X_train_trunc = select.fit_transform(X_train, Y_train)
X_test_trunc = select.transform(X_test)
print('done truncating')
clf = DBN([X_train_trunc.shape[1], k, 4], learn_rates=0.3, learn_rate_decays=0.9, epochs=epochs, verbose=1)
clf.fit(X_train_trunc, Y_train)
pred = clf.predict(X_test_trunc)
if CREATE_SUBMISSION:
X_submit_trunc = select.transform(X_submit)
pred_submit = clf.predict(X_submit_trunc)
dump_csv(pred_submit, k, epochs)
score = metrics.f1_score(Y_test, pred)
print("f1-score: %0.3f" % score)
print("classification report:")
print(metrics.classification_report(Y_test, pred))
print("confusion matrix:")
print(metrics.confusion_matrix(Y_test, pred))
示例7: benchmark
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def benchmark(k, epochs):
print("*" * 80)
print("k: %d, epochs: %d\n" % (k, epochs))
#select = SelectKBest(score_func=chi2, k=k)
select = TruncatedSVD(n_components=k)
X_train_trunc = select.fit_transform(X_train, Y_train)
X_test_trunc = select.transform(X_test)
print('done truncating')
parameters = {'C': [1, 10, 100, 1000, 10000], 'class_weight': ['auto', None], 'tol':[0.001,0.0001]}
clf = LinearSVC(C=100000)
#clf = grid_search.GridSearchCV(svc, parameters)
clf.fit(X_train_trunc, Y_train)
pred = clf.predict(X_test_trunc)
if CREATE_SUBMISSION:
X_submit_trunc = select.transform(X_submit)
pred_submit = clf.predict(X_submit_trunc)
dump_csv(pred_submit, k, epochs)
score = metrics.f1_score(Y_test, pred)
print("f1-score: %0.3f" % score)
print("classification report:")
print(metrics.classification_report(Y_test, pred))
print("confusion matrix:")
print(metrics.confusion_matrix(Y_test, pred))
示例8: tfIDFeats
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def tfIDFeats(ids,data):
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
# Fit TFIDF
tfv.fit(data)
X = tfv.transform(data)
# Initialize SVD
svd = TruncatedSVD(n_components=350)
# Initialize the standard scaler
scl = StandardScaler( with_mean=False)
if X.shape[1]>350:
X = svd.fit_transform(X)
X = scl.fit_transform(X,ids)
if plotData:
X = PCA(n_components=2).fit_transform(X)
return (X,ids)
示例9: preprocess
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def preprocess(data, n_components, use_tf_idf=True):
"""
Preproecess the data for clustering by running SVD and
normalizing the results. This process is also known as
LSA.
arguments:
data -- Dataset, if tf_idf is Truethe object must contain a
tf_idf table alongside a raw frequencies dataframe.
n_components -- int, the number of components to use for the SVD
a minimum of 100 is recommended.
use_tf_idf -- bool, whether to use the tf-idf frequencies for the
preprocessing.
returns:
e -- float, a measure of variance explained by the SVD.
X -- np.array, an array with the data reduced to n_components.
"""
if use_tf_idf:
d = data.tf_idf.as_matrix()
else:
d = data.df.as_matrix()
svd = TruncatedSVD(n_components=n_components)
X = svd.fit_transform(d)
norm = Normalizer()
# Record a measure of explained variance
e = svd.explained_variance_ratio_.sum()*100
return e, norm.fit_transform(d)
示例10: solve
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def solve(self, X, missing_mask):
observed_mask = ~missing_mask
X_filled = X
for i in range(self.max_iters):
# deviation from original svdImpute algorithm:
# gradually increase the rank of our approximation
if self.gradual_rank_increase:
curr_rank = min(2 ** i, self.rank)
else:
curr_rank = self.rank
tsvd = TruncatedSVD(curr_rank, algorithm=self.svd_algorithm)
X_reduced = tsvd.fit_transform(X_filled)
X_reconstructed = tsvd.inverse_transform(X_reduced)
X_reconstructed = self.clip(X_reconstructed)
mae = masked_mae(
X_true=X,
X_pred=X_reconstructed,
mask=observed_mask)
if self.verbose:
print(
"[IterativeSVD] Iter %d: observed MAE=%0.6f" % (
i + 1, mae))
converged = self._converged(
X_old=X_filled,
X_new=X_reconstructed,
missing_mask=missing_mask)
X_filled[missing_mask] = X_reconstructed[missing_mask]
if converged:
break
return X_filled
示例11: main
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def main():
svd = TruncatedSVD()
Z = svd.fit_transform(X)
plt.scatter(Z[:,0], Z[:,1])
for i in xrange(D):
plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1]))
plt.show()
示例12: prepare_query_vectors
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def prepare_query_vectors(df_train, df_test):
search_term = df_train[key_query].values
search_term = np.concatenate([search_term, df_test[key_query].values])
count = TfidfVectorizer(tokenizer=lambda x: x.split(' '))
count.fit(search_term)
vocab = list(count.vocabulary_.keys())
sparse = count.transform(vocab)
tsvd = TruncatedSVD(n_components=50, random_state=configs['seed'])
vocab_vectors = tsvd.fit_transform(sparse)
word_to_nnindex = {}
for i, word in enumerate(vocab):
word_to_nnindex[word] = i + 1
# Converting words to indexes
query_indexes = []
for i, query in enumerate(search_term):
words = query.split(' ')
indexes = []
for word in words:
indexes.append(word_to_nnindex[word])
query_indexes.append(indexes)
# Add Padding to rest
max_words = max([len(q) for q in query_indexes])
for i, query_ind in enumerate(query_indexes):
l = len(query_ind)
query_indexes[i].extend([0] * (max_words - l))
query_train = np.array(query_indexes[:len(df_train)], dtype=np.int32)
query_test = np.array(query_indexes[len(df_train):], dtype=np.int32)
return vocab_vectors, query_train, query_test
示例13: td_svd_q
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def td_svd_q(features):
x_train = csr_matrix(hstack((
features.title_tf_idf_train,
features.description_tf_idf_train
)))
x_test = csr_matrix(hstack((
features.title_tf_idf_test,
features.description_tf_idf_test
)))
svd = TruncatedSVD(n_components=250, n_iter=5)
x_train = svd.fit_transform(x_train)
x_test = svd.transform(x_test)
x_train = np.hstack((
x_train,
features.queries_tf_idf_train.toarray(),
features.features_train
))
x_test = np.hstack((
x_test,
features.queries_tf_idf_test.toarray(),
features.features_test
))
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
y_train = features.train['median_relevance'].values
return x_train, x_test, y_train
示例14: lsa_summarizer
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def lsa_summarizer(text,num_sen=5):
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentenceTokens = sent_detector.tokenize(text.strip())
tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText)
sparse = tfvectorizer.fit_transform(sentenceTokens).A
lsa = TruncatedSVD(n_components=1)
concept = lsa.fit_transform(sparse)
pos = np.array(list(range(len(sentenceTokens))))
listlist = [list(x) for x in zip(sentenceTokens,concept,pos)]
listlist.sort(key=lambda x: x[1],reverse=True)
summarysentences = listlist[0:num_sen]
summarysentences.sort(key=lambda x: x[2],reverse=False)
summary = ""
for n in range(num_sen):
summary += ' ' + summarysentences[n][0]
summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())
return summary
示例15: kfold
# 需要导入模块: from sklearn.decomposition import TruncatedSVD [as 别名]
# 或者: from sklearn.decomposition.TruncatedSVD import fit_transform [as 别名]
def kfold(agetext,k,model,k2):
import collections
out = []
for i in range(k):
print "iteration: "+str(i)
agetext = shuffle(agetext)
datatb = agetext.iloc[:,1:]
label = agetext["agegroup"].tolist()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
datatb, label, test_size=0.15, random_state=i*6)
data = X_train.values
counter = collections.Counter(y_train)
print counter
testdata = X_test.values
lsa = TruncatedSVD(k2, algorithm = 'arpack')
normalizer = Normalizer(copy=False)
X = lsa.fit_transform(data)
X = normalizer.fit_transform(X)
X_test = lsa.transform(testdata)
X_test = normalizer.transform(X_test)
model.fit(X,y_train)
pred = model.predict(X_test)
counter = collections.Counter(y_test)
print counter
counter = collections.Counter(pred)
print counter
out.append(round(accuracy_score(y_test, pred),5))
print str(out)
print np.mean(out)