本文整理汇总了Python中sklearn.decomposition.TruncatedSVD类的典型用法代码示例。如果您正苦于以下问题:Python TruncatedSVD类的具体用法?Python TruncatedSVD怎么用?Python TruncatedSVD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了TruncatedSVD类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: tfIDFeats
def tfIDFeats(ids,data):
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
# Fit TFIDF
tfv.fit(data)
X = tfv.transform(data)
# Initialize SVD
svd = TruncatedSVD(n_components=350)
# Initialize the standard scaler
scl = StandardScaler( with_mean=False)
if X.shape[1]>350:
X = svd.fit_transform(X)
X = scl.fit_transform(X,ids)
if plotData:
X = PCA(n_components=2).fit_transform(X)
return (X,ids)
示例2: find_k
def find_k(self, rank=None, max_clusters=1, vertline=None):
if rank != None:
svd = TruncatedSVD(rank)
self.X = svd.fit_transform(self.X)
self.X = Normalizer(copy=False).fit_transform(self.X)
k_range = range(1, max_clusters)
clusters = [KMeans(n_clusters=k).fit(self.X) for k in k_range]
centroids = [cluster.cluster_centers_ for cluster in clusters]
k_cosine = [cdist(self.X, cent, metric='cosine') for cent in centroids]
dist = [np.min(k_cos, axis=1) for k_cos in k_cosine]
wcss = [sum(d[np.isnan(d) == False]**2) for d in dist] # Within cluster sum of squares
tss = sum(pdist(self.X)**2)/self.X.shape[0] # Total sum of squares
bss = tss - wcss # Explained variance
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(10, 3)
plt.tight_layout()
ax1.set_title('BSS')
ax1.plot(np.arange(1, len(bss)+1), bss)
ax1.scatter(np.arange(1, len(bss)+1), bss)
ax2.set_title('WCSS')
ax2.plot(np.arange(1, len(wcss)+1), wcss)
ax2.scatter(np.arange(1, len(wcss)+1), wcss)
plt.axvline(vertline, c='red', alpha=0.75) if vertline != None else None
plt.show()
示例3: test_feature_union
def test_feature_union():
# basic sanity check for feature union
iris = load_iris()
X = iris.data
X -= X.mean(axis=0)
y = iris.target
svd = TruncatedSVD(n_components=2, random_state=0)
select = SelectKBest(k=1)
fs = FeatureUnion([("svd", svd), ("select", select)])
fs.fit(X, y)
X_transformed = fs.transform(X)
assert_equal(X_transformed.shape, (X.shape[0], 3))
# check if it does the expected thing
assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
assert_array_equal(X_transformed[:, -1],
select.fit_transform(X, y).ravel())
# test if it also works for sparse input
# We use a different svd object to control the random_state stream
fs = FeatureUnion([("svd", svd), ("select", select)])
X_sp = sparse.csr_matrix(X)
X_sp_transformed = fs.fit_transform(X_sp, y)
assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())
# test setting parameters
fs.set_params(select__k=2)
assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
# test it works with transformers missing fit_transform
fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
X_transformed = fs.fit_transform(X, y)
assert_equal(X_transformed.shape, (X.shape[0], 8))
示例4: train_manual
def train_manual():
with open("../data/f_hashtag_prediction/train_data_tweets_processed_0_to_500K.txt") as ftrain:
with open("../data/f_hashtag_prediction/test_data_tagged_processed_manual.txt") as ftest:
test_set = ftest.read().splitlines()
train_set = ftrain.read().splitlines()
# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer(min_df=5, max_df=500, max_features=None,
strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 4), use_idf=1, smooth_idf=1, sublinear_tf=1,
stop_words='english')
# vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(train_set)
print tfidf_matrix.shape
smatrix = vectorizer.transform(test_set)
print smatrix.shape
svd = TruncatedSVD(n_components=500, random_state=42)
svd.fit(tfidf_matrix)
truncated_train_svd = svd.transform(tfidf_matrix)
truncated_test_svd = svd.transform(smatrix)
print truncated_train_svd.shape
print truncated_test_svd.shape
cosine = cosine_similarity(truncated_test_svd[0], truncated_train_svd)
print cosine
print "TEST SET: "
示例5: cook
def cook():
x, y, weights = load_data()
n_components = 200
svd = TruncatedSVD(n_components, random_state=42)
x_unweighted = svd.fit_transform(x)
x_weighted = svd.fit_transform(weighted(x, weights))
for i in range(9):
frac = 1 - (i * 0.01 + 0.01)
print frac
x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac)
classifier = AdaBoostClassifier(n_estimators=100)
classifier.fit(x_train, y_train)
print "Unweighted: ", classifier.score(x_test, y_test)
x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac)
classifier = AdaBoostClassifier(n_estimators=100)
classifier.fit(x_train, y_train)
print "Weighted: ", classifier.score(x_test, y_test)
print '--------------------------'
'''
示例6: SVD_CV
def SVD_CV(counts, scores, n_comp=range(10,611,100)):
n_avg = 16
avg_err = []
for n in range(0,n_avg):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(counts, scores, \
test_size=0.2, random_state=n)
test_err = []
for n in n_comp:
TruncTrans = TruncatedSVD(n_components=n)
X_trunc_train = TruncTrans.fit_transform(X_train,scores)
regr = linear_model(X_trunc_train,y_train)
X_trunc_test = TruncTrans.transform(X_test)
y_pred = regr.predict(X_trunc_test)*10**(-12)+3
test_err.append(metrics.mean_squared_error(y_test, y_pred))
if not avg_err:
avg_err = test_err
else:
avg_err = [avg_err[i]+(test_err[i]*(1.0/n_avg)) for i in range(0,len(test_err))]
plt.plot(n_comp, avg_err, label='Out-of-Sample Error')
plt.xlabel('n components')
plt.ylabel('MSE')
plt.show()
示例7: kfold
def kfold(agetext,k,model,k2):
import collections
out = []
for i in range(k):
print "iteration: "+str(i)
agetext = shuffle(agetext)
datatb = agetext.iloc[:,1:]
label = agetext["agegroup"].tolist()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
datatb, label, test_size=0.15, random_state=i*6)
data = X_train.values
counter = collections.Counter(y_train)
print counter
testdata = X_test.values
lsa = TruncatedSVD(k2, algorithm = 'arpack')
normalizer = Normalizer(copy=False)
X = lsa.fit_transform(data)
X = normalizer.fit_transform(X)
X_test = lsa.transform(testdata)
X_test = normalizer.transform(X_test)
model.fit(X,y_train)
pred = model.predict(X_test)
counter = collections.Counter(y_test)
print counter
counter = collections.Counter(pred)
print counter
out.append(round(accuracy_score(y_test, pred),5))
print str(out)
print np.mean(out)
示例8: test_sparse_formats
def test_sparse_formats(fmt):
Xfmt = Xdense if fmt == "dense" else getattr(X, "to" + fmt)()
tsvd = TruncatedSVD(n_components=11)
Xtrans = tsvd.fit_transform(Xfmt)
assert_equal(Xtrans.shape, (n_samples, 11))
Xtrans = tsvd.transform(Xfmt)
assert_equal(Xtrans.shape, (n_samples, 11))
示例9: compute_svd
def compute_svd(Xs):
# compute 1st principal component
svd = TruncatedSVD(n_components=1, n_iter=20, random_state=0)
svd.fit(Xs)
pc = svd.components_
print(pc.shape, svd.explained_variance_ratio_)
return pc
示例10: lsa_summarizer
def lsa_summarizer(text,num_sen=5):
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentenceTokens = sent_detector.tokenize(text.strip())
tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText)
sparse = tfvectorizer.fit_transform(sentenceTokens).A
lsa = TruncatedSVD(n_components=1)
concept = lsa.fit_transform(sparse)
pos = np.array(list(range(len(sentenceTokens))))
listlist = [list(x) for x in zip(sentenceTokens,concept,pos)]
listlist.sort(key=lambda x: x[1],reverse=True)
summarysentences = listlist[0:num_sen]
summarysentences.sort(key=lambda x: x[2],reverse=False)
summary = ""
for n in range(num_sen):
summary += ' ' + summarysentences[n][0]
summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())
return summary
示例11: fit_document_matrix
def fit_document_matrix(self, X):
"""
Reduce dimension of sparse matrix X
using Latent Semantic Analysis and
build nearst neighbor model
Parameters
----------
X: sparse csr matrix, sparse term frequency matrix or
others weighting matrix from documents
"""
n_components = self.n_components
n_iter = self.n_iter
algorithm = self.algorithm
lsa_model = TruncatedSVD(n_components=n_components,
n_iter=n_iter,
algorithm=algorithm)
# reduce dimension using Latent Semantic Analysis
vectors = lsa_model.fit_transform(X)
self.vectors = vectors
# build nearest neighbor model
nbrs_model = build_nearest_neighbors(vectors, n_recommend=self.n_recommend)
self.nbrs_model = nbrs_model
return self
示例12: basic_lsi
def basic_lsi(df, n_components=200, max_df=0.5, min_df=5):
'''
Basic LSI model for album recommendations
Args:
df: dataframe with Pitchfork reviews
n_components: number of lsi dimensions
max_df: max_df in TfidfVectorizer
min_df: min_df in TfidfVectorizer
Returns:
tfidf: sklearn fitted TfidfVectorizer
tfidf_trans: sparse matrix with tfidf transformed data
svd: sklearn fitted TruncatedSVD
svd_trans: dense array with lsi transformed data
'''
X = df['review']
stopwords = nltk.corpus.stopwords.words('english')
tfidf = TfidfVectorizer(stop_words=stopwords,
max_df=max_df, min_df=min_df)
tfidf_trans = tfidf.fit_transform(X)
svd = TruncatedSVD(n_components=n_components)
svd_trans = svd.fit_transform(tfidf_trans)
return tfidf, tfidf_trans, svd, svd_trans
示例13: buildKB16
def buildKB16(n_comp = 200, seed_value = 123):
## data
# read the training/test data
print('Importing Data')
xtrain = pd.read_csv('../input/xtrain_kb6099.csv')
xtest = pd.read_csv('../input/xtest_kb6099.csv')
# separate
id_train = xtrain.ID; xtrain.drop('ID', axis = 1, inplace = True)
ytrain = xtrain.target; xtrain.drop('target', axis = 1, inplace = True)
id_test = xtest.ID; xtest.drop('ID', axis = 1, inplace = True)
# fit SVD
svd = TruncatedSVD(n_components = n_comp,n_iter=5, random_state= seed_value)
svd.fit(xtrain)
xtrain = svd.transform(xtrain)
xtest = svd.transform(xtest)
xtrain = pd.DataFrame(xtrain)
xtest = pd.DataFrame(xtest)
## store the results
# add indices etc
xtrain = pd.DataFrame(xtrain)
xtrain['ID'] = id_train
xtrain['target'] = ytrain
#
xtest = pd.DataFrame(xtest)
xtest['ID'] = id_test
#
#
# # save the files
xtrain.to_csv('../input/xtrain_kb16c'+str(n_comp)+'.csv', index = False, header = True)
xtest.to_csv('../input/xtest_kb16c'+str(n_comp)+'.csv', index = False, header = True)
return
示例14: truncatedSVD
def truncatedSVD(data, labels, new_dimension):
print "start truncatedSVD..."
start = time.time()
pca = TruncatedSVD(n_components=new_dimension)
reduced = pca.fit_transform(data)
end = time.time()
return (reduced, end-start)
示例15: test_inverse_transform
def test_inverse_transform(algo):
# We need a lot of components for the reconstruction to be "almost
# equal" in all positions. XXX Test means or sums instead?
tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo)
Xt = tsvd.fit_transform(X)
Xinv = tsvd.inverse_transform(Xt)
assert_array_almost_equal(Xinv, Xdense, decimal=1)