本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.transform方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.transform方法的具體用法?Python TfidfVectorizer.transform怎麽用?Python TfidfVectorizer.transform使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.transform方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: TFID
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def TFID(data , choice):
#Again removing stop words increased the efficiency, same for Snowball
if(choice==1):
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{2,}',ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)
#, tokenizer=Snowball()
print "fitting pipeline and transforming for ", len(data), ' entries'
tfv.fit(data)
vect = tfv.transform(data)
print vect.shape
return vect
elif(choice==2):
print 'Fitting char pipeline'
tfvc = TfidfVectorizer(norm='l2',min_df=3,max_df=1.0,strip_accents='unicode',analyzer='char',ngram_range=(2,7),use_idf=1,smooth_idf=1,sublinear_tf=1)
tfvc.fit(data)
vectc = tfvc.transform(data)
print 'vectc',vectc.shape
return vectc
elif(choice==3):
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{2,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, tokenizer=LancasterTokenizer())
#, tokenizer=Snowball()
print "fitting pipeline and transforming for ", len(data), ' entries'
tfv.fit(data)
vect = tfv.transform(data)
print vect.shape
return vect
elif(choice==4):
tfv = CountVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{2,}',ngram_range=(1, 3), binary=True)
print "fitting count pipeline and transforming for ", len(data), ' entries'
tfv.fit(data)
vect = tfv.transform(data)
print vect.shape
return vect
else:
return []
示例2: tfidf_ize
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def tfidf_ize(train, test, node_info):
vectorizer = TfidfVectorizer(ngram_range=(1,1))
vectorizer.fit(node_info.abstract.as_matrix())
for table in [train, test]:
table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
#table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
table.loc[:, 'temp22'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
table.loc[:, 'temp23'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
table.loc[:, 'temp24'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
+ table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
vectorizer = TfidfVectorizer(ngram_range=(2,2))
vectorizer.fit(node_info.abstract.as_matrix())
for table in [train, test]:
table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
#table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
table.loc[:, 'temp27'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
table.loc[:, 'temp28'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
table.loc[:, 'temp29'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
+ table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
return train, test
示例3: trainTFIDF2
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def trainTFIDF2(bow21features, bow2kfold, test):
idx = (test[0][:, 0]).astype(int)
tfv = TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='ascii', analyzer='word',
token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True,
stop_words='english')
pipeline = Pipeline(
[('svd', TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)),
('scl', StandardScaler(copy=True, with_mean=True, with_std=True)),
('svm',
SVC(C=10.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001,
cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None))])
tfidf2CrossValidationTest = None
if toTestModel:
tfidf2CrossValidationTest = tfidfCrossValidation(tfv, pipeline, bow2kfold)
trainData, lblsTrain, testData, lblstest = bow21features
tfv.fit(trainData)
X_train = tfv.transform(trainData)
X_test = tfv.transform(testData)
if isinstance(lblsTrain, list):
lblsTrain = lblsTrain[0]
lblsTrain = (lblsTrain.astype(int))
pipeline.fit(X_train, lblsTrain)
predictions = pipeline.predict(X_test)
finalResults = pd.DataFrame({"id": idx, "prediction": predictions})
return tfidf2CrossValidationTest, finalResults
示例4: word_count_transform
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def word_count_transform(X_train, X_test):
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer.fit(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train).todense()
X_test_tfidf = tfidf_vectorizer.transform(X_test).todense()
return X_train_tfidf, X_test_tfidf
示例5: extract
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def extract(max_gram, feat_dims, save_model=False):
print "extract feature"
vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features=None,
ngram_range=(1, max_gram), sublinear_tf = True )
vectorizer = vectorizer.fit(reviews_train + reviews_unsup)
feats_train_ori = vectorizer.transform(reviews_train)
feats_test_ori = vectorizer.transform(reviews_test)
print "size of orginal train features", feats_train_ori.shape
for feat_dim in feat_dims:
print "perform feature selection"
fselect = SelectKBest(chi2 , k=feat_dim)
feats_train = fselect.fit_transform(feats_train_ori, labels_train)
feats_test = fselect.transform(feats_test_ori)
print "save features"
np.savez("feats/%d_%d.npz" % (max_gram, feat_dim),
feats_train=feats_train, feats_test=feats_test,
labels_train=labels_train, labels_test=labels_test)
if save_model:
print "save models"
with open("models/vectorizer_%d.pkl" % max_gram, "wb") as fout:
pickle.dump(vectorizer, fout, -1)
with open("models/fselect_%d_%d.pkl" % (max_gram, feat_dim), "wb") as fout:
pickle.dump(fselect, fout, -1)
示例6: num_feat_select
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def num_feat_select(n,k):
tfidf = TfidfVectorizer(max_features=n, strip_accents='unicode',
tokenizer = MyTokenizer(), analyzer='word')
tfidf.fit(train['tweet'])
trainf = tfidf.transform(train['tweet'])
testf = tfidf.transform(test['tweet'])
trainlab = np.array(train.ix[:,4:])
knn = neighbors.KNeighborsRegressor(n_neighbors=k)
knn.fit(trainf,trainlab)
print 'here'
tim = time.time();
n = 10
pred = []
for i in range(0,n):
pred.extend(knn.predict(testf[(i*1000):((i+1)*(1000))]))
print(i)
print "time: " + str(time.time() - tim)
#RMSE:
testlab = np.array(test.ix[:,4:])
err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0))))
print err
示例7: processEssay
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def processEssay(self, testidx, trainidx):
#process essay
self.rawdata['essay'] = self.rawdata['essay'].apply(clean)
self.trdata = self.rawdata['essay'].ix[trainidx]
self.testdata = self.rawdata['essay'].ix[testidx]
trainessay = np.array(self.trdata.fillna('Missing'))
testessay = np.array(self.testdata.fillna('Missing'))
tfidfEs = TfidfVectorizer(min_df=4, max_features=500)
tfidfEs.fit(trainessay)
#=======================================================================
# #process need statement
# self.rawdata['need_statement'] = self.rawdata['need_statement'].apply(clean)
# self.trdata = self.rawdata['need_statement'].ix[trainidx]
# self.testdata = self.rawdata['need_statement'].ix[testidx]
# trainneedst = np.array(self.trdata.fillna('Missing'))
# testneedst= np.array(self.testdata.fillna('Missing'))
# tfidfNs = TfidfVectorizer(min_df=3, max_features=20)
# tfidfNs.fit(trainneedst)
#
# #process short desc
# self.rawdata['short_description'] = self.rawdata['short_description'].apply(clean)
# self.trdata = self.rawdata['short_description'].ix[trainidx]
# self.testdata = self.rawdata['short_description'].ix[testidx]
# trainshortd = np.array(self.trdata.fillna('Missing'))
# testshortd= np.array(self.testdata.fillna('Missing'))
# tfidfSd = TfidfVectorizer(min_df=3, max_features=20)
# tfidfSd.fit(trainshortd)
#
# self.exdata_train = sp.hstack((tfidfEs.transform(trainessay),tfidfNs.transform(trainneedst),tfidfSd.transform(trainshortd) ))
# self.exdata_test = sp.hstack((tfidfEs.transform(testessay),tfidfNs.transform(testneedst),tfidfSd.transform(testshortd) ))
#=======================================================================
self.exdata_train = tfidfEs.transform(trainessay) #only use the essay
self.exdata_test = tfidfEs.transform(testessay)
示例8: ridge_003
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def ridge_003():
print('*** CLEANING ***')
tfidf_wrd = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1, 3),
lowercase=True, stop_words='english', min_df=3, max_df=0.5)
tfidf_wrd.fit(train_set['tweet'])
X_train_wrd = tfidf_wrd.transform(train_set['tweet'])
X_test_wrd = tfidf_wrd.transform(test_set['tweet'])
tfidf_char = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='char', ngram_range=(4, 10),
lowercase=True, stop_words='english', min_df=3, max_df=0.5)
tfidf_char.fit(train_set['tweet'])
X_train_char = tfidf_char.transform(train_set['tweet'])
X_test_char = tfidf_char.transform(test_set['tweet'])
y_train = np.array(train_set.ix[:, 4:])
print('*** TRAINING ***')
mdl_wrd = model.ridge(X_train_wrd, y_train)
mdl_char = model.ridge(X_train_char, y_train)
print('*** PREDICTING ***')
test_prediction_wrd = mdl_wrd.predict(X_test_wrd)
test_prediction_char = mdl_char.predict(X_test_char)
test_prediction = (test_prediction_wrd + test_prediction_char) / 2
print('*** OUTPUTTING ***')
output('results/ridge_003.csv', test_prediction)
示例9: __init__
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
class NaiveBayes:
def __init__(self):
self.clf = MultinomialNB()
self.pattern ='(?u)\\b[A-Za-z]{3,}'
self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(2,2))
def train(self,fileName):
print "Naive Bayes classifier is being trained"
table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
X_train = self.tfidf.fit_transform(table.message)
Y_train = []
for item in table.cat:
Y_train.append(int(item))
self.clf.fit(X_train, Y_train)
self.clf.fit(X_train, Y_train)
print "Naive Bayes classifier has been trained"
def classify(self,cFileName, rFileName):
table = pandas.read_table(cFileName, names=["message"])
X_test = self.tfidf.transform(table.message)
print "Data have been classified"
with open(rFileName,'w') as f:
for item in self.clf.predict(X_test).astype(str):
f.write(item+'\n')
def validate(self,fileName):
table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
X_validate = self.tfidf.transform(table.message)
Y_validated = self.clf.predict(X_validate).astype(str)
totalNum = len(table.cat)
errorCount = 0
for i in range(0,totalNum):
if int(table.cat[i])!=int(Y_validated[i]):
errorCount += 1
print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
示例10: train_and_predict_m5
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m5 (train, test, labels) :
# Beautiful soup cleanup and stemming (just to mix it up)
stemmer = PorterStemmer()
trainData = modified_cleanup(train, stemmer, is_train = True, pretag = 'full')
testData = modified_cleanup(test, stemmer, is_train = False, pretag = 'full')
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
print ("Fitting Multinominal Naive Bayes...")
clf = MultinomialNB(alpha = 0.03)
## Create a parameter grid to search for best parameters for everything in the pipeline
# param_grid = {'alpha' : [0.01, 0.03, 0.1, 0.3, 1]}
param_grid = {'alpha' : [0.01, 0.03]}
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例11: train_and_predict_m6
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m6 (train, test, labels) :
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM6, stemmer_type = 'snowball')
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
print ("Fitting K-Nearest Neighbors...")
clf = KNeighborsClassifier(p = 2, n_neighbors = 5)
## Create a parameter grid to search for best parameters for everything in the pipeline
# Note: minkowski with p > 2 does not work for sparse matrices
param_grid = {'n_neighbors' : [3, 4, 5, 6, 7], 'weights' : ['uniform', 'distance'], 'leaf_size' : [1, 3, 5, 10] }
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例12: train_and_predict_m3
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m3 (train, test, labels) :
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM3, stemmer_type = 'porter')
"""
# Beautiful soup cleanup and stemming
stemmer = PorterStemmer()
trainData = modified_cleanup(train, stemmer, is_train = True)
testData = modified_cleanup(test, stemmer, is_train = False)
"""
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
clf = SGDClassifier(random_state = randomState, n_jobs = 1, penalty = 'l2', loss = 'huber', n_iter = 50, class_weight = 'auto', learning_rate = 'optimal', epsilon = 1)
## Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'n_iter' : [30, 50, 80, 100, 200], 'loss': ['huber'], 'epsilon' : [0.3, 1], 'alpha' : [0.0001, 0.0003, 0.001] }
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例13: train_and_predict_m4
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m4 (train, test, labels) :
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM4, stemmer_type = 'porter')
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
clf = LogisticRegression(random_state = randomState, penalty = 'l2', C = 12, class_weight = 'auto')
## Create a parameter grid to search for best parameters for everything in the pipeline
#param_grid = {'C' : [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 30], 'penalty' : ['l2']}
param_grid = {'C' : [1, 3, 5, 6, 7, 8, 10, 11, 12], 'penalty' : ['l2']}
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例14: train_and_predict_m1
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m1 (train, test, labels) :
print ("Training M1 (randomState = %d ...", randomState)
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM1, stemmer_type = 'porter')
## TF-IDF transform with sub-linear TF and stop-word removal
vectorizer = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
vectorizer.fit(trainData)
X = vectorizer.transform(trainData)
X_test = vectorizer.transform(testData)
## Use Stemmer post TF-IDF to check if things change
# print (X)
print ("X.shape: ", X.shape)
print ("X_test.shape: ", X_test.shape)
## Create the pipeline
# 07/02 - RandomizedPCA/PCA does not work on sparse input (so cannot be applied on output of Vectorizer)
# JimingYe says LDA did not give much benefit.
clf = Pipeline([('svd', TruncatedSVD(random_state = randomState, n_components = 330)),
('scl', StandardScaler()),
('svm', SVC(random_state = randomState, cache_size = 500, C = 12))])
## Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svd__n_components' : [200, 250, 300], 'svm__C': [10, 12]}
示例15: doTFIDF
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def doTFIDF(train, test1, test2):
steemedTrain = stemIt(train)
steemedTest1 = stemIt(test1)
steemedTest2 = stemIt(test2)
print "done stemming tweets"
regTrain = processIt(train)
regTest1 = processIt(test1)
regTest2 = processIt(test2)
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)
X = vectorizer.fit_transform(regTrain)
Xtest1 = vectorizer.transform(regTest1)
Xtest2 = vectorizer.transform(regTest2)
scipy.io.mmwrite('train_reg_dataM',X, field='real')
scipy.io.mmwrite('test1_reg_dataM',Xtest1, field='real')
scipy.io.mmwrite('test2_reg_dataM',Xtest2, field='real')
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)
X = vectorizer.fit_transform(steemedTrain)
Xtest1 = vectorizer.transform(steemedTest1)
Xtest2 = vectorizer.transform(steemedTest2)
scipy.io.mmwrite('train_stem_dataM',X, field='real')
scipy.io.mmwrite('test1_stem_dataM',Xtest1, field='real')
scipy.io.mmwrite('test2_stem_dataM',Xtest2, field='real')