本文整理汇总了Python中sklearn.ensemble.ExtraTreesClassifier.transform方法的典型用法代码示例。如果您正苦于以下问题:Python ExtraTreesClassifier.transform方法的具体用法?Python ExtraTreesClassifier.transform怎么用?Python ExtraTreesClassifier.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.ensemble.ExtraTreesClassifier
的用法示例。
在下文中一共展示了ExtraTreesClassifier.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: feature_engineering_common
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
def feature_engineering_common(Y, X, X1):
print "### Shape of training set (X)", X.shape
print "### Shape of labels (Y)", Y.shape
print "### Shape of Kaggle Test set (X1)", X1.shape
# Scale features
scaler = preprocessing.StandardScaler()
X_SCALED = scaler.fit_transform(X)
X1_SCALED = scaler.transform(X1)
print "### (After scaling) Shape of training set", X_SCALED.shape
print "### (After scaling ) Shape of Kaggle Test set", X1_SCALED.shape
# Find Important Features using Random Forest
xtClf = ExtraTreesClassifier().fit(X_SCALED, Y)
X_SCALED_SUBSET = xtClf.transform(X_SCALED)
X1_SCALED_SUBSET = xtClf.transform(X1_SCALED)
importances = xtClf.feature_importances_
print xtClf.feature_importances_
print "### (After scaling & feature selection using Random Forrest) Shape of training set", X_SCALED_SUBSET.shape
print "### (After scaling & feature selection using Random Forrest) Shape of Kaggle Test set", X1_SCALED_SUBSET.shape
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in xrange(10):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
示例2: model
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
def model(TRAIN_FX, KAGGLE_TEST_FX):
# Feature Selection & Scaling
TRAIN_FX_FS, KAGGLE_TEST_FX_FS = feature_selection(TRAIN_FX, KAGGLE_TEST_FX)
Y = TRAIN_FX_FS['count'].values
X = TRAIN_FX_FS.drop('count', axis=1).values
X1 = KAGGLE_TEST_FX_FS.values
print "### Shape of training set (X)", X.shape
print "### Shape of labels (Y)", Y.shape
print "### Shape of Kaggle Test set (X1)", X1.shape
# Scale features
scaler = preprocessing.StandardScaler()
X_SCALED = scaler.fit_transform(X)
X1_SCALED = scaler.transform(X1)
print "### (After scaling) Shape of training set", X_SCALED.shape
print "### (After scaling ) Shape of Kaggle Test set", X1_SCALED.shape
# Find Important Features using Random Forest
xtClf = ExtraTreesClassifier().fit(X_SCALED, Y)
X_SCALED_SUBSET = xtClf.transform(X_SCALED)
X1_SCALED_SUBSET = xtClf.transform(X1_SCALED)
importances = xtClf.feature_importances_
print xtClf.feature_importances_
print "### (After scaling & feature selection using Random Forrest) Shape of training set", X_SCALED_SUBSET.shape
print "### (After scaling & feature selection using Random Forrest) Shape of Kaggle Test set", X1_SCALED_SUBSET.shape
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in xrange(10):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Random Forrest with Cross Validation
rf = ensemble.RandomForestRegressor(n_estimators=100)
ss = cross_validation.ShuffleSplit(X.shape[0], n_iter=5, test_size=0.25, random_state=0)
count = 1
for train_index, test_index in ss:
print("\n [Iteration:%d] Num of Training: %s, Num of Test: %s" % (count, len(train_index), len(test_index)))
# Train the model using the training sets
rf.fit(X[train_index], Y[train_index])
# Model Summary
output_model_summary(rf, X[test_index], Y[test_index])
count += 1
# Train the model using the entire data set
rf.fit(X, Y)
pred = rf.predict(X1)
generate_kaggle_submission(pred, "rf.csv")
示例3: multiclass_predict
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
def multiclass_predict(train_data,labels,valid_data,test_data,output_dir,time_budget,target_num, is_sparse):
print(strftime("%Y-%m-%d %H:%M:%S"))
print("make multiclass prediction\n")
np_seed = int(time.time())
np.random.seed(np_seed)
print ("np seed = " , np_seed)
print(train_data.shape)
print("train_data.shape == (%d,%d)\n"%train_data.shape)
n_features = train_data.shape[1]
n_samples = train_data.shape[0]
start_time = time.time()
if is_sparse:
print("no FS, it is sparse data\n")
train_data=train_data.toarray()
valid_data=valid_data.toarray()
test_data=test_data.toarray()
# train_data = select_clf.transform(train_data,threshold=my_mean )
# valid_data = select_clf.transform(valid_data,threshold=my_mean )
# test_data = select_clf.transform(test_data,threshold=my_mean)
print("sparse converting time = ", time.time() - start_time)
start_time = time.time()
FS_iterations = max(1,int(5000/target_num * (5000./n_samples)*2000./n_features))
print ("FS_iterations = %d\n" % FS_iterations)
select_clf = ExtraTreesClassifier(n_estimators=FS_iterations,max_depth=3)
select_clf.fit(train_data, labels)
print("FS time = ", time.time() - start_time)
my_mean =1./(10*n_features)
train_data = select_clf.transform(train_data,threshold=my_mean )
valid_data = select_clf.transform(valid_data,threshold=my_mean )
test_data = select_clf.transform(test_data,threshold=my_mean)
print(my_mean)
print(train_data.shape)
######################### Make validation/test predictions
n_features=train_data.shape[1]
if n_features < 100:
gbt_features=n_features
else:
gbt_features=int(n_features**0.5)
gbt_iterations= int((time_budget / 3000.) * 3000000/(gbt_features * target_num) * (7000./n_samples))
gbt_params=GBT_params(n_iterations=gbt_iterations,depth=int(10 * np.log2(gbt_iterations)/14.3), learning_rate=0.01,subsample_part=0.6,n_max_features=gbt_features,min_samples_split=5, min_samples_leaf=3)
gbt_params.print_params()
(y_valid, y_test) = make_classification(gbt_params, train_data, labels, valid_data, test_data)
print("y_valid.shape = ",y_valid.shape )
print("y_test.shape = ",y_test.shape )
return (y_valid, y_test)
示例4: remove_feature_tree_based
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
def remove_feature_tree_based(train_X,train_Y):
'''
Removes features based on trees - see sklearn:
http://scikit-learn.org/dev/auto_examples/ensemble/plot_forest_importances.html#example-ensemble-plot-forest-importances-py
Actually removes based on "importance"
'''
forest = ExtraTreesClassifier(n_estimators=1000,
compute_importances = True,
random_state = 0)
forest.fit(train_X, train_Y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
x_labels = ['rc1', 'rc2', 'dca1', 'dca2','dcm1', 'dcm2','ace1','ace2','acsc1', 'acsc2', 'acsv1', 'acsv2', 'acss1','acss2', 'acsk1', 'acsk2', 'taca1', 'taca2', 'tdc1', 'tdc2', 'gmin', 'gmean', 'trd','ep111','ep112','ep211', 'ep212', 'ep311','ep312', 'ep411','ep412','ep511','ep512','ep611','ep612','ep121','ep122','ep221', 'ep222', 'ep321','ep322', 'ep421','ep422','ep521','ep522','ep621','ep622']
# Print the feature ranking
print "Feature ranking:"
for f in xrange(46):
print "%d. feature %s (%f)" % (f + 1, x_labels[indices[f]], importances[indices[f]])
# Transform the data to have only the features that are important
x_new = forest.transform(train_X)
return (forest, x_new)
示例5: process_data
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
def process_data(self):
test = pandas.read_csv("test.csv")
testMat = test.as_matrix()
train = pandas.read_csv("train.csv")
trainMat = train.as_matrix()
trainResult = trainMat[:, 0]
trainMat = trainMat[:, 1:]
# trainInd = np.where(trainResult == 0)[0]
# how_many = (trainResult == 1).sum() - len(trainInd)
# np.random.shuffle(trainInd)
# addedResult = trainResult[trainInd[:how_many],:]
# addedData = trainMat[trainInd[:how_many],:]
# trainResult = np.append(trainResult,addedResult)
# trainMat = np.vstack((trainMat,addedData))
cv = StratifiedKFold(trainResult, 2)
# cv = KFold(n=trainResult.shape[0],k=2)
reduceFeatures = ExtraTreesClassifier(
compute_importances=True, random_state=1234, n_jobs=self.cpus, n_estimators=1000, criterion="gini"
)
reduceFeatures.fit(trainMat, trainResult)
trainScaler = Scaler()
self.cv_data = []
self.cv_data_nonreduced = []
for train, test in cv:
X_train, X_test, Y_train, Y_test = (
trainMat[train, :],
trainMat[test, :],
trainResult[train, :],
trainResult[test, :],
)
X_train = trainScaler.fit_transform(X_train)
X_test = trainScaler.transform(X_test)
self.cv_data_nonreduced.append((X_train, X_test, Y_train, Y_test))
X_train = reduceFeatures.transform(X_train)
X_test = reduceFeatures.transform(X_test)
self.cv_data.append((X_train, X_test, Y_train, Y_test))
testMat = trainScaler.transform(testMat)
self.testMat_nonreduced = testMat
self.testMat = reduceFeatures.transform(testMat)
allData = self.testMat, self.cv_data, self.testMat_nonreduced, self.cv_data_nonreduced
data_handle = open("allData.pkl", "w")
pickle.dump(allData, data_handle)
data_handle.close()
示例6: getSelectedValues
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
def getSelectedValues(self):
(train, trainLabels, test) = self.getScaledValues()
selector = ExtraTreesClassifier(compute_importances=True, random_state=0)
train = selector.fit_transform(train, trainLabels)
return (train, trainLabels, test)
test = selector.transform(test)
示例7: tree_select
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
def tree_select(trainSet, testSet): # input as numpy array
from sklearn.ensemble import ExtraTreesClassifier
#import matplotlib.pyplot as plt
import numpy
X, y = trainSet[:,1:], trainSet[:,0]
#print [X.shape, y.shape]
clf = ExtraTreesClassifier(max_depth=10, n_jobs=-1, bootstrap=True, n_estimators=25)
clf.fit(X, y)
importances = clf.feature_importances_
#std = numpy.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = numpy.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(importances.shape[0]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
'''
plt.figure()
plt.title("Feature importances")
plt.bar(range(10), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(10), indices)
plt.xlim([-1, 10])
plt.show()
#clf.feature_importances_
#print X_new.shape
'''
testSet = clf.transform(testSet)
X_new = clf.transform(X)
#raw_input('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
#
return numpy.hstack((numpy.reshape(y,(y.shape[0],1)), X_new)), testSet
示例8: ExtraTreesClassifier
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
import numpy as np
from sklearn import preprocessing as pp
from sklearn import cross_validation as cv
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
workDir = r'C:\users\Akshay\Downloads\kaggle\\'
# Read data
train = np.genfromtxt(open(workDir + 'train.csv','rb'), delimiter=',')
target = np.genfromtxt(open(workDir + 'trainLabels.csv','rb'), delimiter=',')
test = np.genfromtxt(open(workDir + 'test.csv','rb'), delimiter=',')
# Scale data
train = pp.scale(train)
test = pp.scale(test)
# Select features
selector = ExtraTreesClassifier(compute_importances=True, random_state=0)
train = selector.fit_transform(train, target)
test = selector.transform(test)
# Estimate score
classifier = SVC(C=8, gamma=0.17)
scores = cv.cross_val_score(classifier, train, target, cv=30)
print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2))
# Predict and save
result = classifier.fit(train, target).predict(test)
np.savetxt(workDir + 'a.csv', result, fmt='%d')
示例9: ExtraTreesPreprocessor
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
class ExtraTreesPreprocessor(AutoSklearnPreprocessingAlgorithm):
def __init__(self, n_estimators, criterion, min_samples_leaf,
min_samples_split, max_features,
max_leaf_nodes_or_max_depth="max_depth",
bootstrap=False, max_leaf_nodes=None, max_depth="None",
min_weight_fraction_leaf=0.0,
oob_score=False, n_jobs=1, random_state=None, verbose=0,
class_weight=None):
self.n_estimators = int(n_estimators)
self.estimator_increment = 10
if criterion not in ("gini", "entropy"):
raise ValueError("'criterion' is not in ('gini', 'entropy'): "
"%s" % criterion)
self.criterion = criterion
if max_leaf_nodes_or_max_depth == "max_depth":
self.max_leaf_nodes = None
if max_depth == "None":
self.max_depth = None
else:
self.max_depth = int(max_depth)
# if use_max_depth == "True":
# self.max_depth = int(max_depth)
#elif use_max_depth == "False":
# self.max_depth = None
else:
if max_leaf_nodes == "None":
self.max_leaf_nodes = None
else:
self.max_leaf_nodes = int(max_leaf_nodes)
self.max_depth = None
self.min_samples_leaf = int(min_samples_leaf)
self.min_samples_split = int(min_samples_split)
self.max_features = float(max_features)
if bootstrap == "True":
self.bootstrap = True
elif bootstrap == "False":
self.bootstrap = False
self.oob_score = oob_score
self.n_jobs = int(n_jobs)
self.random_state = random_state
self.verbose = int(verbose)
self.class_weight = class_weight
self.preprocessor = None
def fit(self, X, Y, sample_weight=None):
from sklearn.ensemble import ExtraTreesClassifier
num_features = X.shape[1]
max_features = int(
float(self.max_features) * (np.log(num_features) + 1))
# Use at most half of the features
max_features = max(1, min(int(X.shape[1] / 2), max_features))
self.preprocessor = ExtraTreesClassifier(
n_estimators=0, criterion=self.criterion,
max_depth=self.max_depth, min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
random_state=self.random_state, class_weight=self.class_weight,
warm_start=True
)
# JTS TODO: I think we might have to copy here if we want self.estimator
# to always be consistent on sigabort
while len(self.preprocessor.estimators_) < self.n_estimators:
tmp = self.preprocessor # TODO copy ?
tmp.n_estimators += self.estimator_increment
tmp.fit(X, Y, sample_weight=sample_weight)
self.preprocessor = tmp
return self
def transform(self, X):
if self.preprocessor is None:
raise NotImplementedError
return self.preprocessor.transform(X)
@staticmethod
def get_properties(dataset_properties=None):
return {'shortname': 'ET',
'name': 'Extra Trees Classifier Preprocessing',
'handles_missing_values': False,
'handles_nominal_values': False,
'handles_numerical_features': True,
'prefers_data_scaled': False,
# TODO find out if this is good because of sparcity...
'prefers_data_normalized': False,
'handles_regression': False,
'handles_classification': True,
'handles_multiclass': True,
'handles_multilabel': True,
'is_deterministic': True,
'handles_sparse': False,
'input': (DENSE, SPARSE, UNSIGNED_DATA),
'output': (INPUT,),
# TODO find out what is best used here!
#.........这里部分代码省略.........
示例10: ExtraTreesClassifier
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
X = train[:, :-1]
y = train[:, -1]
del labels
del train
# Parameters for trees
random_state = 5342
n_jobs = 8
verbose = 1
n_estimators = 89
# ExtraTreesClassifier - feature selection
clf1 = ExtraTreesClassifier(criterion='gini', random_state=random_state, n_jobs=n_jobs, verbose=verbose, n_estimators=n_estimators, max_features=None)
clf1.fit(X, y)
X_new = clf1.transform(X, '0.5*median')
X = X_new
# Initialize classifier
clf = KNeighborsClassifier(n_neighbors=20, p=1)
# Start training
print('training started')
############################
# test log loss
print('computing log loss')
kf = cross_validation.KFold(ntrain, n_folds=4)
_logloss = 0.0
for trainIndex, testIndex in kf:
示例11: LogisticRegression
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
if(algo=='l1' or algo=='l2'):
clf2 = LogisticRegression(C=reg,penalty=algo,random_state=0)
elif(algo=='lsvm'):
clf2 = svm.SVC(kernel='linear', C=reg)
elif(algo=='ksvm'):
clf2 = svm.SVC(kernel='rbf', C=reg, gamma=1e-5)
clf2.fit(X_train, y_train)
print "Validation set score filtered coeff: " , clf2.score(X_val, y_val)
elif(fe==5): #Tree based feature selection
forest = ExtraTreesClassifier(n_estimators=20, random_state=144)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
pl.figure(2)
pl.bar(range(10), importances, color="r", yerr=std, align="center")
X_train = forest.transform(X_train, threshold=mean)
X_val = forest.transform(X_val,threshold=mean)
if(algo=='l1' or algo=='l2'):
clf2 = LogisticRegression(C=reg,penalty=algo,random_state=0)
elif(algo=='lsvm'):
clf2 = svm.SVC(kernel='linear', C=reg)
elif(algo=='ksvm'):
clf2 = svm.SVC(kernel='rbf', C=reg, gamma=1e-5)
clf2.fit(X_train, y_train)
print "Validation set score filtered coeff: " , clf2.score(X_val, y_val)
pl.show()
if(resfile != ''):
print "Creating the testset."
subjects_test = range(17, 24)
示例12: PCA
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
file_label.close()
# normalize the features in the train and test dataset
train_data_array_norm = preprocessing.scale(train_data_array)
test_data_array_norm = preprocessing.scale(test_data_array)
# run the module of PCA
#pca = PCA(n_components = 10)
#train_data_array_norm_pca = pca.fit_transform(train_data_array_norm, train_result_array)
#test_data_array_norm_pca = pca.transform(test_data_array_norm)
#print 'train data shape', train_data_array_norm_pca.shape
# tree-based feature selection
classifier = ExtraTreesClassifier()
train_data_array_norm_pca = classifier.fit_transform(train_data_array_norm, np.ravel(train_result_array))
test_data_array_norm_pca = classifier.transform(test_data_array_norm)
print 'train data shape', train_data_array_norm_pca.shape
## build SVM
# random shuffle
np.random.seed(0)
indices = np.random.permutation(len(train_result_array))
classifer = svm.SVC(C=20, gamma = 0.05)
# cross validation
scores = cv.cross_val_score(classifier, train_data_array_norm_pca, np.ravel(train_result_array), cv = 30)
classifer.fit(train_data_array_norm_pca[indices[:-200], :], np.ravel(train_result_array[indices[:-200]]))
示例13: ExtraTreesClassifier
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
#X=np.hstack((np.hstack((np.real(np.fft.fft(X,axis=-1)),np.imag(np.fft.fft(X,axis=-1)))),X))
#X_test = np.hstack((np.hstack((np.real(np.fft.fft(X_test,axis=-1)),np.imag(np.fft.fft(X_test,axis=-1)))),X_test))
clf = ExtraTreesClassifier()
X= clf.fit(X, y).transform(X)
X_test=clf.transform(X_test)
#drop features
#features= [34,48,16,39,62,68,60,67,22,18,14,11,43,87,75,42,59,45,15,55,26,1,56,38,64,70,29,85,32,50,21,40,69,9,86,72,91,36,33,90,41,73,23,74,93,53,77]
#feature =[ item-1 for item in features]
'''
import random
feature = range(93)
feature = random.sample(feature,60)
X=X[:,feature]
X_test=X_test[:,feature]
'''
nb_classes = y.shape[1]
示例14: ExtraTreesClassifier
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
del labels
del train
# Parameters for trees
random_state = 5342
n_jobs = 8
verbose = 1
n_estimators = 89
estimator = ExtraTreesClassifier(criterion='gini', random_state=random_state, n_jobs=n_jobs, verbose=verbose, n_estimators=n_estimators, max_features=None)
clf = AdaBoostClassifier(base_estimator=estimator, random_state=random_state, learning_rate=0.8)
# Start training
print('training started')
estimator.fit(X, y)
X = estimator.transform(X, '1.25*median')
############################
# test log loss
print('computing log loss')
kf = cross_validation.KFold(ntrain, n_folds=4)
_logloss = 0.0
for trainIndex, testIndex in kf:
print("TRAIN:", trainIndex, "TEST:", testIndex)
X_train, X_test = X[trainIndex], X[testIndex]
y_train, y_test = y[trainIndex], y[testIndex]
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)
示例15: print
# 需要导入模块: from sklearn.ensemble import ExtraTreesClassifier [as 别名]
# 或者: from sklearn.ensemble.ExtraTreesClassifier import transform [as 别名]
import numpy as np
import pandas as pd
from sklearn import preprocessing as pp
from sklearn.ensemble import ExtraTreesClassifier
print("Preparing the data")
train = pd.io.parsers.read_csv(r"D:\shared\datascience\phy_train_clean.csv", sep=',', header=0)
test = pd.io.parsers.read_csv(r"D:\shared\datascience\phy_test_clean.csv", sep=',', header=0)
test_index = test.Id
test = test.iloc[:,2:]
target = train.kind
train_index = train.Id
train = train.iloc[:,2:]
print("Preparing an Feature classifier")
selector = ExtraTreesClassifier(compute_importances=True, random_state=0)
print("Transforming the original dataset")
train = pd.DataFrame(selector.fit_transform(train, target), index = train_index)
test = pd.DataFrame(selector.transform(test), index = test_index)
train['kind'] = target
print("Storing the data...")
train.to_csv(r"D:\shared\datascience\phy_train.csv", sep=',')
test.to_csv(r"D:\shared\datascience\phy_test.csv", sep=',')
print("Job finished")