本文整理汇总了Python中sklearn.ensemble.RandomForestClassifier.transform方法的典型用法代码示例。如果您正苦于以下问题:Python RandomForestClassifier.transform方法的具体用法?Python RandomForestClassifier.transform怎么用?Python RandomForestClassifier.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.ensemble.RandomForestClassifier
的用法示例。
在下文中一共展示了RandomForestClassifier.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: RFSelection
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
class RFSelection(TransformerMixin):
def __init__(self, n_features=None, n_estimators=100, random_state=0):
self.rf = RandomForestClassifier(
n_estimators=n_estimators, random_state=0)
self.n_features = None
if n_features is not None:
self.n_features = n_features
def fit(self, X, y=None):
X_local = np.array(X)
self.rf.fit(X_local, y)
importances = self.rf.feature_importances_
ranking = np.argsort(importances)
if self.n_features is None:
self.n_features = round(X_local.shape[1] / 2)
self.threshold = importances[ranking[self.n_features]]
return self
def transform(self, X, y=None):
return self.rf.transform(X, self.threshold)
示例2: test_importances
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def test_importances():
"""Check variable importances."""
X, y = datasets.make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X, y)
importances = clf.feature_importances_
n_important = sum(importances > 0.1)
assert_equal(importances.shape[0], 10)
assert_equal(n_important, 3)
X_new = clf.transform(X, threshold="mean")
assert_less(0 < X_new.shape[1], X.shape[1])
# Check with sample weights
sample_weight = np.ones(y.shape)
sample_weight[y == 1] *= 100
clf = RandomForestClassifier(n_estimators=50, random_state=0)
clf.fit(X, y, sample_weight=sample_weight)
importances = clf.feature_importances_
assert np.all(importances >= 0.0)
clf = RandomForestClassifier(n_estimators=50, random_state=0)
clf.fit(X, y, sample_weight=3*sample_weight)
importances_bis = clf.feature_importances_
assert_almost_equal(importances, importances_bis)
示例3: main
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def main():
if len(sys.argv) < 6:
print "Usage: [program] train test tree_num seed output [thread_num]"
return
elif len(sys.argv) == 6:
threadNum = 1
elif len(sys.argv) == 7:
threadNum = int(sys.argv[6])
treeNum = int(sys.argv[3])
seed = int(sys.argv[4])
print "try",treeNum,"trees with",threadNum,"threads","and seed",seed
trainData = dataProcessor(sys.argv[1])
testData = dataProcessor(sys.argv[2])
target = [data.type for data in trainData]
train = [data.matrix for data in trainData]
test = [data.matrix for data in testData]
# target = numpy.genfromtxt("./data/target.csv",delimiter=",")
# train = numpy.genfromtxt("./data/train.csv",delimiter=",")
# test = numpy.genfromtxt("./data/test.csv",delimiter=",")
print "Data load over, start to generate trees"
rf = RandomForestClassifier(n_estimators = treeNum,n_jobs=threadNum,oob_score=True)
rf.fit(train,target)
train_r = rf.transform(train)
test_r = rf.transform(test)
numpy.savetxt("train.csv",train_r,fmt="%d")
numpy.savetxt("test.csv",test_r,fmt="%d")
numpy.savetxt("target.csv",target,fmt="%d")
numpy.savetxt("ans.csv",[0]*len(test_r),fmt="%d")
print "fit done, # of class:",rf.n_classes_,", oob score:",rf.oob_score_
result = rf.predict(test)
fout = open(sys.argv[5],"w")
for i in result:
tmp = int(i)
fout.write(`tmp`+"\n")
示例4: test_importances
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def test_importances():
"""Check variable importances."""
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0
)
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X, y)
importances = clf.feature_importances_
n_important = sum(importances > 0.1)
assert_equal(importances.shape[0], 10)
assert_equal(n_important, 3)
X_new = clf.transform(X, threshold="mean")
assert_less(0 < X_new.shape[1], X.shape[1])
示例5: featureSelection
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def featureSelection(matrixX, matrixY, seed, fileName):
clf = RandomForestClassifier(n_estimators=240,
random_state=seed,
oob_score=True)
clf.fit(matrixX, numpy.ravel(matrixY))
featureMatrix = clf.transform(matrixX)
accuracy = clf.score(matrixX, matrixY)
oob_score = clf.oob_score_
# print out oob_score and accuracy
dictionary = [{"ID":"oob_score", "Value":oob_score}]
dictionary.append({"ID":"Accuracy", "Value":accuracy})
for i in range(len(clf.feature_importances_)):
dictionary.append({"ID":i+1, "Value":clf.feature_importances_[i]})
writeFileArray(dictionary, "%s_featureImportance_seed-%i.csv" % (fileName, seed))
return [clf, featureMatrix]
示例6: rf_classify
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def rf_classify(self):
print "Random Forest"
clf = RandomForestClassifier()
clf.fit(self.descr, self.target)
mean = clf.score(self.test_descr, self.test_target)
pred = clf.predict(self.test_descr)
pred_df = pd.DataFrame(pred)
pred_df.to_csv("pred_val_rf.csv")
print "Pred ", pred
print "Mean : %3f" % mean
print "Feature Importances ", clf.feature_importances_
print "Predict Probability ", clf.predict_proba(self.descr)
print "Transform ", clf.transform(self.descr)
示例7: main
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def main():
X, y = loadData("breast-cancer-wisconsin.data")
feature_names = [
"Clump Thickness",
"Uniformity of Cell Size",
"Uniformity of Cell Shape",
"Marginal Adhesion",
"Single Epithelial Cell Size",
"Bare Nuclei",
"Bland Chromatin",
"Normal Nucleoli",
"Mitoses",
]
np.random.seed(1)
# split the data into training and test sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
# find the best classifier from the grid search
params = gridSearchCV(X_train, y_train)
n_estimators = params["n_estimators"]
max_features = params["max_features"]
# fit the classifier to the training data with the best parameters
clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
clf.fit(X_train, y_train)
# report the training and test scores
print "Accuracy on the test set: %.2f%%" % (clf.score(X_test, y_test) * 100)
print "Accuracy on the training set: %.2f%%\n" % (clf.score(X_train, y_train) * 100)
# report the most important features
feature_importances = clf.feature_importances_
sorted_feature_importances = np.argsort(feature_importances)[::-1]
j = clf.transform(X).shape[1]
print "The most important %d features are:" % j
for i, index in enumerate(sorted_feature_importances[:j]):
print "%d) %s (%.2f%%)" % (i + 1, feature_names[index], feature_importances[index] * 100)
示例8: main
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def main(args):
# Import data and transpose
logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}\n\tGroup Column: {3}'.format(args.fname, args.dname, args.uniqID, args.group))
dat = wideToDesign(args.fname, args.dname, args.uniqID, args.group, clean_string=True)
data = dat.transpose()
data.dropna(axis=1, inplace=True)
# Pull classifications out of dataset
classes = data[dat.group].copy()
data.drop(dat.group, axis=1, inplace=True)
#TODO: Random forest does not handle NaNs, need to figure out the proper way to impute values.
# Build Random Forest classifier
logger.info('Creating classifier')
model = RandomForestClassifier(n_estimators=args.num)
model.fit(data, classes)
# Identify features
importance = pd.DataFrame([data.columns, model.feature_importances_]).T.sort(columns=1, ascending=False)
# Export features ranked by importance
logger.info('Exporting features')
rev = importance.applymap(lambda x: dat.revertStr(x))
rev.columns = ('feature', 'ranked_importance')
rev.to_csv(args.oname2, index=False, sep='\t')
# Select data based on features
data = data[importance.ix[:, 0].tolist()]
selected_data = pd.DataFrame(model.transform(data, threshold=0))
selected_data.columns = [dat.revertStr(x) for x in data.columns]
# Merge on classes and export
logger.info('Exporting transformed data')
clDf = pd.DataFrame(classes)
clDf.reset_index(inplace=True)
out = clDf.join(selected_data)
out.to_csv(args.oname, index=False, sep='\t', float_format="%.4f")
示例9: RFClassifier
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def RFClassifier(criteria, maxN):
fhin = open('train.csv', 'rU')
header = fhin.readline()
fhin.close()
patientTags = header.split(',')[1:]
y = []
for each in patientTags:
if re.search('CON', each):
y.append(1)
else:
y.append(0)
data = joblib.load('GS_pickles/meanCenteredData.pkl')
RFModel = RFC(criterion=criteria, max_features = "auto", compute_importances=True, n_jobs=6)
classifier = RFModel.fit(data, y)
testData = joblib.load('GS_pickles/imputed_test_data.pkl')
predictions = RFModel.predict(testData)
realLabels = []
for each in predictions:
if each==0:
realLabels.append('AD')
else:
realLabels.append('Normal')
print predictions
print realLabels
# sys.exit()
featureImp = classifier.feature_importances_
# print fsorted
print 'feature importance', featureImp.shape, featureImp
featureInd = []
for i in range(8650):
featureInd.append([featureImp[i], i])
fBest = []
fBestInd = []
# for i in range(8650):
# if featureInd[i][0]>0.001:
# fBest.append(featureInd[i])
# fBestInd.append(featureInd[i][1])
fSorted = sorted(featureInd, key=sortFun)
#
# print featureInd[:500]
print fSorted[-44:]
for each in fSorted[-44:]:
fBest.append(each[1])
fBestInd = sorted(fBest)
print fBestInd
# print 'len(fBest)', len(fBest)
# print fBest
# print fBestInd
joblib.dump(fBestInd, 'randomForest_features_44v2.pkl')
#
# scor = classifier.oob_score_
#
# df = classifier.oob_decision_function_
# sys.exit()
skf = cross_validation.StratifiedKFold(y, 10)
cv_scores = cross_validation.cross_val_score(RFModel, data, y, cv=skf, n_jobs=1)
print "Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() / 2)
#
#
#
#
################################################################################
## Classification and ROC analysis
#
## Run classifier with crossvalidation and plot ROC curves
##cv = StratifiedKFold(y, k=6)
##classifier = svm.SVC(kernel='linear', probability=True)
#
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
y = np.array(y).transpose()
print y
dataAr = np.array(data)
#
dataTrans = RFModel.transform(dataAr)
for i, (train, test) in enumerate(skf):
# print dataTrans[train].shape
### print y[train].shape
## dataTrans[test].shape
## print y
#.........这里部分代码省略.........
示例10: RandomForestClassifier
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
forest = RandomForestClassifier(n_estimators=10000,
random_state=0,
n_jobs=-1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[indices[f]],
importances[indices[f]]))
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]),
importances[indices],
color='lightblue',
align='center')
plt.xticks(range(X_train.shape[1]),
feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
# plt.tight_layout()
# plt.savefig('./random_forest.png', dpi=300)
plt.show()
X_selected = forest.transform(X_train, threshold=0.15)
X_selected.shape
示例11: gen_data
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def gen_data():
# the 4k features!
the_train = pickle.load(open('X33_train_reproduce.p','rb'))
the_test = pickle.load(open('X33_test_reproduce.p','rb'))
# corresponding id and labels
Id = pickle.load(open('xid.p','rb'))
labels = pickle.load(open('y.p','rb'))
Id_test = pickle.load(open('Xt_id.p','rb'))
# merge them into pandas
join_train = np.column_stack((Id, the_train, labels))
join_test = np.column_stack((Id_test, the_test))
train = pd.DataFrame(join_train, columns=['Id']+['the_fea%i'%i for i in xrange(the_train.shape[1])] + ['Class'])
test = pd.DataFrame(join_test, columns=['Id']+['the_fea%i'%i for i in xrange(the_train.shape[1])])
del join_train, join_test
# convert into numeric features
train = train.convert_objects(convert_numeric=True)
test = test.convert_objects(convert_numeric=True)
# including more things
train_count = pd.read_csv("train_frequency.csv")
test_count = pd.read_csv("test_frequency.csv")
train = pd.merge(train, train_count, on='Id')
test = pd.merge(test, test_count, on='Id')
# instr count
train_instr_count = pd.read_csv("train_instr_frequency.csv")
test_instr_count = pd.read_csv("test_instr_frequency.csv")
for n in list(train_instr_count)[1:]:
if np.sum(train_instr_count[n]) == 0:
del train_instr_count[n]
del test_instr_count[n]
train_instr_freq = train_instr_count.copy()
test_instr_freq = test_instr_count.copy()
train_instr_freq.ix[:,1:] = train_instr_freq.ix[:,1:].apply(lambda x: x/np.sum(x), axis = 1)
#train_instr_freq = train_instr_freq.replace(np.inf, 0)
train_instr_freq = train_instr_freq.replace(np.nan, 0)
test_instr_freq.ix[:,1:]=test_instr_freq.ix[:,1:].apply(lambda x: x/np.sum(x), axis = 1)
#test_instr_freq = test_instr_freq.replace(np.inf, 0)
test_instr_freq = test_instr_freq.replace(np.nan, 0)
train = pd.merge(train, train_instr_freq, on='Id')
test = pd.merge(test, test_instr_freq, on='Id')
## all right, include more!
grams_train = pd.read_csv("train_data_750.csv")
grams_test = pd.read_csv("test_data_750.csv")
# daf features
#train_daf = pd.read_csv("train_daf.csv")
#test_daf = pd.read_csv("test_daf.csv")
#daf_list = [0,165,91,60,108,84,42,93,152,100] #daf list for 500 grams.
# dll features
train_dll = pd.read_csv("train_dll.csv")
test_dll = pd.read_csv("test_dll.csv")
# merge all them
#mine = pd.merge(grams_train, train_daf,on='Id')
mine = grams_train
mine = pd.merge(mine, train_dll, on='Id')
mine_labels = pd.read_csv("trainLabels.csv")
mine = pd.merge(mine, mine_labels, on='Id')
mine_labels = mine.Class
mine_Id = mine.Id
del mine['Class']
del mine['Id']
mine = mine.as_matrix()
#mine_test = pd.merge(grams_test, test_daf,on='Id')
mine_test = grams_test
mine_test = pd.merge(mine_test, test_dll,on='Id')
mine_test_id = mine_test.Id
del mine_test['Id']
clf_se = RF(n_estimators=500, n_jobs=-1,random_state = 0)
clf_se.fit(mine,mine_labels)
mine_train = np.array(clf_se.transform(mine, '1.25*mean'))
mine_test = np.array(clf_se.transform(mine_test, '1.25*mean'))
train_mine = pd.DataFrame(np.column_stack((mine_Id, mine_train)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_train.shape[1])]).convert_objects(convert_numeric=True)
test_mine = pd.DataFrame(np.column_stack((mine_test_id, mine_test)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_test.shape[1])]).convert_objects(convert_numeric=True)
train = pd.merge(train, train_mine, on='Id')
test = pd.merge(test, test_mine, on='Id')
train_image = pd.read_csv("train_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
test_image = pd.read_csv("test_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
train = pd.merge(train, train_image, on='Id')
test = pd.merge(test, test_image, on='Id')
return train, test
示例12: len
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features='auto',
max_leaf_nodes=None,
bootstrap=True,
oob_score=False,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None)
fit = rf.fit(predictor, classes)
# Side Step: Attribute Importance ???
var_importance = rf.transform(predictor, threshold=None)
# Step 3: Test set probability and prediction
test_predictor = [x[1:5] for x in test]
test_classes = [x[5] for x in test]
# Provides probability of belonging to each class
rf_prob = rf.predict_proba(test_predictor).tolist()
# Predicts the class
rf_cls = rf.predict(test_predictor).tolist()
# Step 4: Evaluating Accuracy
pop = len(data)
error = 0
示例13: feature_selection
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def feature_selection(fl, nofl, method='Fscore', N_features=10):
"""
Selects most important features according to F-score, entropy etc
Args:
fl: 2D np array (region, feature) of flaring regions
nofl: 2D np array (region, feature) of nonflaring regions
method: string, choose one of: 'Fscore', 'RandomForest', 'RFE', 'chi2', 'pca', 'DecisionTree'
N_features: integer, number of features to be selected
Returns:
fl: 2D transformed array (region, only important features) of flaring regions
nofl: 2D transformed array of nonflaring regions
scores: 1D array with size N_features which has the scores of the features (e.g. F score)
float for pca that shows variance explained
"""
Nfl = fl.shape[0]; Nnofl = nofl.shape[0]
yfl = np.ones(Nfl); ynofl = np.zeros(Nnofl)
if method == 'RandomForest':
selector = RandomForestClassifier(n_estimators=10000, criterion='entropy', \
class_weight='auto', max_features = 0.5)
selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
scores=selector.feature_importances_
#threshold = sorted(scores, reverse=True)[N_features-1]
#fl = selector.transform(fl, threshold=threshold)
#nofl = selector.transform(nofl, threshold=threshold)
fl = fl[:, np.argsort(scores)[::-1][0:N_features]]
nofl = nofl[:, np.argsort(scores)[::-1][0:N_features]]
elif method == 'DecisionTree':
selector = DecisionTreeClassifier(criterion='entropy', class_weight='auto')
selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
scores=selector.feature_importances_
fl = fl[:, np.argsort(scores)[::-1][0:N_features]]
nofl = nofl[:, np.argsort(scores)[::-1][0:N_features]]
elif method == 'RFE':
estimator = LogisticRegression(penalty='l1', class_weight='auto')
selector = RFE(estimator, n_features_to_select=N_features, step=1)
selector = selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
scores = selector.ranking_
fl = fl[:, np.argsort(scores)[0:N_features]]
nofl = nofl[:, np.argsort(scores)[0:N_features]]
elif method == 'Fscore':
selector = SelectKBest(f_classif, k=N_features)
selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
scores=selector.scores_
#fl = selector.transform(fl); nofl = selector.transform(nofl)
fl = fl[:, np.argsort(scores)[::-1][0:N_features]]
nofl = nofl[:, np.argsort(scores)[::-1][0:N_features]]
elif method == 'chi2':
data = np.concatenate((fl,nofl),axis=0)
minim = np.zeros(fl.shape[1])
for i in range(fl.shape[1]):
minim[i] = np.min(data[:,i])
if minim[i] < 0:
fl[:,i] = fl[:,i] - minim[i]; nofl[:,i] = nofl[:,i] - minim[i]
selector = SelectKBest(chi2, k=N_features)
selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
scores=selector.scores_
#fl = selector.transform(fl); nofl = selector.transform(nofl)
fl = fl[:, np.argsort(scores)[::-1][0:N_features]]
nofl = nofl[:, np.argsort(scores)[::-1][0:N_features]]
minim = minim[np.argsort(scores)[::-1][0:N_features]]
for i in range(fl.shape[1]):
if minim[i] < 0:
fl[:,i] = fl[:,i] + minim[i]; nofl[:,i] = nofl[:,i] + minim[i]
elif method == 'pca':
selector = PCA(n_components=N_features)
selector.fit(np.concatenate((fl,nofl), axis=0))
fl = selector.transform(fl); nofl = selector.transform(nofl)
scores = selector.explained_variance_ratio_
print "PCA was applied and ", np.shape(fl)[1], " components were kept."
print "Variance explained: ", np.sum(selector.explained_variance_ratio_)
#for i in range(N_features):
#print zip(np.arange(scores.shape[0]), (scores.argsort())[::-1], sorted(scores)[::-1])[i]
return fl, nofl, scores
示例14: RandomForestRegressor
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
# In[234]:
# compute feature importances
pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort('importance')
# In[235]:
# compute the out-of-bag R-squared score
rfreg.oob_score_
# In[236]:
# set a threshold for which features to include
print rfreg.transform(X, threshold=0.1).shape
print rfreg.transform(X, threshold='mean').shape
print rfreg.transform(X, threshold='median').shape
# In[237]:
# create a new feature matrix that only includes important features
X_important = rfreg.transform(X, threshold='mean')
# In[238]:
# check the RMSE for a Random Forest that only includes important features
rfreg = RandomForestRegressor(n_estimators=30, max_features=2, random_state=1)
scores = cross_val_score(rfreg, X_important, y, cv=10, scoring='mean_squared_error')
示例15: print
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
data_df=data_dfi[cols]
print (data_df.head())
data_df = data_df.reset_index()
dfc = data_df.reindex(np.random.permutation(data_df.index))
dfc.fillna(value=-99999, inplace=True)
forecast_out = int(math.ceil(0.01 * len(dfc)))
dfx = dfc[features]
X = np.array(dfx)
y = np.array(dfc[to_predict])
names = np.array(dfc.columns.values)
clf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf.fit(X,y)
X_selected = clf.transform(X)
feat_list = sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), names),
reverse=True)
print (feat_list)
good_feats = []
for i in feat_list:
array = np.asarray(i)
print
if array[1] == 'index':
break
good_feats.append(array[1])
print (good_feats)
dfx = dfc[good_feats]
X = np.array(dfx)