本文整理汇总了Python中sklearn.ensemble.RandomForestClassifier类的典型用法代码示例。如果您正苦于以下问题:Python RandomForestClassifier类的具体用法?Python RandomForestClassifier怎么用?Python RandomForestClassifier使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RandomForestClassifier类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: rand_forest
def rand_forest(train_bow,train_labels,test_bow,test_labels,bow_indexes):
print("Training rndForest")
rf_classifier=RandomForestClassifier()
rf_classifier.fit(train_bow,train_labels)
print("Testing rndForest")
test(rf_classifier,"rf",test_bow,test_labels,bow_indexes)
示例2: fit_rf
def fit_rf(path, index_filter=None, class_filter=None, feature_filter=None, folds=10,
inverse=False, lc_filter=None):
"""
path: Dirección del dataset a ocupar para entrenar
index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar
class_filter: Lista de clases que se quiere utilizar
feature_filter: Lista de features que se quiere utilizar
"""
data = pd.read_csv(path, index_col=0)
data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter)
skf = cross_validation.StratifiedKFold(y, n_folds=folds)
results = []
for train_index, test_index in skf:
if inverse:
aux = train_index
train_index = test_index
test_index = aux
train_X, test_X = data.iloc[train_index], data.iloc[test_index]
train_y, test_y = y.iloc[train_index], y.iloc[test_index]
clf = None
clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14,
min_samples_split=5)
clf.fit(train_X, train_y)
results.append(metrics.predict_table(clf, test_X, test_y))
return pd.concat(results)
示例3: __init__
def __init__(self, data, classes, tree_features, n_trees=100):
self.n_features = np.shape(data)[1]
n_rows = np.shape(data)[0]
n_nans = np.sum(np.isnan(data), 0)
data = data[:, n_nans < n_rows]
self.n_features = np.shape(data)[1]
n_nans = np.sum(np.isnan(data), 1)
data = data[n_nans < self.n_features, :]
self.n_rows = np.shape(data)[0]
if (tree_features > self.n_features):
tree_features = self.n_features
self.col_list = np.zeros((n_trees, tree_features), dtype='int')
self.n_trees = n_trees
self.bags = []
for i in range(n_trees):
cols = sample(range(self.n_features), tree_features)
cols.sort()
self.col_list[i, :] = cols
data_temp = data[:, cols]
n_nans = np.sum(np.isnan(data_temp), 1)
data_temp = data_temp[n_nans == 0, :]
classes_temp = classes[n_nans == 0]
#bag = BaggingClassifier(n_estimators=1, max_features=tree_features)
bag = RandomForestClassifier(n_estimators=1, max_features=tree_features)
bag.fit(data_temp, classes_temp)
self.bags.append(bag)
print(np.shape(data_temp))
示例4: random_forest_classify
def random_forest_classify(train_data,train_label,test_data):
rf = RandomForestClassifier(n_estimators=100)
rf.fit(train_data, ravel(train_label))
test_label=rf.predict(test_data)
save_result(test_label,'sklearn_random_forest_classify_Result.csv')
return test_label
示例5: get_randomforest_classifier
def get_randomforest_classifier(X_train, y_train, params=None):
param_grid = {"max_depth": [4, 5, 6, 7],
"max_features": [3, 5],
"criterion": ["gini", "entropy"]}
if params is None:
log = RandomForestClassifier()
t = start("training random forest ")
cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123)
clf = grid_search.GridSearchCV(log, param_grid, cv=cv, n_jobs=4, scoring='roc_auc')
clf = clf.fit(X_train,y_train)
report(t, nitems=10*len(param_grid))
print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_))
print "With parameters:"
best_parameters = clf.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
else:
clf = RandomForestClassifier(**params)
clf = clf.fit(X_train,y_train)
return clf
示例6: cls_create
def cls_create(xs, ys):
if algo == "SVM":
classifier = svm.SVC(C = self.parm, probability=True)
elif algo == "RF":
classifier = RandomForestClassifier(n_estimators = int(self.parm), criterion='entropy', n_jobs = 1)
#
#classifier = LDA()
new_xs = xs
"""
positive_count = len([y for y in ys if y > 0])
if positive_count >= 20:
#self.selector = svm.LinearSVC(C = 1, dual = False, penalty="l1")
self.selector = LDA()
new_xs = self.selector.fit_transform(xs, ys)
else:
self.selector = None
"""
classifier.fit(new_xs, ys)
probs = classifier.predict_proba(new_xs)
#self.pclassifier = svm.SVC(parm_val = 1.0)
#self.pclassifier.fit(probs, ys)
self.threshold, self.positive, self.negative = best_threshold_for_f1(probs, 20, ys)
return classifier
开发者ID:simonhughes22,项目名称:PythonNlpResearch,代码行数:31,代码来源:Codes_ClassifyUsingVectorComposition_WordSpace.py
示例7: train_model_on_gestures
def train_model_on_gestures(wav_list):
gestures = {'vattene':0, 'vieniqui':1, 'perfetto':2, 'furbo':3, 'cheduepalle':4,
'chevuoi':5, 'daccordo':6, 'seipazzo':7, 'combinato':8, 'freganiente':9,
'ok':10, 'cosatifarei':11, 'basta':12, 'prendere':13, 'noncenepiu':14,
'fame':15, 'tantotempo':16, 'buonissimo':17, 'messidaccordo':18, 'sonostufo':19}
dataX = []
i = 0
for wav in wav_list:
path = re.sub('\_audio.wav$', '', wav)
print '\n', '##############'
print path[-25:]
sample = VideoMat(path, True)
sk = Skelet(sample)
rate, data = get_data(wav)
data_frame = np.asarray(create_features(data, sample.labels, sample.numFrames, sk))
#print 'data_frame !', data_frame.shape
#data_frame2 = np.asarray(Head_inter(path, sample.labels).data_frame)
#data_frame = np.hstack((data_frame, data_frame2))
dataX += copy.copy(data_frame)
# 1 target / 19 * 6 joints infos / 8 Head/Hand distances / 5 Head box = 128 features
#Train model: Don't use the Head box features, don't really improve the model
data_frame = np.asarray(dataX)
Y = data_frame[:, 0]
Y = np.asarray([gestures[i] for i in Y])
X = data_frame[:, 1:]
X = X.astype(np.float32, copy=False)
X = X[:, :122]
clf = RandomForestClassifier(n_estimators=300, criterion='entropy', min_samples_split=10,
min_samples_leaf=1, verbose=2, random_state=1) #n_jobs=2
clf = clf.fit(X, Y)
pickle.dump(clf, open('gradient_boosting_model_gestures.pkl','wb'))
示例8: cross_validate
def cross_validate():
print("Reading the data")
data = cu.get_dataframe(train_file)
print("Cross-Validating")
rf = RandomForestClassifier(n_estimators=10,
verbose=1,
compute_importances=True,
n_jobs=2)
cv = cross_validation.KFold(len(data),
k=10,
indices=False)
results = []
for traincv, testcv in cv:
print "\t-- cv [%d]"%len(results)
print "\t","extracting features"
#...
feacv = features.extract_features(feature_names,
traincv)
print "\t","learning"
rf.fit(feacv, data["OpenStatus"])
print "\t","predicting"
probs = rf.predict_proba(testcv)
print "\t","evaluating"
results.append( llfun(target[testcv],
[x["OpenStatus"] for x in probas]) )
print "LogLoss: " + str( np.array(results).mean() )
示例9: crossValIteration
def crossValIteration(dat,classes,cutoff,prop=0.9,reshuffle=False):
if reshuffle:
dat.samples = sampleReshuffle(dat)
saved_samples = [i for i in dat.samples]
dat.samples = ["{0}_$$_{1}".format(i,v) for i,v in enumerate(dat.samples)]
train,test=dat.splitTraining(prop, classes)
print test.samples
selectedSampleIndicies = [int(i.split("_$$_")[0]) for i in test.samples]
dat.samples = saved_samples
print test.samples
test.samples = [i.split("_$$_")[1] for i in test.samples]
train.samples = [i.split("_$$_")[1] for i in train.samples]
print "Training set has {0} samples from classes: {1}".format(len(train.samples),",".join(set(train.samples)))
print "Test set has {0} samples from classes: {1}".format(len(test.samples),",".join(set(test.samples)))
print "Selecting data..."
# select features for each disease
print "Number of selections made for each class:"
print "Setting up SVM..."
Xtrain = train.values.transpose()
Ytrain = train.samples
clf=RandomForestClassifier(n_estimators=1000)
clf.fit(Xtrain,Ytrain)
Xtest = test.values.transpose()
Ytest = test.samples
print "Predicting R-forest..."
#classification results versus actual
acc = zip(Ytest,clf.predict(Xtest)) # (actual,predicted)... for each sample
print acc # this is the elemental form of the "result" lists processed below
print sum([i[0] == i[1] for i in acc])*1.0/len(acc)
return acc
示例10: get_preds
def get_preds(features, trees=3000, depth=19): # features is the number of latents features that I want the nmf to run on
# Create dataframes
df = get_nmf(k=features)
df_full = add_yahoo_to_df(df)
df_train = add_dummies(df_full) # Why aren't you using df_full?
df_test = get_nmf('data_wednesday', k=features) # put in folder name where the json data is
df_test_full = add_yahoo_to_df(df_test)
df_test_full = add_dummies(df_test_full)
# Create models
X_model_class, y_model_class = get_classifier_data(df_full)
rf_class = RandomForestClassifier(n_estimators=trees, max_depth=depth)
rf_class.fit(X_model_class, y_model_class)
#
X_model_regress, y_model_regress = get_regressor_data(df_full)
rf_regress = RandomForestRegressor(n_estimators=trees, max_depth=depth)
rf_regress.fit(X_model_regress, y_model_regress)
# Get X and y values
X_classify, y_classify = get_classifier_data(pd.DataFrame(df_test_full.ix['2016-04-11']))
X_regress, y_regress = get_regressor_data(pd.DataFrame(df_test_full.ix['2016-04-11']))
# Run models
classifier_preds = rf_class.predict(X_classify)
classifier_accuracy = accuracy_score(classifier_preds, y_classify)
regressor_preds = rf_regress.predict(X_regress)
regressor_mse = mean_squared_error(regressor_preds, y_regress)
# I want to return the number of features, k, along with the accuracy of the classifier
# and the MSE of the regressor. This will give me an idea of how well things are doing
# based on the number of features.
return [features, classifier_accuracy, regressor_mse]
示例11: myforest
def myforest(train, test, trees=250):
#Training data prep-------------------------------------------------------------------------------------------
csv_file_object = csv.reader(open(train, 'rb')) #Load in the training csv file
header = csv_file_object.next() #Skip the fist line as it is a header
output_header = header[0:2]
train_data=[]
for row in csv_file_object: #Skip through each row in the csv file
train_data.append(row[1:]) #adding each row to the data variable
train_data = np.array(train_data) #Then convert from a list to an array
#Test data prep-----------------------------------------------------------------------------------------------
test_file_object = csv.reader(open(test, 'rb')) #Load in the test csv file
header = test_file_object.next() #Skip the fist line as it is a header
test_data=[] #Create a variable called 'test_data'
ids = []
for row in test_file_object: #Skip through each row in the csv file
ids.append(row[0])
test_data.append(row[1:]) #adding each row to the data variable
test_data = np.array(test_data) #Then convert from a list to an array
#Train the forest
print 'Training'
forest = RandomForestClassifier(n_estimators=trees)
forest = forest.fit(train_data[0::,1::], train_data[0::,0])
print 'Predicting'
output = forest.predict(test_data)
open_file_object = csv.writer(open("result.csv", "wb"))
open_file_object.writerow([output_header[0],output_header[1]])
open_file_object.writerows(zip(ids, output))
示例12: Random_Forest_classifier
def Random_Forest_classifier(train_input_data,train_output_data,test_input_data,test_output_data):
tree_list = []
accuracy_percent = []
for trees in range(10,200,10):
clf = RandomForestClassifier(trees)
clf.fit(train_input_data,train_output_data)
predicted_output = clf.predict(test_input_data)
error_list = []
if isinstance(predicted_output,list) ==False:
predicted_output = predicted_output.tolist()
if isinstance(test_output_data,list) ==False:
test_output_data = test_output_data.tolist()
for i in range(len(test_output_data)):
cur_univ_similarities = similar_univs[similar_univs['univName'] == predicted_output[i]]
cur_univ_similarity_list = cur_univ_similarities.values.tolist()
cur_univ_similarity_list = [item for sublist in cur_univ_similarity_list for item in sublist]
if test_output_data[i] in cur_univ_similarity_list[1:]:
error_list.append(0)
else:
error_list.append(1)
tree_list.append(trees)
accuracy_percent.append(100 -((sum(error_list)/float(len(error_list))) * 100))
tree_list = np.array(tree_list)
accuracy_percent = np.array(accuracy_percent)
plt.plot(tree_list,accuracy_percent)
plt.xlabel('Number of trees')
plt.ylabel('Percent of accuracy')
plt.title('Varation of accuracy with trees')
plt.grid(True)
plt.savefig("rf1.png")
plt.show()
return predicted_output
示例13: main
def main():
S, col_names_S = load_data(config.paths.training_data,
config.paths.cache_folder)
Xs, Ys, col_names_S = extract_xy(S, col_names_S)
a = RandomForestClassifier(n_estimators=1)
a.fit(Xs.toarray(), Ys.toarray().ravel())
best_features = a.feature_importances_
max_ind, max_val = max(enumerate(best_features), key=operator.itemgetter(1))
print best_features
print max_ind, max_val
print Xs.shape
print Ys.shape
param_range = [1, 3, 5, 7, 10, 15, 20, 30, 60, 80]
train_scores, test_scores = validation_curve(RandomForestClassifier(criterion='entropy'), Xs, Ys.toarray().ravel(),
'n_estimators', param_range)
print train_scores
print test_scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.title("Validation Curve for Random Forest")
plt.xlabel("Number of Trees")
plt.ylabel("Score")
plt.plot(param_range, train_mean, label="Training Score", color='r')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.2, color='r')
plt.plot(param_range, test_mean, label="Test Score", color='b')
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.2, color='b')
plt.legend(loc="best")
plt.show()
示例14: randomForest_eval_func
def randomForest_eval_func(self, chromosome):
n_estimators, max_features, window_size = self.decode_chromosome(chromosome)
if self.check_log(n_estimators, max_features, window_size):
return self.get_means_from_log(n_estimators, max_features, window_size)[0]
folded_dataset = self.create_folded_dataset(window_size)
indim = 21 * (2 * window_size + 1)
mean_AUC = 0
mean_decision_value = 0
mean_mcc = 0
sample_size_over_thousand_flag = False
for test_fold in xrange(self.fold):
test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset(test_fold)
if len(test_labels) + len(train_labels) > 1000:
sample_size_over_thousand_flag = True
clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
clf.fit(train_dataset, train_labels)
probas = clf.predict_proba(test_dataset)
decision_values = map(lambda x: x[1], probas) # Probability of being binding residue
AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC(decision_values, test_labels)
mean_AUC += AUC
mean_decision_value += decision_value_and_max_mcc[0]
mean_mcc += decision_value_and_max_mcc[1]
if sample_size_over_thousand_flag:
break
if not sample_size_over_thousand_flag:
mean_AUC /= self.fold
mean_decision_value /= self.fold
mean_mcc /= self.fold
self.write_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc)
self.add_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc)
return mean_AUC
示例15: model_and_predict
def model_and_predict(self, X_train, y_train, X_test):
district_idx = self.columns.index('PdDistrict')
districts = set(X_train[:,district_idx])
district_ys = {}
# Grow forest and predict separately for each district's records
for d in districts:
district_X_train = X_train[X_train[:, district_idx] == d]
district_X_train = np.delete(district_X_train, district_idx, 1)
district_y_train = y_train[X_train[:, district_idx] == d]
district_X_test = X_test[X_test[:, district_idx] == d]
district_X_test = np.delete(district_X_test, district_idx, 1)
print "Growing forest for", d
# Not saving output in Git so make this deterministic
# with random_state
rf = RandomForestClassifier(n_estimators=self.n_trees, n_jobs=-1,
random_state=782629)
rf.fit(district_X_train, district_y_train)
district_ys[d] = list(rf.predict(district_X_test))
print "Finished", d
print "All predictions made"
y_hat = []
for row in X_test:
d_ys = district_ys[row[district_idx]]
y_hat.append(d_ys.pop(0))
return y_hat