本文整理汇总了Python中sklearn.ensemble.RandomForestClassifier.input_features方法的典型用法代码示例。如果您正苦于以下问题:Python RandomForestClassifier.input_features方法的具体用法?Python RandomForestClassifier.input_features怎么用?Python RandomForestClassifier.input_features使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.ensemble.RandomForestClassifier
的用法示例。
在下文中一共展示了RandomForestClassifier.input_features方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import input_features [as 别名]
def main():
p = optparse.OptionParser()
p.add_option('--model', '-m', default = 'model', type = str, help = 'model filename prefix')
p.add_option('--load', '-L', default = False, action = 'store_true', help = 'load model from file')
p.add_option('--features', '-f', default = 'features.txt', type = str, help = 'feature filename')
p.add_option('--data', '-d', default = 'data.csv', type = str, help = 'marked data filename')
p.add_option('--verbose', '-v', default = False, action = 'store_true', help = 'verbosity flag')
p.add_option('--thresh', '-T', default = 0.5, type = float, help = 'probability threshold to classify True')
p.add_option('--n_estimators', '-n', default = 100, type = int, help = 'number of random forest estimators')
p.add_option('--test_fraction', '-t', default = 0.25, type = float, help = 'fraction of data to use for testing')
p.add_option('--seed', '-s', default = None, type = int, help = 'random seed')
p.add_option('--jobs', '-j', default = -1, type = int, help = 'number of jobs (-1 if maximum)')
p.add_option('--probs', '-p', default = None, type = str, help = 'filename for output probabilities')
opts, args = p.parse_args()
model_filename = opts.model + '%s.pickle' % ('' if opts.seed is None else str(opts.seed))
probs_filename = ('predicted_probs%s.dat' % ('' if opts.seed is None else str(opts.seed))) if opts.probs is None else opts.probs
np.random.seed(opts.seed)
if opts.verbose:
print("\nReading marked data from %s..." % opts.data)
# establish data frame
df = pd.read_csv(opts.data)
n_lines = len(df)
# choose test set as random test_fraction of data, leaving the remainder for training
n_test = int(opts.test_fraction * n_lines)
if opts.verbose:
print("Read %d lines of data -> %d lines (training), %d lines (test)" % (n_lines, n_lines - n_test, n_test))
test_subset = np.random.permutation(range(n_lines))[:n_test]
is_train = np.ones(n_lines, dtype = bool)
for i in test_subset:
is_train[i] = False
# establish training and test sets
train, test = df[is_train], df[~is_train]
if opts.load:
rfc = pickle.load(open(model_filename, 'rb'))
if opts.verbose:
print("\nLoaded model from '%s'.\n" % model_filename)
else:
# set the random forest instance
rfc = RandomForestClassifier(n_estimators = opts.n_estimators, n_jobs = opts.jobs)
# set list of features (all the uncommented features above dotted line in feature file; leading/trailing whitespace is stripped
with open(opts.features, 'r') as f:
lines = f.readlines()
line_starts_with_dash = [(line[0] == '-') for line in lines]
assert (line_starts_with_dash.count(True) == 1), "Feature file must have a single dashed line separating input/output features."
dashed_line_index = line_starts_with_dash.index(True)
rfc.input_features = []
for i in range(dashed_line_index):
feature = lines[i].partition('#')[0].strip()
if (len(feature) > 0):
rfc.input_features.append(feature)
output_features = []
for i in range(dashed_line_index + 1, len(lines)):
feature = lines[i].partition('#')[0].strip()
if (len(feature) > 0):
output_features.append(feature)
assert (len(output_features) == 1), "Feature file must have exactly one output feature."
rfc.output_feature = output_features[0]
num_features = len(rfc.input_features)
assert (num_features > 0), "Feature file must have at least one input feature."
X = train[rfc.input_features]
y = train[rfc.output_feature]
if (not opts.load):
# train the forest
if opts.verbose:
print("\nTraining %d random forests..." % opts.n_estimators)
rfc.fit(X, y)
# save off the model
pickle.dump(rfc, open(model_filename, 'wb'))
if opts.verbose:
print("\nSaved model to '%s'.\n" % model_filename)
# make predictions on the test data
probs = rfc.predict_proba(test[rfc.input_features])[:, 1]
probs_series = pd.Series(probs)
probs_series.to_csv(probs_filename, index = False)
test_preds = (probs >= opts.thresh)
conf_df = pd.crosstab(test[rfc.output_feature], test_preds, rownames = ['actual'], colnames = ['predicted'])
conf_mat = np.asarray(conf_df)
class_report = classification_report(test[rfc.output_feature], test_preds)
print("\nConfusion Matrix")
print(conf_df)
print("\nClassification Report")
print(class_report)
accuracy = (conf_mat[0, 0] + conf_mat[1, 1]) / float(np.sum(conf_mat))
print("Accuracy = %.3f%%" % (100. * accuracy))
print("\nFeature Importances")
triples = [(i, rfc.input_features[i], rfc.feature_importances_[i]) for i in range(num_features)]
triples.sort(key = lambda pair : pair[2], reverse = True)
indices, features, importances = zip(*triples)
for i in range(num_features):
#.........这里部分代码省略.........
示例2: main
# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import input_features [as 别名]
def main():
p = optparse.OptionParser()
p.add_option('--load', '-L', default = False, action = 'store_true', help = 'load model from file')
p.add_option('--features', '-f', default = 'features.txt', type = str, help = 'feature filename')
p.add_option('--verbose', '-v', default = False, action = 'store_true', help = 'verbosity flag')
p.add_option('--thresh', '-T', default = 0.5, type = float, help = 'probability threshold to classify True')
p.add_option('--n_estimators', '-n', default = 100, type = int, help = 'number of random forest estimators')
p.add_option('--seed', '-s', default = None, type = int, help = 'random seed')
p.add_option('--jobs', '-j', default = -1, type = int, help = 'number of jobs (-1 if maximum)')
opts, args = p.parse_args()
model_filename = 'model%s.pickle' % ('' if opts.seed is None else str(opts.seed))
np.random.seed(opts.seed)
if opts.verbose:
print("\nReading data set...")
train = pd.read_csv('yoochoose/data/training_session_features.csv').append(pd.read_csv('yoochoose/data/dev_session_features.csv'))
test = pd.read_csv('yoochoose/data/test_session_features.csv')
if opts.load:
rfc = pickle.load(open(model_filename, 'rb'))
if opts.verbose:
print("\nLoaded model from '%s'.\n" % model_filename)
else:
# set the random forest instance
rfc = RandomForestClassifier(n_estimators = opts.n_estimators, n_jobs = opts.jobs)
# set list of features (all the uncommented features above dotted line in feature file; leading/trailing whitespace is stripped
with open(opts.features, 'r') as f:
lines = f.readlines()
line_starts_with_dash = [(line[0] == '-') for line in lines]
assert (line_starts_with_dash.count(True) == 1), "Feature file must have a single dashed line separating input/output features."
dashed_line_index = line_starts_with_dash.index(True)
rfc.input_features = []
for i in range(dashed_line_index):
feature = lines[i].partition('#')[0].strip()
if (len(feature) > 0):
rfc.input_features.append(feature)
output_features = []
for i in range(dashed_line_index + 1, len(lines)):
feature = lines[i].partition('#')[0].strip()
if (len(feature) > 0):
output_features.append(feature)
assert (len(output_features) == 1), "Feature file must have exactly one output feature."
rfc.output_feature = output_features[0]
num_features = len(rfc.input_features)
assert (num_features > 0), "Feature file must have at least one input feature."
X = train[rfc.input_features]
y = train[rfc.output_feature]
if (not opts.load):
# train the forest
if opts.verbose:
print("\nTraining %d random forests..." % opts.n_estimators)
rfc.fit(X, y)
# save off the model
pickle.dump(rfc, open(model_filename, 'wb'))
if opts.verbose:
print("\nSaved model to '%s'.\n" % model_filename)
# make predictions on the test data
probs = rfc.predict_proba(test[rfc.input_features])[:, 1]
probs_series = pd.Series(probs)
probs_series.to_csv('test_probs%s' % ('' if opts.seed is None else str(opts.seed)), index = False)
test_preds = (probs >= opts.thresh)
conf_df = pd.crosstab(test[rfc.output_feature], test_preds, rownames = ['actual'], colnames = ['predicted'])
conf_mat = np.asarray(conf_df)
class_report = classification_report(test[rfc.output_feature], test_preds)
s = "\nConfusion Matrix\n"
s += str(conf_df) + '\n'
s += "\nClassification Report\n"
s += class_report + '\n'
accuracy = (conf_mat[0, 0] + conf_mat[1, 1]) / float(np.sum(conf_mat))
s += "Accuracy = %.3f%%\n" % (100. * accuracy)
s += "\nFeature Importances\n"
triples = [(i, rfc.input_features[i], rfc.feature_importances_[i]) for i in range(num_features)]
triples.sort(key = lambda pair : pair[2], reverse = True)
indices, features, importances = zip(*triples)
for i in range(num_features):
s += "%17s %3d.%03d%%\n" % (features[i], int(100. * importances[i]), round(1000 * (100. * importances[i] - int(100. * importances[i]))))
with open('test_report%s' % ('' if opts.seed is None else str(opts.seed)), 'w') as f:
f.write(s)