本文整理汇总了Python中sklearn.tree.DecisionTreeClassifier类的典型用法代码示例。如果您正苦于以下问题:Python DecisionTreeClassifier类的具体用法?Python DecisionTreeClassifier怎么用?Python DecisionTreeClassifier使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DecisionTreeClassifier类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: decision_tree_entropy
def decision_tree_entropy(training_data):
clf = DecisionTreeClassifier(criterion="entropy",random_state=0)
clf.fit(training_data[0], training_data[1])
#with open("/media/deeksha/e/Deeksha/Dropbox/Coursework/MachineLearning/HW3/entropy.dot", 'w') as f:
# f = tree.export_graphviz(clf, out_file=f)
print "entropy:Number of Nodes", clf.tree_.node_count
return clf
示例2: __init__
class Transformer:
def __init__(self, use_PCA=True):
self._clf = DecisionTreeClassifier(min_samples_leaf=10)
self._idx = None
self._scaler = StandardScaler()
self._trans = PCA('mle')
self._use_PCA = use_PCA
def fit(self, X, y):
X = np.array(X)
self._clf.fit(X, y)
self._idx = filter(lambda x: self._clf.feature_importances_[x] > 0, \
range(len(self._clf.feature_importances_)))
new_set = [X[i][self._idx] for i in xrange(len(X))]
# new_set = self._scaler.fit_transform(new_set)
if self._use_PCA:
new_set = self._trans.fit_transform(new_set)
return new_set
def transform(self, features):
features = features[self._idx]
# features = self._scaler.transform(features.astype(float))
if self._use_PCA:
features = self._trans.transform(features)
return features
示例3: quize1
def quize1(data):
# 1. Select count of neighbors.Загрузите выборку из файла titanic.csv с помощью пакета Pandas.
# 2.Оставьте в выборке четыре признака: класс пассажира (Pclass), цену билета (Fare), возраст пассажира (Age) и его пол (Sex).
# 3.Обратите внимание, что признак Sex имеет строковые значения.
# 4.Выделите целевую переменную — она записана в столбце Survived.
# 5.В данных есть пропущенные значения — например, для некоторых пассажиров неизвестен их возраст.
# 6.Такие записи при чтении их в pandas принимают значение nan.
# Найдите все объекты, у которых есть пропущенные признаки, и удалите их из выборки.
# Обучите решающее дерево с параметром random_state=241 и остальными параметрами по умолчанию.
# Вычислите важности признаков и найдите два признака с
# наибольшей важностью. Их названия будут ответами для данной задачи
# (в качестве ответа укажите названия признаков через запятую или пробел, порядок не важен).
dataF = data[['Pclass', 'Fare', 'Age', 'Sex','Survived']]
dataF = dataF.dropna()
Y = dataF['Survived']
dataF = dataF[['Pclass', 'Fare', 'Age', 'Sex']]
clf = DecisionTreeClassifier(random_state=241)
dataF.loc[dataF['Sex'] != 'male', 'Sex'] = 0
dataF.loc[dataF['Sex'] == 'male', 'Sex'] = 1
print (dataF)
clf.fit(dataF, Y)
importances = clf.feature_importances_
print(importances)
# d = zip(dataF.columns, clf.feature_importanc_)
# print(d)
return
示例4: evaluateDecisionTree
def evaluateDecisionTree(train_x,train_y,test_x,test_y):
clf = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=5,max_depth=20)
clf.fit(train_x,train_y)
p = clf.predict_proba(test_x)[:,1]
auc = roc_auc_score(test_y,p)
plotAUC(test_y,clf.predict_proba(test_x)[:,1],'DT')
return auc
示例5: decision_tree
def decision_tree(train_bow,train_labels,test_bow,test_labels,bow_indexes):
print("Training decision tree")
dt_classifier=DecisionTreeClassifier()
dt_classifier.fit(train_bow,train_labels)
print("Testing decision tree")
test(dt_classifier,"dt",test_bow,test_labels,bow_indexes)
示例6: train_adaboost
def train_adaboost(features, labels, learning_rate, n_lab, n_runs, n_estim, n_samples):
uniqLabels = np.unique(labels)
print 'Taking ', str(n_lab), ' labels'
uniqLabels = uniqLabels[:n_lab]
used_labels = uniqLabels
pbar = start_progressbar(len(uniqLabels), 'training adaboost for %i labels' %len(uniqLabels))
allLearners = []
for yy ,targetLab in enumerate(uniqLabels):
runs=[]
for rrr in xrange(n_runs):
#import ipdb;ipdb.set_trace()
feats,labs = get_binary_sets(features, labels, targetLab, n_samples)
#print 'fitting stump'
#import ipdb;ipdb.set_trace()
baseClf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=10, min_samples_split=10)
baseClf.fit(feats, labs)
ada_real = AdaBoostClassifier( base_estimator=baseClf, learning_rate=learning_rate,
n_estimators=n_estim,
algorithm="SAMME.R")
#import ipdb;ipdb.set_trace()
runs.append(ada_real.fit(feats, labs))
allLearners.append(runs)
update_progressbar(pbar, yy)
end_progressbar(pbar)
return allLearners, used_labels
示例7: test_importances
def test_importances():
"""Check variable importances."""
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
for name, Tree in CLF_TREES.items():
clf = Tree(random_state=0)
clf.fit(X, y)
importances = clf.feature_importances_
n_important = np.sum(importances > 0.1)
assert_equal(importances.shape[0], 10, "Failed with {0}".format(name))
assert_equal(n_important, 3, "Failed with {0}".format(name))
X_new = clf.transform(X, threshold="mean")
assert_less(0, X_new.shape[1], "Failed with {0}".format(name))
assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name))
# Check on iris that importances are the same for all builders
clf = DecisionTreeClassifier(random_state=0)
clf.fit(iris.data, iris.target)
clf2 = DecisionTreeClassifier(random_state=0,
max_leaf_nodes=len(iris.data))
clf2.fit(iris.data, iris.target)
assert_array_equal(clf.feature_importances_,
clf2.feature_importances_)
示例8: MultEstimator
class MultEstimator(BaseEstimator):
def __init__(self, categories):
self.categories = categories
def fit(self, X, y, **params):
self.models = {_: None for _ in self.categories}
self.tot_model = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
categ = X[:, -1]
data = X[:, :-1]
self.tot_model.fit(data, y)
for c in self.models.keys():
mask = categ == c
m = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
m.fit(data[mask], y[mask])
self.models[c] = m
def predict(self, X):
categ = X[:, -1]
data = X[:, :-1]
p = self.tot_model.predict(data)
for c in self.models.keys():
mask = categ == c
if mask.any():
p[mask] = self.models[c].predict(data[mask])
return p
def predict_proba(self, X):
categ = X[:, -1]
data = X[:, :-1]
p = self.tot_model.predict_proba(data)
for c in self.models.keys():
mask = categ == c
if mask.any():
p[mask] = self.models[c].predict_proba(data[mask])
return p
示例9: main
def main(percentage):
"""Given a percentage for splitting the dataset, fit the training set and apply the rest as a test set."""
df = pd.read_csv('cellStrength.log')
df.drop('SSID', 1, inplace=True)
processed = preprocess(df)
location_col = processed[0].shape[1]-4
hash_to_location = {y:x for x,y in processed[1].items()}
df2, targets = encode_target(processed[0], location_col)
msk = np.random.rand(len(df)) < percentage
test = df2[~msk].copy()
train = df2[msk].copy()
open('golden.csv', 'w').write(','.join([hash_to_location[p] for p in test['Target'].tolist()]) + '\n' )
test.drop(186, 1, inplace=True)
test.drop('Target', 1, inplace=True)
features = list(df2.columns[:location_col]) + list(df2.columns[location_col+1:-1])
y = train['Target']
X = train[features]
dt = DecisionTreeClassifier(min_samples_split=3, random_state=99)
try:
dt.fit(X, y)
except ValueError:
return
predictions = dt.predict(test).tolist()
open('golden.csv', 'a').write(','.join([hash_to_location[p] for p in predictions]))
# get_code(dt, features, targets)
return get_accuracy('golden.csv')
示例10: programmer_2
def programmer_2():
datafile = 'data/model.xls'
data = pd.read_excel(datafile)
data = data.as_matrix()
shuffle(data) # 随机打乱数据
# 设置训练数据比8:2
p = 0.8
train = data[:int(len(data) * p), :]
test = data[int(len(data) * p):, :]
# 构建CART决策树模型
treefile = 'tmp/tree.pkl'
tree = DecisionTreeClassifier()
tree.fit(train[:, :3], train[:, 3])
joblib.dump(tree, treefile)
cm_plot(train[:, 3], tree.predict(train[:, :3])).show() # 显示混淆矩阵可视化结果
# 注意到Scikit-Learn使用predict方法直接给出预测结果。
fpr, tpr, thresholds = roc_curve(
test[:, 3], tree.predict_proba(test[:, :3])[:, 1], pos_label=1)
plt.plot(fpr, tpr, linewidth=2, label='ROC of CART', color='green')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# 设定边界范围
plt.ylim(0, 1.05)
plt.xlim(0, 1.05)
plt.legend(loc=4)
plt.show()
print(thresholds)
示例11: test_graphviz_errors
def test_graphviz_errors():
# Check for errors of export_graphviz
clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2)
# Check not-fitted decision tree error
out = StringIO()
assert_raises(NotFittedError, export_graphviz, clf, out)
clf.fit(X, y)
# Check if it errors when length of feature_names
# mismatches with number of features
message = ("Length of feature_names, "
"1 does not match number of features, 2")
assert_raise_message(ValueError, message, export_graphviz, clf, None,
feature_names=["a"])
message = ("Length of feature_names, "
"3 does not match number of features, 2")
assert_raise_message(ValueError, message, export_graphviz, clf, None,
feature_names=["a", "b", "c"])
# Check class_names error
out = StringIO()
assert_raises(IndexError, export_graphviz, clf, out, class_names=[])
# Check precision error
out = StringIO()
assert_raises_regex(ValueError, "should be greater or equal",
export_graphviz, clf, out, precision=-1)
assert_raises_regex(ValueError, "should be an integer",
export_graphviz, clf, out, precision="1")
示例12: decision_trees
def decision_trees(features, labels):
classifier = DecisionTreeClassifier(random_state=0, criterion="entropy")
classifier.fit(features, labels)
scores = cross_validation.cross_val_score(
classifier, features, labels, cv=10, score_func=metrics.precision_recall_fscore_support
)
print_table("Decision Trees", numpy.around(numpy.mean(scores, axis=0), 2))
示例13: text_learning_experiment
def text_learning_experiment(words_to_remove=[]):
from_sara = open("../text_learning/from_sara.txt", "r")
from_chris = open("../text_learning/from_chris.txt", "r")
word_data, authors = vectorize_emails(from_sara, from_chris, max_emails=300, words_to_remove=words_to_remove)
features_train, features_test, labels_train, labels_test = \
cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]
clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)
predict_train = clf.predict(features_train)
predict_test = clf.predict(features_test)
print "train acc:", accuracy_score(labels_train, predict_train)
print "test acc: ", accuracy_score(labels_test, predict_test)
feature_index = np.argmax(clf.feature_importances_)
feature_importance = clf.feature_importances_[feature_index]
feature_name = vectorizer.get_feature_names()[feature_index]
print "Most important feature, and relative importance:", feature_name, ":", feature_importance
return feature_name, feature_importance
示例14: train_dtc
def train_dtc(X, y):
"""
Create and train the Decision Tree Classifier.
"""
dtc = DecisionTreeClassifier()
dtc.fit(X, y)
return dtc
示例15: decision_tree_prediction
def decision_tree_prediction(features_train, labels_train, features_test, ids):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3)
clf = DecisionTreeClassifier(criterion='gini',
min_samples_split=10,
max_depth=10,
max_leaf_nodes=16,
max_features=2)
#clf_acc = clf.fit(X_train, y_train)
# print(clf.best_estimator_)
#feature_importance = clf.feature_importances_
#print (feature_importance)
#pred = clf_acc.predict_proba(X_test)[:,1]
#print (y_test, pred)
# acc = accuracy_score(y_test, pred)
# print ("Acc {}".format(acc))
clf = clf.fit(features_train, labels_train)
pred = clf.predict_proba(features_test)[:,1]
predictions_file = open("data/canivel_decision_tree.csv", "wb")
predictions_file_object = csv.writer(predictions_file)
predictions_file_object.writerow(["ID", "TARGET"])
predictions_file_object.writerows(zip(ids, pred))
predictions_file.close()