当前位置: 首页>>代码示例>>Python>>正文


Python tree.DecisionTreeClassifier类代码示例

本文整理汇总了Python中sklearn.tree.DecisionTreeClassifier的典型用法代码示例。如果您正苦于以下问题:Python DecisionTreeClassifier类的具体用法?Python DecisionTreeClassifier怎么用?Python DecisionTreeClassifier使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了DecisionTreeClassifier类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: decision_tree_entropy

def decision_tree_entropy(training_data):
    clf = DecisionTreeClassifier(criterion="entropy",random_state=0)
    clf.fit(training_data[0], training_data[1])
    #with open("/media/deeksha/e/Deeksha/Dropbox/Coursework/MachineLearning/HW3/entropy.dot", 'w') as f:
    #    f = tree.export_graphviz(clf, out_file=f)
    print "entropy:Number of Nodes", clf.tree_.node_count
    return clf
开发者ID:deekshachugh,项目名称:MachineLearning,代码行数:7,代码来源:DecisionTreeUsingGiniand+Entropy.py

示例2: __init__

class Transformer:
    def __init__(self, use_PCA=True):
        self._clf = DecisionTreeClassifier(min_samples_leaf=10)
        self._idx = None
        self._scaler = StandardScaler()
        self._trans = PCA('mle')
        self._use_PCA = use_PCA

    def fit(self, X, y):
        X = np.array(X)
        self._clf.fit(X, y)

        self._idx = filter(lambda x: self._clf.feature_importances_[x] > 0, \
                range(len(self._clf.feature_importances_)))

        new_set = [X[i][self._idx] for i in xrange(len(X))]

#        new_set = self._scaler.fit_transform(new_set)

        if self._use_PCA:
            new_set = self._trans.fit_transform(new_set)
        return new_set

    def transform(self, features):
        features = features[self._idx]
#        features = self._scaler.transform(features.astype(float))
        if self._use_PCA:
            features = self._trans.transform(features)
        return features
开发者ID:ItsLastDay,项目名称:Opinion-mining-from-reviews,代码行数:29,代码来源:solution.py

示例3: quize1

def quize1(data):
# 1. Select count of neighbors.Загрузите выборку из файла titanic.csv с помощью пакета Pandas.
# 2.Оставьте в выборке четыре признака: класс пассажира (Pclass), цену билета (Fare), возраст пассажира (Age) и его пол (Sex).
# 3.Обратите внимание, что признак Sex имеет строковые значения.
# 4.Выделите целевую переменную — она записана в столбце Survived.
# 5.В данных есть пропущенные значения — например, для некоторых пассажиров неизвестен их возраст.
# 6.Такие записи при чтении их в pandas принимают значение nan.
# Найдите все объекты, у которых есть пропущенные признаки, и удалите их из выборки.
# Обучите решающее дерево с параметром random_state=241 и остальными параметрами по умолчанию.
# Вычислите важности признаков и найдите два признака с
# наибольшей важностью. Их названия будут ответами для данной задачи
# (в качестве ответа укажите названия признаков через запятую или пробел, порядок не важен).
    dataF = data[['Pclass', 'Fare', 'Age', 'Sex','Survived']]
    dataF = dataF.dropna()
    Y = dataF['Survived']
    dataF = dataF[['Pclass', 'Fare', 'Age', 'Sex']]
    clf = DecisionTreeClassifier(random_state=241)
    dataF.loc[dataF['Sex'] != 'male', 'Sex'] = 0
    dataF.loc[dataF['Sex'] == 'male', 'Sex'] = 1
    print (dataF)
    clf.fit(dataF, Y)
    importances = clf.feature_importances_
    print(importances)
    # d = zip(dataF.columns, clf.feature_importanc_)
    # print(d)
    return
开发者ID:BlinJin,项目名称:Machine-Learning,代码行数:26,代码来源:decision_trees.py

示例4: evaluateDecisionTree

def evaluateDecisionTree(train_x,train_y,test_x,test_y):
    clf = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=5,max_depth=20)
    clf.fit(train_x,train_y)
    p = clf.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y,p)
    plotAUC(test_y,clf.predict_proba(test_x)[:,1],'DT')
    return auc
开发者ID:ds-ga-1001-final,项目名称:project,代码行数:7,代码来源:decision_tree.py

示例5: decision_tree

def decision_tree(train_bow,train_labels,test_bow,test_labels,bow_indexes):
    print("Training decision tree")
    dt_classifier=DecisionTreeClassifier()

    dt_classifier.fit(train_bow,train_labels)
    print("Testing decision tree")
    test(dt_classifier,"dt",test_bow,test_labels,bow_indexes)
开发者ID:wangk1,项目名称:research,代码行数:7,代码来源:classifiers_func.py

示例6: train_adaboost

def train_adaboost(features, labels, learning_rate, n_lab, n_runs, n_estim, n_samples):
    uniqLabels = np.unique(labels)
    print 'Taking ', str(n_lab), ' labels'
    uniqLabels = uniqLabels[:n_lab]
    used_labels = uniqLabels
    pbar = start_progressbar(len(uniqLabels), 'training adaboost for %i labels' %len(uniqLabels))
    allLearners = []
    for yy ,targetLab in enumerate(uniqLabels):
        runs=[]
        for rrr in xrange(n_runs):
            #import ipdb;ipdb.set_trace()
            feats,labs = get_binary_sets(features, labels, targetLab, n_samples)
            #print 'fitting stump'
            #import ipdb;ipdb.set_trace()
            baseClf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=10, min_samples_split=10)
            baseClf.fit(feats, labs)
            ada_real = AdaBoostClassifier( base_estimator=baseClf, learning_rate=learning_rate,
                                      n_estimators=n_estim,
                                      algorithm="SAMME.R")
            #import ipdb;ipdb.set_trace()
            runs.append(ada_real.fit(feats, labs))
        allLearners.append(runs)
        update_progressbar(pbar, yy)
    end_progressbar(pbar)
    
    return allLearners, used_labels
开发者ID:aarslan,项目名称:action_rec,代码行数:26,代码来源:classifier_wrappers.py

示例7: test_importances

def test_importances():
    """Check variable importances."""
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    for name, Tree in CLF_TREES.items():
        clf = Tree(random_state=0)

        clf.fit(X, y)
        importances = clf.feature_importances_
        n_important = np.sum(importances > 0.1)

        assert_equal(importances.shape[0], 10, "Failed with {0}".format(name))
        assert_equal(n_important, 3, "Failed with {0}".format(name))

        X_new = clf.transform(X, threshold="mean")
        assert_less(0, X_new.shape[1], "Failed with {0}".format(name))
        assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name))

    # Check on iris that importances are the same for all builders
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(iris.data, iris.target)
    clf2 = DecisionTreeClassifier(random_state=0,
                                  max_leaf_nodes=len(iris.data))
    clf2.fit(iris.data, iris.target)

    assert_array_equal(clf.feature_importances_,
                       clf2.feature_importances_)
开发者ID:Carol-Hu,项目名称:scikit-learn,代码行数:33,代码来源:test_tree.py

示例8: MultEstimator

class MultEstimator(BaseEstimator):
    def __init__(self, categories):
        self.categories = categories

    def fit(self, X, y, **params):
        self.models = {_: None for _ in self.categories}
        self.tot_model = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
        categ = X[:, -1]
        data = X[:, :-1]
        self.tot_model.fit(data, y)
        for c in self.models.keys():
            mask = categ == c
            m = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
            m.fit(data[mask], y[mask])
            self.models[c] = m

    def predict(self, X):
        categ = X[:, -1]
        data = X[:, :-1]
        p = self.tot_model.predict(data)
        for c in self.models.keys():
            mask = categ == c
            if mask.any():
                p[mask] = self.models[c].predict(data[mask])
        return p

    def predict_proba(self, X):
        categ = X[:, -1]
        data = X[:, :-1]
        p = self.tot_model.predict_proba(data)
        for c in self.models.keys():
            mask = categ == c
            if mask.any():
                p[mask] = self.models[c].predict_proba(data[mask])
        return p
开发者ID:alfiya400,项目名称:kaggle-avitoDuplicatesDetection,代码行数:35,代码来源:model.py

示例9: main

def main(percentage):
    """Given a percentage for splitting the dataset, fit the training set and apply the rest as a test set."""
    df = pd.read_csv('cellStrength.log')
    df.drop('SSID', 1, inplace=True)
    processed = preprocess(df)
    location_col = processed[0].shape[1]-4

    hash_to_location = {y:x for x,y in processed[1].items()}

    df2, targets = encode_target(processed[0], location_col)
    msk = np.random.rand(len(df)) < percentage
    test = df2[~msk].copy()
    train = df2[msk].copy()

    open('golden.csv', 'w').write(','.join([hash_to_location[p] for p in test['Target'].tolist()]) + '\n' )

    test.drop(186, 1, inplace=True)
    test.drop('Target', 1, inplace=True)

    features = list(df2.columns[:location_col]) + list(df2.columns[location_col+1:-1])

    y = train['Target']
    X = train[features]

    dt = DecisionTreeClassifier(min_samples_split=3, random_state=99)
    try:
        dt.fit(X, y)
    except ValueError:
        return
    predictions = dt.predict(test).tolist()
    open('golden.csv', 'a').write(','.join([hash_to_location[p] for p in predictions]))

    # get_code(dt, features, targets)
    return get_accuracy('golden.csv')
开发者ID:elahi-arman,项目名称:Python,代码行数:34,代码来源:router_association.py

示例10: programmer_2

def programmer_2():
    datafile = 'data/model.xls'
    data = pd.read_excel(datafile)
    data = data.as_matrix()
    shuffle(data)  # 随机打乱数据

    # 设置训练数据比8:2
    p = 0.8
    train = data[:int(len(data) * p), :]
    test = data[int(len(data) * p):, :]

    # 构建CART决策树模型
    treefile = 'tmp/tree.pkl'
    tree = DecisionTreeClassifier()
    tree.fit(train[:, :3], train[:, 3])

    joblib.dump(tree, treefile)

    cm_plot(train[:, 3], tree.predict(train[:, :3])).show()  # 显示混淆矩阵可视化结果
    # 注意到Scikit-Learn使用predict方法直接给出预测结果。

    fpr, tpr, thresholds = roc_curve(
        test[:, 3], tree.predict_proba(test[:, :3])[:, 1], pos_label=1)
    plt.plot(fpr, tpr, linewidth=2, label='ROC of CART', color='green')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # 设定边界范围
    plt.ylim(0, 1.05)
    plt.xlim(0, 1.05)
    plt.legend(loc=4)
    plt.show()
    print(thresholds)
开发者ID:Ctipsy,项目名称:python_data_analysis_and_mining_action,代码行数:32,代码来源:code.py

示例11: test_graphviz_errors

def test_graphviz_errors():
    # Check for errors of export_graphviz
    clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2)

    # Check not-fitted decision tree error
    out = StringIO()
    assert_raises(NotFittedError, export_graphviz, clf, out)

    clf.fit(X, y)

    # Check if it errors when length of feature_names
    # mismatches with number of features
    message = ("Length of feature_names, "
               "1 does not match number of features, 2")
    assert_raise_message(ValueError, message, export_graphviz, clf, None,
                         feature_names=["a"])

    message = ("Length of feature_names, "
               "3 does not match number of features, 2")
    assert_raise_message(ValueError, message, export_graphviz, clf, None,
                         feature_names=["a", "b", "c"])

    # Check class_names error
    out = StringIO()
    assert_raises(IndexError, export_graphviz, clf, out, class_names=[])

    # Check precision error
    out = StringIO()
    assert_raises_regex(ValueError, "should be greater or equal",
                        export_graphviz, clf, out, precision=-1)
    assert_raises_regex(ValueError, "should be an integer",
                        export_graphviz, clf, out, precision="1")
开发者ID:Lavanya-Basavaraju,项目名称:scikit-learn,代码行数:32,代码来源:test_export.py

示例12: decision_trees

def decision_trees(features, labels):
    classifier = DecisionTreeClassifier(random_state=0, criterion="entropy")
    classifier.fit(features, labels)
    scores = cross_validation.cross_val_score(
        classifier, features, labels, cv=10, score_func=metrics.precision_recall_fscore_support
    )
    print_table("Decision Trees", numpy.around(numpy.mean(scores, axis=0), 2))
开发者ID:pelluch,项目名称:data-mining,代码行数:7,代码来源:main.py

示例13: text_learning_experiment

def text_learning_experiment(words_to_remove=[]):
    from_sara  = open("../text_learning/from_sara.txt", "r")
    from_chris = open("../text_learning/from_chris.txt", "r")
    word_data, authors = vectorize_emails(from_sara, from_chris, max_emails=300, words_to_remove=words_to_remove)
    features_train, features_test, labels_train, labels_test = \
        cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train = vectorizer.fit_transform(features_train)
    features_test  = vectorizer.transform(features_test).toarray()

    features_train = features_train[:150].toarray()
    labels_train   = labels_train[:150]

    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)
    predict_train = clf.predict(features_train)
    predict_test = clf.predict(features_test)
    print "train acc:", accuracy_score(labels_train, predict_train)
    print "test acc: ", accuracy_score(labels_test, predict_test)
    feature_index = np.argmax(clf.feature_importances_)
    feature_importance = clf.feature_importances_[feature_index]
    feature_name = vectorizer.get_feature_names()[feature_index]
    print "Most important feature, and relative importance:", feature_name, ":", feature_importance
    return feature_name, feature_importance
开发者ID:andrei-iusan,项目名称:ud120-projects,代码行数:25,代码来源:poi_id.py

示例14: train_dtc

def train_dtc(X, y):
    """
    Create and train the Decision Tree Classifier.
    """
    dtc = DecisionTreeClassifier()
    dtc.fit(X, y)
    return dtc
开发者ID:texaspandaa,项目名称:Text-Mining,代码行数:7,代码来源:1.py

示例15: decision_tree_prediction

def decision_tree_prediction(features_train, labels_train, features_test, ids):

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3)

    clf = DecisionTreeClassifier(criterion='gini',
                                 min_samples_split=10,
                                 max_depth=10,
                                 max_leaf_nodes=16,
                                 max_features=2)


    #clf_acc = clf.fit(X_train, y_train)
    # print(clf.best_estimator_)
    #feature_importance = clf.feature_importances_
    #print (feature_importance)

    #pred = clf_acc.predict_proba(X_test)[:,1]
    #print (y_test, pred)
    # acc = accuracy_score(y_test, pred)
    # print ("Acc {}".format(acc))

    clf = clf.fit(features_train, labels_train)

    pred = clf.predict_proba(features_test)[:,1]

    predictions_file = open("data/canivel_decision_tree.csv", "wb")
    predictions_file_object = csv.writer(predictions_file)
    predictions_file_object.writerow(["ID", "TARGET"])
    predictions_file_object.writerows(zip(ids, pred))
    predictions_file.close()
开发者ID:canivel,项目名称:Kaggle-Santander,代码行数:30,代码来源:regular_classifiers.py


注:本文中的sklearn.tree.DecisionTreeClassifier类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。