当前位置: 首页>>代码示例>>Python>>正文


Python RandomForestClassifier.transform方法代码示例

本文整理汇总了Python中sklearn.ensemble.RandomForestClassifier.transform方法的典型用法代码示例。如果您正苦于以下问题:Python RandomForestClassifier.transform方法的具体用法?Python RandomForestClassifier.transform怎么用?Python RandomForestClassifier.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.ensemble.RandomForestClassifier的用法示例。


在下文中一共展示了RandomForestClassifier.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: RFSelection

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
class RFSelection(TransformerMixin):

    def __init__(self, n_features=None, n_estimators=100, random_state=0):
        self.rf = RandomForestClassifier(
            n_estimators=n_estimators, random_state=0)
        self.n_features = None

        if n_features is not None:
            self.n_features = n_features

    def fit(self, X, y=None):
        X_local = np.array(X)
        self.rf.fit(X_local, y)

        importances = self.rf.feature_importances_
        ranking = np.argsort(importances)

        if self.n_features is None:
            self.n_features = round(X_local.shape[1] / 2)

        self.threshold = importances[ranking[self.n_features]]
        return self

    def transform(self, X, y=None):
        return self.rf.transform(X, self.threshold)
开发者ID:rloliveirajr,项目名称:sklearn_transformers,代码行数:27,代码来源:rf_selection.py

示例2: test_importances

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def test_importances():
    """Check variable importances."""
    X, y = datasets.make_classification(n_samples=1000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X, y)
    importances = clf.feature_importances_
    n_important = sum(importances > 0.1)

    assert_equal(importances.shape[0], 10)
    assert_equal(n_important, 3)

    X_new = clf.transform(X, threshold="mean")
    assert_less(0 < X_new.shape[1], X.shape[1])

    # Check with sample weights
    sample_weight = np.ones(y.shape)
    sample_weight[y == 1] *= 100

    clf = RandomForestClassifier(n_estimators=50, random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    importances = clf.feature_importances_
    assert np.all(importances >= 0.0)

    clf = RandomForestClassifier(n_estimators=50, random_state=0)
    clf.fit(X, y, sample_weight=3*sample_weight)
    importances_bis = clf.feature_importances_
    assert_almost_equal(importances, importances_bis)
开发者ID:Arezou1,项目名称:scikit-learn,代码行数:36,代码来源:test_forest.py

示例3: main

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def main():
	if len(sys.argv) < 6:
		print "Usage: [program] train test tree_num seed output [thread_num]"
		return
	elif len(sys.argv) == 6:
		threadNum = 1
	elif len(sys.argv) == 7:
		threadNum = int(sys.argv[6])

	treeNum = int(sys.argv[3])
	seed = int(sys.argv[4])

	print "try",treeNum,"trees with",threadNum,"threads","and seed",seed

	trainData = dataProcessor(sys.argv[1])
	testData = dataProcessor(sys.argv[2])
	target = [data.type for data in trainData]
	train = [data.matrix for data in trainData]
	test = [data.matrix for data in testData]

#	target = numpy.genfromtxt("./data/target.csv",delimiter=",")
#	train = numpy.genfromtxt("./data/train.csv",delimiter=",")
#	test = numpy.genfromtxt("./data/test.csv",delimiter=",")
	print "Data load over, start to generate trees"

	rf = RandomForestClassifier(n_estimators = treeNum,n_jobs=threadNum,oob_score=True)
	rf.fit(train,target)
	train_r = rf.transform(train)
	test_r = rf.transform(test)
	
	numpy.savetxt("train.csv",train_r,fmt="%d")
	numpy.savetxt("test.csv",test_r,fmt="%d")
	numpy.savetxt("target.csv",target,fmt="%d")
	numpy.savetxt("ans.csv",[0]*len(test_r),fmt="%d")

	print "fit done, # of class:",rf.n_classes_,", oob score:",rf.oob_score_

	result = rf.predict(test)
	fout = open(sys.argv[5],"w")
	for i in result:
		tmp = int(i)
		fout.write(`tmp`+"\n")
开发者ID:raychin4563,项目名称:MLFinal,代码行数:44,代码来源:randomForestTry.py

示例4: test_importances

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def test_importances():
    """Check variable importances."""
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0
    )

    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X, y)
    importances = clf.feature_importances_
    n_important = sum(importances > 0.1)

    assert_equal(importances.shape[0], 10)
    assert_equal(n_important, 3)

    X_new = clf.transform(X, threshold="mean")
    assert_less(0 < X_new.shape[1], X.shape[1])
开发者ID:vd4mmind,项目名称:scikit-learn,代码行数:18,代码来源:test_forest.py

示例5: featureSelection

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def featureSelection(matrixX, matrixY, seed, fileName):
        clf = RandomForestClassifier(n_estimators=240,
            random_state=seed,
            oob_score=True)
        clf.fit(matrixX, numpy.ravel(matrixY))
        featureMatrix = clf.transform(matrixX)
        accuracy = clf.score(matrixX, matrixY)
        oob_score = clf.oob_score_

        # print out oob_score and accuracy
        dictionary = [{"ID":"oob_score", "Value":oob_score}]
        dictionary.append({"ID":"Accuracy", "Value":accuracy})
        for i in range(len(clf.feature_importances_)):
            dictionary.append({"ID":i+1, "Value":clf.feature_importances_[i]})
        writeFileArray(dictionary, "%s_featureImportance_seed-%i.csv" % (fileName, seed))
        return [clf, featureMatrix]
开发者ID:noraw,项目名称:CyborgBugs,代码行数:18,代码来源:transformFeatures.py

示例6: rf_classify

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
	def rf_classify(self):
		print "Random Forest"

		clf = RandomForestClassifier()
		clf.fit(self.descr, self.target)
		mean = clf.score(self.test_descr, self.test_target)
		pred = clf.predict(self.test_descr)

		pred_df = pd.DataFrame(pred)
		pred_df.to_csv("pred_val_rf.csv")

		print "Pred ", pred
		print "Mean : %3f" % mean
		print "Feature Importances ", clf.feature_importances_
		print "Predict Probability ", clf.predict_proba(self.descr)
		print "Transform ", clf.transform(self.descr)
开发者ID:raghav297,项目名称:crunchbase,代码行数:18,代码来源:classify.py

示例7: main

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def main():
    X, y = loadData("breast-cancer-wisconsin.data")
    feature_names = [
        "Clump Thickness",
        "Uniformity of Cell Size",
        "Uniformity of Cell Shape",
        "Marginal Adhesion",
        "Single Epithelial Cell Size",
        "Bare Nuclei",
        "Bland Chromatin",
        "Normal Nucleoli",
        "Mitoses",
    ]
    np.random.seed(1)

    # split the data into training and test sets
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

    # find the best classifier from the grid search
    params = gridSearchCV(X_train, y_train)
    n_estimators = params["n_estimators"]
    max_features = params["max_features"]

    # fit the classifier to the training data with the best parameters
    clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
    clf.fit(X_train, y_train)

    # report the training and test scores
    print "Accuracy on the test set: %.2f%%" % (clf.score(X_test, y_test) * 100)
    print "Accuracy on the training set: %.2f%%\n" % (clf.score(X_train, y_train) * 100)

    # report the most important features
    feature_importances = clf.feature_importances_
    sorted_feature_importances = np.argsort(feature_importances)[::-1]
    j = clf.transform(X).shape[1]
    print "The most important %d features are:" % j
    for i, index in enumerate(sorted_feature_importances[:j]):
        print "%d) %s (%.2f%%)" % (i + 1, feature_names[index], feature_importances[index] * 100)
开发者ID:rodyou,项目名称:Machine-Learning,代码行数:40,代码来源:random_forest.py

示例8: main

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def main(args):
    # Import data and transpose
    logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}\n\tGroup Column: {3}'.format(args.fname, args.dname, args.uniqID, args.group))
    dat = wideToDesign(args.fname, args.dname, args.uniqID, args.group, clean_string=True)
    data = dat.transpose()
    data.dropna(axis=1, inplace=True)

    # Pull classifications out of dataset
    classes = data[dat.group].copy()
    data.drop(dat.group, axis=1, inplace=True)
    #TODO: Random forest does not handle NaNs, need to figure out the proper way to impute values.

    # Build Random Forest classifier
    logger.info('Creating classifier')
    model = RandomForestClassifier(n_estimators=args.num)
    model.fit(data, classes)

    # Identify features
    importance = pd.DataFrame([data.columns, model.feature_importances_]).T.sort(columns=1, ascending=False)

    # Export features ranked by importance
    logger.info('Exporting features')
    rev = importance.applymap(lambda x: dat.revertStr(x))
    rev.columns = ('feature', 'ranked_importance')
    rev.to_csv(args.oname2, index=False, sep='\t')

    # Select data based on features
    data = data[importance.ix[:, 0].tolist()]
    selected_data = pd.DataFrame(model.transform(data, threshold=0))
    selected_data.columns = [dat.revertStr(x) for x in data.columns]

    # Merge on classes and export
    logger.info('Exporting transformed data')
    clDf = pd.DataFrame(classes)
    clDf.reset_index(inplace=True)
    out = clDf.join(selected_data)
    out.to_csv(args.oname, index=False, sep='\t', float_format="%.4f")
开发者ID:secimTools,项目名称:GalaxyTools,代码行数:39,代码来源:RandomForest.py

示例9: RFClassifier

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def RFClassifier(criteria, maxN):

    fhin = open('train.csv', 'rU')
    header = fhin.readline()
    fhin.close()
    patientTags = header.split(',')[1:]
    y = []
    for each in patientTags:
        if re.search('CON', each):
            y.append(1)
        else:
            y.append(0)
    data = joblib.load('GS_pickles/meanCenteredData.pkl')

    RFModel = RFC(criterion=criteria, max_features = "auto",  compute_importances=True, n_jobs=6)

    classifier = RFModel.fit(data, y)


    testData = joblib.load('GS_pickles/imputed_test_data.pkl')

    predictions = RFModel.predict(testData)

    realLabels = []
    for each in predictions:
        if each==0:
            realLabels.append('AD')
        else:
            realLabels.append('Normal')

    print predictions
    print realLabels

#    sys.exit()
    featureImp = classifier.feature_importances_

#    print fsorted
    print 'feature importance', featureImp.shape, featureImp

    featureInd = []

    for i in range(8650):
        featureInd.append([featureImp[i], i])

    fBest = []
    fBestInd = []
#    for i in range(8650):
#        if featureInd[i][0]>0.001:
#            fBest.append(featureInd[i])
#            fBestInd.append(featureInd[i][1])
    fSorted = sorted(featureInd, key=sortFun)
#
#    print featureInd[:500]
    print fSorted[-44:]

    for each in fSorted[-44:]:
        fBest.append(each[1])

    fBestInd = sorted(fBest)
    print fBestInd

#    print 'len(fBest)',  len(fBest)
#    print fBest
#    print fBestInd
    joblib.dump(fBestInd, 'randomForest_features_44v2.pkl')
#
#    scor = classifier.oob_score_
#
#    df = classifier.oob_decision_function_
#    sys.exit()
    skf = cross_validation.StratifiedKFold(y, 10)
    cv_scores = cross_validation.cross_val_score(RFModel, data, y, cv=skf, n_jobs=1)
    print "Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() / 2)
#
#
#
#
################################################################################
## Classification and ROC analysis
#
## Run classifier with crossvalidation and plot ROC curves
##cv = StratifiedKFold(y, k=6)
##classifier = svm.SVC(kernel='linear', probability=True)
#
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    y = np.array(y).transpose()
    print y

    dataAr = np.array(data)
#
    dataTrans = RFModel.transform(dataAr)

    for i, (train, test) in enumerate(skf):
#        print dataTrans[train].shape
###        print y[train].shape
##        dataTrans[test].shape
##        print y

#.........这里部分代码省略.........
开发者ID:B-Rich,项目名称:gsinghal_python_src,代码行数:103,代码来源:parser_GS.py

示例10: RandomForestClassifier

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
forest = RandomForestClassifier(n_estimators=10000,
                                random_state=0,
                                n_jobs=-1)

forest.fit(X_train, y_train)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[indices[f]],
                            importances[indices[f]]))

plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]),
        importances[indices],
        color='lightblue',
        align='center')

plt.xticks(range(X_train.shape[1]),
           feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
# plt.tight_layout()
# plt.savefig('./random_forest.png', dpi=300)
plt.show()

X_selected = forest.transform(X_train, threshold=0.15)
X_selected.shape
开发者ID:amdshameer,项目名称:python-machine-learning-book,代码行数:31,代码来源:ch04.py

示例11: gen_data

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def gen_data():

    # the 4k features!
    the_train = pickle.load(open('X33_train_reproduce.p','rb'))  
    the_test = pickle.load(open('X33_test_reproduce.p','rb'))
    # corresponding id and labels
    Id = pickle.load(open('xid.p','rb'))
    labels = pickle.load(open('y.p','rb'))    
    Id_test = pickle.load(open('Xt_id.p','rb'))

    # merge them into pandas
    join_train = np.column_stack((Id, the_train, labels))
    join_test = np.column_stack((Id_test, the_test))
    train = pd.DataFrame(join_train, columns=['Id']+['the_fea%i'%i for i in xrange(the_train.shape[1])] + ['Class'])
    test = pd.DataFrame(join_test, columns=['Id']+['the_fea%i'%i for i in xrange(the_train.shape[1])])
    del join_train, join_test
    # convert into numeric features
    train = train.convert_objects(convert_numeric=True)
    test = test.convert_objects(convert_numeric=True)
    
    # including more things
    train_count = pd.read_csv("train_frequency.csv")
    test_count = pd.read_csv("test_frequency.csv") 
    train = pd.merge(train, train_count, on='Id')
    test = pd.merge(test, test_count, on='Id')


    
    # instr count
    train_instr_count = pd.read_csv("train_instr_frequency.csv")
    test_instr_count = pd.read_csv("test_instr_frequency.csv")
    for n in list(train_instr_count)[1:]:
        if np.sum(train_instr_count[n]) == 0:
            del train_instr_count[n]
            del test_instr_count[n]
    
    train_instr_freq = train_instr_count.copy()
    test_instr_freq = test_instr_count.copy()
    
    train_instr_freq.ix[:,1:] = train_instr_freq.ix[:,1:].apply(lambda x: x/np.sum(x), axis = 1)
    #train_instr_freq = train_instr_freq.replace(np.inf, 0)
    train_instr_freq = train_instr_freq.replace(np.nan, 0)
    test_instr_freq.ix[:,1:]=test_instr_freq.ix[:,1:].apply(lambda x: x/np.sum(x), axis = 1)
    #test_instr_freq = test_instr_freq.replace(np.inf, 0)
    test_instr_freq = test_instr_freq.replace(np.nan, 0)
    
    train = pd.merge(train, train_instr_freq, on='Id')
    test = pd.merge(test, test_instr_freq, on='Id')    
    
    ## all right, include more!
    grams_train = pd.read_csv("train_data_750.csv")
    grams_test = pd.read_csv("test_data_750.csv")
    
    # daf features
    #train_daf = pd.read_csv("train_daf.csv")
    #test_daf = pd.read_csv("test_daf.csv")
    #daf_list = [0,165,91,60,108,84,42,93,152,100] #daf list for 500 grams.
    # dll features
    train_dll = pd.read_csv("train_dll.csv")
    test_dll = pd.read_csv("test_dll.csv")

    
    # merge all them
    #mine = pd.merge(grams_train, train_daf,on='Id')
    mine = grams_train
    mine = pd.merge(mine, train_dll, on='Id')
    
    mine_labels = pd.read_csv("trainLabels.csv")
    mine = pd.merge(mine, mine_labels, on='Id')
    mine_labels = mine.Class
    mine_Id = mine.Id
    del mine['Class']
    del mine['Id']
    mine = mine.as_matrix()

    #mine_test = pd.merge(grams_test, test_daf,on='Id')
    mine_test = grams_test
    mine_test = pd.merge(mine_test, test_dll,on='Id')

    mine_test_id = mine_test.Id
    del mine_test['Id']
    clf_se = RF(n_estimators=500, n_jobs=-1,random_state = 0)
    clf_se.fit(mine,mine_labels)
    mine_train = np.array(clf_se.transform(mine, '1.25*mean'))
    mine_test = np.array(clf_se.transform(mine_test, '1.25*mean'))

    train_mine = pd.DataFrame(np.column_stack((mine_Id, mine_train)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_train.shape[1])]).convert_objects(convert_numeric=True)
    test_mine = pd.DataFrame(np.column_stack((mine_test_id, mine_test)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_test.shape[1])]).convert_objects(convert_numeric=True)
    train = pd.merge(train, train_mine, on='Id')
    test = pd.merge(test, test_mine, on='Id')

    train_image = pd.read_csv("train_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
    test_image = pd.read_csv("test_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
    train = pd.merge(train, train_image, on='Id')
    test = pd.merge(test, test_image, on='Id')

    return train, test
开发者ID:ChenglongChen,项目名称:kaggle_Microsoft_Malware,代码行数:99,代码来源:semi_model.py

示例12: len

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
                            min_samples_split=2,
                            min_samples_leaf=1,
                            min_weight_fraction_leaf=0.0,
                            max_features='auto',
                            max_leaf_nodes=None,
                            bootstrap=True,
                            oob_score=False,
                            random_state=None,
                            verbose=0,
                            warm_start=False,
                            class_weight=None)

fit = rf.fit(predictor, classes)

# Side Step: Attribute Importance ???
var_importance = rf.transform(predictor, threshold=None)


# Step 3: Test set probability and prediction
test_predictor = [x[1:5] for x in test]
test_classes = [x[5] for x in test]

# Provides probability of belonging to each class
rf_prob = rf.predict_proba(test_predictor).tolist()
# Predicts the class
rf_cls = rf.predict(test_predictor).tolist()


# Step 4: Evaluating Accuracy
pop = len(data)
error = 0
开发者ID:RandallShane,项目名称:BoiseDataScienceMeetup,代码行数:33,代码来源:randomforest.py

示例13: feature_selection

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
def feature_selection(fl, nofl, method='Fscore', N_features=10):
  """
  Selects most important features according to F-score, entropy etc
  
  Args:
  fl: 2D np array (region, feature) of flaring regions
  nofl: 2D np array (region, feature) of nonflaring regions
  method: string, choose one of: 'Fscore', 'RandomForest', 'RFE', 'chi2', 'pca', 'DecisionTree'
  N_features: integer, number of features to be selected
  
  Returns:
  fl: 2D transformed array (region, only important features) of flaring regions
  nofl: 2D transformed array of nonflaring regions
  scores: 1D array with size N_features which has the scores of the features (e.g. F score)  
          float for pca that shows variance explained  
  """
  Nfl = fl.shape[0]; Nnofl = nofl.shape[0] 
  yfl = np.ones(Nfl); ynofl = np.zeros(Nnofl)
  if method == 'RandomForest':  
    selector = RandomForestClassifier(n_estimators=10000, criterion='entropy', \
                                      class_weight='auto', max_features = 0.5)
    selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
    scores=selector.feature_importances_
    #threshold = sorted(scores, reverse=True)[N_features-1]
    #fl = selector.transform(fl, threshold=threshold) 
    #nofl = selector.transform(nofl, threshold=threshold)
    fl = fl[:, np.argsort(scores)[::-1][0:N_features]]
    nofl = nofl[:, np.argsort(scores)[::-1][0:N_features]]
  elif method == 'DecisionTree':
    selector = DecisionTreeClassifier(criterion='entropy', class_weight='auto')
    selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
    scores=selector.feature_importances_
    fl = fl[:, np.argsort(scores)[::-1][0:N_features]]
    nofl = nofl[:, np.argsort(scores)[::-1][0:N_features]]
  elif method == 'RFE':
    estimator = LogisticRegression(penalty='l1', class_weight='auto')
    selector = RFE(estimator, n_features_to_select=N_features, step=1)
    selector = selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
    scores = selector.ranking_
    fl = fl[:, np.argsort(scores)[0:N_features]]
    nofl = nofl[:, np.argsort(scores)[0:N_features]]
  elif method == 'Fscore':
    selector = SelectKBest(f_classif, k=N_features)
    selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
    scores=selector.scores_
    #fl = selector.transform(fl); nofl = selector.transform(nofl)
    fl = fl[:, np.argsort(scores)[::-1][0:N_features]]
    nofl = nofl[:, np.argsort(scores)[::-1][0:N_features]]
  elif method == 'chi2':
    data = np.concatenate((fl,nofl),axis=0)
    minim = np.zeros(fl.shape[1])
    for i in range(fl.shape[1]):
      minim[i] = np.min(data[:,i])
      if minim[i] < 0:
        fl[:,i] = fl[:,i] - minim[i]; nofl[:,i] = nofl[:,i] - minim[i]  
    selector = SelectKBest(chi2, k=N_features)
    selector.fit(np.concatenate((fl,nofl),axis=0), np.concatenate((yfl, ynofl), axis=0))
    scores=selector.scores_
    #fl = selector.transform(fl); nofl = selector.transform(nofl)
    fl = fl[:, np.argsort(scores)[::-1][0:N_features]]
    nofl = nofl[:, np.argsort(scores)[::-1][0:N_features]]
    minim = minim[np.argsort(scores)[::-1][0:N_features]]
    for i in range(fl.shape[1]):
      if minim[i] < 0:
        fl[:,i] = fl[:,i] + minim[i]; nofl[:,i] = nofl[:,i] + minim[i]
  elif method == 'pca':
    selector = PCA(n_components=N_features)
    selector.fit(np.concatenate((fl,nofl), axis=0))
    fl = selector.transform(fl); nofl = selector.transform(nofl)
    scores = selector.explained_variance_ratio_
    print "PCA was applied and ", np.shape(fl)[1], " components were kept."
    print "Variance explained: ", np.sum(selector.explained_variance_ratio_)
  #for i in range(N_features):  
    #print zip(np.arange(scores.shape[0]), (scores.argsort())[::-1], sorted(scores)[::-1])[i]      
  return fl, nofl, scores
开发者ID:cedar10b,项目名称:flare_prediction,代码行数:77,代码来源:functions.py

示例14: RandomForestRegressor

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
# In[234]:

# compute feature importances
pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort('importance')


# In[235]:

# compute the out-of-bag R-squared score
rfreg.oob_score_


# In[236]:

# set a threshold for which features to include
print rfreg.transform(X, threshold=0.1).shape
print rfreg.transform(X, threshold='mean').shape
print rfreg.transform(X, threshold='median').shape


# In[237]:

# create a new feature matrix that only includes important features
X_important = rfreg.transform(X, threshold='mean')


# In[238]:

# check the RMSE for a Random Forest that only includes important features
rfreg = RandomForestRegressor(n_estimators=30, max_features=2, random_state=1)
scores = cross_val_score(rfreg, X_important, y, cv=10, scoring='mean_squared_error')
开发者ID:erichseamon,项目名称:stat504,代码行数:33,代码来源:landslides-project-proposal.py

示例15: print

# 需要导入模块: from sklearn.ensemble import RandomForestClassifier [as 别名]
# 或者: from sklearn.ensemble.RandomForestClassifier import transform [as 别名]
data_df=data_dfi[cols]
print (data_df.head())
data_df = data_df.reset_index()
dfc = data_df.reindex(np.random.permutation(data_df.index))
dfc.fillna(value=-99999, inplace=True)
forecast_out = int(math.ceil(0.01 * len(dfc)))
dfx = dfc[features]
X = np.array(dfx)
y = np.array(dfc[to_predict])
names = np.array(dfc.columns.values)



clf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf.fit(X,y)
X_selected = clf.transform(X)
feat_list = sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), names), 
             reverse=True)
print (feat_list)
good_feats = []
for i in feat_list:
	array = np.asarray(i)
	print 
	if array[1] == 'index':
		break
	good_feats.append(array[1])

print (good_feats)

dfx = dfc[good_feats]
X = np.array(dfx)
开发者ID:themandalore,项目名称:btc_trader,代码行数:33,代码来源:ml_data1.py


注:本文中的sklearn.ensemble.RandomForestClassifier.transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。