当前位置: 首页>>代码示例>>Python>>正文


Python preprocessing.LabelEncoder类代码示例

本文整理汇总了Python中sklearn.preprocessing.LabelEncoder的典型用法代码示例。如果您正苦于以下问题:Python LabelEncoder类的具体用法?Python LabelEncoder怎么用?Python LabelEncoder使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了LabelEncoder类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: train

    def train(cls, X, y, word_sim_metric, classifier=LinearSVC,
              feature_num=10, feature_type='sim', verbose=True):

        if isinstance(classifier, type):
            classifier = classifier()

        labels = LabelEncoder()
        y_train = labels.fit_transform(y)

        @timeit
        def build():

            corpus = zip(X, y)
            model = Pipeline([
                ('preprocessor', TextPreprocessor(corpus, word_sim_metric, feature_num, feature_type)),
                ('vectorizer', DictVectorizer()),
                ('classifier', classifier),
            ])

            model.fit(X, y_train)
            return model

        if verbose: print("Building the model")
        model, secs = build()
        if verbose: print("Complete model building in {:0.3f} seconds".format(secs))

        return cls(labels, model)
开发者ID:gsi-upm,项目名称:sematch,代码行数:27,代码来源:application.py

示例2: load_otto_group

def load_otto_group():
    """
    Loads and returns several variables for the data set from Kaggle's Otto Group Product Classification competition.
    Link: https://www.kaggle.com/c/otto-group-product-classification-challenge

    Returns
    ----------
    data : array-like
        Pandas data frame containing the entire data set.

    X : array-like
        Training input samples.

    y : array-like
        Target values.
    """
    file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'otto_group.zip')
    z = ZipFile(file_location)
    data = pd.read_csv(z.open('train.csv'))
    data = data.set_index('id')

    # move the label to the first position
    cols = data.columns.tolist()
    cols = cols[-1:] + cols[0:-1]
    data = data[cols]

    X = data.iloc[:, 1:].values

    y = data.iloc[:, 0].values

    # transform the labels from strings to integers
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)

    return data, X, y
开发者ID:jdwittenauer,项目名称:ionyx,代码行数:35,代码来源:datasets.py

示例3: buildTreeClassifier

def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'pointGroup', md = None):
    """
    Build a random forest-classifier model to predict some structure feature from compositional data.  Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    le = LabelEncoder()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = le.fit_transform(df[targetcolumn].values)

    rfc = RandomForestClassifier(max_depth = md)
    acc = mean(cross_val_score(rfc, X, y))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfc.fit(X_train,y_train)
    y_predict = rfc.predict(X_test)
    cm = confusion_matrix(y_test, y_predict)
    
    cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_)

    rfc.fit(X, y)

    return rfc, cm, round(acc,2), le
开发者ID:rhsimplex,项目名称:matprojgeom,代码行数:28,代码来源:modelbuilder.py

示例4: test_multiclass_classifier_class_weight

def test_multiclass_classifier_class_weight():
    """tests multiclass with classweights for each class"""
    alpha = .1
    n_samples = 20
    tol = .00001
    max_iter = 50
    class_weight = {0: .45, 1: .55, 2: .75}
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)

    clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
                              max_iter=max_iter, tol=tol, random_state=77,
                              fit_intercept=fit_intercept,
                              class_weight=class_weight)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]

    coef1 = []
    intercept1 = []
    coef2 = []
    intercept2 = []
    for cl in classes:
        y_encoded = np.ones(n_samples)
        y_encoded[y != cl] = -1

        spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight)
        spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight,
                                              sparse=True)
        coef1.append(spweights1)
        intercept1.append(spintercept1)
        coef2.append(spweights2)
        intercept2.append(spintercept2)

    coef1 = np.vstack(coef1)
    intercept1 = np.array(intercept1)
    coef2 = np.vstack(coef2)
    intercept2 = np.array(intercept2)

    for i, cl in enumerate(classes):
        assert_array_almost_equal(clf1.coef_[i].ravel(),
                                  coef1[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)

        assert_array_almost_equal(clf2.coef_[i].ravel(),
                                  coef2[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:60,代码来源:test_sag.py

示例5: process_one_cell

def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 8).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 500.0
    df_cell_train.loc[:,'y'] *= 1000.0
    df_cell_test.loc[:,'x'] *= 500.0
    df_cell_test.loc[:,'y'] *= 1000.0
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.values

    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    
    return pred_labels, row_ids
开发者ID:kaustubh0mani,项目名称:Facebook-Predicting-Check-Ins,代码行数:30,代码来源:facebook.py

示例6: plot_model_decision_surface

def plot_model_decision_surface(clf, train_features, train_labels,
                                plot_step=0.02, cmap=plt.cm.RdYlBu,
                                markers=None, alphas=None, colors=None):
    
    if train_features.shape[1] != 2:
        raise ValueError("X_train should have exactly 2 columnns!")
    
    x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
    y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    clf_est = clone(clf)
    clf_est.fit(train_features,train_labels)
    if hasattr(clf_est, 'predict_proba'):
        Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
    else:
        Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])    
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=cmap)
    
    le = LabelEncoder()
    y_enc = le.fit_transform(train_labels)
    n_classes = len(le.classes_)
    plot_colors = ''.join(colors) if colors else [None] * n_classes
    label_names = le.classes_
    markers = markers if markers else [None] * n_classes
    alphas = alphas if alphas else [None] * n_classes
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y_enc == i)
        plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
                    label=label_names[i], cmap=cmap, edgecolors='black', 
                    marker=markers[i], alpha=alphas[i])
    plt.legend()
    plt.show()
开发者ID:Zoery,项目名称:practical-machine-learning-with-python,代码行数:35,代码来源:model_evaluation_utils.py

示例7: main

def main():
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')

    enc = LabelEncoder()
    joined = pd.concat((train['Product_Info_2'],
                        test['Product_Info_2']), axis=0)
    enc.fit(joined)
    train['Product_Info_2'] = enc.transform(train['Product_Info_2'])
    test['Product_Info_2'] = enc.transform(test['Product_Info_2'])


    X_train = train.drop('Response', axis=1).values
    y_train = train['Response'].values
    X_test = test.values

    mdl = xgb.XGBRegressor(learning_rate=0.05,
                           n_estimators=200,
                           subsample=0.5,
                           max_depth=6,
                           silent=False)
    mdl.fit(X_train, y_train)

    preds = mdl.predict(X_test)
    preds = [min(max(1, int(round(pred))), 8) for pred in preds]

    sub = pd.DataFrame({'Id': test['Id'], 'Response': preds})
    sub.to_csv('submissions/xgb.csv', index=False)
开发者ID:xiaoyubai,项目名称:kaggle-prudential,代码行数:28,代码来源:base_mdl.py

示例8: test_vote_soft

def test_vote_soft():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    train_probs = probs[0]
    test_probs = probs[1]
    print(len(train_probs))
    for prob in train_probs:
        print(prob.shape)
        print(type(prob))
    #train_attr = reduce(lambda a,b:a+b,train_probs)
    test_attr = reduce(lambda a,b:a+b,test_probs)

    pred = test_attr.idxmax(1)
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))
开发者ID:ZaydH,项目名称:recipe_cuisine_type_classifier,代码行数:28,代码来源:predict.py

示例9: test_hard_vote

def test_hard_vote():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    #train_probs = probs[0]
    test_probs = probs[1]
    print(len(test_probs))
    preds = [x.idxmax(1) for x in test_probs]
    pred = np.zeros(len(preds[0]),dtype=np.int8)
    print(len(pred))
    for i in range(len(preds[0])):
        votes = [p[i] for p in preds]
        print(votes)
        pred[i]= max(set(votes),key=votes.count)
        print(pred[i])
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))

    """
开发者ID:ZaydH,项目名称:recipe_cuisine_type_classifier,代码行数:31,代码来源:predict.py

示例10: train

    def train(self):
        input_dir = get_config().get('Classification', 'TrainingInputPath')
        self.logger.info("Loading features")
        file_name = os.path.join(input_dir, 'labels.csv')
        labels = pd.read_csv(file_name, header=None).as_matrix()[:, 1]
        labels = map(itemgetter(1),
                     map(os.path.split,
                         map(os.path.dirname, labels)))
        label_encoder = LabelEncoder().fit(labels)
        labels_encoded = label_encoder.transform(labels)
        num_classes = len(label_encoder.classes_)

        file_name = os.path.join(input_dir, 'reps.csv')
        features = pd.read_csv(file_name, header=None).as_matrix()

        self.logger.info("Training for {} classes.".format(num_classes))

        clf = SVC(C=1, kernel='linear', probability=True)

        # TODO: Try a previous LDA
        try:
            lda = int(get_config().get("Classification", "LDADim"))
        except ValueError:
            lda = None
        if lda:
            clf_final = clf
            clf = Pipeline([('lda', LDA(n_components=lda)),
                            ('clf', clf_final)])
        clf.fit(features, labels_encoded)

        file_name = os.path.join(input_dir, 'classifier.pkl')
        self.logger.info("Saving classifier to '{}'".format(file_name))
        with open(file_name, 'w') as f:
            pickle.dump((label_encoder, clf), f)
开发者ID:albertorepo,项目名称:jarvis,代码行数:34,代码来源:training.py

示例11: loadData

def loadData(path="../data/",k=5,log='add',pca_n=0,SEED=34):
	from pandas import DataFrame, read_csv
	from numpy import log as ln
	from sklearn.cross_validation import KFold
	from sklearn.preprocessing import LabelEncoder
	from sklearn.preprocessing import StandardScaler
	train = read_csv(path+"train.csv")
	test = read_csv(path+"test.csv")
	id = test.id
	target = train.target
	encoder = LabelEncoder()
	target_nnet = encoder.fit_transform(target).astype('int32')
	feat_names = [x for x in train.columns if x.startswith('feat')]
	train = train[feat_names].astype(float)
	test = test[feat_names]
	if log == 'add':
		for v in train.columns:
			train[v+'_log'] = ln(train[v]+1)
			test[v+'_log'] = ln(test[v]+1)
	elif log == 'replace':
		for v in train.columns:
			train[v] = ln(train[v]+1)
			test[v] = ln(test[v]+1)      
	if pca_n > 0:
		from sklearn.decomposition import PCA
		pca = PCA(pca_n)
		train = pca.fit_transform(train)
		test = pca.transform(test)
	scaler = StandardScaler()
	scaler.fit(train)
	train = DataFrame(scaler.transform(train),columns=['feat_'+str(x) for x in range(train.shape[1])])
	test = DataFrame(scaler.transform(test),columns=['feat_'+str(x) for x in range(train.shape[1])])
	cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED)
	return train, test, target, target_nnet, id, cv, encoder
开发者ID:dmcgarry,项目名称:kaggle_otto_group,代码行数:34,代码来源:helperFunctions.py

示例12: to_numeric

	def to_numeric(self, columns=[]):
		le = LabelEncoder()
		for i, c in enumerate(columns):
			le.fit(self.M[:, c])
			self.M[:, c] = le.transform(self.M[:, c])
		self.M = self.M.astype(np.float)
		return self
开发者ID:makgyver,项目名称:pyros,代码行数:7,代码来源:binarizer.py

示例13: prepare_labels

def prepare_labels(y):
    # From here: https://www.kaggle.com/pestipeti/keras-cnn-starter
    values = np.array(y)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)

    return integer_encoded, label_encoder
开发者ID:melgor,项目名称:kaggle-whale-tail,代码行数:7,代码来源:training.py

示例14: labele

def labele(tbl,cols='all'):
    from sklearn.preprocessing import LabelEncoder as LE
    if cols=='all':cols=tbl.columns
    le=LE()
    for ac in tbl.columns:
        tbl.loc[:,ac]=le.fit(tbl[ac]).transform(tbl[ac]) #might have to return le
    return tbl
开发者ID:DistrictDataLabs,项目名称:transportation-project-1,代码行数:7,代码来源:analysis.py

示例15: ml_target

def ml_target(dataset):
    """ Takes a dataset and retuns the target in a numpy.array ready for
    machine learning.
    Mainly transforms non-numerical variables(columns) to numbers.

    Parameters
    ----------
    copper.Dataset

    Returns
    -------
    (label_encoder, np.array)

    Notes
    -----
    If dataset has more than one variable with role=TARGET then the first one
    is selected.
    """
    cols = dataset.filter_cols(role=dataset.TARGET)
    assert len(cols) > 0, 'No target variables on Dataset'
    if len(cols) > 1:
        import warnings
        warnings.warn("Dataset contains more than one target, %s was choosed" % cols[0])

    if dataset[cols[0]].dtype in (np.int, np.float):
        return None, dataset[cols[0]].values
    else:
        le = LabelEncoder()
        encoded = le.fit_transform(dataset[cols[0]].values)
        return le, encoded
开发者ID:GeorgeMcIntire,项目名称:copper,代码行数:30,代码来源:transform.py


注:本文中的sklearn.preprocessing.LabelEncoder类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。