当前位置: 首页>>代码示例>>Python>>正文


Python LabelEncoder.fit方法代码示例

本文整理汇总了Python中sklearn.preprocessing.LabelEncoder.fit方法的典型用法代码示例。如果您正苦于以下问题:Python LabelEncoder.fit方法的具体用法?Python LabelEncoder.fit怎么用?Python LabelEncoder.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.preprocessing.LabelEncoder的用法示例。


在下文中一共展示了LabelEncoder.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: preprocess

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def preprocess():
    train = pd.read_csv("../data/train.csv")
    test = pd.read_csv("../data/test.csv")

    train['Date'] = pd.to_datetime(pd.Series(train['Original_Quote_Date']))
    train = train.drop('Original_Quote_Date', axis=1)

    test['Date'] = pd.to_datetime(pd.Series(test['Original_Quote_Date']))
    test = test.drop('Original_Quote_Date', axis=1)

    train['Year'] = train['Date'].apply(lambda x: x.year)
    train['Month'] = train['Date'].apply(lambda x: x.month)
    train['weekday'] = train['Date'].apply(lambda x: x.dayofweek)

    test['Year'] = test['Date'].apply(lambda x: x.year)
    test['Month'] = test['Date'].apply(lambda x: x.month)
    test['weekday'] = test['Date'].apply(lambda x: x.dayofweek)

    train = train.drop('Date', axis=1)
    test = test.drop('Date', axis=1)

    for f in train.columns:
        if train[f].dtype == 'object':
            lbl = LabelEncoder()
            # watch how to handle missing value labeling
            lbl.fit(list(train[f].values) + list(test[f].values))
            train[f] = lbl.transform(list(train[f].values))
            test[f] = lbl.transform(list(test[f].values))

    train = train.fillna(-1)
    test = test.fillna(-1)

    return train, test
开发者ID:EricTing,项目名称:HomesiteInsur,代码行数:35,代码来源:homesite.py

示例2: test_hard_vote

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def test_hard_vote():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    #train_probs = probs[0]
    test_probs = probs[1]
    print(len(test_probs))
    preds = [x.idxmax(1) for x in test_probs]
    pred = np.zeros(len(preds[0]),dtype=np.int8)
    print(len(pred))
    for i in range(len(preds[0])):
        votes = [p[i] for p in preds]
        print(votes)
        pred[i]= max(set(votes),key=votes.count)
        print(pred[i])
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))

    """
开发者ID:ZaydH,项目名称:recipe_cuisine_type_classifier,代码行数:33,代码来源:predict.py

示例3: main

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def main():
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')

    enc = LabelEncoder()
    joined = pd.concat((train['Product_Info_2'],
                        test['Product_Info_2']), axis=0)
    enc.fit(joined)
    train['Product_Info_2'] = enc.transform(train['Product_Info_2'])
    test['Product_Info_2'] = enc.transform(test['Product_Info_2'])


    X_train = train.drop('Response', axis=1).values
    y_train = train['Response'].values
    X_test = test.values

    mdl = xgb.XGBRegressor(learning_rate=0.05,
                           n_estimators=200,
                           subsample=0.5,
                           max_depth=6,
                           silent=False)
    mdl.fit(X_train, y_train)

    preds = mdl.predict(X_test)
    preds = [min(max(1, int(round(pred))), 8) for pred in preds]

    sub = pd.DataFrame({'Id': test['Id'], 'Response': preds})
    sub.to_csv('submissions/xgb.csv', index=False)
开发者ID:xiaoyubai,项目名称:kaggle-prudential,代码行数:30,代码来源:base_mdl.py

示例4: clean_country

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def clean_country(df:pd.DataFrame):
    df['country'] = df['country'].map(lambda x: str(x).upper())

    unique = np.unique(df.country.values)
    max_num_countries = len(unique)
    print("Unique Countries : ", unique)
    print("Num Unique Countries : ", max_num_countries)
    print()

    countries = df['country'].values

    if os.path.exists('data/countries.pkl'):
        with open('data/countries.pkl', 'rb') as f:
            encoder = pickle.load(f)

    else:
        encoder = LabelEncoder()
        encoder.fit(countries)

        with open('data/countries.pkl', 'wb') as f:
            pickle.dump(encoder, f)

    # encode the values
    countries = encoder.transform(countries)

    return df, countries, max_num_countries
开发者ID:titu1994,项目名称:Python-Work,代码行数:28,代码来源:data_utils.py

示例5: to_numeric

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
	def to_numeric(self, columns=[]):
		le = LabelEncoder()
		for i, c in enumerate(columns):
			le.fit(self.M[:, c])
			self.M[:, c] = le.transform(self.M[:, c])
		self.M = self.M.astype(np.float)
		return self
开发者ID:makgyver,项目名称:pyros,代码行数:9,代码来源:binarizer.py

示例6: fit

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
    def fit(self, X, y=None):
        if self.categorical: # Need to one hot encode labels
            label_encoder = LabelEncoder()
            one_hot = OneHotEncoder()
            label_encoder.fit(y)
            one_hot.fit(list(map(lambda x:[x],label_encoder.transform(y))))
            self.stack_encoder = lambda x: one_hot.transform(list(map(lambda x:[x],label_encoder.transform(x)))).toarray()

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-self.hold_out_percent)

        predictions = []
        for (name, clf) in self.base_classifiers:
            print("Ensemble currently fitting:",name)
            clf.fit(X_train,y_train)
            if self.categorical:
                predictions.append(self.stack_encoder(clf.predict(X_test)))
            else:
                predictions.append(list(map(lambda x:[x], clf.predict(X_test))))

        predictions = np.hstack(predictions)

        print("Fitting stack classifier")
        self.stack_classifier[1].fit(predictions,y_test)

        if self.refit_base:
            for (name, clf) in self.base_classifiers:
                print("Ensemble currently refitting:",name)
                clf.fit(X,y)

        return self
开发者ID:dnola,项目名称:145_whats_cooking,代码行数:32,代码来源:pipeline_helpers.py

示例7: OutputLabelColumn

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
class OutputLabelColumn(BaseEstimator, TransformerMixin):
    '''
    Take a string or key categorical column and transform it to integer labels.
    '''

    def __init__(self):
        '''
        Set up the internal transformation.
        '''
        self._labeler = LabelEncoder()

    def fit(self, X, y=None):
        '''
        Fit the label and encoding
        '''
        handle_none = list(map(str, X))
        self._labeler.fit(handle_none)
        return self

    def transform(self, X):
        '''
        Transform a column of data into one hot encodings.

        Parameters
        ----------
        X : pandas series or numpy array
        '''
        handle_none = list(map(str, X))
        return self._labeler.transform(handle_none).astype(np.int32)
开发者ID:wballard,项目名称:tableclassifier,代码行数:31,代码来源:table_model.py

示例8: data_processing

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def data_processing(train,test,features):
    # train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    # test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    # train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    # test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    # train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    # test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    # train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
    # test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
    # features += ['hour','dark','StreetNo']

    print("Filling NAs")
    # print(train.mode())
    train = train.fillna(train.median().iloc[0])
    test = test.fillna(test.median().iloc[0])
    print("Label Encoder")
    le=LabelEncoder()
    for col in features:
        le.fit(list(train[col])+list(test[col]))
        train[col]=le.transform(train[col])
        test[col]=le.transform(test[col])

    le.fit(list(train[target]))
    train[target]=le.transform(train[target])

    print("Standard Scalaer")
    scaler=StandardScaler()
    for col in features:
        scaler.fit(list(train[col]))
        train[col]=scaler.transform(train[col])
        test[col]=scaler.transform(test[col])

    return train,test,features
开发者ID:ssdf93,项目名称:kaggle,代码行数:35,代码来源:xgboost_native.py

示例9: encode_categorical_data

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def encode_categorical_data(train, test, fill_missing = False):
    '''
    encoding is an extemely slow process
    So only use the training data to trian the encoder
    '''
    le = LabelEncoder()

    ## this step creates separate train and test dataFrame
    if fill_missing:
        train = train.fillna(value='missing')
        test = test.fillna(value='missing')

    counter = 0
    start_time = time.time()
    for col, dtype in zip(train.columns, train.dtypes):
        if dtype == 'object':
            le.fit(pd.concat([train[col], test[col]], axis=0))
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])

        counter += 1
        if counter % 20 == 0:
            print '{} out of {} is processed using {} seconds...'.format(str(counter), str(train.shape[1]), round((time.time() - start_time), 0))

    end_time = time.time()
    print 'encoding process takes ', round((end_time - start_time)), 'seconds'

    ## train and test are newly created
    return train, test
开发者ID:mengyx-work,项目名称:xgboost_hyperopt,代码行数:31,代码来源:data_munge.py

示例10: EncodingText

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
class EncodingText():
    """
    LabelEncoder.transform的主要作用是把语料库的一个个单词转换成vocabulary对应的数字
    LabelEncoder.fit的主要作用是去重,并把vocabulary里的单词映射到数字。
    """
    def __init__(self,vocabulary):
        from sklearn.preprocessing import LabelEncoder
        self.le=LabelEncoder()
        self.vocabulary=vocabulary

    def fit(self,X,y=None):
        self.le.fit(self.vocabulary)
        return self

    def transform(self,X):
        for x in X:
            y = self.le.transform(x.split())
        return [self.le.transform(x.split()) for x in X]
        #return [self.getSparseM(x) for x in X]

    def getSparseM(self,x):
        from scipy.sparse import coo_matrix
        import numpy as np
        sent=x.split()
        ind=self.le.transform(sent)
        a=coo_matrix((np.ones([len(sent)]),(ind,range(len(sent)))),shape=(len(self.vocabulary),len(sent)))
        return a
开发者ID:iron-fe,项目名称:stanford_Rnn,代码行数:29,代码来源:transformations.py

示例11: process_data

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def process_data(train,test,features,features_non_numeric):
    train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and x[11:13] >= 18 and x[11:13] < 6) else 0)
    test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and x[11:13] >= 18 and x[11:13] < 6) else 0)
    features += ['hour','dark','StreetNo']

    print "Filling N/As: " + str(datetime.datetime.now())
    train = train.fillna(train.mode().iloc[0])
    test = test.fillna(test.mode().iloc[0])
    # Pre-processing non-numberic values
    print "Label Encoder: " + str(datetime.datetime.now())
    le = LabelEncoder()
    for col in features:
        # print col
        le.fit(list(train[col])+list(test[col]))
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])
    # Xgb requires goal to be numeric...
    le.fit(list(train[goal]))
    train[goal] = le.transform(train[goal])

    # Neural Network, Stochastic Gradient Descent is sensitive to feature scaling, so it is highly recommended to scale your data.
    print "Standard Scaler: " + str(datetime.datetime.now())
    scaler = StandardScaler()
    for col in set(features): # - set(features_non_numeric):
        # print col
        scaler.fit(list(train[col])+list(test[col]))
        train[col] = scaler.transform(train[col])
        test[col] = scaler.transform(test[col])
    return (train,test,features)
开发者ID:AdityaRon,项目名称:kaggle-for-fun,代码行数:37,代码来源:sf-crime-classification-xgb-native.py

示例12: main

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def main():
    org_type = pd.read_csv('../input/application_train.csv', usecols=['ORGANIZATION_TYPE'], nrows=None)
    print(org_type.shape)
    print(org_type.nunique())

    print(org_type.head())

    lbl = LabelEncoder()
    lbl.fit(org_type)
    org_type_label = lbl.transform(org_type)

    print(type(org_type_label))
    print(org_type_label.shape)
    print(org_type_label[:5])

    model = Sequential([
        Dense(32, units=784),
        Activation('relu'),
        Dense(10),
        Activation('softmax'),
    ])

    

    model.compile(loss='mean_absolute_error', optimizer='adam')
开发者ID:aptx4869yuyang2017,项目名称:Home_Credit_0729,代码行数:27,代码来源:org_type.py

示例13: main

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        best = load_json(self.data.params_fname)
        if isinstance(best, list):
            best = best[0]
        best = clean_params(best)
        print(self.data.params_fname, self.data.training_set)
        corpus, labels = read_data_labels(self.data.training_set)
        le = LabelEncoder()
        le.fit(labels)
        y = le.transform(labels)
        t = TextModel(corpus, **best)
        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True, random_state=self.data.seed).split(X):
            c = SVC(model=t)
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for tweet in tweet_iterator(self.data.training_set):
                tweet['decision_function'] = hy[i].tolist()
                i += 1
                fpt.write(json.dumps(tweet)+"\n")
        return hy
开发者ID:INGEOTEC,项目名称:b4msa,代码行数:31,代码来源:command_line.py

示例14: encode_dataset

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def encode_dataset(train,test,meta,target_model='xgb'):
    y_train = train[meta['target']]
    train = train.drop([meta['target']],axis=1)
    assert train.shape[1] == test.shape[1]
    for i in range(train.shape[1]):
        assert train.columns[i] == test.columns[i]
    train_obs = len(train)
    #
    all_data = pd.concat([train,test],axis=0)
    for i,f in enumerate(meta['cols'].keys()):
        print(i,f,meta['cols'][f])
        if meta['cols'][f] == 'CAT':
            all_data[f] = all_data[f].astype('str')
            encoder = LabelEncoder()
            encoder.fit(np.unique(all_data[f].unique().tolist()))
            if target_model == 'xgb':
                all_data[f] = encoder.transform(all_data[f])
            else:
                all_data[f] = encoder.transform(all_data[f]).astype(int)
        elif meta['cols'][f] == 'NUM':
            all_data[f] = all_data[f].fillna(-1)
        elif meta['cols'][f] == 'REM':
            all_data = all_data.drop(f,axis=1)
        else:
            raise Exception(str(meta['cols'][f])+":unknown mapping")
    assert train_obs == len(y_train)
    return all_data , y_train
开发者ID:gtesei,项目名称:fast-furious,代码行数:29,代码来源:base_lightGBM.py

示例15: __init__

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
    def __init__(self,filename='train.json'):
        self.filename_tr=filename

        # Read JSON data using pandas
        # columns are: id, cuisine, ingredients
        data  = pd.read_json(filename)
        
        # Label Encoders
        labels = LabelEncoder()
        labels.fit(data.cuisine)
        self.classes = labels.classes_
        self.class_encode = labels.transform
        self.class_decode = labels.inverse_transform        

        # Get numerical labels for ytrain 
        y_train = labels.transform(data.cuisine)

        # Vectorization of ingredients Using WordNet lemmatization & Tfid
        data['ingredients_clean_string'] = [' , '.join(z).strip() for z in data['ingredients']]  
        data['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in data['ingredients']]

        vectorizer  = TfidfVectorizer(stop_words='english', ngram_range=(1,1), max_df=0.57, analyzer='word', token_pattern=r'\w+')
        x_train     = vectorizer.fit_transform(data.ingredients_string).todense()
        ingred_dict = vectorizer.vocabulary_
        self.vectorizer = vectorizer

        self.y_train = y_train
        self.x_train = x_train

        self.tsdata  = pd.DataFrame()
开发者ID:justiceamoh,项目名称:whatscooking,代码行数:32,代码来源:DataInterface.py


注:本文中的sklearn.preprocessing.LabelEncoder.fit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。