本文整理汇总了Python中sklearn.preprocessing.LabelEncoder.fit方法的典型用法代码示例。如果您正苦于以下问题:Python LabelEncoder.fit方法的具体用法?Python LabelEncoder.fit怎么用?Python LabelEncoder.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.LabelEncoder
的用法示例。
在下文中一共展示了LabelEncoder.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: preprocess
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def preprocess():
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
train['Date'] = pd.to_datetime(pd.Series(train['Original_Quote_Date']))
train = train.drop('Original_Quote_Date', axis=1)
test['Date'] = pd.to_datetime(pd.Series(test['Original_Quote_Date']))
test = test.drop('Original_Quote_Date', axis=1)
train['Year'] = train['Date'].apply(lambda x: x.year)
train['Month'] = train['Date'].apply(lambda x: x.month)
train['weekday'] = train['Date'].apply(lambda x: x.dayofweek)
test['Year'] = test['Date'].apply(lambda x: x.year)
test['Month'] = test['Date'].apply(lambda x: x.month)
test['weekday'] = test['Date'].apply(lambda x: x.dayofweek)
train = train.drop('Date', axis=1)
test = test.drop('Date', axis=1)
for f in train.columns:
if train[f].dtype == 'object':
lbl = LabelEncoder()
# watch how to handle missing value labeling
lbl.fit(list(train[f].values) + list(test[f].values))
train[f] = lbl.transform(list(train[f].values))
test[f] = lbl.transform(list(test[f].values))
train = train.fillna(-1)
test = test.fillna(-1)
return train, test
示例2: test_hard_vote
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def test_hard_vote():
X,y,test_X,test_Y =get_test_data()
print("bag of words")
bow = BagOfWordsClassifier()
bow_probs = bow.get_proba(X,y,test_X,prefix="t")
print("direct attribute")
da = DirectAttributeClassifier()
da_probs = da.get_proba(X,y,test_X,prefix="t")
probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
#train_probs = probs[0]
test_probs = probs[1]
print(len(test_probs))
preds = [x.idxmax(1) for x in test_probs]
pred = np.zeros(len(preds[0]),dtype=np.int8)
print(len(pred))
for i in range(len(preds[0])):
votes = [p[i] for p in preds]
print(votes)
pred[i]= max(set(votes),key=votes.count)
print(pred[i])
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
pred = le.inverse_transform(pred)
print(metrics.accuracy_score(test_Y,pred))
"""
示例3: main
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def main():
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
enc = LabelEncoder()
joined = pd.concat((train['Product_Info_2'],
test['Product_Info_2']), axis=0)
enc.fit(joined)
train['Product_Info_2'] = enc.transform(train['Product_Info_2'])
test['Product_Info_2'] = enc.transform(test['Product_Info_2'])
X_train = train.drop('Response', axis=1).values
y_train = train['Response'].values
X_test = test.values
mdl = xgb.XGBRegressor(learning_rate=0.05,
n_estimators=200,
subsample=0.5,
max_depth=6,
silent=False)
mdl.fit(X_train, y_train)
preds = mdl.predict(X_test)
preds = [min(max(1, int(round(pred))), 8) for pred in preds]
sub = pd.DataFrame({'Id': test['Id'], 'Response': preds})
sub.to_csv('submissions/xgb.csv', index=False)
示例4: clean_country
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def clean_country(df:pd.DataFrame):
df['country'] = df['country'].map(lambda x: str(x).upper())
unique = np.unique(df.country.values)
max_num_countries = len(unique)
print("Unique Countries : ", unique)
print("Num Unique Countries : ", max_num_countries)
print()
countries = df['country'].values
if os.path.exists('data/countries.pkl'):
with open('data/countries.pkl', 'rb') as f:
encoder = pickle.load(f)
else:
encoder = LabelEncoder()
encoder.fit(countries)
with open('data/countries.pkl', 'wb') as f:
pickle.dump(encoder, f)
# encode the values
countries = encoder.transform(countries)
return df, countries, max_num_countries
示例5: to_numeric
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def to_numeric(self, columns=[]):
le = LabelEncoder()
for i, c in enumerate(columns):
le.fit(self.M[:, c])
self.M[:, c] = le.transform(self.M[:, c])
self.M = self.M.astype(np.float)
return self
示例6: fit
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def fit(self, X, y=None):
if self.categorical: # Need to one hot encode labels
label_encoder = LabelEncoder()
one_hot = OneHotEncoder()
label_encoder.fit(y)
one_hot.fit(list(map(lambda x:[x],label_encoder.transform(y))))
self.stack_encoder = lambda x: one_hot.transform(list(map(lambda x:[x],label_encoder.transform(x)))).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-self.hold_out_percent)
predictions = []
for (name, clf) in self.base_classifiers:
print("Ensemble currently fitting:",name)
clf.fit(X_train,y_train)
if self.categorical:
predictions.append(self.stack_encoder(clf.predict(X_test)))
else:
predictions.append(list(map(lambda x:[x], clf.predict(X_test))))
predictions = np.hstack(predictions)
print("Fitting stack classifier")
self.stack_classifier[1].fit(predictions,y_test)
if self.refit_base:
for (name, clf) in self.base_classifiers:
print("Ensemble currently refitting:",name)
clf.fit(X,y)
return self
示例7: OutputLabelColumn
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
class OutputLabelColumn(BaseEstimator, TransformerMixin):
'''
Take a string or key categorical column and transform it to integer labels.
'''
def __init__(self):
'''
Set up the internal transformation.
'''
self._labeler = LabelEncoder()
def fit(self, X, y=None):
'''
Fit the label and encoding
'''
handle_none = list(map(str, X))
self._labeler.fit(handle_none)
return self
def transform(self, X):
'''
Transform a column of data into one hot encodings.
Parameters
----------
X : pandas series or numpy array
'''
handle_none = list(map(str, X))
return self._labeler.transform(handle_none).astype(np.int32)
示例8: data_processing
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def data_processing(train,test,features):
# train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
# test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
# train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
# test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
# train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
# test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
# train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
# test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
# features += ['hour','dark','StreetNo']
print("Filling NAs")
# print(train.mode())
train = train.fillna(train.median().iloc[0])
test = test.fillna(test.median().iloc[0])
print("Label Encoder")
le=LabelEncoder()
for col in features:
le.fit(list(train[col])+list(test[col]))
train[col]=le.transform(train[col])
test[col]=le.transform(test[col])
le.fit(list(train[target]))
train[target]=le.transform(train[target])
print("Standard Scalaer")
scaler=StandardScaler()
for col in features:
scaler.fit(list(train[col]))
train[col]=scaler.transform(train[col])
test[col]=scaler.transform(test[col])
return train,test,features
示例9: encode_categorical_data
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def encode_categorical_data(train, test, fill_missing = False):
'''
encoding is an extemely slow process
So only use the training data to trian the encoder
'''
le = LabelEncoder()
## this step creates separate train and test dataFrame
if fill_missing:
train = train.fillna(value='missing')
test = test.fillna(value='missing')
counter = 0
start_time = time.time()
for col, dtype in zip(train.columns, train.dtypes):
if dtype == 'object':
le.fit(pd.concat([train[col], test[col]], axis=0))
train[col] = le.transform(train[col])
test[col] = le.transform(test[col])
counter += 1
if counter % 20 == 0:
print '{} out of {} is processed using {} seconds...'.format(str(counter), str(train.shape[1]), round((time.time() - start_time), 0))
end_time = time.time()
print 'encoding process takes ', round((end_time - start_time)), 'seconds'
## train and test are newly created
return train, test
示例10: EncodingText
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
class EncodingText():
"""
LabelEncoder.transform的主要作用是把语料库的一个个单词转换成vocabulary对应的数字
LabelEncoder.fit的主要作用是去重,并把vocabulary里的单词映射到数字。
"""
def __init__(self,vocabulary):
from sklearn.preprocessing import LabelEncoder
self.le=LabelEncoder()
self.vocabulary=vocabulary
def fit(self,X,y=None):
self.le.fit(self.vocabulary)
return self
def transform(self,X):
for x in X:
y = self.le.transform(x.split())
return [self.le.transform(x.split()) for x in X]
#return [self.getSparseM(x) for x in X]
def getSparseM(self,x):
from scipy.sparse import coo_matrix
import numpy as np
sent=x.split()
ind=self.le.transform(sent)
a=coo_matrix((np.ones([len(sent)]),(ind,range(len(sent)))),shape=(len(self.vocabulary),len(sent)))
return a
示例11: process_data
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def process_data(train,test,features,features_non_numeric):
train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and x[11:13] >= 18 and x[11:13] < 6) else 0)
test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and x[11:13] >= 18 and x[11:13] < 6) else 0)
features += ['hour','dark','StreetNo']
print "Filling N/As: " + str(datetime.datetime.now())
train = train.fillna(train.mode().iloc[0])
test = test.fillna(test.mode().iloc[0])
# Pre-processing non-numberic values
print "Label Encoder: " + str(datetime.datetime.now())
le = LabelEncoder()
for col in features:
# print col
le.fit(list(train[col])+list(test[col]))
train[col] = le.transform(train[col])
test[col] = le.transform(test[col])
# Xgb requires goal to be numeric...
le.fit(list(train[goal]))
train[goal] = le.transform(train[goal])
# Neural Network, Stochastic Gradient Descent is sensitive to feature scaling, so it is highly recommended to scale your data.
print "Standard Scaler: " + str(datetime.datetime.now())
scaler = StandardScaler()
for col in set(features): # - set(features_non_numeric):
# print col
scaler.fit(list(train[col])+list(test[col]))
train[col] = scaler.transform(train[col])
test[col] = scaler.transform(test[col])
return (train,test,features)
示例12: main
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def main():
org_type = pd.read_csv('../input/application_train.csv', usecols=['ORGANIZATION_TYPE'], nrows=None)
print(org_type.shape)
print(org_type.nunique())
print(org_type.head())
lbl = LabelEncoder()
lbl.fit(org_type)
org_type_label = lbl.transform(org_type)
print(type(org_type_label))
print(org_type_label.shape)
print(org_type_label[:5])
model = Sequential([
Dense(32, units=784),
Activation('relu'),
Dense(10),
Activation('softmax'),
])
model.compile(loss='mean_absolute_error', optimizer='adam')
示例13: main
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def main(self, args=None):
self.data = self.parser.parse_args(args=args)
assert not self.data.update_klass
best = load_json(self.data.params_fname)
if isinstance(best, list):
best = best[0]
best = clean_params(best)
print(self.data.params_fname, self.data.training_set)
corpus, labels = read_data_labels(self.data.training_set)
le = LabelEncoder()
le.fit(labels)
y = le.transform(labels)
t = TextModel(corpus, **best)
X = [t[x] for x in corpus]
hy = [None for x in y]
for tr, ts in KFold(n_splits=self.data.kratio,
shuffle=True, random_state=self.data.seed).split(X):
c = SVC(model=t)
c.fit([X[x] for x in tr], [y[x] for x in tr])
_ = c.decision_function([X[x] for x in ts])
[hy.__setitem__(k, v) for k, v in zip(ts, _)]
i = 0
with open(self.get_output(), 'w') as fpt:
for tweet in tweet_iterator(self.data.training_set):
tweet['decision_function'] = hy[i].tolist()
i += 1
fpt.write(json.dumps(tweet)+"\n")
return hy
示例14: encode_dataset
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def encode_dataset(train,test,meta,target_model='xgb'):
y_train = train[meta['target']]
train = train.drop([meta['target']],axis=1)
assert train.shape[1] == test.shape[1]
for i in range(train.shape[1]):
assert train.columns[i] == test.columns[i]
train_obs = len(train)
#
all_data = pd.concat([train,test],axis=0)
for i,f in enumerate(meta['cols'].keys()):
print(i,f,meta['cols'][f])
if meta['cols'][f] == 'CAT':
all_data[f] = all_data[f].astype('str')
encoder = LabelEncoder()
encoder.fit(np.unique(all_data[f].unique().tolist()))
if target_model == 'xgb':
all_data[f] = encoder.transform(all_data[f])
else:
all_data[f] = encoder.transform(all_data[f]).astype(int)
elif meta['cols'][f] == 'NUM':
all_data[f] = all_data[f].fillna(-1)
elif meta['cols'][f] == 'REM':
all_data = all_data.drop(f,axis=1)
else:
raise Exception(str(meta['cols'][f])+":unknown mapping")
assert train_obs == len(y_train)
return all_data , y_train
示例15: __init__
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit [as 别名]
def __init__(self,filename='train.json'):
self.filename_tr=filename
# Read JSON data using pandas
# columns are: id, cuisine, ingredients
data = pd.read_json(filename)
# Label Encoders
labels = LabelEncoder()
labels.fit(data.cuisine)
self.classes = labels.classes_
self.class_encode = labels.transform
self.class_decode = labels.inverse_transform
# Get numerical labels for ytrain
y_train = labels.transform(data.cuisine)
# Vectorization of ingredients Using WordNet lemmatization & Tfid
data['ingredients_clean_string'] = [' , '.join(z).strip() for z in data['ingredients']]
data['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in data['ingredients']]
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,1), max_df=0.57, analyzer='word', token_pattern=r'\w+')
x_train = vectorizer.fit_transform(data.ingredients_string).todense()
ingred_dict = vectorizer.vocabulary_
self.vectorizer = vectorizer
self.y_train = y_train
self.x_train = x_train
self.tsdata = pd.DataFrame()