本文整理汇总了Python中sklearn.preprocessing.LabelEncoder.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python LabelEncoder.fit_transform方法的具体用法?Python LabelEncoder.fit_transform怎么用?Python LabelEncoder.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.LabelEncoder
的用法示例。
在下文中一共展示了LabelEncoder.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: label_encode_train_test_sets
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def label_encode_train_test_sets (train, test) :
" Label encode 'supplier' and 'bracket_pricing' features for both train and test set "
test_suppliers = np.sort(pd.unique(test.supplier.ravel()))
print ("Test suppliers shape & elements: ", test_suppliers.shape, test_suppliers)
train_suppliers = np.sort(pd.unique(train.supplier.ravel()))
print ("Train suppliers shape & elements: ", train_suppliers.shape, train_suppliers)
## Merge 'supplier' for both datasets first because we want encoding to be consistent across both
# http://docs.scipy.org/doc/numpy/reference/generated/numpy.sort.html
supplier_ids = []
supplier_ids.extend(train_suppliers)
supplier_ids.extend(test_suppliers)
supplier_ids = np.sort(np.unique(supplier_ids))
print ("Merged supplier_ids.shape: ", supplier_ids.shape)
# print ("supplier_ids.elements: ", supplier_ids)
## Perform label encoding fit on the merged array and then individually transform for train and test sets
print ("Performing label encoding on supplier column...")
label_e = LabelEncoder()
label_e.fit(supplier_ids)
train['supplier'] = label_e.transform(train['supplier'])
test['supplier'] = label_e.transform(test['supplier'])
## Perform label encoding on 'bracket_pricing'
print ("Performing label encoding on bracket_pricing column...")
train['bracket_pricing'] = label_e.fit_transform(train['bracket_pricing'])
test['bracket_pricing'] = label_e.fit_transform(test['bracket_pricing'])
return train, test
示例2: get_test
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def get_test(dim=128,maxlen=500,name='test.csv',events=None):
X_train = pd.read_csv(path+name,
dtype={'device_id': np.str})
X_train["app_lab"] = X_train["device_id"].map(events)
X_train.fillna('0 ',inplace=True)
x_train = X_train["app_lab"].values
phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv',
dtype={'device_id': np.str})
phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True)
phone_brand_le = LabelEncoder()
phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand'])
device_model_le = LabelEncoder()
phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model'])
X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True)
X_train.fillna(0,inplace=True)
phone_brand = X_train['phone_brand'].values
device_model = X_train['device_model'].values
x_train = [ x.split(' ') for x in x_train]
for i in range(len(x_train)):
x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')]
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_train = [x_train,phone_brand,device_model]
return x_train
示例3: prepare_items_features
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def prepare_items_features(user_items_csv, out_dir):
array = np.loadtxt(user_items_csv, delimiter='|',
dtype=np.dtype(np.uint64))
le = LabelEncoder()
col1 = le.fit_transform(array[:, 1].T)
col2 = le.fit_transform(array[:, 2].T)
col3 = le.fit_transform(array[:, 3].T)
col4 = le.fit_transform(array[:, 4].T)
columns = np.array([col1, col2, col3, col4]).T
enc = OneHotEncoder()
print(array[:10])
encoded = np.c_[array[:, 0], enc.fit_transform(columns).toarray()]
print(encoded[:10])
print(encoded.shape)
user_id = encoded[0][0]
rows = []
current = np.zeros(encoded.shape[1]-1)
for i in range(encoded.shape[0]):
if encoded[i][0] != user_id:
rows.append(np.concatenate([[user_id], current]))
user_id = encoded[i][0]
current = np.zeros(encoded.shape[1]-1)
else:
current = np.sum([current, encoded[i, 1:]], axis=0)
rows.append(np.concatenate([[user_id], current]))
array = np.array(rows)
print(array.shape)
# let's serialize array
np.save(os.path.join(out_dir, "user_items"), array)
示例4: load_data
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def load_data():
train_list = []
for line in open('../data/train_clean.json', 'r'):
train_list.append(json.loads(line))
train = pd.DataFrame(train_list)
#train_work = train[names[-1]]
test_list = []
for line in open('../data/test_clean.json', 'r'):
test_list.append(json.loads(line))
test = pd.DataFrame(test_list)
print('--- NLP on major, simply cut the first word')
le = LabelEncoder()
print len(set(train['major']))
train['major'] = train['major'].apply(lambda x : " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')
test['major'] = test['major'].apply(lambda x : " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')
print len(set(train['major']))
le.fit(list(train['major']) + list(test['major']))
train['major'] = le.transform(train['major'])
test['major'] = le.transform(test['major'])
le = LabelEncoder()
train['gender'] = le.fit_transform(train['gender'])
names = train.columns
le = LabelEncoder()
test['gender'] = le.fit_transform(test['gender'])
del train['_id']
del test['_id']
train = train.fillna(0)
test = test.fillna(0)
#test['age'] = test['age'].apply(lambda x : int(x.replace(u'岁','').encode('ascii')))
return train, test
示例5: process_raw_label
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def process_raw_label():
df = pd.DataFrame([
['green', 'M', 10.1, 'class1'],
['red', 'L', 13.5, 'class2'],
['blue', 'XL', 15.3, 'class1']
])
df.columns = ['color', 'size', 'price', 'classlabel']
print(df)
size_mapping = {
'XL': 3,
'L': 2,
'M': 1
}
df['size'] = df['size'].map(size_mapping)
print(df)
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
print(class_mapping)
df['classlabel'] = df['classlabel'].map(class_mapping)
print(df)
# inv
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
print(df)
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
print(y)
x = df[['color', 'size', 'price']].values
print(x)
color_le = LabelEncoder()
x[:, 0] = color_le.fit_transform(x[:, 0])
print('label encoder\n', x)
ohe = OneHotEncoder(categorical_features=[0], sparse=False)
x = ohe.fit_transform(x)
print(x)
print(pd.get_dummies(df[['price', 'color', 'size']]))
开发者ID:ilikesongdandan,项目名称:Introduction-to-Programming-Using-Python,代码行数:37,代码来源:process_raw_data.py
示例6: train_test
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def train_test(self, X, y, X_test):
"""
"""
le = LabelEncoder()
id_123 = np.logical_or(np.logical_or(y==1, y==2), y==3)
y0 = np.zeros(len(y), dtype=np.int32)
y0[id_123] = 1
X0 = np.copy(X)
y0 = le.fit_transform(y0).astype(np.int32)
X1 = X[id_123]
y1 = y[id_123]
y1 = le.fit_transform(y1).astype(np.int32)
X2 = X[np.logical_not(id_123)]
y2 = y[np.logical_not(id_123)]
y2 = le.fit_transform(y2).astype(np.int32)
print 'working on nn0...'
self.nn0.max_epochs = self.early_stopping0.best_valid_epoch
self.nn0.verbose=0
self.nn0.fit(X0, y0)
y0_pred = self.nn0.predict_proba(X_test)
print 'working on nn1...'
self.nn1.max_epochs = self.early_stopping1.best_valid_epoch
self.nn1.verbose=0
self.nn1.fit(X1, y1)
y1_pred = self.nn1.predict_proba(X_test)
print 'working on nn2...'
self.nn2.max_epochs = self.early_stopping2.best_valid_epoch
self.nn2.verbose=0
self.nn2.fit(X2, y2)
y2_pred = self.nn2.predict_proba(X_test)
y_pred = np.zeros((y0_pred.shape[0], 9))
y_pred[:,0] = y0_pred[:,0]*y2_pred[:,0]
y_pred[:,1] = y0_pred[:,1]*y1_pred[:,0]
y_pred[:,2] = y0_pred[:,1]*y1_pred[:,1]
y_pred[:,3] = y0_pred[:,1]*y1_pred[:,2]
y_pred[:,4] = y0_pred[:,0]*y2_pred[:,1]
y_pred[:,5] = y0_pred[:,0]*y2_pred[:,2]
y_pred[:,6] = y0_pred[:,0]*y2_pred[:,3]
y_pred[:,7] = y0_pred[:,0]*y2_pred[:,4]
y_pred[:,8] = y0_pred[:,0]*y2_pred[:,5]
yp0 = y_pred
self.cal_clf.fit(X, y)
yp1 = self.cal_clf.predict_proba(X_test)
y_pred = (yp0 + yp1)/2.
return y_pred
示例7: test_label_encoder_fit_transform
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def test_label_encoder_fit_transform():
"""Test fit_transform"""
le = LabelEncoder()
ret = le.fit_transform([1, 1, 4, 5, -1, 0])
assert_array_equal(ret, [2, 2, 3, 4, 0, 1])
le = LabelEncoder()
ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])
assert_array_equal(ret, [1, 1, 2, 0])
示例8: label_encoding
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def label_encoding(self, x: pd.DataFrame, y: pd.DataFrame, services: list) -> (pd.DataFrame, pd.DataFrame):
le = LabelEncoder()
le = le.fit(services)
x['service'] = le.transform(x['service'])
for feature in ["protocol_type", "flag"]:
x[feature] = le.fit_transform(x[feature])
y = le.fit_transform(y)
print(le.classes_)
return x, y
示例9: X_train_generatetor_infinite
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def X_train_generatetor_infinite(dim=128,maxlen=500,batch_size=128,name="X_train.csv",events=None):
X_train = pd.read_csv(path+name)
group_le = LabelEncoder()
group_lb = LabelBinarizer()
labels = group_le.fit_transform(X_train['group'].values)
labels = group_lb.fit_transform(labels)
del labels
##################
# Phone Brand
##################
# print("# Read Phone Brand")
phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv',
dtype={'device_id': np.str})
phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True)
phone_brand_le = LabelEncoder()
phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand'])
device_model_le = LabelEncoder()
phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model'])
while 1:
data = pd.read_csv(path+name,iterator=True,chunksize=batch_size,
dtype={'device_id': np.str})
for X_train in data:
X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True)
phone_brand = X_train['phone_brand'].values
device_model = X_train['device_model'].values
X_train["app_lab"] = X_train["device_id"].map(events)
y_train = X_train['group'].values
X_train['gender'][X_train['gender']=='M']=1
X_train['gender'][X_train['gender']=='F']=0
y_train_gender = X_train['gender'].values
y_train_age = X_train['age'].values
# take log transformation
y_train_age = np.log(y_train_age)
X_train.fillna('0 ',inplace=True)
y_train = group_le.transform(y_train)
y_train = group_lb.transform(y_train)
x_train = X_train["app_lab"].values
x_train = [ x.split(' ') for x in x_train]
for i in range(len(x_train)):
x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')]
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_train = [x_train,phone_brand,device_model]
y_train = [y_train,y_train_gender,y_train_age]
yield (x_train,y_train)
示例10: process_data
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def process_data(trainDF, testDF):
# 去除train,test中的无用列,并做数据合并
trainDF.drop(['Descript', 'Resolution'], axis=1, inplace=True)
testDF.drop(['Id'], axis=1, inplace=True)
labels = trainDF['Category'].copy()
y = trainDF['Category'].copy()
combi = pd.concat([trainDF.drop(['Category'], axis=1), testDF])
combi['Month'], combi['Day'], combi['Hour'] = zip(*combi['Dates'].apply(extract_time))
combi.drop(['Dates'], axis=1, inplace=True)
combi['intesect'] = combi['Address'].apply(lambda x: 1 if '/' in x else 0)
combi['Wake'] = combi['Hour'].apply(lambda x: 1 if (int(x)>=8 and int(x)<=23) else 0)
addresses = sorted(combi['Address'].unique())
categories = sorted(trainDF['Category'].unique())
addr_counts = combi.groupby('Address').size()
cat_counts = trainDF.groupby('Category').size()
addr_cat_counts = trainDF.groupby(['Address', 'Category']).size()
# 使用counts learning方法对地址信息和分类结果进行特征提取, 可参考https://msdn.microsoft.com/en-us/library/azure/dn913056.aspx
logoddsPA = {}
logodds = {}
PA = cat_counts/float(len(trainDF))
default_logodds = np.log(PA/(1-PA))
for addr in addresses:
PA = addr_counts[addr]/float(len(combi))
logoddsPA[addr] = np.log(PA/(1.0-PA))
logodds[addr] = deepcopy(default_logodds)
if addr in addr_cat_counts.keys():
for cat in addr_cat_counts[addr].keys():
if addr_cat_counts[addr][cat] >= 2 and addr_cat_counts[addr][cat] < addr_counts[addr]:
PA = addr_cat_counts[addr][cat] / float(addr_counts[addr])
logodds[addr][categories.index(cat)] = np.log(PA/(1.0-PA))
logodds[addr] = pd.Series(logodds[addr])
logodds[addr].index = range(len(categories))
combi['LogoddsPA'] = combi['Address'].apply(lambda x: logoddsPA[x])
logodds_features = combi['Address'].apply(lambda x: logodds[x])
logodds_features.colums = ["logodds"+str(x) for x in range(len(categories))]
combi_full = pd.concat([combi, logodds_features], axis=1)
xy_scaler = StandardScaler()
combi_full[['X', 'Y']] = xy_scaler.fit_transform(combi_full[['X', 'Y']])
# 进行label encoding
lbe = LabelEncoder()
combi_full['DayOfWeek'] = lbe.fit_transform(combi_full['DayOfWeek'])
combi_full['PdDistrict'] = lbe.fit_transform(combi_full['PdDistrict'])
combi_full['Wake'] = combi_full['Hour'].apply(lambda x: 1 if (int(x)>=8 and int(x)<=23) else 0)
combi_full["IsDup"]=pd.Series(combi_full.duplicated()|combi_full.duplicated(take_last=True)).apply(int)
combi_full.drop(['Address'], axis=1, inplace=True)
y = lbe.fit_transform(y)
# 由于采用xgboost,可不对特征进行dummy处理
#ohe = OneHotEncoder(categorical_features=[0, 1,4,5,6])
#data = ohe.fit_transform(combi_full.values)
train = combi_full.values[:878049, :]
test = combi_full.values[878049:, :]
return train, test, y, lbe.classes_
示例11: execute
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def execute(self,data):
print 'started label encoding step'
le = LabelEncoder()
output_array = le.fit_transform(data[self.column_list[0]])
for i in range(1,len(self.column_list)):
output_array=np.column_stack([output_array,le.fit_transform(data[self.column_list[i]])])
otherCols = set(data.columns).difference(set(self.column_list))
df1 = data[list(otherCols)]
df2 = pd.DataFrame(output_array,columns=self.column_list)
df1 = df1.join(df2,how='left')
print 'finished label encoding step'
return df1
示例12: fit_transform
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def fit_transform(self, dframe):
"""
Fit label encoder and return encoded labels.
Access individual column classes via indexing
`self.all_classes_`
Access individual column encoders via indexing
`self.all_encoders_`
Access individual column encoded labels via indexing
`self.all_labels_`
"""
df = dframe.copy()
# if columns are provided, iterate through and get `classes_`
if self.columns is not None:
# ndarray to hold LabelEncoder().classes_ for each
# column; should match the shape of specified `columns`
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
self.all_encoders_ = np.ndarray(shape=self.columns.shape,
dtype=object)
self.all_labels_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
# instantiate LabelEncoder
le = LabelEncoder()
# fit and transform labels in the column
df.loc[:, column] =\
le.fit_transform(df.loc[:, column].values)
# append the `classes_` to our ndarray container
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
self.all_encoders_[idx] = le
self.all_labels_[idx] = le
else:
# no columns specified; assume all are to be encoded
self.columns = df.iloc[:, :].columns
self.all_classes_ = np.ndarray(shape=self.columns.shape,
dtype=object)
for idx, column in enumerate(self.columns):
le = LabelEncoder()
df.loc[:, column] = le.fit_transform(
df.loc[:, column].values)
self.all_classes_[idx] = (column,
np.array(le.classes_.tolist(),
dtype=object))
self.all_encoders_[idx] = le
return df
示例13: data_preprocess
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def data_preprocess(df):
dropLst = ['Unnamed: 0', 'STATION_NAME',
'STATISTICAL_CODE_DESCRIPTION', 'CrimeCat']
df['STREET'] = df['STREET'].apply(get_rid_num)
df['ZIP'] = df['ZIP'].apply(int)
le = LabelEncoder()
df['STREET'] = le.fit_transform(df['STREET'])
df['CITY'] = le.fit_transform(df['CITY'])
feature_names = df.drop(dropLst, axis=1).columns
X = df.drop(dropLst, axis=1).values
y = df['CrimeCat'].values
lb = LabelBinarizer()
y = lb.fit_transform(y)
return X, y, feature_names
示例14: __call__
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def __call__(self, X_train, X_test, y_train, y_test):
X = np.vstack([X_train, X_test])
y = np.hstack([y_train, y_test])
le = LabelEncoder()
y = le.fit_transform(y)
kmeans = KMeans(
n_clusters=len(np.unique(y)),
n_init=self.kmeans__n_init,
random_state=self.random_state,
)
kmeans.fit(X)
r = distance.cdist(kmeans.cluster_centers_, kmeans.cluster_centers_)
h = np.exp(-r / (self.sig**2))
N = confusion_matrix(y, kmeans.labels_)
wN = np.zeros(h.shape)
for l in range(wN.shape[0]): # label
for c in range(wN.shape[0]): # cluster
for j in range(wN.shape[0]):
wN[l, c] += h[l, c] * N[l, j]
return wN.max(axis=0).sum() / wN.sum()
示例15: test_multiclass_classifier_class_weight
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def test_multiclass_classifier_class_weight():
"""tests multiclass with classweights for each class"""
alpha = .1
n_samples = 20
tol = .00001
max_iter = 50
class_weight = {0: .45, 1: .55, 2: .75}
fit_intercept = True
X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
cluster_std=0.1)
step_size = get_step_size(X, alpha, fit_intercept, classification=True)
classes = np.unique(y)
clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
max_iter=max_iter, tol=tol, random_state=77,
fit_intercept=fit_intercept,
class_weight=class_weight)
clf2 = clone(clf1)
clf1.fit(X, y)
clf2.fit(sp.csr_matrix(X), y)
le = LabelEncoder()
class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
sample_weight = class_weight_[le.fit_transform(y)]
coef1 = []
intercept1 = []
coef2 = []
intercept2 = []
for cl in classes:
y_encoded = np.ones(n_samples)
y_encoded[y != cl] = -1
spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
n_iter=max_iter, dloss=log_dloss,
sample_weight=sample_weight)
spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
n_iter=max_iter, dloss=log_dloss,
sample_weight=sample_weight,
sparse=True)
coef1.append(spweights1)
intercept1.append(spintercept1)
coef2.append(spweights2)
intercept2.append(spintercept2)
coef1 = np.vstack(coef1)
intercept1 = np.array(intercept1)
coef2 = np.vstack(coef2)
intercept2 = np.array(intercept2)
for i, cl in enumerate(classes):
assert_array_almost_equal(clf1.coef_[i].ravel(),
coef1[i].ravel(),
decimal=2)
assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)
assert_array_almost_equal(clf2.coef_[i].ravel(),
coef2[i].ravel(),
decimal=2)
assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)