当前位置: 首页>>代码示例>>Python>>正文


Python LabelEncoder.fit_transform方法代码示例

本文整理汇总了Python中sklearn.preprocessing.LabelEncoder.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python LabelEncoder.fit_transform方法的具体用法?Python LabelEncoder.fit_transform怎么用?Python LabelEncoder.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.preprocessing.LabelEncoder的用法示例。


在下文中一共展示了LabelEncoder.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: label_encode_train_test_sets

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def label_encode_train_test_sets (train, test) :
	" Label encode 'supplier' and 'bracket_pricing' features for both train and test set "
	test_suppliers = np.sort(pd.unique(test.supplier.ravel()))
	print ("Test suppliers shape & elements: ", test_suppliers.shape, test_suppliers)
	train_suppliers = np.sort(pd.unique(train.supplier.ravel()))
	print ("Train suppliers shape & elements: ", train_suppliers.shape, train_suppliers)
	
	## Merge 'supplier' for both datasets first because we want encoding to be consistent across both
	# http://docs.scipy.org/doc/numpy/reference/generated/numpy.sort.html
	supplier_ids = []
	supplier_ids.extend(train_suppliers)
	supplier_ids.extend(test_suppliers)
	supplier_ids = np.sort(np.unique(supplier_ids))
	print ("Merged supplier_ids.shape: ", supplier_ids.shape)
	# print ("supplier_ids.elements: ", supplier_ids)

	## Perform label encoding fit on the merged array and then individually transform for train and test sets
	print ("Performing label encoding on supplier column...")
	label_e = LabelEncoder()
	label_e.fit(supplier_ids)
	train['supplier'] = label_e.transform(train['supplier'])
	test['supplier'] = label_e.transform(test['supplier'])

	## Perform label encoding on 'bracket_pricing'
	print ("Performing label encoding on bracket_pricing column...")
	train['bracket_pricing'] = label_e.fit_transform(train['bracket_pricing'])
	test['bracket_pricing'] = label_e.fit_transform(test['bracket_pricing'])

	return train, test
开发者ID:sanderjb,项目名称:Kaggle-CAT,代码行数:31,代码来源:regressor_v2.py

示例2: get_test

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def get_test(dim=128,maxlen=500,name='test.csv',events=None):
    X_train = pd.read_csv(path+name,
                    dtype={'device_id': np.str})
    X_train["app_lab"] = X_train["device_id"].map(events)
    X_train.fillna('0 ',inplace=True)
    x_train = X_train["app_lab"].values

    phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv',
                    dtype={'device_id': np.str})
    phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True)

    phone_brand_le = LabelEncoder()
    phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand'])

    device_model_le = LabelEncoder()
    phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model'])


    X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True)
    X_train.fillna(0,inplace=True)
    phone_brand = X_train['phone_brand'].values
    device_model = X_train['device_model'].values

    x_train = [ x.split(' ') for x in  x_train]
    for i in range(len(x_train)):
        x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')]

    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_train = [x_train,phone_brand,device_model]
    return x_train
开发者ID:chu-NMSU,项目名称:Talking-Data,代码行数:32,代码来源:benchmark_cnn.py

示例3: prepare_items_features

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def prepare_items_features(user_items_csv, out_dir):
    array = np.loadtxt(user_items_csv, delimiter='|',
            dtype=np.dtype(np.uint64))

    le = LabelEncoder()
    col1 = le.fit_transform(array[:, 1].T)
    col2 = le.fit_transform(array[:, 2].T)
    col3 = le.fit_transform(array[:, 3].T)
    col4 = le.fit_transform(array[:, 4].T)

    columns = np.array([col1, col2, col3, col4]).T
    enc = OneHotEncoder()
    print(array[:10])
    encoded = np.c_[array[:, 0], enc.fit_transform(columns).toarray()]
    print(encoded[:10])
    print(encoded.shape)

    user_id = encoded[0][0]
    rows = []
    current = np.zeros(encoded.shape[1]-1)
    for i in range(encoded.shape[0]):
        if encoded[i][0] != user_id:
            rows.append(np.concatenate([[user_id], current]))
            user_id = encoded[i][0]
            current = np.zeros(encoded.shape[1]-1)
        else:
            current = np.sum([current, encoded[i, 1:]], axis=0)
    rows.append(np.concatenate([[user_id], current]))

    array = np.array(rows)
    print(array.shape)

    # let's serialize array
    np.save(os.path.join(out_dir, "user_items"), array)
开发者ID:Patechoc,项目名称:labs-untested,代码行数:36,代码来源:data.py

示例4: load_data

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def load_data():
    train_list = []
    for line in open('../data/train_clean.json', 'r'):
        train_list.append(json.loads(line))
    train = pd.DataFrame(train_list)
    
    #train_work = train[names[-1]]
    test_list = []
    for line in open('../data/test_clean.json', 'r'):
        test_list.append(json.loads(line))
    test = pd.DataFrame(test_list)
    
    print('--- NLP on major, simply cut the first word')
    le = LabelEncoder()
    print len(set(train['major']))
    train['major'] = train['major'].apply(lambda x :  " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')
    test['major']  = test['major'].apply(lambda x :  " ".join(jieba.cut(x,  cut_all = False)).split()[0] if x is not None  and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')

    print len(set(train['major']))
    le.fit(list(train['major']) + list(test['major']))
    train['major'] = le.transform(train['major'])
    test['major'] = le.transform(test['major'])
 
    le = LabelEncoder()
    train['gender'] = le.fit_transform(train['gender'])
    names =  train.columns
    
    le = LabelEncoder()
    test['gender'] = le.fit_transform(test['gender'])
    del train['_id']
    del test['_id']
    train = train.fillna(0)
    test = test.fillna(0)
    #test['age'] = test['age'].apply(lambda x : int(x.replace(u'岁','').encode('ascii')))
    return train, test
开发者ID:XIG-DATA,项目名称:JobTitlePrediction,代码行数:37,代码来源:prepro.py

示例5: process_raw_label

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def process_raw_label():
    df = pd.DataFrame([
        ['green', 'M', 10.1, 'class1'],
        ['red', 'L', 13.5, 'class2'],
        ['blue', 'XL', 15.3, 'class1']
    ])
    df.columns = ['color', 'size', 'price', 'classlabel']
    print(df)
    size_mapping = {
        'XL': 3,
        'L': 2,
        'M': 1
    }
    df['size'] = df['size'].map(size_mapping)
    print(df)
    class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
    print(class_mapping)
    df['classlabel'] = df['classlabel'].map(class_mapping)
    print(df)
    # inv
    inv_class_mapping = {v: k for k, v in class_mapping.items()}
    df['classlabel'] = df['classlabel'].map(inv_class_mapping)
    print(df)
    class_le = LabelEncoder()
    y = class_le.fit_transform(df['classlabel'].values)
    print(y)
    x = df[['color', 'size', 'price']].values
    print(x)
    color_le = LabelEncoder()
    x[:, 0] = color_le.fit_transform(x[:, 0])
    print('label encoder\n', x)
    ohe = OneHotEncoder(categorical_features=[0], sparse=False)
    x = ohe.fit_transform(x)
    print(x)
    print(pd.get_dummies(df[['price', 'color', 'size']]))
开发者ID:ilikesongdandan,项目名称:Introduction-to-Programming-Using-Python,代码行数:37,代码来源:process_raw_data.py

示例6: train_test

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
    def train_test(self, X, y, X_test):
        """
        """
        le = LabelEncoder()
        id_123 = np.logical_or(np.logical_or(y==1, y==2), y==3)  
        y0 = np.zeros(len(y), dtype=np.int32)
        y0[id_123] = 1
        X0 = np.copy(X) 
        y0 = le.fit_transform(y0).astype(np.int32)
    
        X1 = X[id_123]
        y1 = y[id_123]
        y1 = le.fit_transform(y1).astype(np.int32)
    
        X2 = X[np.logical_not(id_123)]
        y2 = y[np.logical_not(id_123)]    
        y2 = le.fit_transform(y2).astype(np.int32)
        
        print 'working on nn0...'
        self.nn0.max_epochs = self.early_stopping0.best_valid_epoch
        self.nn0.verbose=0
        self.nn0.fit(X0, y0)
        y0_pred = self.nn0.predict_proba(X_test)
        
        print 'working on nn1...'
        self.nn1.max_epochs = self.early_stopping1.best_valid_epoch
        self.nn1.verbose=0
        self.nn1.fit(X1, y1)
        y1_pred = self.nn1.predict_proba(X_test)   
        
        print 'working on nn2...'
        self.nn2.max_epochs = self.early_stopping2.best_valid_epoch
        self.nn2.verbose=0        
        self.nn2.fit(X2, y2)
        y2_pred = self.nn2.predict_proba(X_test)
           
        y_pred = np.zeros((y0_pred.shape[0], 9))
        y_pred[:,0] = y0_pred[:,0]*y2_pred[:,0]
        y_pred[:,1] = y0_pred[:,1]*y1_pred[:,0]
        y_pred[:,2] = y0_pred[:,1]*y1_pred[:,1]
        y_pred[:,3] = y0_pred[:,1]*y1_pred[:,2]
        y_pred[:,4] = y0_pred[:,0]*y2_pred[:,1]
        y_pred[:,5] = y0_pred[:,0]*y2_pred[:,2]
        y_pred[:,6] = y0_pred[:,0]*y2_pred[:,3]
        y_pred[:,7] = y0_pred[:,0]*y2_pred[:,4]
        y_pred[:,8] = y0_pred[:,0]*y2_pred[:,5]
        yp0 = y_pred
        
        self.cal_clf.fit(X, y)        
        yp1 = self.cal_clf.predict_proba(X_test)
        y_pred = (yp0 + yp1)/2.        
        
        return y_pred       
        


        

        
开发者ID:chrinide,项目名称:kaggle_otto_group,代码行数:55,代码来源:clf_nolearn_2_levels_cal.py

示例7: test_label_encoder_fit_transform

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def test_label_encoder_fit_transform():
    """Test fit_transform"""
    le = LabelEncoder()
    ret = le.fit_transform([1, 1, 4, 5, -1, 0])
    assert_array_equal(ret, [2, 2, 3, 4, 0, 1])

    le = LabelEncoder()
    ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])
    assert_array_equal(ret, [1, 1, 2, 0])
开发者ID:MarkyV,项目名称:scikit-learn,代码行数:11,代码来源:test_preprocessing.py

示例8: label_encoding

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
 def label_encoding(self, x: pd.DataFrame, y: pd.DataFrame, services: list) -> (pd.DataFrame, pd.DataFrame):
     le = LabelEncoder()
     le = le.fit(services)
     x['service'] = le.transform(x['service'])
     for feature in ["protocol_type", "flag"]:
         x[feature] = le.fit_transform(x[feature])
     y = le.fit_transform(y)
     print(le.classes_)
     return x, y
开发者ID:qianFX,项目名称:final_project,代码行数:11,代码来源:kdc.py

示例9: X_train_generatetor_infinite

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def X_train_generatetor_infinite(dim=128,maxlen=500,batch_size=128,name="X_train.csv",events=None):
    X_train = pd.read_csv(path+name)
    group_le = LabelEncoder()
    group_lb = LabelBinarizer()
    labels = group_le.fit_transform(X_train['group'].values)
    labels = group_lb.fit_transform(labels)
    del labels
    
    ##################
    #   Phone Brand
    ##################
    # print("# Read Phone Brand")
    phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv',
                    dtype={'device_id': np.str})
    phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True)
    phone_brand_le = LabelEncoder()
    phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand'])

    device_model_le = LabelEncoder()
    phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model'])


    while 1:
        data = pd.read_csv(path+name,iterator=True,chunksize=batch_size,
                    dtype={'device_id': np.str})
        for X_train in data:
            X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True)
            phone_brand = X_train['phone_brand'].values
            device_model = X_train['device_model'].values


            X_train["app_lab"] = X_train["device_id"].map(events)
            y_train = X_train['group'].values
            
            X_train['gender'][X_train['gender']=='M']=1
            X_train['gender'][X_train['gender']=='F']=0

            y_train_gender = X_train['gender'].values
            y_train_age = X_train['age'].values
            # take log transformation
            y_train_age = np.log(y_train_age)

            X_train.fillna('0 ',inplace=True)
            y_train = group_le.transform(y_train)
            y_train = group_lb.transform(y_train)
            x_train = X_train["app_lab"].values
            x_train = [ x.split(' ') for x in  x_train]
            for i in range(len(x_train)):
                x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')]

            x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
            
            x_train = [x_train,phone_brand,device_model]
            y_train = [y_train,y_train_gender,y_train_age]

            yield (x_train,y_train)
开发者ID:chu-NMSU,项目名称:Talking-Data,代码行数:58,代码来源:benchmark_cnn.py

示例10: process_data

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def process_data(trainDF, testDF):
	# 去除train,test中的无用列,并做数据合并
	trainDF.drop(['Descript', 'Resolution'], axis=1, inplace=True)
	testDF.drop(['Id'], axis=1, inplace=True)
	labels = trainDF['Category'].copy()
	y = trainDF['Category'].copy()	
	combi = pd.concat([trainDF.drop(['Category'], axis=1),  testDF])
	
	combi['Month'], combi['Day'], combi['Hour'] = zip(*combi['Dates'].apply(extract_time))
	combi.drop(['Dates'], axis=1, inplace=True)
	combi['intesect'] = combi['Address'].apply(lambda x: 1 if '/' in x else 0)
	combi['Wake'] = combi['Hour'].apply(lambda x: 1 if (int(x)>=8 and int(x)<=23) else 0)
	addresses = sorted(combi['Address'].unique())
	categories = sorted(trainDF['Category'].unique())
	addr_counts = combi.groupby('Address').size()
	cat_counts = trainDF.groupby('Category').size()
	addr_cat_counts = trainDF.groupby(['Address', 'Category']).size()
	# 使用counts learning方法对地址信息和分类结果进行特征提取, 可参考https://msdn.microsoft.com/en-us/library/azure/dn913056.aspx
	logoddsPA = {}
	logodds = {}
	PA = cat_counts/float(len(trainDF))
	default_logodds = np.log(PA/(1-PA))
	for addr in addresses:
		PA = addr_counts[addr]/float(len(combi))
		logoddsPA[addr] = np.log(PA/(1.0-PA))
		logodds[addr] = deepcopy(default_logodds)
		if addr in addr_cat_counts.keys():
			for cat in addr_cat_counts[addr].keys():
				if addr_cat_counts[addr][cat] >= 2 and addr_cat_counts[addr][cat] < addr_counts[addr]:
					PA = addr_cat_counts[addr][cat] / float(addr_counts[addr])
					logodds[addr][categories.index(cat)] = np.log(PA/(1.0-PA))
		logodds[addr] = pd.Series(logodds[addr])
		logodds[addr].index = range(len(categories))
	combi['LogoddsPA'] = combi['Address'].apply(lambda x: logoddsPA[x])
	logodds_features = combi['Address'].apply(lambda x: logodds[x])
	logodds_features.colums = ["logodds"+str(x) for x in range(len(categories))]
	combi_full = pd.concat([combi, logodds_features], axis=1)
	xy_scaler = StandardScaler()
	combi_full[['X', 'Y']] = xy_scaler.fit_transform(combi_full[['X', 'Y']])
	# 进行label encoding
	lbe = LabelEncoder()
	combi_full['DayOfWeek'] = lbe.fit_transform(combi_full['DayOfWeek'])
	combi_full['PdDistrict'] = lbe.fit_transform(combi_full['PdDistrict'])
	combi_full['Wake'] = combi_full['Hour'].apply(lambda x: 1 if (int(x)>=8 and int(x)<=23) else 0)
	combi_full["IsDup"]=pd.Series(combi_full.duplicated()|combi_full.duplicated(take_last=True)).apply(int)
	combi_full.drop(['Address'], axis=1, inplace=True)
	y = lbe.fit_transform(y)
	# 由于采用xgboost,可不对特征进行dummy处理
	#ohe = OneHotEncoder(categorical_features=[0, 1,4,5,6])
	#data = ohe.fit_transform(combi_full.values)
	train = combi_full.values[:878049, :]
	test = combi_full.values[878049:, :]

	return train, test, y, lbe.classes_
开发者ID:moment0517,项目名称:kaggle,代码行数:56,代码来源:san+francisco+crime+classification.py

示例11: execute

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
 def execute(self,data):
     print 'started label encoding step'
     le = LabelEncoder()
     output_array = le.fit_transform(data[self.column_list[0]])
     for i in range(1,len(self.column_list)):
         output_array=np.column_stack([output_array,le.fit_transform(data[self.column_list[i]])])
     otherCols = set(data.columns).difference(set(self.column_list))
     df1 = data[list(otherCols)]
     df2 = pd.DataFrame(output_array,columns=self.column_list)
     df1 = df1.join(df2,how='left')
     print 'finished label encoding step'
     return df1
开发者ID:AnantaData,项目名称:AnantaFramework,代码行数:14,代码来源:data_transformation.py

示例12: fit_transform

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
    def fit_transform(self, dframe):
        """
        Fit label encoder and return encoded labels.

        Access individual column classes via indexing
        `self.all_classes_`

        Access individual column encoders via indexing
        `self.all_encoders_`

        Access individual column encoded labels via indexing
        `self.all_labels_`
        """
        df = dframe.copy()
        # if columns are provided, iterate through and get `classes_`
        if self.columns is not None:
            # ndarray to hold LabelEncoder().classes_ for each
            # column; should match the shape of specified `columns`
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                            dtype=object)
            self.all_labels_ = np.ndarray(shape=self.columns.shape,
                                          dtype=object)
            for idx, column in enumerate(self.columns):
                # instantiate LabelEncoder
                le = LabelEncoder()
                # fit and transform labels in the column
                df.loc[:, column] =\
                    le.fit_transform(df.loc[:, column].values)
                # append the `classes_` to our ndarray container
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
                self.all_labels_[idx] = le
        else:
            # no columns specified; assume all are to be encoded
            self.columns = df.iloc[:, :].columns
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            for idx, column in enumerate(self.columns):
                le = LabelEncoder()
                df.loc[:, column] = le.fit_transform(
                        df.loc[:, column].values)
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
        return df
开发者ID:jmwoloso,项目名称:avalearn,代码行数:52,代码来源:encoder.py

示例13: data_preprocess

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def data_preprocess(df):
    dropLst = ['Unnamed: 0', 'STATION_NAME',
               'STATISTICAL_CODE_DESCRIPTION', 'CrimeCat']
    df['STREET'] = df['STREET'].apply(get_rid_num)
    df['ZIP'] = df['ZIP'].apply(int)
    le = LabelEncoder()
    df['STREET'] = le.fit_transform(df['STREET'])
    df['CITY'] = le.fit_transform(df['CITY'])
    feature_names = df.drop(dropLst, axis=1).columns
    X = df.drop(dropLst, axis=1).values
    y = df['CrimeCat'].values
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    return X, y, feature_names
开发者ID:livenb,项目名称:crime_prediction,代码行数:16,代码来源:classification.py

示例14: __call__

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
    def __call__(self, X_train, X_test, y_train, y_test):
        X = np.vstack([X_train, X_test])
        y = np.hstack([y_train, y_test])
        le = LabelEncoder()
        y = le.fit_transform(y)

        kmeans = KMeans(
            n_clusters=len(np.unique(y)),
            n_init=self.kmeans__n_init,
            random_state=self.random_state,
        )
        kmeans.fit(X)

        r = distance.cdist(kmeans.cluster_centers_, kmeans.cluster_centers_)
        h = np.exp(-r / (self.sig**2))

        N = confusion_matrix(y, kmeans.labels_)

        wN = np.zeros(h.shape)
        for l in range(wN.shape[0]):  # label
            for c in range(wN.shape[0]):  # cluster
                for j in range(wN.shape[0]):
                    wN[l, c] += h[l, c] * N[l, j]

        return wN.max(axis=0).sum() / wN.sum()
开发者ID:svecon,项目名称:metric-learn,代码行数:27,代码来源:wpurity.py

示例15: test_multiclass_classifier_class_weight

# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import fit_transform [as 别名]
def test_multiclass_classifier_class_weight():
    """tests multiclass with classweights for each class"""
    alpha = .1
    n_samples = 20
    tol = .00001
    max_iter = 50
    class_weight = {0: .45, 1: .55, 2: .75}
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)

    clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
                              max_iter=max_iter, tol=tol, random_state=77,
                              fit_intercept=fit_intercept,
                              class_weight=class_weight)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]

    coef1 = []
    intercept1 = []
    coef2 = []
    intercept2 = []
    for cl in classes:
        y_encoded = np.ones(n_samples)
        y_encoded[y != cl] = -1

        spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight)
        spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight,
                                              sparse=True)
        coef1.append(spweights1)
        intercept1.append(spintercept1)
        coef2.append(spweights2)
        intercept2.append(spintercept2)

    coef1 = np.vstack(coef1)
    intercept1 = np.array(intercept1)
    coef2 = np.vstack(coef2)
    intercept2 = np.array(intercept2)

    for i, cl in enumerate(classes):
        assert_array_almost_equal(clf1.coef_[i].ravel(),
                                  coef1[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)

        assert_array_almost_equal(clf2.coef_[i].ravel(),
                                  coef2[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:62,代码来源:test_sag.py


注:本文中的sklearn.preprocessing.LabelEncoder.fit_transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。