本文整理汇总了Python中sklearn.preprocessing.LabelEncoder.classes_方法的典型用法代码示例。如果您正苦于以下问题:Python LabelEncoder.classes_方法的具体用法?Python LabelEncoder.classes_怎么用?Python LabelEncoder.classes_使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.LabelEncoder
的用法示例。
在下文中一共展示了LabelEncoder.classes_方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: data
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
def data(fold=False):
fname = df.zoo.download('http://dags.stanford.edu/data/iccv09Data.tar.gz')
# extracting files one-by-one in memory is unfortunately WAY too slow
# for this dataset. So we bite the bullet and extract the full tgz.
where = _p.dirname(fname)
imgdir = 'iccv09Data/images/'
with _taropen(fname, 'r') as f:
f.extractall(where)
ids = [_p.basename(n)[:-4] for n in f.getnames() if n.startswith(imgdir)]
X = [imread(_p.join(where, imgdir, i) + '.jpg') for i in ids]
y = [_np.loadtxt(_p.join(where, 'iccv09Data/labels', i) + '.regions.txt', dtype=_np.int32) for i in ids]
# I personally don't believe in the other label types.
le = _np.array(['sky', 'tree', 'road', 'grass', 'water', 'building', 'mountain', 'foreground', 'object'])
try:
from sklearn.preprocessing import LabelEncoder
le, classes = LabelEncoder(), le
le.classes_ = classes
except ImportError:
pass
if fold is False:
return X, y, le
lo, hi = fold*ntest(), (fold+1)*ntest()
Xtr = X[:lo] + X[hi:]
ytr = y[:lo] + y[hi:]
Xte = X[lo:hi]
yte = y[lo:hi]
return (Xtr, ytr), (Xte, yte), le
示例2: encode
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
def encode(df, dump=fromPickle):
"""
Takes in: dataframe from clean_col
Returns: a dataframe that LabelEncodes the categorical variables
"""
encoders=dict()
for col in lblColumns:
if col not in final_cols:
continue
le = LabelEncoder()
if dump:
fName="%s/%s.npy"%(modelPath,col)
if os.path.isfile(fName):
le.classes_=np.load(fName)
else:
le.fit(df[col])
np.save(fName, le.classes_)
else:
le.fit(df[col])
encoders[col]=le
df[col] = le.transform(df[col])
# Order columns with logprice as the last column
df = df[final_cols]
df = df.reset_index().drop('index', axis = 1)
return df
示例3: restore
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
def restore(self,model_path):
'''
Restore a saved multiencoder from path using npz file, by reconstructing the LabelEncoders with the classes.
Restore the X header too.
'''
path = model_path + '/encoder.npz'
h_path = model_path + '/header.npz'
npzfile = np.load(path)
h_npzfile = np.load(h_path)
self.header = h_npzfile['header']
self.encoders = {}
for k,v in npzfile.items():
le = LabelEncoder()
le.classes_ = v
self.encoders[k] = le
self.columns = list(self.encoders.keys())
return self
示例4: learn_sentdist
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
def learn_sentdist(clean_pcc,
feature_list=None,
label_features=None):
""" Learning a classifier for the distance of arguments from a connective
Runs a random forest. Prints out accuracy scores from a 5-fold cross validation.
Returns the classifier and the label encoder that was used.
:param clean_pcc: Cleaned PCC data, no NaNs
:type clean_pcc: pd.DataFrame
:param feature_list: list of features that shall be calculated with discourse_connective_text_featurizer
:param label_features: list of features that have to be encoded as labels
:return: trained classifier, score array and label encoder
:rtype: tuple
"""
print 'Calculating features...'
# Taking our favorite featurizer
featurizer = lambda sents, conn_pos: discourse_connective_text_featurizer(sents, conn_pos,
feature_list=feature_list)
features = sentdist_feature_dataframe(clean_pcc, featurizer) # Got features of X
print 'Calculated all features'
# We need to encode the non-numerical labels
le = LabelEncoder()
# LabelEncoder only deals with 1 dim np.arrays
le.fit(features[label_features].values.ravel())
# Dealing with unknowns
le.classes_ = np.append(le.classes_, '<unknown>')
features = encode_label_features(features, le, label_features)
print 'Cross validating classifier...'
clf = RandomForestClassifier(min_samples_leaf=5, n_jobs=-1, verbose=0)
scores = cross_val_score(clf, features, clean_pcc['sentence_dist'], cv=5)
print 'Cross validated classifier\nscores: %s\nmean score: %f' % (str(scores), scores.mean())
print 'Learning classifier on the whole data set...'
clf.fit(features, clean_pcc['sentence_dist'])
print 'Learned classifier on the whole data set'
return clf, scores, le
示例5: create_kaggle_submission
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
def create_kaggle_submission(prob, ids_raw, score=None, threshold=0.0):
"""
Given a model, load training data and predict on it. Includes id col.
Note that this assumes only one prediction per user. (for now)
"""
ids = [] #list of ids
cts = [] #list of countries
# threshold = 0.0
le = LabelEncoder()
le.classes_ = COUNTRY_CLASSES
for i in xrange(len(list(ids_raw))):
idx = ids_raw[i]
if pd.isnull(idx):
py.test.set_trace()
valid = sorted([(j,k) for j,k in enumerate(prob[i]) if k >= threshold],key=lambda x: x[1], reverse=True)
valid_ids, valid_prob = zip(*valid)
sub_countries = list([le.inverse_transform(x) for x in valid_ids])[:5]
ids += [idx] * len(sub_countries)
cts += sub_countries
# spot check that it's not all 7's
irene = [np.argmax(i) for i in prob]
print pd.Series(irene).value_counts()
#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
# datetime submission
date_str = datetime.datetime.now().strftime('%y%m%d_%H%M')
if score:
sub.to_csv('submission_%.4f.csv' % score,index=False)
else:
sub.to_csv('submission_%s.csv' % date_str ,index=False)
return sub
示例6: LabelEncoder
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
# convert Tag1 from strings to integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['Tag1_enc'] = le.fit_transform(train.Tag1)
# confirm that the conversion worked
train.Tag1.value_counts().head()
train.Tag1_enc.value_counts().head()
# create a dummy column for each value of Tag1_enc (returns a sparse matrix)
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
tag1_dummies = ohe.fit_transform(train[['Tag1_enc']])
tag1_dummies
# try a Naive Bayes model with tag1_dummies as the features
cross_val_score(nb, tag1_dummies, train.OpenStatus, scoring='log_loss', cv=10).mean() # 0.650
# adjust Tag1 on testing set since LabelEncoder errors on new values during a transform
test['Tag1'] = test['Tag1'].map(lambda s: '<unknown>' if s not in le.classes_ else s)
import numpy as np
le.classes_ = np.append(le.classes_, '<unknown>')
# apply the same encoding to the actual testing data and make predictions
test['Tag1_enc'] = le.transform(test.Tag1)
oos_tag1_dummies = ohe.transform(test[['Tag1_enc']])
nb.fit(tag1_dummies, train.OpenStatus)
oos_pred_prob = nb.predict_proba(oos_tag1_dummies)[:, 1]
sub = pd.DataFrame({'id':test.index, 'OpenStatus':oos_pred_prob}).set_index('id')
sub.to_csv('sub5.csv') # 0.649
示例7: learn_main_arg_node
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
def learn_main_arg_node(node_df,
syntax_dict,
node_dict,
precalc_features=None,
feature_list=None,
label_features=None):
""" Learn a classifier for a node being arg0 or arg1
:param node_df: node data with tree and node ids
:type node_df: pd.DataFrame
:param syntax_dict: to look up the syntax trees by their id
:type syntax_dict: dict
:param node_dict: to look up the nodes by their id
:type node_dict: dict
:param precalc_features: precalculated features to save computation time in development
:param precalc_features: pd.DataFrame
:param feature_list: Names of the features that shall be calculated
:type feature_list: list
:param label_features: Names of features that are discrete
:type label_features: list
:return: All data that is needed to classifiy new data with the classifiers
LogisticRegression classifiers from scikit learn, the list of features and label
features, as well as encoders for the labels and a binary encoder and a featurizer method
{'logit_arg0_clf': logit_arg0_clf,
'logit_arg1_clf': logit_arg1_clf,
'feature_list': feature_list,
'label_features': label_features,
'label_encoder': le,
'binary_encoder': ohe,
'node_featurizer': featurizer}
:rtype: dict
"""
def featurizer(node_df, syntax_dict, node_dict):
return node_feature_dataframe(node_df, node_featurizer,
syntax_dict=syntax_dict,
node_dict=node_dict,
feature_list=feature_list)
if precalc_features is None:
print 'Calculating features'
features = featurizer(node_df, syntax_dict, node_dict)
print 'done'
else:
features = precalc_features
# We need to encode the non-numerical labels
print 'Encoding labels...'
le = LabelEncoder()
# LabelEncoder only deals with 1 dim np.arrays
le.fit(features[label_features].values.ravel())
# Dealing with unknowns
le.classes_ = np.append(le.classes_, '<unknown>')
encoded_features = encode_label_features(features, le, label_features)
print 'Encoded label'
# We need to binarize the data for logistic regression
print 'Binarizing features for logistic regression...'
ohe = OneHotEncoder(sparse=False)
ohe.fit(encoded_features[label_features].values)
logit_features = binarize_features(encoded_features, ohe, label_features)
print 'Binarized features.'
print 'Training classifiers for arg0 labeling'
print '======================================'
nr_of_nodes = float(len(node_df))
baseline = (nr_of_nodes - sum(node_df['is_arg0_node'])) / nr_of_nodes
print 'Majority baseline: %f' % baseline
print 'Cross validating Logistic regression classifier...'
# C is the inverse of the regularization strength
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
logit_arg0_clf = LogisticRegression(C=1.0)
scores = cross_val_score(logit_arg0_clf, logit_features,
node_df['is_arg0_node'], cv=5)
print 'Cross validated Logistic Regression classifier\nscores: %s\nmean score: ' \
'%f' % (str(scores), scores.mean())
print ''
print 'Training classifiers for arg1 labeling'
print '======================================'
baseline = (nr_of_nodes - sum(node_df['is_arg1_node'])) / nr_of_nodes
print 'Majority baseline: %f' % baseline
print 'Cross validating Logistic regression classifier...'
# C is the inverse of the regularization strength
logit_arg1_clf = LogisticRegression(C=1.0)
scores = cross_val_score(logit_arg1_clf, logit_features,
node_df['is_arg1_node'], cv=5)
print 'Cross validated Logistic Regression classifier\nscores: %s\nmean score: ' \
'%f' % (
str(scores), scores.mean())
print 'Learning classifiers on the whole data set...'
logit_arg0_clf.fit(logit_features, node_df['is_arg0_node'])
logit_arg1_clf.fit(logit_features, node_df['is_arg1_node'])
print 'Learned classifier on the whole data set'
# ToDo: Design features (see Lin et al p. 17, Connective_syntactic!)
# ToDo: Evaluate this method (remember not to count punctuation)
# ToDo: Get baseline by labeling everything after the connective as
#.........这里部分代码省略.........
示例8: LabelEncoder
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
if __name__ == '__main__':
# TODO: ROS node initialization
rospy.init_node('clustering', anonymous=True)
# TODO: Create Subscribers
pcl_sub = rospy.Subscriber("/sensor_stick/point_cloud", pc2.PointCloud2, pcl_callback, queue_size=1)
# TODO: Create Publishers
pcl_objects_pub = rospy.Publisher("/pcl_objects", pc2.PointCloud2, queue_size=1)
pcl_table_pub = rospy.Publisher("/pcl_table", pc2.PointCloud2, queue_size=1)
pcl_cluster_pub = rospy.Publisher("/pcl_cluster", pc2.PointCloud2, queue_size=1)
object_markers_pub = rospy.Publisher("/object_markers", Marker, queue_size=1)
detected_objects_pub = rospy.Publisher("/detected_objects", DetectedObjectsArray, queue_size=1)
# TODO: Load Model From disk
model = pickle.load(open('model.sav', 'rb'))
clf = model['classifier']
encoder = LabelEncoder()
encoder.classes_ = model['classes']
scaler = model['scaler']
# Initialize color_list
get_color_list.color_list = []
# TODO: Spin while node is not shutdown
while not rospy.is_shutdown():
rospy.spin()
示例9: LabelEncoder
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
cosine = F.linear(x_norm, w_norm, None)
out = cosine * self.scale
return out
resume = sys.argv[1]
encoder = sys.argv[2]
x_test = sys.argv[3]
train_folder = "/home/blcv/CODE/Kaggle/humpback_short_blażej/data/processed/train_bb_fastai2/"
test_df = "/home/blcv/CODE/Kaggle/humpback_whale_identification/data/processed/sample_submission.csv"
test_folder = "/home/blcv/CODE/Kaggle/humpback_short_blażej/data/processed/test_bb_fastai2/"
option_da = ['gray']# [] #
label_encoder = LabelEncoder()
label_encoder.classes_ = np.load(encoder)
# encode whale as integers
X_test = pd.read_csv(x_test)
val_loader = getDataLoader(X_test, train_folder, 'val', option_da = option_da, image_size = 224, batch_size = 64)
# model preparation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'se_resnext101_32x4d'
model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')
model.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
model.last_linear = nn.Sequential(*[nn.LayerNorm(model.last_linear.in_features, elementwise_affine = False),
NormLinear(model.last_linear.in_features, 5004)])
model = model.to(device)
model = nn.DataParallel(model)
示例10: process_data
# 需要导入模块: from sklearn.preprocessing import LabelEncoder [as 别名]
# 或者: from sklearn.preprocessing.LabelEncoder import classes_ [as 别名]
def process_data(data_type='train', write_to_csv=False, return_df=True,
include_sessions=False):
train = pd.read_csv(TRAINING_DATA, header=0)
test = pd.read_csv(TEST_DATA, header=0)
train_countries = train['country_destination']
train_ids = train['id']
test_ids = test['id']
train.drop(['id', 'country_destination'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)
piv_train = train.shape[0]
data = pd.concat((train, test), axis=0, ignore_index=True)
# features to output into model training data
nonnumeric_columns = [
'gender',
'signup_method',
'signup_flow',
'language',
'affiliate_channel',
'affiliate_provider',
'first_affiliate_tracked',
'signup_app',
'first_device_type',
'first_browser',
]
# add_null_cols(data)
add_date_cols(data)
parse_age(data)
data = add_categorical_cols(data, nonnumeric_columns, data_type)
data = fill_in_na(data)
vals = data.values
X = vals[:piv_train]
le = LabelEncoder()
le.classes_ = COUNTRY_CLASSES
y = le.fit_transform(train_countries)
train_df = pd.DataFrame(X, columns=data.columns)
train_df['id'] = train_ids
train_df['country_destination'] = y
X_kaggle = vals[piv_train:]
test_df = pd.DataFrame(X_kaggle, columns=data.columns)
test_df['id'] = test_ids
if include_sessions:
sessions_df = pd.read_csv('sessions_users.csv')
train_df = train_df.merge(sessions_df, how='left',
left_index='id', right_index='user_id')
train_df.fillna(0, inplace=True)
test_df = test_df.merge(sessions_df, how='left',
left_index='id', right_index='user_id')
test_df.fillna(0, inplace=True)
if write_to_csv:
if include_sessions:
test_df.to_csv('test_sessions.csv', index=False)
train_df.to_csv('train_sessions.csv', index=False)
print 'Wrote train_sessions.csv, test_sessions.csv'
else:
test_df.to_csv('test.csv', index=False)
train_df.to_csv('train.csv', index=False)
print 'Wrote train.csv, test.csv'
if return_df:
return test_df, train_df