本文整理汇总了Python中sklearn.preprocessing.LabelBinarizer.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python LabelBinarizer.fit_transform方法的具体用法?Python LabelBinarizer.fit_transform怎么用?Python LabelBinarizer.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.LabelBinarizer
的用法示例。
在下文中一共展示了LabelBinarizer.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def main():
pipeline = Pipeline([
('vect', TfidfVectorizer(stop_words='english')),
('clf', LogisticRegression())
])
parameters = {
'vect__max_df': (0.25, 0.5),
'vect__ngram_range': ((1, 1), (1, 2)),
'vect__use_idf': (True, False),
'clf__C': (0.1, 1, 10),
}
os.chdir('C:\\Users\\Dan\\1) Python Notebooks\\Datasets')
df = pd.read_csv('data/train.tsv', header=0, delimiter='\t')
X, y = df['Phrase'], df['Sentiment'].as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
lb = LabelBinarizer()
y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
lb = LabelBinarizer()
y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
print 'Accuracy:', accuracy_score(y_test, predictions)
print 'Precision:', precision_score(y_test, predictions)
print 'Recall:', recall_score(y_test, predictions)
示例2: test_label_binarizer_multilabel
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def test_label_binarizer_multilabel():
lb = LabelBinarizer()
# test input as lists of tuples
inp = [(2, 3), (1,), (1, 2)]
indicator_mat = np.array([[0, 1, 1],
[1, 0, 0],
[1, 1, 0]])
got = lb.fit_transform(inp)
assert_array_equal(indicator_mat, got)
assert_equal(lb.inverse_transform(got), inp)
# test input as label indicator matrix
lb.fit(indicator_mat)
assert_array_equal(indicator_mat,
lb.inverse_transform(indicator_mat))
# regression test for the two-class multilabel case
lb = LabelBinarizer()
inp = [[1, 0], [0], [1], [0, 1]]
expected = np.array([[1, 1],
[1, 0],
[0, 1],
[1, 1]])
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_equal([set(x) for x in lb.inverse_transform(got)],
[set(x) for x in inp])
示例3: initData
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def initData(filename):
if not os.path.exists(filename):
print "I can't find this file: %s"%filename
sys.exit(1)
datareader = csv.reader(open(filename,'r'))
ct = 0;
for row in datareader:
ct = ct+1
datareader = csv.reader(open(filename,'r'))
data = np.array(-1*np.ones((ct,7),float),object);
k=0;
for row in datareader:
data[k,:] = np.array(row)
k = k+1;
#To modify
featnames = np.array(ATTRIBUTES,str)
keys = [[]]*np.size(data,1)
numdata = -1*np.ones_like(data);
nfeatures=[0]
featIndex=[]
# convert string objects to integer values for modeling:
for k in range(np.size(data,1)):
keys[k],garbage,numdata[:,k] = np.unique(data[:,k],True,True)
numrows = np.size(numdata,0); # number of instances in car data set
numcols = np.size(numdata,1); # number of columns in car data set
numdata = np.array(numdata,int)
xdata = numdata[:,:-1]; # x-data is all data BUT the last column which are the class labels
ydata = numdata[:,-1]; # y-data is set to class labels in the final column, signified by -1
# ------------------ numdata multilabel -> binary conversion for NB-Model ---------------------
lbin = LabelBinarizer();
for k in range(np.size(xdata,1)): # loop thru number of columns in xdata
if k==0:
xdata_ml = lbin.fit_transform(xdata[:,k]);
featIndex = lbin.classes_
nfeatures.append(len(lbin.classes_))
else:
xdata_ml = np.hstack((xdata_ml,lbin.fit_transform(xdata[:,k])))
featIndex= np.hstack((featIndex,lbin.classes_))
nfeatures.append(nfeatures[-1]+len(lbin.classes_))
if _VERBOSE:
print "nfeatures:"
print nfeatures
print "featIndex"
print featIndex
return xdata_ml,xdata,ydata,data,nfeatures,keys,featIndex
示例4: encode_categorical
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def encode_categorical(cat, missing_value = False, option = "binary"):
# Encodes the categorical features. For N unique categories:
# cat : the column of categorical values
# option = 'binary' : binary (one-hot, orthogonal, thermometer) encoding - N features
# 'freq' : occuring frequency (percentage) - 1 feature
# 'mis_float' : all binary encoded except missing values (vector of floats corresponding to occurance frequencies) - (N-1) features
# 'mis_unif' : all binary encoded except missing values (vector of floats of uniform values) - (N-1) features
# 'dummy' : just like binary but one column is removed - (n-1) features
# 'sum' : sum (deviation) coding. just like dummy but zeros row is all -1 - (N-1) features
########## TO DO: Encoding w.r.t. targets
if option == "binary":
lb = LabelBinarizer()
encoded = lb.fit_transform(cat)
elif option == "freq":
freq_count = itemfreq(cat)
encoded = np.zeros(len(cat))
for i in range(freq_count.shape[0]):
encoded[cat == freq_count[i][0]] = float(freq_count[i][1])/len(encoded)
elif option == "mis_float":
if missing_value == False:
raise ValueError("Provide a missing value for the option 'mis_float'.")
else:
lb = LabelBinarizer()
encoded = lb.fit_transform(cat).astype(float)
missing_bool = cat == missing_value
if np.sum(missing_bool) == 0:
raise ValueError("No such missing value!")
encoded = np.delete(encoded, np.argmax(encoded[np.argmax(missing_bool),:]), axis = 1)
encoded[missing_bool,:] = np.sum(encoded[~missing_bool], axis = 0)/float(encoded[~missing_bool].shape[0])
elif option == "mis_unif":
if missing_value == False:
raise ValueError("Provide a missing value for the option 'mis_float'.")
else:
lb = LabelBinarizer()
encoded = lb.fit_transform(cat).astype(float)
missing_bool = cat == missing_value
if np.sum(missing_bool) == 0:
raise ValueError("No such missing value!")
encoded = np.delete(encoded, np.argmax(encoded[np.argmax(missing_bool),:]), axis = 1)
encoded[missing_bool,:] = np.ones(encoded.shape[1]) * 1.0 / encoded.shape[1]
elif option == "dummy":
lb = LabelBinarizer()
encoded = lb.fit_transform(cat)[:,0:-1]
elif option == "sum":
lb = LabelBinarizer()
encoded = lb.fit_transform(cat)
last_col = encoded[:,-1].astype(bool)
encoded = encoded[:,0:-1]
encoded[last_col,:] = -1
else:
raise ValueError("No such option!")
print("Number of unique categorical values : %s" % encoded.shape[1])
return encoded
示例5: test_multinomial_loss
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def test_multinomial_loss():
# test if the multinomial loss and gradient computations are consistent
X, y = iris.data, iris.target.astype(np.float64)
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
rng = check_random_state(42)
weights = rng.randn(n_features, n_classes)
intercept = rng.randn(n_classes)
sample_weights = rng.randn(n_samples)
np.abs(sample_weights, sample_weights)
# compute loss and gradient like in multinomial SAG
dataset, _ = make_dataset(X, y, sample_weights, random_state=42)
loss_1, grad_1 = _multinomial_grad_loss_all_samples(dataset, weights,
intercept, n_samples,
n_features, n_classes)
# compute loss and gradient like in multinomial LogisticRegression
lbin = LabelBinarizer()
Y_bin = lbin.fit_transform(y)
weights_intercept = np.vstack((weights, intercept)).T.ravel()
loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin,
0.0, sample_weights)
grad_2 = grad_2.reshape(n_classes, -1)
grad_2 = grad_2[:, :-1].T
# comparison
assert_array_almost_equal(grad_1, grad_2)
assert_almost_equal(loss_1, loss_2)
示例6: ElasticNetClassifier
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
class ElasticNetClassifier(LinearClassifierMixin, ElasticNet):
"""Class to extend elastic-net in case of classification."""
def fit(self, X, y, check_input=True):
self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
Y = self._label_binarizer.fit_transform(y)
if self._label_binarizer.y_type_.startswith('multilabel'):
# we don't (yet) support multi-label classification in ENet
raise ValueError(
"%s doesn't support multi-label classification" % (
self.__class__.__name__))
# Y = column_or_1d(Y, warn=True)
super(ElasticNetClassifier, self).fit(X, Y)
if self.classes_.shape[0] > 2:
ndim = self.classes_.shape[0]
else:
ndim = 1
self.coef_ = self.coef_.reshape(ndim, -1)
return self
@property
def classes_(self):
return self._label_binarizer.classes_
示例7: bio_classification_report
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def bio_classification_report(y_true, y_pred):
lb = LabelBinarizer()
y_true_combined = 1 - lb.fit_transform(list(chain.from_iterable(y_true)))
y_pred_combined = list(chain.from_iterable(y_pred))
tagset = set(lb.classes_) - {'O'}
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
print 'True sum %d Pred sum %d Len %d' %(sum(y_true_combined), sum(y_pred_combined), len(y_pred_combined))
print "AUC\tP-R: %.4f\tROC: %.4f" % (average_precision_score(y_true_combined, y_pred_combined, average=None),
roc_auc_score(y_true_combined, y_pred_combined, average=None))
#plt.figure()
#fpr, tpr, thr = roc_curve(y_true_combined, y_pred_combined)
#area = auc(fpr, tpr)
#plt.plot(fpr, tpr, label='{area:.3f}'.format( area=area))
#plt.legend(loc=4)
#plt.savefig('sub3.jpg')
return classification_report(
1 - y_true_combined,
[0 if v > 0.1 else 1 for v in y_pred_combined],
labels=[class_indices[cls] for cls in tagset],
target_names=tagset,
)
示例8: transform
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def transform(self, data_dict):
listOfUnits = ["kilogram", "kg", "gram", "[GMgmkK]?Hz", "liter", "ml",
"cup", "cm", "foot", "inch", "meter", "mg", "gallon", "milliliter", "[MGTmgtKk]B"]
regex = "[\d]+\.[\d]+(" + "[\b/,-]|".join(listOfUnits) + ")"
data = data_dict[self.key].str.extract(regex, flags = re.IGNORECASE, expand=False).str.lower()
lb = LabelBinarizer()
return lb.fit_transform(data.fillna(""))
示例9: full_matrix
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def full_matrix(dropped):
# create initial matrix
print('starting with m0')
lb = LabelBinarizer(sparse_output=True)
# m = lb.fit_transform(dropped.restaurant_id)
m = lb.fit_transform(dropped.user_name)
print(m.shape)
# build matrix
# making nan its own category for categorical
print("adding categorical to matrix")
m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city', 'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode', 'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',])
print(m.shape)
print("adding bool to matrix")
m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating', 'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out', 'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ])
print(m.shape)
m = add_numerical_to_matrix(m, dropped, ['review_votes_cool', 'review_votes_funny', 'review_votes_useful', 'user_average_stars', 'user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot', 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain', 'user_compliments_profile', 'user_compliments_writer', 'user_fans', 'user_review_count', 'user_votes_cool', 'user_votes_funny', 'user_votes_useful', 'restaurant_attributes_price_range', 'restaurant_latitude', 'restaurant_longitude', 'restaurant_review_count', 'checkin_counts', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'neu', 'pos', 'compound', 'user_yelping_since_delta','manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold'])
print(m.shape)
print("adding restaurant categories to matrix")
cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7']
m = special_categories_to_matrix(m, dropped, cats)
print(m.shape)
print("adding restaurant neighborhoods to matrix")
cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3']
m = special_categories_to_matrix(m, dropped, cats)
print(m.shape)
print("matrix shape of {}".format(m.shape))
joblib.dump(m, 'pickle_jar/full_matrix')
示例10: get_dataset2
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def get_dataset2(test_fraction):
"""
@:param: test_fraction used to split train and test
Vectorizes the features and labels into categorical values and randomly splits into train and test set
:return: X_train, X_test, y_train, y_test
"""
data = []
with open('labels.csv', 'r') as datafile:
csv_reader = csv.reader(datafile, delimiter=',', quotechar='|')
for row in csv_reader:
data.append(row)
data = numpy.asarray(data)
X = data[:, 0:data.shape[1]-1]
y = data[:, data.shape[1]-1]
# X,y = get_tabledata()
vec = DictVectorizer()
feature_dict = [dict(enumerate(x)) for x in X.tolist()]
X = vec.fit_transform(feature_dict).toarray()
joblib.dump(vec, 'vectorizer.pkl')
lb = LabelBinarizer()
y = lb.fit_transform(y)
joblib.dump(lb, 'binarizer.pkl')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction)
return X_train, X_test, y_train, y_test
示例11: scorer_auc
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def scorer_auc(y_true, y_pred):
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
"""Dedicated to 2class probabilistic outputs"""
le = LabelBinarizer()
y_true = le.fit_transform(y_true)
return roc_auc_score(y_true, y_pred)
示例12: binarize_label_columns
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def binarize_label_columns(df, columns, two_classes_as='single'):
'''
Inputs:
df: Pandas dataframe object.
columns: Columns to binarize.
tow_classes_as: How to handle two classes, as 'single' or 'multiple' columns.
Returns a tuple with the following items:
df: Pandas dataframe object with new columns.
binlabel_names: Names of the newly created binary variables.
lb_objects: a dictionary with columns as keys and sklear.LabelBinarizer
objects as values.
'''
binlabel_names = []
lb_objects = {}
for col in columns:
if len(df[col].unique()) > 1:
rows_notnull = df[col].notnull() # Use only valid feature observations
lb = LabelBinarizer()
binclass = lb.fit_transform(df[col][rows_notnull]) # Fit & transform on valid observations
if len(lb.classes_) == 2 and two_classes_as == 'multiple':
binclass = np.hstack((1 - binclass, binclass))
lb_objects[col] = lb
if len(lb.classes_) > 2 or two_classes_as == 'multiple':
col_binlabel_names = [col+'_'+str(c) for c in lb.classes_]
binlabel_names += col_binlabel_names # Names for the binarized classes
for n in col_binlabel_names: df[n] = np.NaN # Initialize columns
df.loc[rows_notnull, col_binlabel_names] = binclass # Merge binarized data
elif two_classes_as == 'single':
binlabel_names.append(col+'_bin') # Names for the binarized classes
df[col+'_bin'] = np.NaN # Initialize columns
df.loc[rows_notnull, col+'_bin'] = binclass # Merge binarized data
return df, binlabel_names, lb_objects
示例13: BinaryRelevanceClassifier
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
class BinaryRelevanceClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, estimator):
self.estimator = estimator
def fit(self, X, Y):
# binarize labels
self.bl = LabelBinarizer()
Y = self.bl.fit_transform(Y)
self.classes_ = self.bl.classes_
# create an estimator for each label
self.estimators_ = []
for i in xrange(self.bl.classes_.shape[0]):
estimator = clone(self.estimator)
estimator.fit(X, Y[:, i])
self.estimators_.append(estimator)
def predict(self, X):
self._check_is_fitted()
X = np.atleast_2d(X)
Y = np.empty((X.shape[0], self.classes_.shape[0]))
for i, estimator in enumerate(self.estimators_):
Y[:, i] = estimator.predict(X).T
return self.bl.inverse_transform(Y)
def _check_is_fitted(self):
if not hasattr(self, "estimators_"):
raise ValueError("The object hasn't been fitted yet!")
示例14: bio_classification_report
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def bio_classification_report(y_true, y_pred):
"""
Classification report for a list of BIO-encoded sequences.
It computes token-level metrics and discards "O" labels.
Note that it requires scikit-learn 0.15+ (or a version from github master)
to calculate averages properly!
Note: This function was copied from
http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
Args:
y_true: True labels, list of strings
y_pred: Predicted labels, list of strings
Returns:
classification report as string
"""
lbin = LabelBinarizer()
y_true_combined = lbin.fit_transform(list(chain.from_iterable(y_true)))
y_pred_combined = lbin.transform(list(chain.from_iterable(y_pred)))
#tagset = set(lbin.classes_) - {NO_NE_LABEL}
tagset = set(lbin.classes_)
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lbin.classes_)}
return classification_report(
y_true_combined,
y_pred_combined,
labels=[class_indices[cls] for cls in tagset],
target_names=tagset,
)
示例15: train
# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def train(self, X, y):
n_features = X.shape[1]
# class_prior = self.class_prior
# Binarize Y
labelbin = LabelBinarizer()
Y = labelbin.fit_transform(y)
self.classes = labelbin.classes_
if Y.shape[1] == 1:
Y = np.concatenate((1 - Y, Y), axis=1)
n_effective_classes = Y.shape[1]
self.class_count = np.zeros(n_effective_classes)
self.feature_count = np.zeros((n_effective_classes, n_features))
print "Start counting..."
self.class_count = Y.sum(axis=0)
print "Finished class counting!"
print "Start feature counting..."
self.feature_count = np.dot(Y.T, X)
print "Finished feature counting!"
# Apply add-k-smoothing
print "Start smoothing..."
self.class_count_smooth = self.class_count + self.k * len(self.classes)
self.feature_count_smooth = self.feature_count + self.k
print "Finished smooting!"
# Convert to log probabilities
self.feature_log_prob = (np.log(self.feature_count_smooth) - np.log(self.class_count_smooth.reshape(-1,1)))
self.class_log_prior = np.zeros(len(self.classes)) - np.log(len(self.classes))
return self