当前位置: 首页>>代码示例>>Python>>正文


Python LabelBinarizer.fit_transform方法代码示例

本文整理汇总了Python中sklearn.preprocessing.LabelBinarizer.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python LabelBinarizer.fit_transform方法的具体用法?Python LabelBinarizer.fit_transform怎么用?Python LabelBinarizer.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.preprocessing.LabelBinarizer的用法示例。


在下文中一共展示了LabelBinarizer.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'clf__C': (0.1, 1, 10),
    }
    os.chdir('C:\\Users\\Dan\\1) Python Notebooks\\Datasets')
    df = pd.read_csv('data/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])    
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    predictions = grid_search.predict(X_test)
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Precision:', precision_score(y_test, predictions)
    print 'Recall:', recall_score(y_test, predictions)
开发者ID:danielmcdade,项目名称:1--Python-Notebooks,代码行数:32,代码来源:grid_search_multi_label.py

示例2: test_label_binarizer_multilabel

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def test_label_binarizer_multilabel():
    lb = LabelBinarizer()

    # test input as lists of tuples
    inp = [(2, 3), (1,), (1, 2)]
    indicator_mat = np.array([[0, 1, 1],
                              [1, 0, 0],
                              [1, 1, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(indicator_mat, got)
    assert_equal(lb.inverse_transform(got), inp)

    # test input as label indicator matrix
    lb.fit(indicator_mat)
    assert_array_equal(indicator_mat,
                       lb.inverse_transform(indicator_mat))

    # regression test for the two-class multilabel case
    lb = LabelBinarizer()

    inp = [[1, 0], [0], [1], [0, 1]]
    expected = np.array([[1, 1],
                         [1, 0],
                         [0, 1],
                         [1, 1]])
    got = lb.fit_transform(inp)
    assert_array_equal(expected, got)
    assert_equal([set(x) for x in lb.inverse_transform(got)],
                 [set(x) for x in inp])
开发者ID:AlexLerman,项目名称:scikit-learn,代码行数:31,代码来源:test_preprocessing.py

示例3: initData

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def initData(filename):
    if not os.path.exists(filename):
        print "I can't find this file: %s"%filename
        sys.exit(1)

    datareader = csv.reader(open(filename,'r'))
    ct = 0;
    for row in datareader:
     ct = ct+1

    datareader = csv.reader(open(filename,'r'))
    data = np.array(-1*np.ones((ct,7),float),object);
    k=0;

    for row in datareader:
     data[k,:] = np.array(row)
     k = k+1;

    #To modify
    featnames = np.array(ATTRIBUTES,str)

    keys = [[]]*np.size(data,1)
    numdata = -1*np.ones_like(data);
    nfeatures=[0]
    featIndex=[]

    # convert string objects to integer values for modeling:
    for k in range(np.size(data,1)):
     keys[k],garbage,numdata[:,k] = np.unique(data[:,k],True,True)

    numrows = np.size(numdata,0); # number of instances in car data set
    numcols = np.size(numdata,1); # number of columns in car data set
    numdata = np.array(numdata,int)

    xdata = numdata[:,:-1]; # x-data is all data BUT the last column which are the class labels
    ydata = numdata[:,-1]; # y-data is set to class labels in the final column, signified by -1

    # ------------------ numdata multilabel -> binary conversion for NB-Model ---------------------
    lbin = LabelBinarizer();
    for k in range(np.size(xdata,1)): # loop thru number of columns in xdata
     if k==0:
      xdata_ml = lbin.fit_transform(xdata[:,k]);
      featIndex = lbin.classes_
      nfeatures.append(len(lbin.classes_))
     else:
      xdata_ml = np.hstack((xdata_ml,lbin.fit_transform(xdata[:,k])))
      featIndex= np.hstack((featIndex,lbin.classes_))
      nfeatures.append(nfeatures[-1]+len(lbin.classes_))

    if _VERBOSE:

        print "nfeatures:"
        print nfeatures
        print "featIndex"
        print featIndex

    return xdata_ml,xdata,ydata,data,nfeatures,keys,featIndex
开发者ID:cozyberry,项目名称:nbem,代码行数:59,代码来源:nb_clustering.py

示例4: encode_categorical

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def encode_categorical(cat, missing_value = False, option = "binary"):
    # Encodes the categorical features. For N unique categories:
    # cat : the column of categorical values
    # option = 'binary'    : binary (one-hot, orthogonal, thermometer) encoding - N features
    #          'freq'      : occuring frequency (percentage) - 1 feature
    #          'mis_float' : all binary encoded except missing values (vector of floats corresponding to occurance frequencies) - (N-1) features
    #          'mis_unif'  : all binary encoded except missing values (vector of floats of uniform values) - (N-1) features
    #          'dummy'     : just like binary but one column is removed - (n-1) features
    #          'sum'       : sum (deviation) coding. just like dummy but zeros row is all -1 - (N-1) features
    ########## TO DO: Encoding w.r.t. targets
    
    if option == "binary":
        lb = LabelBinarizer()
        encoded = lb.fit_transform(cat)
    elif option == "freq":
        freq_count = itemfreq(cat)
        encoded = np.zeros(len(cat))
        for i in range(freq_count.shape[0]):
            encoded[cat == freq_count[i][0]] = float(freq_count[i][1])/len(encoded)
    elif option == "mis_float":
        if missing_value == False:
            raise ValueError("Provide a missing value for the option 'mis_float'.")
        else:
            lb = LabelBinarizer()
            encoded = lb.fit_transform(cat).astype(float)
            missing_bool = cat == missing_value
            if np.sum(missing_bool) == 0:
                raise ValueError("No such missing value!")
            encoded = np.delete(encoded, np.argmax(encoded[np.argmax(missing_bool),:]), axis = 1)
            encoded[missing_bool,:] = np.sum(encoded[~missing_bool], axis = 0)/float(encoded[~missing_bool].shape[0])
    elif option == "mis_unif":
        if missing_value == False:
            raise ValueError("Provide a missing value for the option 'mis_float'.")
        else:
            lb = LabelBinarizer()
            encoded = lb.fit_transform(cat).astype(float)
            missing_bool = cat == missing_value
            if np.sum(missing_bool) == 0:
                raise ValueError("No such missing value!")
            encoded = np.delete(encoded, np.argmax(encoded[np.argmax(missing_bool),:]), axis = 1)
            encoded[missing_bool,:] = np.ones(encoded.shape[1]) * 1.0 / encoded.shape[1]
    elif option == "dummy":
        lb = LabelBinarizer()
        encoded = lb.fit_transform(cat)[:,0:-1]
    elif option == "sum":
        lb = LabelBinarizer()
        encoded = lb.fit_transform(cat)
        last_col = encoded[:,-1].astype(bool)
        encoded = encoded[:,0:-1]
        encoded[last_col,:] = -1
    else:
        raise ValueError("No such option!")
    print("Number of unique categorical values : %s" % encoded.shape[1])
        
    return encoded
开发者ID:ogencoglu,项目名称:Templates,代码行数:57,代码来源:encode_categorical.py

示例5: test_multinomial_loss

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def test_multinomial_loss():
    # test if the multinomial loss and gradient computations are consistent
    X, y = iris.data, iris.target.astype(np.float64)
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))

    rng = check_random_state(42)
    weights = rng.randn(n_features, n_classes)
    intercept = rng.randn(n_classes)
    sample_weights = rng.randn(n_samples)
    np.abs(sample_weights, sample_weights)

    # compute loss and gradient like in multinomial SAG
    dataset, _ = make_dataset(X, y, sample_weights, random_state=42)
    loss_1, grad_1 = _multinomial_grad_loss_all_samples(dataset, weights,
                                                        intercept, n_samples,
                                                        n_features, n_classes)
    # compute loss and gradient like in multinomial LogisticRegression
    lbin = LabelBinarizer()
    Y_bin = lbin.fit_transform(y)
    weights_intercept = np.vstack((weights, intercept)).T.ravel()
    loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin,
                                               0.0, sample_weights)
    grad_2 = grad_2.reshape(n_classes, -1)
    grad_2 = grad_2[:, :-1].T

    # comparison
    assert_array_almost_equal(grad_1, grad_2)
    assert_almost_equal(loss_1, loss_2)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:31,代码来源:test_sag.py

示例6: ElasticNetClassifier

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
class ElasticNetClassifier(LinearClassifierMixin, ElasticNet):
    """Class to extend elastic-net in case of classification."""

    def fit(self, X, y, check_input=True):
        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
        Y = self._label_binarizer.fit_transform(y)
        if self._label_binarizer.y_type_.startswith('multilabel'):
            # we don't (yet) support multi-label classification in ENet
            raise ValueError(
                "%s doesn't support multi-label classification" % (
                    self.__class__.__name__))

        # Y = column_or_1d(Y, warn=True)
        super(ElasticNetClassifier, self).fit(X, Y)
        if self.classes_.shape[0] > 2:
            ndim = self.classes_.shape[0]
        else:
            ndim = 1
        self.coef_ = self.coef_.reshape(ndim, -1)

        return self

    @property
    def classes_(self):
        return self._label_binarizer.classes_
开发者ID:slipguru,项目名称:palladio,代码行数:27,代码来源:classifier.py

示例7: bio_classification_report

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def bio_classification_report(y_true, y_pred):

    lb = LabelBinarizer()
    y_true_combined = 1 - lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = list(chain.from_iterable(y_pred))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    print 'True sum %d Pred sum %d Len %d' %(sum(y_true_combined), sum(y_pred_combined), len(y_pred_combined))
    print "AUC\tP-R: %.4f\tROC: %.4f" % (average_precision_score(y_true_combined, y_pred_combined, average=None),
        roc_auc_score(y_true_combined, y_pred_combined, average=None))
    #plt.figure()
    #fpr, tpr, thr = roc_curve(y_true_combined, y_pred_combined)
    #area = auc(fpr, tpr)
    #plt.plot(fpr, tpr, label='{area:.3f}'.format( area=area))
    #plt.legend(loc=4)
    #plt.savefig('sub3.jpg')

    return classification_report(
        1 - y_true_combined,
        [0 if v > 0.1 else 1 for v in y_pred_combined],
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )
开发者ID:lmxl,项目名称:hal,代码行数:27,代码来源:base_learner.py

示例8: transform

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
 def transform(self, data_dict):
     listOfUnits = ["kilogram", "kg", "gram", "[GMgmkK]?Hz", "liter", "ml",
             "cup", "cm", "foot", "inch", "meter", "mg", "gallon", "milliliter", "[MGTmgtKk]B"]
     regex = "[\d]+\.[\d]+(" + "[\b/,-]|".join(listOfUnits) + ")"
     data = data_dict[self.key].str.extract(regex, flags = re.IGNORECASE, expand=False).str.lower()
     lb = LabelBinarizer()
     return lb.fit_transform(data.fillna(""))
开发者ID:zero0nee,项目名称:UnspscClassifier,代码行数:9,代码来源:transformers.py

示例9: full_matrix

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def full_matrix(dropped):
    # create initial matrix
    print('starting with m0')
    lb = LabelBinarizer(sparse_output=True)
    # m = lb.fit_transform(dropped.restaurant_id)
    m = lb.fit_transform(dropped.user_name)
    print(m.shape)
    # build matrix
    # making nan its own category for categorical
    print("adding categorical to matrix")
    m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city',  'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode',  'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',])
    print(m.shape)

    print("adding bool to matrix")
    m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating',  'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out',  'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ])
    print(m.shape)

    m = add_numerical_to_matrix(m, dropped, ['review_votes_cool', 'review_votes_funny', 'review_votes_useful', 'user_average_stars', 'user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot', 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain', 'user_compliments_profile', 'user_compliments_writer', 'user_fans', 'user_review_count', 'user_votes_cool', 'user_votes_funny', 'user_votes_useful', 'restaurant_attributes_price_range', 'restaurant_latitude', 'restaurant_longitude', 'restaurant_review_count', 'checkin_counts', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'neu', 'pos', 'compound', 'user_yelping_since_delta','manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold'])
    print(m.shape)

    print("adding restaurant categories to matrix")
    cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("adding restaurant neighborhoods to matrix")
    cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("matrix shape of {}".format(m.shape))
    joblib.dump(m, 'pickle_jar/full_matrix')
开发者ID:potatochip,项目名称:kojak,代码行数:34,代码来源:blue_pill.py

示例10: get_dataset2

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def get_dataset2(test_fraction):
    """
   @:param: test_fraction used to split train and test
   Vectorizes the features and labels into categorical values and randomly splits into train and test set
   :return: X_train, X_test, y_train, y_test
   """
    data = []
    with open('labels.csv', 'r') as datafile:
        csv_reader = csv.reader(datafile, delimiter=',', quotechar='|')
        for row in csv_reader:
            data.append(row)

    data = numpy.asarray(data)
    X = data[:, 0:data.shape[1]-1]
    y = data[:, data.shape[1]-1]

    # X,y = get_tabledata()

    vec = DictVectorizer()
    feature_dict = [dict(enumerate(x)) for x in X.tolist()]
    X = vec.fit_transform(feature_dict).toarray()
    joblib.dump(vec, 'vectorizer.pkl')

    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    joblib.dump(lb, 'binarizer.pkl')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction)
    return X_train, X_test, y_train, y_test
开发者ID:spatial-computing,项目名称:strabo-learning-ocr-transformation-hpc,代码行数:30,代码来源:Data.py

示例11: scorer_auc

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def scorer_auc(y_true, y_pred):
    from sklearn.metrics import roc_auc_score
    from sklearn.preprocessing import LabelBinarizer
    """Dedicated to 2class probabilistic outputs"""
    le = LabelBinarizer()
    y_true = le.fit_transform(y_true)
    return roc_auc_score(y_true, y_pred)
开发者ID:SherazKhan,项目名称:Paris_orientation-decoding,代码行数:9,代码来源:base.py

示例12: binarize_label_columns

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def binarize_label_columns(df, columns, two_classes_as='single'):
    '''
    Inputs:
        df: Pandas dataframe object.
        columns: Columns to binarize.
        tow_classes_as: How to handle two classes, as 'single' or 'multiple' columns.
    Returns a tuple with the following items:
        df: Pandas dataframe object with new columns.
        binlabel_names: Names of the newly created binary variables.
        lb_objects: a dictionary with columns as keys and sklear.LabelBinarizer 
        objects as values.
    '''
    binlabel_names = []
    lb_objects = {}
    for col in columns:
        if len(df[col].unique()) > 1: 
            rows_notnull = df[col].notnull() # Use only valid feature observations
            lb = LabelBinarizer()
            binclass = lb.fit_transform(df[col][rows_notnull]) # Fit & transform on valid observations
            if len(lb.classes_) == 2 and two_classes_as == 'multiple':
                binclass = np.hstack((1 - binclass, binclass))
            lb_objects[col] = lb
            if len(lb.classes_) > 2 or two_classes_as == 'multiple':
                col_binlabel_names = [col+'_'+str(c) for c in lb.classes_]
                binlabel_names += col_binlabel_names # Names for the binarized classes
                for n in col_binlabel_names: df[n] = np.NaN # Initialize columns
                df.loc[rows_notnull, col_binlabel_names] = binclass # Merge binarized data
            elif two_classes_as == 'single': 
                binlabel_names.append(col+'_bin') # Names for the binarized classes
                df[col+'_bin'] = np.NaN # Initialize columns
                df.loc[rows_notnull, col+'_bin'] = binclass # Merge binarized data
    return df, binlabel_names, lb_objects
开发者ID:jonathan1296,项目名称:Data-Science-Lectures,代码行数:34,代码来源:start_here.py

示例13: BinaryRelevanceClassifier

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
class BinaryRelevanceClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, Y):
        # binarize labels
        self.bl = LabelBinarizer()
        Y = self.bl.fit_transform(Y)
        self.classes_ = self.bl.classes_

        # create an estimator for each label
        self.estimators_ = []
        for i in xrange(self.bl.classes_.shape[0]):
            estimator = clone(self.estimator)
            estimator.fit(X, Y[:, i])
            self.estimators_.append(estimator)

    def predict(self, X):
        self._check_is_fitted()

        X = np.atleast_2d(X)
        Y = np.empty((X.shape[0], self.classes_.shape[0]))
        for i, estimator in enumerate(self.estimators_):
            Y[:, i] = estimator.predict(X).T

        return self.bl.inverse_transform(Y)

    def _check_is_fitted(self):
        if not hasattr(self, "estimators_"):
            raise ValueError("The object hasn't been fitted yet!")
开发者ID:sunnyrjuneja,项目名称:ai_tidbits,代码行数:32,代码来源:binary_relevance_classifier.py

示例14: bio_classification_report

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.

    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!

    Note: This function was copied from
    http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

    Args:
        y_true: True labels, list of strings
        y_pred: Predicted labels, list of strings
    Returns:
        classification report as string
    """
    lbin = LabelBinarizer()
    y_true_combined = lbin.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lbin.transform(list(chain.from_iterable(y_pred)))

    #tagset = set(lbin.classes_) - {NO_NE_LABEL}
    tagset = set(lbin.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lbin.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )
开发者ID:aleju,项目名称:ner-crf,代码行数:34,代码来源:test.py

示例15: train

# 需要导入模块: from sklearn.preprocessing import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.LabelBinarizer import fit_transform [as 别名]
	def train(self, X, y):
		n_features = X.shape[1]

		# class_prior = self.class_prior

		# Binarize Y
		labelbin = LabelBinarizer()
		Y = labelbin.fit_transform(y)
		self.classes = labelbin.classes_
		if Y.shape[1] == 1:
			Y = np.concatenate((1 - Y, Y), axis=1)

		n_effective_classes = Y.shape[1]
		self.class_count = np.zeros(n_effective_classes)
		self.feature_count = np.zeros((n_effective_classes, n_features))

		print "Start counting..."
		self.class_count = Y.sum(axis=0)
		print "Finished class counting!"
		print "Start feature counting..."
		self.feature_count = np.dot(Y.T, X)
		print "Finished feature counting!"

		# Apply add-k-smoothing
		print "Start smoothing..."
		self.class_count_smooth = self.class_count + self.k * len(self.classes)
		self.feature_count_smooth = self.feature_count + self.k
		print "Finished smooting!"

		# Convert to log probabilities
		self.feature_log_prob = (np.log(self.feature_count_smooth) - np.log(self.class_count_smooth.reshape(-1,1)))
		self.class_log_prior = np.zeros(len(self.classes)) - np.log(len(self.classes))

		return self
开发者ID:TobiasMR,项目名称:stat-nlp,代码行数:36,代码来源:naivebayes2.py


注:本文中的sklearn.preprocessing.LabelBinarizer.fit_transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。