当前位置: 首页>>代码示例>>Python>>正文


Python Binarizer.fit_transform方法代码示例

本文整理汇总了Python中sklearn.preprocessing.Binarizer.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python Binarizer.fit_transform方法的具体用法?Python Binarizer.fit_transform怎么用?Python Binarizer.fit_transform使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.preprocessing.Binarizer的用法示例。


在下文中一共展示了Binarizer.fit_transform方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: binarizeMatrix

# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def binarizeMatrix(dataMatrix, threshold):
    """
    Transforms all the inputs to either 0/1 . <0 Maps to 0. >1 Maps 1. [0,1] depends on the threshold you set between [0,1]
    """

    binarizer = Binarizer(threshold=threshold)

    dataMatrix = binarizer.fit_transform(dataMatrix)

    return dataMatrix
开发者ID:Gliganu,项目名称:DMC_Fashion_2016,代码行数:12,代码来源:DatasetManipulator.py

示例2: wine_quality_white

# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def wine_quality_white():
    # white wine quality dataset

    filename = '../../data/raw/mldata/winequality-white.csv'

    # The data corresponds to the 11 first column of the csv file
    data = np.loadtxt(filename, usecols=tuple(range(11)), delimiter=';', dtype=float)
    # Read the label
    # We need to binarise the label using a threshold at 4
    bn = Binarizer(threshold=4)
    label = bn.fit_transform(np.loadtxt(filename, usecols=(11,), delimiter=';', dtype=int))
    # We need to inverse the label -> 1=0 and 0=1
    label = np.ravel(np.abs(label - 1))
    
    np.savez('../../data/clean/uci-wine-quality-white.npz', data=data, label=label)
开发者ID:I2Cvb,项目名称:data_balancing,代码行数:17,代码来源:conversion.py

示例3: do_logreg

# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def do_logreg():
    from sklearn.preprocessing import Binarizer, scale
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score,classification_report
    from sklearn.cross_validation import train_test_split
    from sklearn.cross_validation import cross_val_score
    from sklearn.grid_search import GridSearchCV
    from scipy.stats import expon
    import pandas
    ### load data
    col_names=['mpg','cylinders','displacement','horsepower','weight',
               'acceleration','model_year','origin','car_name']
    df=pandas.read_csv('auto_mpg.csv')
    df.columns=col_names
    df=df.drop('car_name',1)
    
    lr=LogisticRegression()
    bn=Binarizer(threshold=df['mpg'].mean())
    print "Performing binarization of the mpg variable into above/below average classes"
    target=bn.fit_transform(df['mpg'])
    data=df.drop('mpg',1)
    data=scale(data)
    print "Splitting into training and test sets"
    data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.5,random_state=0)

    grid=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
    print 'Searching for optimal C in {} using {}-fold validation on test set '.format(grid,nfolds)
    tuned_parameters=[{'C':grid}]
    clf=GridSearchCV(lr,tuned_parameters,cv=nfolds,scoring='accuracy')
    clf.fit(data_train,target_train)
    for params, mean_score,_ in clf.grid_scores_:
        print "{}: Mean accuracy {}".format(params,mean_score)

    
    print  """Cross-validating above/below average mpg prediction
        using {}-fold validation on the test dataset.
        Using the best estimator: {}
        """.format(nfolds,clf.best_estimator_)
        
    mean_cross=np.mean(cross_val_score(clf.best_estimator_,data_test,target_test,cv=nfolds))

    print "Mean cross-validated accuracy after optimization is: {}".format(mean_cross)
开发者ID:jmccutchan,项目名称:GA_homework,代码行数:44,代码来源:sklearn_logreg.py

示例4: us_crime

# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def us_crime():
    # US crime dataset

    filename = '../../data/raw/mldata/communities.data'

    # The missing data will be consider as NaN
    # Only use 122 continuous features
    tmp_data = np.genfromtxt(filename, delimiter = ',')
    tmp_data = tmp_data[:, 5:]

    # replace missing value by the mean
    imp = Imputer(verbose = 1)
    tmp_data = imp.fit_transform(tmp_data)

    # extract the data to be saved
    data = tmp_data[:, :-1]
    bn = Binarizer(threshold=0.65)
    label = np.ravel(bn.fit_transform(tmp_data[:, -1]))

    np.savez('../../data/clean/uci-us-crime.npz', data=data, label=label)
开发者ID:I2Cvb,项目名称:data_balancing,代码行数:22,代码来源:conversion.py

示例5: ngram

# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
#
#	Comment section below out if you already have made pickle files
#
#---------------------------------------------------------------------------------------

all_bigr = ngram(X_train, 'bigram') #starting with all features

print "Starting counting bigrams..."
X_train_bi_counted = count(X_train, all_bigr, 'bigram')
print "Done counting train set"
X_test_bi_counted = count(X_test, all_bigr, 'bigram')
print "Done counting test set"

print "Binarizing and dumping files"
bin = Binarizer()
X_train_bi_binary = bin.fit_transform(X_train_bi_counted)
X_test_bi_binary = bin.transform(X_test_bi_counted)
pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) )
pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) )
print "Done"


print "Starting tfidf vectors..."
X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted)
pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) )
pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) )
print "Done"


print "Starting feature selection using CART random forests on binary files"
indices_important_feats_bi_bin = tree(X_train_bi_binary, y_train, all_bigr, 'Bigram_binary')
开发者ID:MariaBarrett,项目名称:LPIIExam,代码行数:33,代码来源:ngram.py

示例6: binarize

# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def binarize(img, threshold):
    binarizer = Binarizer(threshold, copy=False)
    return binarizer.fit_transform(img)
开发者ID:Cysu,项目名称:dlearn,代码行数:5,代码来源:imgproc.py

示例7: load

# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def load(opt='custom', x_filename=None, y_filename=None, n_samples=0,
         samples_on='rows', **kwargs):
    """Load a specified dataset.

    This function can be used either to load one of the standard scikit-learn
    datasets or a different dataset saved as X.npy Y.npy in the working
    directory.

    Parameters
    -----------
    opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons',
          'custom', 'GSEXXXXX'}, default: 'custom'
        Name of a predefined dataset to be loaded. 'iris', 'digits', 'diabetes'
        'boston', 'circles' and 'moons' refer to the correspondent
        `scikit-learn` datasets. 'custom' can be used to load a custom dataset
        which name is specified in `x_filename` and `y_filename` (optional).

    x_filename : string, default : None
        The data matrix file name.

    y_filename : string, default : None
        The label vector file name.

    n_samples : int
        The number of samples to be loaded. This comes handy when dealing with
        large datasets. When n_samples is less than the actual size of the
        dataset this function performs a random subsampling that is stratified
        w.r.t. the labels (if provided).

    samples_on : string
        This can be either in ['row', 'rows'] if the samples lie on the row of
        the input data matrix, or viceversa in ['col', 'cols'] the other way
        around.

    data_sep : string
        The data separator. For instance comma, tab, blank space, etc.

    Returns
    -----------
    X : array of float, shape : n_samples x n_features
        The input data matrix.

    y : array of float, shape : n_samples
        The label vector; np.nan if missing.

    feature_names : array of integers (or strings), shape : n_features
        The feature names; a range of number if missing.

    index : list of integers (or strings)
        This is the samples identifier, if provided as first column (or row) of
        of the input file. Otherwise it is just an incremental range of size
        n_samples.
    """
    data = None
    try:
        if opt.lower() == 'iris':
            data = datasets.load_iris()
        elif opt.lower() == 'digits':
            data = datasets.load_digits()
        elif opt.lower() == 'diabetes':
            data = datasets.load_diabetes()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'boston':
            data = datasets.load_boston()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'gauss':
            means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]])
            sigmas = np.array([0.33, 0.33, 0.33])
            if n_samples <= 1:
                n_samples = 333
            xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'circles':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3,
                                           noise=.05)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'moons':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'custom':
            data = load_custom(x_filename, y_filename, samples_on, **kwargs)
        elif opt.lower().startswith('gse'):
            raise Exception("Use ade_GEO2csv.py to convert GEO DataSets"
                            "into csv files.")
    except IOError as e:
        print("I/O error({0}): {1}".format(e.errno, e.strerror))

    X, y = data.data, data.target
    if n_samples > 0 and X.shape[0] > n_samples:
        if y is not None:
            try:  # Legacy for sklearn
                sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1)
                # idx = np.random.permutation(X.shape[0])[:n_samples]
            except TypeError:
#.........这里部分代码省略.........
开发者ID:slipguru,项目名称:adenine,代码行数:103,代码来源:data_source.py

示例8: fp_vectorizer

# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
 def fp_vectorizer(self, processed_data):
     binarizer = Binarizer(threshold = 5)
     vectorized_data = binarizer.fit_transform(processed_data)
     return vectorized_data
开发者ID:imink,项目名称:UCL_COMPIG15_Project,代码行数:6,代码来源:feature_extraction.py


注:本文中的sklearn.preprocessing.Binarizer.fit_transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。