本文整理汇总了Python中sklearn.preprocessing.Binarizer.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python Binarizer.fit_transform方法的具体用法?Python Binarizer.fit_transform怎么用?Python Binarizer.fit_transform使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.Binarizer
的用法示例。
在下文中一共展示了Binarizer.fit_transform方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: binarizeMatrix
# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def binarizeMatrix(dataMatrix, threshold):
"""
Transforms all the inputs to either 0/1 . <0 Maps to 0. >1 Maps 1. [0,1] depends on the threshold you set between [0,1]
"""
binarizer = Binarizer(threshold=threshold)
dataMatrix = binarizer.fit_transform(dataMatrix)
return dataMatrix
示例2: wine_quality_white
# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def wine_quality_white():
# white wine quality dataset
filename = '../../data/raw/mldata/winequality-white.csv'
# The data corresponds to the 11 first column of the csv file
data = np.loadtxt(filename, usecols=tuple(range(11)), delimiter=';', dtype=float)
# Read the label
# We need to binarise the label using a threshold at 4
bn = Binarizer(threshold=4)
label = bn.fit_transform(np.loadtxt(filename, usecols=(11,), delimiter=';', dtype=int))
# We need to inverse the label -> 1=0 and 0=1
label = np.ravel(np.abs(label - 1))
np.savez('../../data/clean/uci-wine-quality-white.npz', data=data, label=label)
示例3: do_logreg
# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def do_logreg():
from sklearn.preprocessing import Binarizer, scale
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from scipy.stats import expon
import pandas
### load data
col_names=['mpg','cylinders','displacement','horsepower','weight',
'acceleration','model_year','origin','car_name']
df=pandas.read_csv('auto_mpg.csv')
df.columns=col_names
df=df.drop('car_name',1)
lr=LogisticRegression()
bn=Binarizer(threshold=df['mpg'].mean())
print "Performing binarization of the mpg variable into above/below average classes"
target=bn.fit_transform(df['mpg'])
data=df.drop('mpg',1)
data=scale(data)
print "Splitting into training and test sets"
data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.5,random_state=0)
grid=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
print 'Searching for optimal C in {} using {}-fold validation on test set '.format(grid,nfolds)
tuned_parameters=[{'C':grid}]
clf=GridSearchCV(lr,tuned_parameters,cv=nfolds,scoring='accuracy')
clf.fit(data_train,target_train)
for params, mean_score,_ in clf.grid_scores_:
print "{}: Mean accuracy {}".format(params,mean_score)
print """Cross-validating above/below average mpg prediction
using {}-fold validation on the test dataset.
Using the best estimator: {}
""".format(nfolds,clf.best_estimator_)
mean_cross=np.mean(cross_val_score(clf.best_estimator_,data_test,target_test,cv=nfolds))
print "Mean cross-validated accuracy after optimization is: {}".format(mean_cross)
示例4: us_crime
# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def us_crime():
# US crime dataset
filename = '../../data/raw/mldata/communities.data'
# The missing data will be consider as NaN
# Only use 122 continuous features
tmp_data = np.genfromtxt(filename, delimiter = ',')
tmp_data = tmp_data[:, 5:]
# replace missing value by the mean
imp = Imputer(verbose = 1)
tmp_data = imp.fit_transform(tmp_data)
# extract the data to be saved
data = tmp_data[:, :-1]
bn = Binarizer(threshold=0.65)
label = np.ravel(bn.fit_transform(tmp_data[:, -1]))
np.savez('../../data/clean/uci-us-crime.npz', data=data, label=label)
示例5: ngram
# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
#
# Comment section below out if you already have made pickle files
#
#---------------------------------------------------------------------------------------
all_bigr = ngram(X_train, 'bigram') #starting with all features
print "Starting counting bigrams..."
X_train_bi_counted = count(X_train, all_bigr, 'bigram')
print "Done counting train set"
X_test_bi_counted = count(X_test, all_bigr, 'bigram')
print "Done counting test set"
print "Binarizing and dumping files"
bin = Binarizer()
X_train_bi_binary = bin.fit_transform(X_train_bi_counted)
X_test_bi_binary = bin.transform(X_test_bi_counted)
pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) )
pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) )
print "Done"
print "Starting tfidf vectors..."
X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted)
pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) )
pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) )
print "Done"
print "Starting feature selection using CART random forests on binary files"
indices_important_feats_bi_bin = tree(X_train_bi_binary, y_train, all_bigr, 'Bigram_binary')
示例6: binarize
# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def binarize(img, threshold):
binarizer = Binarizer(threshold, copy=False)
return binarizer.fit_transform(img)
示例7: load
# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def load(opt='custom', x_filename=None, y_filename=None, n_samples=0,
samples_on='rows', **kwargs):
"""Load a specified dataset.
This function can be used either to load one of the standard scikit-learn
datasets or a different dataset saved as X.npy Y.npy in the working
directory.
Parameters
-----------
opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons',
'custom', 'GSEXXXXX'}, default: 'custom'
Name of a predefined dataset to be loaded. 'iris', 'digits', 'diabetes'
'boston', 'circles' and 'moons' refer to the correspondent
`scikit-learn` datasets. 'custom' can be used to load a custom dataset
which name is specified in `x_filename` and `y_filename` (optional).
x_filename : string, default : None
The data matrix file name.
y_filename : string, default : None
The label vector file name.
n_samples : int
The number of samples to be loaded. This comes handy when dealing with
large datasets. When n_samples is less than the actual size of the
dataset this function performs a random subsampling that is stratified
w.r.t. the labels (if provided).
samples_on : string
This can be either in ['row', 'rows'] if the samples lie on the row of
the input data matrix, or viceversa in ['col', 'cols'] the other way
around.
data_sep : string
The data separator. For instance comma, tab, blank space, etc.
Returns
-----------
X : array of float, shape : n_samples x n_features
The input data matrix.
y : array of float, shape : n_samples
The label vector; np.nan if missing.
feature_names : array of integers (or strings), shape : n_features
The feature names; a range of number if missing.
index : list of integers (or strings)
This is the samples identifier, if provided as first column (or row) of
of the input file. Otherwise it is just an incremental range of size
n_samples.
"""
data = None
try:
if opt.lower() == 'iris':
data = datasets.load_iris()
elif opt.lower() == 'digits':
data = datasets.load_digits()
elif opt.lower() == 'diabetes':
data = datasets.load_diabetes()
b = Binarizer(threshold=np.mean(data.target))
data.target = b.fit_transform(data.data)
elif opt.lower() == 'boston':
data = datasets.load_boston()
b = Binarizer(threshold=np.mean(data.target))
data.target = b.fit_transform(data.data)
elif opt.lower() == 'gauss':
means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]])
sigmas = np.array([0.33, 0.33, 0.33])
if n_samples <= 1:
n_samples = 333
xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples)
data = datasets.base.Bunch(data=xx, target=yy)
elif opt.lower() == 'circles':
if n_samples == 0:
n_samples = 400
xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3,
noise=.05)
data = datasets.base.Bunch(data=xx, target=yy)
elif opt.lower() == 'moons':
if n_samples == 0:
n_samples = 400
xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01)
data = datasets.base.Bunch(data=xx, target=yy)
elif opt.lower() == 'custom':
data = load_custom(x_filename, y_filename, samples_on, **kwargs)
elif opt.lower().startswith('gse'):
raise Exception("Use ade_GEO2csv.py to convert GEO DataSets"
"into csv files.")
except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
X, y = data.data, data.target
if n_samples > 0 and X.shape[0] > n_samples:
if y is not None:
try: # Legacy for sklearn
sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1)
# idx = np.random.permutation(X.shape[0])[:n_samples]
except TypeError:
#.........这里部分代码省略.........
示例8: fp_vectorizer
# 需要导入模块: from sklearn.preprocessing import Binarizer [as 别名]
# 或者: from sklearn.preprocessing.Binarizer import fit_transform [as 别名]
def fp_vectorizer(self, processed_data):
binarizer = Binarizer(threshold = 5)
vectorized_data = binarizer.fit_transform(processed_data)
return vectorized_data