本文整理汇总了Python中sklearn.preprocessing.Binarizer类的典型用法代码示例。如果您正苦于以下问题:Python Binarizer类的具体用法?Python Binarizer怎么用?Python Binarizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Binarizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: cv_mean_std_array
def cv_mean_std_array(X, y, alphas, ks, n_a, n_k, cv=20):
n = n_alphas*n_ks
cv_mean = np.empty(n)
cv_std = np.empty(n)
regressors = pd.DataFrame()
binarizer = Binarizer(threshold=1400)
y_binary = binarizer.transform(y).transpose().ravel()
itt_counter = 0
print 'size n_a: %d n_k: %d' %(n_a, n_k)
for i in range (0, n_a):
print 'reg. column : %d' %(i*n_k)
temp_string = 'alpha=%f' %alphas[i*n_k]
print temp_string
print regressors.shape
df_temp = pd.DataFrame()
print 'computing for alpha = %f' %(alphas[n_ks*i])
X_lasso, df_temp[temp_string] = df_Lasso(X, y, alphas[i*n_k])
regressors = pd.concat([regressors,df_temp], ignore_index=True, axis=1)
for j in range(0, n_k):
print 'i:%d, j:%d' %(i, j)
print 'computing for alpha = %f and k = %f' %(alphas[n_ks*i+j], ks[n_ks*i+j])
print 'X_lasso shape:'
print X_lasso.shape
cv_mean[n_ks*i+j], cv_std[n_ks*i+j] = knn_cv_mean_and_std(X_lasso, y_binary, alphas[n_ks*i+j], ks[n_ks*i+j], cv=cv)
itt_counter = itt_counter + 1
print 'completed %dth iteration of knn cv mean:%f std:%f, at pos:%d' % (itt_counter, cv_mean[n_ks*i+j], cv_std[n_ks*i+j], n_ks*i+j)
return cv_mean, cv_std, regressors
示例2: cv_mean_std_array
def cv_mean_std_array(X, y, alphas, n_a, cv=20):
binarizer = Binarizer(threshold=1400)
y_binary = binarizer.transform(y).transpose().ravel()
cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds = np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a)
for i in range (0, n_a):
print 'computing for alpha=%f' %alphas[i]
cv_ols_means[i], cv_ols_stds[i], cv_lasso_means[i], cv_lasso_stds[i], cv_ridge_means[i], cv_ridge_stds[i] = lm_cv_mean_and_std(X, , alphas[i])
print 'successfully computed iteration %d' %i
return cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds
示例3: initialize
def initialize():
images, labels = load_mnist_data()
binarizer = Binarizer().fit(images)
images_binarized = binarizer.transform(images)
knn = KNeighborsClassifier(n_neighbors=3, metric='jaccard')
knn.fit(images_binarized, labels)
return knn
示例4: binarizeMatrix
def binarizeMatrix(dataMatrix, threshold):
"""
Transforms all the inputs to either 0/1 . <0 Maps to 0. >1 Maps 1. [0,1] depends on the threshold you set between [0,1]
"""
binarizer = Binarizer(threshold=threshold)
dataMatrix = binarizer.fit_transform(dataMatrix)
return dataMatrix
示例5: test_binarizer
def test_binarizer():
X_ = np.array([[1, 0, 5], [2, 3, 0]])
for init in (np.array, sp.csr_matrix, sp.csc_matrix):
X = init(X_.copy())
binarizer = Binarizer(threshold=2.0, copy=True)
X_bin = toarray(binarizer.transform(X))
assert_equal(np.sum(X_bin == 0), 4)
assert_equal(np.sum(X_bin == 1), 2)
X_bin = binarizer.transform(X)
assert_equal(type(X), type(X_bin))
binarizer = Binarizer(copy=True).fit(X)
X_bin = toarray(binarizer.transform(X))
assert_true(X_bin is not X)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(copy=True)
X_bin = binarizer.transform(X)
assert_true(X_bin is not X)
X_bin = toarray(X_bin)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(copy=False)
X_bin = binarizer.transform(X)
assert_true(X_bin is X)
X_bin = toarray(X_bin)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
示例6: test_binarizer_vs_sklearn
def test_binarizer_vs_sklearn():
# Compare msmbuilder.preprocessing.Binarizer
# with sklearn.preprocessing.Binarizer
binarizerr = BinarizerR()
binarizerr.fit(np.concatenate(trajs))
binarizer = Binarizer()
binarizer.fit(trajs)
y_ref1 = binarizerr.transform(trajs[0])
y1 = binarizer.transform(trajs)[0]
np.testing.assert_array_almost_equal(y_ref1, y1)
示例7: wine_quality_white
def wine_quality_white():
# white wine quality dataset
filename = '../../data/raw/mldata/winequality-white.csv'
# The data corresponds to the 11 first column of the csv file
data = np.loadtxt(filename, usecols=tuple(range(11)), delimiter=';', dtype=float)
# Read the label
# We need to binarise the label using a threshold at 4
bn = Binarizer(threshold=4)
label = bn.fit_transform(np.loadtxt(filename, usecols=(11,), delimiter=';', dtype=int))
# We need to inverse the label -> 1=0 and 0=1
label = np.ravel(np.abs(label - 1))
np.savez('../../data/clean/uci-wine-quality-white.npz', data=data, label=label)
示例8: fit
def fit(self, X, y=None):
"""
Обучает бинаризатор на данных
"""
# print("Fitting binarizer...")
methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS
if self.method not in methods:
raise ValueError("Method should be one of {0}".format(", ".join(methods)))
X = check_array(X, accept_sparse=['csr', 'csc'])
if issparse(X):
X = X.tocsc()
if self.method in Binarizer._UNSUPERVISED_METHODS:
self._fit_unsupervised(X)
self.joint_thresholds_ = self.thresholds_
self.joint_scores_ = self.scores_
else:
if y is None:
raise ValueError("y must not be None for supervised binarizers.")
# вынести в отдельную функцию
# y = np.array(y)
# if len(y.shape) == 1:
# self.classes_, y = np.unique(y, return_inverse=True)
# nclasses = self.classes_.shape[0]
# Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int)
# Y_new[np.arange(y.shape[0]), y] = 1
# else:
# self.classes_ = np.arange(y.shape[1])
# Y_new = y
label_binarizer = SK_LabelBinarizer()
Y_new = label_binarizer.fit_transform(y)
self.classes_ = label_binarizer.classes_
if X.shape[0] != Y_new.shape[0]:
raise ValueError("X and y have incompatible shapes.\n"
"X has %s samples, but y has %s." %
(X.shape[0], Y_new.shape[0]))
self._fit_supervised(X, Y_new)
if len(self.classes_) <= 2:
self.joint_thresholds_ = self.thresholds_[:, 0]
self.joint_scores_ = self.scores_[:, 0]
else:
min_class_scores = np.min(self.scores_, axis=0)
max_class_scores = np.max(self.scores_, axis=0)
diffs = max_class_scores - min_class_scores
diffs[np.where(diffs == 0)] = 1
normalized_scores = (self.scores_ - min_class_scores) / diffs
# находим для каждого признака тот класс, для которого он наиболее полезен
# НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ
optimal_indexes = np.argmax(normalized_scores, axis=1)
nfeat = self.thresholds_.shape[0]
# в качестве порога бинаризации каждого признака
# берём значение для класса, где он наиболее полезен
self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes]
self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes]
# передаём пороги в sklearn.SK_Binarizer
self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_)
return self
示例9: do_logreg
def do_logreg():
from sklearn.preprocessing import Binarizer, scale
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from scipy.stats import expon
import pandas
### load data
col_names=['mpg','cylinders','displacement','horsepower','weight',
'acceleration','model_year','origin','car_name']
df=pandas.read_csv('auto_mpg.csv')
df.columns=col_names
df=df.drop('car_name',1)
lr=LogisticRegression()
bn=Binarizer(threshold=df['mpg'].mean())
print "Performing binarization of the mpg variable into above/below average classes"
target=bn.fit_transform(df['mpg'])
data=df.drop('mpg',1)
data=scale(data)
print "Splitting into training and test sets"
data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.5,random_state=0)
grid=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
print 'Searching for optimal C in {} using {}-fold validation on test set '.format(grid,nfolds)
tuned_parameters=[{'C':grid}]
clf=GridSearchCV(lr,tuned_parameters,cv=nfolds,scoring='accuracy')
clf.fit(data_train,target_train)
for params, mean_score,_ in clf.grid_scores_:
print "{}: Mean accuracy {}".format(params,mean_score)
print """Cross-validating above/below average mpg prediction
using {}-fold validation on the test dataset.
Using the best estimator: {}
""".format(nfolds,clf.best_estimator_)
mean_cross=np.mean(cross_val_score(clf.best_estimator_,data_test,target_test,cv=nfolds))
print "Mean cross-validated accuracy after optimization is: {}".format(mean_cross)
示例10: us_crime
def us_crime():
# US crime dataset
filename = '../../data/raw/mldata/communities.data'
# The missing data will be consider as NaN
# Only use 122 continuous features
tmp_data = np.genfromtxt(filename, delimiter = ',')
tmp_data = tmp_data[:, 5:]
# replace missing value by the mean
imp = Imputer(verbose = 1)
tmp_data = imp.fit_transform(tmp_data)
# extract the data to be saved
data = tmp_data[:, :-1]
bn = Binarizer(threshold=0.65)
label = np.ravel(bn.fit_transform(tmp_data[:, -1]))
np.savez('../../data/clean/uci-us-crime.npz', data=data, label=label)
示例11: OneHotEncoder
from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder
onehot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()
x = ['a', 'b', 'c']
label_x = label_encoder.fit_transform(x).reshape([len(x), 1])
print(label_x)
print(onehot_encoder.fit_transform(label_x).toarray())
binarizer = Binarizer(threshold=1.0).fit(label_x)
print(binarizer.transform(label_x))
示例12: Binarizer
# In[3]:
# Import csv data
raw_data = pd.read_csv('OnlineNewsPopularity_wLabels_deleteNoise.csv').iloc[:, 1:] # read in csv, omit the first column of url
raw_data = raw_data.iloc[:, :-1]
news_data = raw_data.iloc[:, :-1] # Take up to the second last column
news_labels = raw_data.iloc[:, -1] # Take shares column for labels
# Binarize
print '\nBinary Threshold:'
binary_threshold = np.median(raw_data[' shares'])
news_data = news_data.drop(' n_non_stop_words', 1)
print binary_threshold
binarizer = Binarizer(threshold=binary_threshold)
y_binary = binarizer.transform(news_labels).transpose().ravel()
# In[ ]:
# Discretize
# In[25]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
print 'Decision Tree Classifier Accuracy Rate'
tree_score = cross_val_score(tree, news_data, y_binary, cv=10)
示例13: DictVectorizer
news_data = extracted_data.iloc[:, :-1] # Take up to the second last column
news_labels = extracted_data[' shares'] # Take shares column for labels
# Data Preprocessing
news_data_transpose = news_data.transpose()
data_into_dict = news_data_transpose.to_dict()
list_data = [v for k, v in data_into_dict.iteritems()]
# Encode
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
transformed_data = dv.fit_transform(list_data).toarray()
# Label Encoder - Binarization
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=1400) # Threshold at 1400 because median of shares is 1400
transformed_labels = binarizer.transform(news_labels)
transformed_labels = transformed_labels.transpose().ravel() # .ravel() is to fix "Too many array indices error"
# Could be a scikit or pandas bug
############## Classification #################
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
# Decision Tree Classifier
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
gnb = GaussianNB()
示例14: ngram
#---------------------------------------------------------------------------------------
#
# Comment section below out if you already have made pickle files
#
#---------------------------------------------------------------------------------------
all_bigr = ngram(X_train, 'bigram') #starting with all features
print "Starting counting bigrams..."
X_train_bi_counted = count(X_train, all_bigr, 'bigram')
print "Done counting train set"
X_test_bi_counted = count(X_test, all_bigr, 'bigram')
print "Done counting test set"
print "Binarizing and dumping files"
bin = Binarizer()
X_train_bi_binary = bin.fit_transform(X_train_bi_counted)
X_test_bi_binary = bin.transform(X_test_bi_counted)
pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) )
pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) )
print "Done"
print "Starting tfidf vectors..."
X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted)
pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) )
pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) )
print "Done"
print "Starting feature selection using CART random forests on binary files"
示例15: print
_, n_features = X.get_shape()
print('Loading test data...')
with open('data/test-svmlight.dat') as infile:
lines = infile.readlines()
n_samples = len(lines)
test = lil_matrix((n_samples, n_features))
for n,line in enumerate(lines):
for word_count in line.split():
fid, count = word_count.split(':')
test[n,int(fid)] = int(fid)
test = test.tocsr()
if opts.binarize:
print('Binarizing the data...')
binar = Binarizer(copy=False)
X = binar.transform(X)
test = binar.transform(test)
if opts.tfidf:
print('Transforming word occurrences into TF-IDF...')
tranny = TfidfTransformer()
X = tranny.fit_transform(X)
test = tranny.transform(test)
if opts.select_features:
k_features = int(opts.k_features)
if opts.select_features == 'k-best':
print('Selecting %i best features...' % k_features)
ch2 = SelectKBest(chi2, k=k_features)
if opts.select_features == 'pct':