本文整理汇总了Python中sklearn.covariance.EmpiricalCovariance.mahalanobis方法的典型用法代码示例。如果您正苦于以下问题:Python EmpiricalCovariance.mahalanobis方法的具体用法?Python EmpiricalCovariance.mahalanobis怎么用?Python EmpiricalCovariance.mahalanobis使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.covariance.EmpiricalCovariance
的用法示例。
在下文中一共展示了EmpiricalCovariance.mahalanobis方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_covariance
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
def test_covariance():
"""Tests Covariance module on a simple dataset.
"""
# test covariance fit from data
cov = EmpiricalCovariance()
cov.fit(X)
assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0)
assert_almost_equal(
cov.error_norm(empirical_covariance(X), norm='spectral'), 0)
assert_almost_equal(
cov.error_norm(empirical_covariance(X), norm='frobenius'), 0)
assert_almost_equal(
cov.error_norm(empirical_covariance(X), scaling=False), 0)
assert_almost_equal(
cov.error_norm(empirical_covariance(X), squared=False), 0)
# Mahalanobis distances computation test
mahal_dist = cov.mahalanobis(X)
assert(np.amax(mahal_dist) < 250)
assert(np.amin(mahal_dist) > 50)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
cov = EmpiricalCovariance()
cov.fit(X_1d)
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
assert_almost_equal(
cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)
# test integer type
X_integer = np.asarray([[0, 1], [1, 0]])
result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
assert_array_almost_equal(empirical_covariance(X_integer), result)
示例2: OneClassMahalanobis
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
class OneClassMahalanobis(BaseClassifier):
_fit_params = ['perc_keep']
_predict_params = []
def __init__(self,*args, **kwargs):
# BaseClassifier.__init__(self, *args, **kwargs)
self.perc_keep = kwargs["perc_keep"]
def fit(self, data):
nu = 0.01
n_sample = data.shape[0]
n_feature = data.shape[1]
exclude = set()
for d in range(n_feature):
feature = data[:, d]
s_feature = feature.copy()
s_feature.sort()
low = s_feature[int(n_sample*nu/2)]
upp = s_feature[n_sample-int(n_sample*nu/2)]
exld = numpy.nonzero(numpy.logical_or((feature > upp),(feature < low)))[0]
[exclude.add(e) for e in exld]
use = numpy.array([f for f in range(n_sample) if f not in exclude])
data_ = data[use, :]
self.cov = EmpiricalCovariance().fit(data_)
dist = self.cov.mahalanobis(data)
self.cutoff = numpy.percentile(dist, self.perc_keep)
print self.cutoff
def predict(self, data):
mahal_dist = self.cov.mahalanobis(data)
self.mahal_dist = mahal_dist
print mahal_dist.min(), mahal_dist.max(), self.cutoff, (mahal_dist > self.cutoff).sum(), "of", len(mahal_dist)
return (mahal_dist > self.cutoff).astype(numpy.uint8)*-2+1
def decision_function(self, data=None):
return self.mahal_dist
示例3: Mahalanobis
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
class Mahalanobis (BaseEstimator):
"""Mahalanobis distance estimator. Uses Covariance estimate
to compute mahalanobis distance of the observations
from the model.
Parameters
----------
robust : boolean to determine wheter to use robust estimator
based on Minimum Covariance Determinant computation
"""
def __init__(self, robust=False):
if not robust:
from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator #
else:
from sklearn.covariance import MinCovDet as CovarianceEstimator #
self.model = CovarianceEstimator()
self.cov = None
def fit(self, X, y=None, **params):
"""Fits the covariance model according to the given training
data and parameters.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training data, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
self : object
Returns self.
"""
self.cov = self.model.fit(X)
return self
def score(self, X, y=None):
"""Computes the mahalanobis distances of given observations.
The provided observations are assumed to be centered. One may want to
center them using a location estimate first.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
The observations, the Mahalanobis distances of the which we compute.
Returns
-------
mahalanobis_distance : array, shape = [n_observations,]
Mahalanobis distances of the observations.
"""
#return self.model.score(X,assume_centered=True)
return - self.model.mahalanobis(X-self.model.location_) ** 0.33
示例4: test_covariance
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
def test_covariance():
"""Tests Covariance module on a simple dataset.
"""
# test covariance fit from data
cov = EmpiricalCovariance()
cov.fit(X)
emp_cov = empirical_covariance(X)
assert_array_almost_equal(emp_cov, cov.covariance_, 4)
assert_almost_equal(cov.error_norm(emp_cov), 0)
assert_almost_equal(
cov.error_norm(emp_cov, norm='spectral'), 0)
assert_almost_equal(
cov.error_norm(emp_cov, norm='frobenius'), 0)
assert_almost_equal(
cov.error_norm(emp_cov, scaling=False), 0)
assert_almost_equal(
cov.error_norm(emp_cov, squared=False), 0)
assert_raises(NotImplementedError,
cov.error_norm, emp_cov, norm='foo')
# Mahalanobis distances computation test
mahal_dist = cov.mahalanobis(X)
print(np.amin(mahal_dist), np.amax(mahal_dist))
assert(np.amin(mahal_dist) > 0)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
cov = EmpiricalCovariance()
cov.fit(X_1d)
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
assert_almost_equal(
cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)
# test with one sample
# FIXME I don't know what this test does
X_1sample = np.arange(5)
cov = EmpiricalCovariance()
assert_warns(UserWarning, cov.fit, X_1sample)
assert_array_almost_equal(cov.covariance_,
np.zeros(shape=(5, 5), dtype=np.float64))
# test integer type
X_integer = np.asarray([[0, 1], [1, 0]])
result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
assert_array_almost_equal(empirical_covariance(X_integer), result)
# test centered case
cov = EmpiricalCovariance(assume_centered=True)
cov.fit(X)
assert_array_equal(cov.location_, np.zeros(X.shape[1]))
示例5: mahalanobis_plot
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False):
"""
See http://scikit-learn.org/0.13/modules/outlier_detection.html#\
fitting-an-elliptic-envelop
for details.
"""
if df is None and ctry is None:
raise ValueError('Either the country or a dataframe must be supplied')
elif df is None:
df = load_res(ctry, weighted=weighted)
if inliers:
df = get_inliers(df=df)
X = df.values
robust_cov = MinCovDet().fit(X)
#-----------------------------------------------------------------------------
# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)
#-----------------------------------------------------------------------------
# Display results
fig = plt.figure()
fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
#-----------------------------------------------------------------------------
# Show data set
ax1 = fig.add_subplot(1, 1, 1)
ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.')
ax1.set_title(country_code[ctry])
#-----------------------------------------------------------------------------
# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1],
100),
np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1],
100))
zz = np.c_[xx.ravel(), yy.ravel()]
#-----------------------------------------------------------------------------
mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov),
cmap=plt.cm.PuBu_r,
linestyles='dashed')
#-----------------------------------------------------------------------------
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov),
cmap=plt.cm.YlOrBr_r, linestyles='dotted')
ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]],
['MLE dist', 'robust dist'],
loc="upper right", borderaxespad=0)
ax1.grid()
return (fig, ax1, ctry)
示例6: test_covariance
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
def test_covariance():
"""Tests Covariance module on a simple dataset.
"""
# test covariance fit from data
cov = EmpiricalCovariance()
cov.fit(X)
emp_cov = empirical_covariance(X)
assert_array_almost_equal(emp_cov, cov.covariance_, 4)
assert_almost_equal(cov.error_norm(emp_cov), 0)
assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0)
assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0)
assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm="foo")
# Mahalanobis distances computation test
mahal_dist = cov.mahalanobis(X)
print(np.amin(mahal_dist), np.amax(mahal_dist))
assert np.amin(mahal_dist) > 0
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
cov = EmpiricalCovariance()
cov.fit(X_1d)
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0)
# test with one sample
X_1sample = np.arange(5)
cov = EmpiricalCovariance()
with warnings.catch_warnings(record=True):
cov.fit(X_1sample)
# test integer type
X_integer = np.asarray([[0, 1], [1, 0]])
result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
assert_array_almost_equal(empirical_covariance(X_integer), result)
# test centered case
cov = EmpiricalCovariance(assume_centered=True)
cov.fit(X)
assert_array_equal(cov.location_, np.zeros(X.shape[1]))
示例7: OneClassMahalanobis
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
class OneClassMahalanobis(BaseClassifier):
_fit_params = []
def __init__(self, *args, **kwargs):
pass
def fit(self, data):
#self.cov = MinCovDet().fit(data)
self.cov = EmpiricalCovariance().fit(data)
def predict(self, data):
mahal_emp_cov = self.cov.mahalanobis(data)
d = data.shape[1]
thres = scipy.stats.chi2.ppf(0.95, d)
self.mahal_emp_cov = mahal_emp_cov
return (mahal_emp_cov > thres).astype(numpy.int32)*-2+1
def decision_function(self, data):
return self.mahal_emp_cov
示例8:
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
# Show data set
subfig1 = plt.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
np.linspace(plt.ylim()[0], plt.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]
mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
cmap=plt.cm.PuBu_r,
linestyles='dashed')
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
cmap=plt.cm.YlOrBr_r, linestyles='dotted')
subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
inlier_plot, outlier_plot],
['MLE dist', 'robust dist', 'inliers', 'outliers'],
loc="upper right", borderaxespad=0)
plt.xticks(())
示例9:
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
# Show data set
subfig1 = pl.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(pl.xlim()[0], pl.xlim()[1], 100),
np.linspace(pl.ylim()[0], pl.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]
mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
cmap=pl.cm.PuBu_r,
linestyles='dashed')
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
cmap=pl.cm.YlOrBr_r, linestyles='dotted')
subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
inlier_plot, outlier_plot],
['MLE dist', 'robust dist', 'inliers', 'outliers'],
loc="upper right", borderaxespad=0)
pl.xticks(())
示例10: main
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
def main():
parser = argparse.ArgumentParser(
description='Plot outlier-like distances for a 2-dimensional dataset')
parser.add_argument(
'dataset', type=argparse.FileType('r'),
help='a CSV file containing the dataset')
parser.add_argument(
'--plot', type=str, choices=['train', 'grid'], default='grid',
help='plot the dataset or a grid evenly distributed over its span')
parser.add_argument(
'--plotdims', type=int, choices=[2, 3], default=2,
help='the number of dimensions to plot')
args = parser.parse_args()
X = np.loadtxt(args.dataset, delimiter=',')
fig = plt.figure()
xformer = NullTransformer()
if X.shape[1] > 2:
xformer = PCA(n_components=2)
X = xformer.fit_transform(X)
if args.plotdims == 2:
plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
else:
plt.scatter(X[:, 0], X[:, 1])
plt.show(block=False)
path_to_script = os.path.realpath(__file__)
dir_of_script = os.path.dirname(path_to_script)
dataset_path = dir_of_script + '/outliers.npy'
np.save(dataset_path, X)
###########################################################################
# Train autoencoder with the n samples until convergence. Run
# evenly distributed samples through the autoencoder and compute
# their reconstruction error.
###########################################################################
maxseq_orig = np.max(X)
minseq_orig = np.min(X)
seqrange = np.abs(maxseq_orig - minseq_orig)
maxseq = maxseq_orig + 0.5 * seqrange
minseq = minseq_orig - 0.5 * seqrange
print("minseq", minseq, "maxseq", maxseq)
if args.plot == 'grid':
seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
Xplot = np.array([_ for _ in product(seq, seq)])
else:
Xplot = X
robust_cov = MinCovDet().fit(X)
robust_md = robust_cov.mahalanobis(Xplot)
empirical_cov = EmpiricalCovariance().fit(X)
empirical_md = empirical_cov.mahalanobis(Xplot)
# Assume Xplot is at least 2-dimensional.
if Xplot.shape[1] > 2:
Xplot2d = bh_sne(Xplot)
else:
Xplot2d = Xplot
robust_md01 = robust_md - np.nanmin(robust_md)
robust_md01 = robust_md01 / np.nanmax(robust_md01)
empirical_md01 = empirical_md - np.nanmin(empirical_md)
empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)
fig = plt.figure()
if args.plotdims == 2:
ax = fig.add_subplot(1, 1, 1)
ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0')
else:
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01,
cmap=plt.cm.jet, color=robust_md01)
ax.set_zlabel('Mahalanobis distance')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Mahalanobis distance (robust covariance)')
fig = plt.figure()
if args.plotdims == 2:
ax = fig.add_subplot(1, 1, 1)
ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0')
else:
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01,
cmap=plt.cm.jet, color=empirical_md01)
ax.set_zlabel('Mahalanobis distance')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Mahalanobis distance (empirical covariance)')
#.........这里部分代码省略.........
示例11:
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
offset_bottom = fig.subplotpars.bottom
width = fig.subplotpars.right - offset_left
subfig1 = pl.subplot(3, 1, 1)
subfig2 = pl.subplot(3, 1, 2)
subfig3 = pl.subplot(3, 1, 3)
# Show data set
subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers')
subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
subfig1.legend(loc="upper right")
# Empirical covariance -based Mahalanobis distances
subfig2.scatter(np.arange(n_samples), emp_cov.mahalanobis(X),
color='black', label='inliers')
subfig2.scatter(np.arange(n_samples)[-n_outliers:],
emp_cov.mahalanobis(X)[-n_outliers:],
color='red', label='outliers')
subfig2.set_ylabel("Mahal. dist.")
subfig2.set_title("1. from empirical estimates")
subfig2.axes.set_position(pos=[offset_left, 0.39, width, .2])
# MCD-based Mahalanobis distances
subfig3.scatter(np.arange(n_samples), robust_cov.mahalanobis(X),
color='black', label='inliers')
subfig3.scatter(np.arange(n_samples)[-n_outliers:],
robust_cov.mahalanobis(X)[-n_outliers:],
color='red', label='outliers')
subfig3.set_ylabel("Mahal. dist.")
示例12: EmpiricalCovariance
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
emp_cov = EmpiricalCovariance().fit(X)
# Display results
fig = pl.figure()
# Show data set
subfig1 = pl.subplot(3, 1, 1)
subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers')
subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
subfig1.legend(loc="upper right")
emp_mahal = emp_cov.mahalanobis(X) ** (0.33)
subfig2 = pl.subplot(2, 2, 3)
subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
subfig2.plot(1.26 * np.ones(n_samples - n_outliers),
emp_mahal[:-n_outliers], '+k', markeredgewidth=1)
subfig2.plot(2.26 * np.ones(n_outliers),
emp_mahal[-n_outliers:], '+k', markeredgewidth=1)
subfig2.axes.set_xticklabels(('inliers', 'outliers'), size=11)
subfig2.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$")
subfig2.set_title("1. from non-robust estimates\n(Maximum Likelihood)")
robust_mahal = robust_cov.mahalanobis(X) ** (0.33)
subfig3 = pl.subplot(2, 2, 4)
subfig3.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]],
widths=.25)
subfig3.plot(1.26 * np.ones(n_samples - n_outliers),
示例13: ECDF
# 需要导入模块: from sklearn.covariance import EmpiricalCovariance [as 别名]
# 或者: from sklearn.covariance.EmpiricalCovariance import mahalanobis [as 别名]
# save for heuristic correction
age = df_test['var15']
age_ecdf = ECDF(df_train['var15'])
df_train['var15'] = age_ecdf(df_train['var15'])
df_test['var15'] = age_ecdf(df_test['var15'])
# feature engineering
df_train.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0
df_train['num_zeros'] = (df_train == 0).sum(axis=1)
df_test.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0
df_test['num_zeros'] = (df_test == 0).sum(axis=1)
# outliers
ec = EmpiricalCovariance()
ec = ec.fit(df_train)
m2 = ec.mahalanobis(df_train)
df_train = df_train[m2 < 40000]
df_target = df_target[m2 < 40000]
# clip
# df_test = df_test.clip(df_train.min(), df_train.max(), axis=1)
# standard preprocessing
prep = Pipeline([
('cd', ColumnDropper(drop=ZERO_VARIANCE_COLUMNS + CORRELATED_COLUMNS)),
('std', StandardScaler())
])
X_train = prep.fit_transform(df_train)
X_test = prep.transform(df_test)
y_train = df_target.values