本文整理汇总了Python中sklearn.covariance.MinCovDet.mahalanobis方法的典型用法代码示例。如果您正苦于以下问题:Python MinCovDet.mahalanobis方法的具体用法?Python MinCovDet.mahalanobis怎么用?Python MinCovDet.mahalanobis使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.covariance.MinCovDet
的用法示例。
在下文中一共展示了MinCovDet.mahalanobis方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getMahalanobisRobust
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
def getMahalanobisRobust(dat, critical_alpha = 0.01, good_rows = np.zeros(0)):
'''Calculate the Mahalanobis distance from the sample vector.'''
if good_rows.size == 0:
good_rows = np.any(~np.isnan(dat), axis=1);
#import pdb
#pdb.set_trace()
try:
robust_cov = MinCovDet().fit(dat[good_rows])
mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat))
except ValueError:
#this step will fail if the covariance matrix is not singular. This happens if the data is not
#a unimodal symetric distribution. For example there is too many small noisy particles. Therefore
#I will take a safe option and return zeros in the mahalanobis distance if this is the case.
mahalanobis_dist = np.zeros(dat.shape[0])
#critial distance of the maholanobis distance using the chi-square distirbution
#https://en.wikiversity.org/wiki/Mahalanobis%27_distance
#http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
maha_lim = chi2.ppf(1-critical_alpha, dat.shape[1])
outliers = mahalanobis_dist>maha_lim
return mahalanobis_dist, outliers, maha_lim
示例2: launch_mcd_on_dataset
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
tol_support):
rand_gen = np.random.RandomState(0)
data = rand_gen.randn(n_samples, n_features)
# add some outliers
outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
outliers_offset = 10. * \
(rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
data[outliers_index] += outliers_offset
inliers_mask = np.ones(n_samples).astype(bool)
inliers_mask[outliers_index] = False
pure_data = data[inliers_mask]
# compute MCD by fitting an object
mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
T = mcd_fit.location_
S = mcd_fit.covariance_
H = mcd_fit.support_
# compare with the estimates learnt from the inliers
error_location = np.mean((pure_data.mean(0) - T) ** 2)
assert(error_location < tol_loc)
error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
assert(error_cov < tol_cov)
assert(np.sum(H) >= tol_support)
assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
示例3: mahalanobis_plot
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False):
"""
See http://scikit-learn.org/0.13/modules/outlier_detection.html#\
fitting-an-elliptic-envelop
for details.
"""
if df is None and ctry is None:
raise ValueError('Either the country or a dataframe must be supplied')
elif df is None:
df = load_res(ctry, weighted=weighted)
if inliers:
df = get_inliers(df=df)
X = df.values
robust_cov = MinCovDet().fit(X)
#-----------------------------------------------------------------------------
# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)
#-----------------------------------------------------------------------------
# Display results
fig = plt.figure()
fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
#-----------------------------------------------------------------------------
# Show data set
ax1 = fig.add_subplot(1, 1, 1)
ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.')
ax1.set_title(country_code[ctry])
#-----------------------------------------------------------------------------
# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1],
100),
np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1],
100))
zz = np.c_[xx.ravel(), yy.ravel()]
#-----------------------------------------------------------------------------
mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov),
cmap=plt.cm.PuBu_r,
linestyles='dashed')
#-----------------------------------------------------------------------------
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov),
cmap=plt.cm.YlOrBr_r, linestyles='dotted')
ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]],
['MLE dist', 'robust dist'],
loc="upper right", borderaxespad=0)
ax1.grid()
return (fig, ax1, ctry)
示例4: estimateGaussian
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
def estimateGaussian(nb_objects_init, nb_objects_final, thr, who, genes, siRNA,
loadingFolder = '../resultData/thrivisions/predictions',
threshold=0.05,):
arr=np.vstack((thr, nb_objects_init, nb_objects_final)).T
#deleting siRNAs that have only one experiment
print len(siRNA)
all_=Counter(siRNA);siRNA = np.array(siRNA)
toDelsi=filter(lambda x: all_[x]==1, all_)
toDelInd=[]
for si in toDelsi:
toDelInd.extend(np.where(siRNA==si)[0])
print len(toDelInd)
dd=dict(zip(range(4), [arr, who, genes, siRNA]))
for array_ in dd:
dd[array_]=np.delete(dd[array_],toDelInd,0 )
arr, who, genes, siRNA = [dd[el] for el in range(4)]
print arr.shape
arr_ctrl=arr[np.where(np.array(genes)=='ctrl')]
ctrlcov=MinCovDet().fit(arr_ctrl)
robdist= ctrlcov.mahalanobis(arr)*np.sign(arr[:,0]-np.mean(arr[:,0]))
new_siRNA=np.array(siRNA)[np.where((genes!='ctrl')&(robdist>0))]
pval,qval =empiricalPvalues(np.absolute(robdist[np.where(genes=='ctrl')])[:, np.newaxis],\
robdist[np.where((genes!='ctrl')&(robdist>0))][:, np.newaxis],\
folder=loadingFolder, name="thrivision", sup=True, also_pval=True)
assert new_siRNA.shape==qval.shape
hits=Counter(new_siRNA[np.where(qval<threshold)[0]])
hits=filter(lambda x: float(hits[x])/all_[x]>=0.5, hits)
gene_hits = [genes[list(siRNA).index(el)] for el in hits]
gene_hits=Counter(gene_hits)
return robdist, pval,qval, hits, gene_hits
示例5:
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
np.linspace(plt.ylim()[0], plt.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]
mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
cmap=plt.cm.PuBu_r,
linestyles='dashed')
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
cmap=plt.cm.YlOrBr_r, linestyles='dotted')
subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
inlier_plot, outlier_plot],
['MLE dist', 'robust dist', 'inliers', 'outliers'],
loc="upper right", borderaxespad=0)
plt.xticks(())
plt.yticks(())
# Plot the scores for each point
# emp_mahal = emp_cov.mahalanobis(X - np.mean(X, 0)) ** (0.33)
# subfig2 = plt.subplot(2, 2, 3)
# subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
示例6:
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(pl.xlim()[0], pl.xlim()[1], 100),
np.linspace(pl.ylim()[0], pl.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]
mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
cmap=pl.cm.PuBu_r,
linestyles='dashed')
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
cmap=pl.cm.YlOrBr_r, linestyles='dotted')
subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
inlier_plot, outlier_plot],
['MLE dist', 'robust dist', 'inliers', 'outliers'],
loc="upper right", borderaxespad=0)
pl.xticks(())
pl.yticks(())
# Plot the scores for each point
emp_mahal = emp_cov.mahalanobis(X - np.mean(X, 0)) ** (0.33)
subfig2 = pl.subplot(2, 2, 3)
subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
示例7: main
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
def main():
parser = argparse.ArgumentParser(
description='Plot outlier-like distances for a 2-dimensional dataset')
parser.add_argument(
'dataset', type=argparse.FileType('r'),
help='a CSV file containing the dataset')
parser.add_argument(
'--plot', type=str, choices=['train', 'grid'], default='grid',
help='plot the dataset or a grid evenly distributed over its span')
parser.add_argument(
'--plotdims', type=int, choices=[2, 3], default=2,
help='the number of dimensions to plot')
args = parser.parse_args()
X = np.loadtxt(args.dataset, delimiter=',')
fig = plt.figure()
xformer = NullTransformer()
if X.shape[1] > 2:
xformer = PCA(n_components=2)
X = xformer.fit_transform(X)
if args.plotdims == 2:
plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
else:
plt.scatter(X[:, 0], X[:, 1])
plt.show(block=False)
path_to_script = os.path.realpath(__file__)
dir_of_script = os.path.dirname(path_to_script)
dataset_path = dir_of_script + '/outliers.npy'
np.save(dataset_path, X)
###########################################################################
# Train autoencoder with the n samples until convergence. Run
# evenly distributed samples through the autoencoder and compute
# their reconstruction error.
###########################################################################
maxseq_orig = np.max(X)
minseq_orig = np.min(X)
seqrange = np.abs(maxseq_orig - minseq_orig)
maxseq = maxseq_orig + 0.5 * seqrange
minseq = minseq_orig - 0.5 * seqrange
print("minseq", minseq, "maxseq", maxseq)
if args.plot == 'grid':
seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
Xplot = np.array([_ for _ in product(seq, seq)])
else:
Xplot = X
robust_cov = MinCovDet().fit(X)
robust_md = robust_cov.mahalanobis(Xplot)
empirical_cov = EmpiricalCovariance().fit(X)
empirical_md = empirical_cov.mahalanobis(Xplot)
# Assume Xplot is at least 2-dimensional.
if Xplot.shape[1] > 2:
Xplot2d = bh_sne(Xplot)
else:
Xplot2d = Xplot
robust_md01 = robust_md - np.nanmin(robust_md)
robust_md01 = robust_md01 / np.nanmax(robust_md01)
empirical_md01 = empirical_md - np.nanmin(empirical_md)
empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)
fig = plt.figure()
if args.plotdims == 2:
ax = fig.add_subplot(1, 1, 1)
ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0')
else:
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01,
cmap=plt.cm.jet, color=robust_md01)
ax.set_zlabel('Mahalanobis distance')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Mahalanobis distance (robust covariance)')
fig = plt.figure()
if args.plotdims == 2:
ax = fig.add_subplot(1, 1, 1)
ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0')
else:
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01,
cmap=plt.cm.jet, color=empirical_md01)
ax.set_zlabel('Mahalanobis distance')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Mahalanobis distance (empirical covariance)')
#.........这里部分代码省略.........
示例8: ols
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
lm2 = ols('word_diff ~ Age + C(Centre_ID)',
data=clean_st,subset=subset).fit()
print(lm2.summary())
# <markdowncell>
# # Snippets. Might come back to this later:
# <codecell>
from scipy.stats import pearsonr
from sklearn.covariance import MinCovDet
# just look at what's interesting for now, and drop the NAs involved
clean = st_v_merged.loc[:,['norm_diff','Interview_Suggested_Ranking_numerical_']]
clean = clean.dropna(axis=0)
# calculate robust covariance estimate, calculate what's too far away
mcd = MinCovDet()
mcd.fit(clean)
pearsonr(clean.iloc[:,0],clean.iloc[:,1])
# <codecell>
d = mcd.mahalanobis(clean)
d.sort()
d
示例9: __init__
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
class Grid:
def __init__(self, dim=10, noise=0.1, outliers=0):
self.points = create_grid(dim, noise, outliers)
self.polardata = np.zeros((len(self.points), 2))
self.polar_cov = 0
self.inlier_points = np.zeros((len(self.points), 2))
self.inlier_indicies = np.zeros((len(self.points), 1))
self.normalized_points = np.zeros((len(self.points), 2))
self.theta = 0
self.step_size = 1
self.linedata = np.zeros((3*len(self.points), 2))
self.normalized_point_ids = []
self.bounds = [0, 0, 0, 0]
def step(self, rotation=0):
self.points = rotate(self.points, rotation)
def analyze(self, mahalanobis_tolerance=2):
self.inlier_points = np.zeros((len(self.points), 2))
for id1 in range(len(self.points)):
id2 = closest_point(self.points, self.points[id1], id1)[0]
#keep lines fro plotting purposes
self.linedata[3*id1] = self.points[id1]
self.linedata[3*id1+1] = self.points[id2]
self.linedata[3*id1+2] = [None, None]
# we are repeating every pi/2, so we compress the angle space by 4x
a = 4*math.atan2((self.points[id1, 1] - self.points[id2, 1]), (self.points[id1, 0] - self.points[id2, 0]))
r = np.linalg.norm(self.points[id1] - self.points[id2])
self.polardata[id1] = [r*math.cos(a), r*math.sin(a)]
#find the minimal covariance inlier cluster
self.polar_cov = MinCovDet().fit(self.polardata)
# extract the grid angle and size. angle is divided by 4 because
# we previously scaled it up to repeat every 90 deg
self.theta = math.atan2(-self.polar_cov.location_[1], self.polar_cov.location_[0])/4
self.step_size = np.linalg.norm(self.polar_cov.location_)
# extract inlier points
polar_mahal = self.polar_cov.mahalanobis(self.polardata)**(0.33)
inlier_count = 0
for i in range(len(polar_mahal)):
if polar_mahal[i] < mahalanobis_tolerance: # stdev tolerance to outliers
self.inlier_points[inlier_count] = self.points[i]
self.inlier_indicies[inlier_count] = i
inlier_count += 1
self.normalized_points = rotate(self.inlier_points[:inlier_count], -self.theta)/self.step_size
#enumerate grid IDs
origin_id = closest_point(self.normalized_points, np.mean(self.normalized_points))[0]
self.normalized_points = self.normalized_points - self.normalized_points[origin_id]
inlier_count = 0
self.bounds = [sys.maxint, sys.maxint, -sys.maxint, -sys.maxint]
for p in self.normalized_points:
x = round(p[0])
y = round(p[1])
d = np.linalg.norm(p-[x, y])
if d < 0.4: #tolerance from unit position
self.normalized_points[inlier_count] = [x, y]
if (x < self.bounds[0]):
self.bounds[0] = x
if (x > self.bounds[2]):
self.bounds[2] = x
if (y < self.bounds[1]):
self.bounds[1] = y
if (y > self.bounds[3]):
self.bounds[3] = y
inlier_count += 1
self.normalized_points = self.normalized_points[:inlier_count]
示例10:
# 需要导入模块: from sklearn.covariance import MinCovDet [as 别名]
# 或者: from sklearn.covariance.MinCovDet import mahalanobis [as 别名]
subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
subfig1.legend(loc="upper right")
emp_mahal = emp_cov.mahalanobis(X) ** (0.33)
subfig2 = pl.subplot(2, 2, 3)
subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
subfig2.plot(1.26 * np.ones(n_samples - n_outliers),
emp_mahal[:-n_outliers], '+k', markeredgewidth=1)
subfig2.plot(2.26 * np.ones(n_outliers),
emp_mahal[-n_outliers:], '+k', markeredgewidth=1)
subfig2.axes.set_xticklabels(('inliers', 'outliers'), size=11)
subfig2.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$")
subfig2.set_title("1. from non-robust estimates\n(Maximum Likelihood)")
robust_mahal = robust_cov.mahalanobis(X) ** (0.33)
subfig3 = pl.subplot(2, 2, 4)
subfig3.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]],
widths=.25)
subfig3.plot(1.26 * np.ones(n_samples - n_outliers),
robust_mahal[:-n_outliers], '+k', markeredgewidth=1)
subfig3.plot(2.26 * np.ones(n_outliers),
robust_mahal[-n_outliers:], '+k', markeredgewidth=1)
subfig3.axes.set_xticklabels(('inliers', 'outliers'), size=11)
subfig3.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$")
subfig3.set_title("2. from robust estimates\n(Minimum Covariance Determinant)")
pl.show()