本文整理汇总了Python中pyspark.mllib.stat.Statistics.chiSqTest方法的典型用法代码示例。如果您正苦于以下问题:Python Statistics.chiSqTest方法的具体用法?Python Statistics.chiSqTest怎么用?Python Statistics.chiSqTest使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.stat.Statistics
的用法示例。
在下文中一共展示了Statistics.chiSqTest方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_right_number_of_results
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
def test_right_number_of_results(self):
num_cols = 1001
sparse_data = [
LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
]
chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
self.assertEqual(len(chi), num_cols)
self.assertIsNotNone(chi[1000])
示例2: test_goodness_of_fit
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
def test_goodness_of_fit(self):
from numpy import inf
observed = Vectors.dense([4, 6, 5])
pearson = Statistics.chiSqTest(observed)
# Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
self.assertEqual(pearson.statistic, 0.4)
self.assertEqual(pearson.degreesOfFreedom, 2)
self.assertAlmostEqual(pearson.pValue, 0.8187, 4)
# Different expected and observed sum
observed1 = Vectors.dense([21, 38, 43, 80])
expected1 = Vectors.dense([3, 5, 7, 20])
pearson1 = Statistics.chiSqTest(observed1, expected1)
# Results validated against the R command
# `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
self.assertEqual(pearson1.degreesOfFreedom, 3)
self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)
# Vectors with different sizes
observed3 = Vectors.dense([1.0, 2.0, 3.0])
expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)
# Negative counts in observed
neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)
# Count = 0.0 in expected but not observed
zero_expected = Vectors.dense([1.0, 0.0, 3.0])
pearson_inf = Statistics.chiSqTest(observed, zero_expected)
self.assertEqual(pearson_inf.statistic, inf)
self.assertEqual(pearson_inf.degreesOfFreedom, 2)
self.assertEqual(pearson_inf.pValue, 0.0)
# 0.0 in expected and observed simultaneously
zero_observed = Vectors.dense([2.0, 0.0, 1.0])
self.assertRaises(
IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
示例3: test_chi_sq_pearson
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
def test_chi_sq_pearson(self):
data = [
LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
]
for numParts in [2, 4, 6, 8]:
chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
feature1 = chi[0]
self.assertEqual(feature1.statistic, 0.75)
self.assertEqual(feature1.degreesOfFreedom, 2)
self.assertAlmostEqual(feature1.pValue, 0.6873, 4)
feature2 = chi[1]
self.assertEqual(feature2.statistic, 1.5)
self.assertEqual(feature2.degreesOfFreedom, 3)
self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
示例4: test_matrix_independence
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
def test_matrix_independence(self):
data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
# Results validated against R command
# `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
self.assertAlmostEqual(chi.statistic, 21.9958, 4)
self.assertEqual(chi.degreesOfFreedom, 6)
self.assertAlmostEqual(chi.pValue, 0.001213, 4)
# Negative counts
neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)
# Row sum = 0.0
row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)
# Column sum = 0.0
col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
示例5: time
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
print "Converting bigrams to sparse vectors in a dataframe for the train set"
t0 = time()
features=dfTrain.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)
features.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
# In[323]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
print "Computing the chi vector"
t0 = time()
labeledPoints = features.map(lambda row : LabeledPoint(row.label, row.bigramVectors))
chi = Statistics.chiSqTest(labeledPoints)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
# In[324]:
print "Starting bigram selection,broadcasting the newly created bigram dictionary"
t0 = time()
biSelect = [revDict_broad.value[i] for i,bigram in enumerate(chi) if bigram.pValue <=0.3]
dictSelect = {}
for i,bigram in enumerate(biSelect):
dictSelect[bigram]=i
dictSel_broad = sc.broadcast(dictSelect)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
示例6: SparkContext
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
from pyspark import SparkContext
# $example on$
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="HypothesisTestingExample")
# $example on$
vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events
# compute the goodness of fit. If a second vector to test against
# is not supplied as a parameter, the test runs against a uniform distribution.
goodnessOfFitTestResult = Statistics.chiSqTest(vec)
# summary of the test including the p-value, degrees of freedom,
# test statistic, the method used, and the null hypothesis.
print("%s\n" % goodnessOfFitTestResult)
mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix
# conduct Pearson's independence test on the input contingency matrix
independenceTestResult = Statistics.chiSqTest(mat)
# summary of the test including the p-value, degrees of freedom,
# test statistic, the method used, and the null hypothesis.
print("%s\n" % independenceTestResult)
obs = sc.parallelize(
示例7: SparkContext
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
sc = SparkContext("local", "Rubbish")
"""
# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
Vectors.dense([4, 5, 0, 3]),
Vectors.dense([6, 7, 0, 8])])
"""
# Sample vector composing of frequency of events
vect = Vectors.dense([4,5,0,3])
# Summary of the test including the p-value, degrees of freedom,
goodnessOfFitTestResult = Statistics.chiSqTest(vect)
sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
matrix = Matrices.dense(3,4, sampleData)
# Conduct Pearson's independence test on the input contingency matrix
independenceTestResult = Statistics.chiSqTest(matrix)
# Test statistic, the method used, and the null hypothesis.
print "SINGLE VECTOR FIT: "
print goodnessOfFitTestResult
## Summary of the test including the p-value, degrees of freedom.
print "INDEPENDENCE TEST RESULT: "
print independenceTestResult