Python Statistics.chiSqTest方法代码示例

本文整理汇总了Python中pyspark.mllib.stat.Statistics.chiSqTest方法的典型用法代码示例。如果您正苦于以下问题：Python Statistics.chiSqTest方法的具体用法？Python Statistics.chiSqTest怎么用？Python Statistics.chiSqTest使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.stat.Statistics的用法示例。

在下文中一共展示了Statistics.chiSqTest方法的7个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_right_number_of_results

# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])

开发者ID:greatyan，项目名称:spark，代码行数:11，代码来源:tests.py

示例2: test_goodness_of_fit

# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
    def test_goodness_of_fit(self):
        from numpy import inf

        observed = Vectors.dense([4, 6, 5])
        pearson = Statistics.chiSqTest(observed)

        # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
        self.assertEqual(pearson.statistic, 0.4)
        self.assertEqual(pearson.degreesOfFreedom, 2)
        self.assertAlmostEqual(pearson.pValue, 0.8187, 4)

        # Different expected and observed sum
        observed1 = Vectors.dense([21, 38, 43, 80])
        expected1 = Vectors.dense([3, 5, 7, 20])
        pearson1 = Statistics.chiSqTest(observed1, expected1)

        # Results validated against the R command
        # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
        self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
        self.assertEqual(pearson1.degreesOfFreedom, 3)
        self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)

        # Vectors with different sizes
        observed3 = Vectors.dense([1.0, 2.0, 3.0])
        expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
        self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)

        # Negative counts in observed
        neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)

        # Count = 0.0 in expected but not observed
        zero_expected = Vectors.dense([1.0, 0.0, 3.0])
        pearson_inf = Statistics.chiSqTest(observed, zero_expected)
        self.assertEqual(pearson_inf.statistic, inf)
        self.assertEqual(pearson_inf.degreesOfFreedom, 2)
        self.assertEqual(pearson_inf.pValue, 0.0)

        # 0.0 in expected and observed simultaneously
        zero_observed = Vectors.dense([2.0, 0.0, 1.0])
        self.assertRaises(
            IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)

开发者ID:drewrobb，项目名称:spark，代码行数:44，代码来源:test_stat.py

示例3: test_chi_sq_pearson

# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
    def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
        ]

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4)

开发者ID:greatyan，项目名称:spark，代码行数:23，代码来源:tests.py

示例4: test_matrix_independence

# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
    def test_matrix_independence(self):
        data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)

开发者ID:greatyan，项目名称:spark，代码行数:23，代码来源:tests.py

示例5: time

# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
print "Converting bigrams to sparse vectors in a dataframe for the train set"
t0 = time()
features=dfTrain.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)
features.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[323]:

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
print "Computing the chi vector"
t0 = time()
labeledPoints = features.map(lambda row : LabeledPoint(row.label, row.bigramVectors))
chi = Statistics.chiSqTest(labeledPoints)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[324]:

print "Starting bigram selection,broadcasting the newly created bigram dictionary"
t0 = time()
biSelect = [revDict_broad.value[i] for i,bigram in enumerate(chi) if bigram.pValue <=0.3]
dictSelect = {}
for i,bigram in enumerate(biSelect):
    dictSelect[bigram]=i
dictSel_broad = sc.broadcast(dictSelect)
tt = time() - t0
print "Done in {} second".format(round(tt,3))

开发者ID:pifouuu，项目名称:ProjetBigData，代码行数:33，代码来源:script3_bis.py

示例6: SparkContext

# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
from pyspark import SparkContext
# $example on$
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="HypothesisTestingExample")

    # $example on$
    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events

    # compute the goodness of fit. If a second vector to test against
    # is not supplied as a parameter, the test runs against a uniform distribution.
    goodnessOfFitTestResult = Statistics.chiSqTest(vec)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % goodnessOfFitTestResult)

    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix

    # conduct Pearson's independence test on the input contingency matrix
    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % independenceTestResult)

    obs = sc.parallelize(

开发者ID:lhfei，项目名称:spark-in-action，代码行数:33，代码来源:hypothesis_testing_example.py

示例7: SparkContext

# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import chiSqTest [as 别名]
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics


sc = SparkContext("local", "Rubbish")

"""
# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
                       Vectors.dense([4, 5, 0,  3]),
                       Vectors.dense([6, 7, 0,  8])])
"""

# Sample vector composing of frequency of events
vect = Vectors.dense([4,5,0,3])

# Summary of the test including the p-value, degrees of freedom,
goodnessOfFitTestResult = Statistics.chiSqTest(vect)

sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
matrix = Matrices.dense(3,4, sampleData)
# Conduct Pearson's independence test on the input contingency matrix
independenceTestResult = Statistics.chiSqTest(matrix)


# Test statistic, the method used, and the null hypothesis.
print "SINGLE VECTOR FIT: "
print goodnessOfFitTestResult 
## Summary of the test including the p-value, degrees of freedom.
print "INDEPENDENCE TEST RESULT: "
print independenceTestResult

开发者ID:jjingrong，项目名称:Spark-MLlib，代码行数:33，代码来源:Hypothesis_testing.py

注：本文中的pyspark.mllib.stat.Statistics.chiSqTest方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。