当前位置: 首页>>代码示例>>Python>>正文


Python stat.Statistics类代码示例

本文整理汇总了Python中pyspark.mllib.stat.Statistics的典型用法代码示例。如果您正苦于以下问题:Python Statistics类的具体用法?Python Statistics怎么用?Python Statistics使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Statistics类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: spark_pearson

 def spark_pearson(a, b):
     rdd_a = sc.parallelize(a)
     rdd_b = sc.parallelize(b)
     g = func.func_globals
     g['pearson'] = Statistics.corr(rdd_a, rdd_b, 'pearson')
     g['rho'] = Statistics.corr(rdd_a, rdd_b, 'spearman')
     func(a, b)
开发者ID:avloss,项目名称:pyspark_experiments,代码行数:7,代码来源:my_annotations.py

示例2: run4

    def run4(self):
        from my_fun import parse_interaction,parse_interaction_with_key,summary_by_label

        raw_data = self.raw_data
        vector_data = raw_data.map(parse_interaction)
        # Compute column summary statistics.
        summary = Statistics.colStats(vector_data)

        print "Duration Statistics:"
        print " Mean: {}".format(round(summary.mean()[0],3))
        print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3))
        print " Max value: {}".format(round(summary.max()[0],3))
        print " Min value: {}".format(round(summary.min()[0],3))
        print " Total value count: {}".format(summary.count())
        print " Number of non-zero values: {}".format(summary.numNonzeros()[0])

        label_vector_data = raw_data.map(parse_interaction_with_key)
        normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.")

        normal_summary = Statistics.colStats(normal_label_data.values())

        print "Duration Statistics for label: {}".format("normal")
        print " Mean: {}".format(normal_summary.mean()[0],3)
        print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3))
        print " Max value: {}".format(round(normal_summary.max()[0],3))
        print " Min value: {}".format(round(normal_summary.min()[0],3))
        print " Total value count: {}".format(normal_summary.count())
        print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0])

        normal_sum = summary_by_label(raw_data, "normal.")

        print "Duration Statistics for label: {}".format("normal")
        print " Mean: {}".format(normal_sum.mean()[0],3)
        print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3))
        print " Max value: {}".format(round(normal_sum.max()[0],3))
        print " Min value: {}".format(round(normal_sum.min()[0],3))
        print " Total value count: {}".format(normal_sum.count())
        print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0])

        label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.",
                      "imap.","ipsweep.","land.","loadmodule.","multihop.",
                      "neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.",
                      "rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.",
                      "warezmaster."]
        stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]

        duration_by_label = [
            (stat[0], np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())]))
            for stat in stats_by_label]

        pd.set_option('display.max_columns', 50)

        stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index')

        print "Duration statistics, by label"
        stats_by_label_df
开发者ID:wuzhongdehua,项目名称:my_pyspark,代码行数:56,代码来源:Demo.py

示例3: test_col_norms

    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
开发者ID:HodaAlemi,项目名称:spark,代码行数:12,代码来源:tests.py

示例4: test_col_with_different_rdds

 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
开发者ID:greatyan,项目名称:spark,代码行数:13,代码来源:tests.py

示例5: test_R_implementation_equivalence

    def test_R_implementation_equivalence(self):
        data = self.sc.parallelize([
            1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
            -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
            -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
            -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
            0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
        ])
        model = Statistics.kolmogorovSmirnovTest(data, "norm")
        self.assertAlmostEqual(model.statistic, 0.189, 3)
        self.assertAlmostEqual(model.pValue, 0.422, 3)

        model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
        self.assertAlmostEqual(model.statistic, 0.189, 3)
        self.assertAlmostEqual(model.pValue, 0.422, 3)
开发者ID:drewrobb,项目名称:spark,代码行数:15,代码来源:test_stat.py

示例6: do_all

def do_all(f_path,out_name):
	sc = SparkContext()
	data = sc.textFile(f_path)

	data = data.map(parseKeepD).filter(lambda p: p[0] != None)

	# Scale Features
	features = data.map(lambda x: x[0].features)
	summary = Statistics.colStats(features)
	global means
	global varis
	means = summary.mean()
	varis = summary.variance()

	#scale the points
	data = data.map(lambda y: (conv_label_pt(y[0]),y[1]))

	#train model
	model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none')

	#calculate disparity
	disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1]))  

	#calculate SSR for later
	ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum()

	#keep N
	N = disparity.count()
	#shut down SC
	MSE = ssr/float(N)
	se = std_errors(data,MSE,N)
	disparity.saveAsTextFile(out_loc + out_name)

	sc.stop()
	return model.intercept,model.weights,se,disparity, ssr, N
开发者ID:ssz225,项目名称:bigdata_final,代码行数:35,代码来源:spark_reg_local.py

示例7: summarize

def summarize(dataset):
    print "schema: %s" % dataset.schema().json()
    labels = dataset.map(lambda r: r.label)
    print "label average: %f" % labels.mean()
    features = dataset.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print "features average: %r" % summary.mean()
开发者ID:ArafathC,项目名称:spark,代码行数:7,代码来源:dataset_example.py

示例8: test_right_number_of_results

 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])
开发者ID:greatyan,项目名称:spark,代码行数:9,代码来源:tests.py

示例9: test_goodness_of_fit

    def test_goodness_of_fit(self):
        from numpy import inf

        observed = Vectors.dense([4, 6, 5])
        pearson = Statistics.chiSqTest(observed)

        # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
        self.assertEqual(pearson.statistic, 0.4)
        self.assertEqual(pearson.degreesOfFreedom, 2)
        self.assertAlmostEqual(pearson.pValue, 0.8187, 4)

        # Different expected and observed sum
        observed1 = Vectors.dense([21, 38, 43, 80])
        expected1 = Vectors.dense([3, 5, 7, 20])
        pearson1 = Statistics.chiSqTest(observed1, expected1)

        # Results validated against the R command
        # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
        self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
        self.assertEqual(pearson1.degreesOfFreedom, 3)
        self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)

        # Vectors with different sizes
        observed3 = Vectors.dense([1.0, 2.0, 3.0])
        expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
        self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)

        # Negative counts in observed
        neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)

        # Count = 0.0 in expected but not observed
        zero_expected = Vectors.dense([1.0, 0.0, 3.0])
        pearson_inf = Statistics.chiSqTest(observed, zero_expected)
        self.assertEqual(pearson_inf.statistic, inf)
        self.assertEqual(pearson_inf.degreesOfFreedom, 2)
        self.assertEqual(pearson_inf.pValue, 0.0)

        # 0.0 in expected and observed simultaneously
        zero_observed = Vectors.dense([2.0, 0.0, 1.0])
        self.assertRaises(
            IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
开发者ID:drewrobb,项目名称:spark,代码行数:42,代码来源:test_stat.py

示例10: recommend2user

	def recommend2user(self,user_id):
		
		query = '''select page_id from cooladata where date_range(last 21 days) and user_id = {:d} and page_id is not null group by page_id;'''.format(user_id)

		def SQLtoURL(query):
			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
			return data


		def QueryXXXXX(query, file = None):
			session = Session()
			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
			return response.content
		


		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  

		def convert_row(row):
			rowlist = []
			rowlist = [d['v'] for d in row]
			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		historyTitleData = self.spark.createDataFrame(rd, table_cols)
		historyTitleData = historyTitleData.dropna()
		
		self.model.createOrReplaceTempView("Database")
		historyTitleData.registerTempTable("historyTable")
		
		pageVectorHistory = self.spark.sql('''select d.page_id, d.normTopicDist, case when h.page_id is null then 0 else 1 end as label from Database as d left join historyTable as h on d.page_id = h.page_id''')
		
		mainRdd = pageVectorHistory[pageVectorHistory['label'] == 1][['normTopicDist']].rdd.map(lambda x: x['normTopicDist'].toArray())
		mainVec = Statistics.colStats(mainRdd).mean()

		pageRank = pageVectorHistory[pageVectorHistory['label'] == 0].rdd.map(lambda row: (row['page_id'], float(np.dot(mainVec, row['normTopicDist'].toArray()))))
		pager = pageRank.toDF()
		pager.createOrReplaceTempView("pager")
		sortPageR = self.sqlctx.sql('''select _1 as page_id, _2 as similarity from pager order by similarity desc''')

		return sortPageR.take(10)
开发者ID:NoamRosenberg,项目名称:Portfolio,代码行数:43,代码来源:engine.py

示例11: create_or_update_week

def create_or_update_week(influencer_tweets, topic_tweets, week):

    topic_cor = []
    influencer_cor = []
    for t in topic_tweets:
        for i in influencer_tweets:
            if t['time'] == i['time']:
                topic_cor.append(t['count'])
                influencer_cor.append(i['count'])

    if len(topic_cor)<=1:
        corr = 0
    else:

        sc = SparkContext(appName="CorrelationPerWeek")

        topic_tweets = sc.parallelize(topic_cor)
        influencer_tweets = sc.parallelize(influencer_cor)

        corr = Statistics.corr(topic_tweets, influencer_tweets, "pearson")

        sc.stop()

    url = "http://localhost:8000/api/weeks/"

    today = datetime.fromtimestamp(week/1000.0)
    payload = '{    "score": %f,    "start_date": "%s"  }' % (
        float(corr), str(today.year) + "-" + str(today.month) + "-" + str(today.day))
    headers = {
        'authorization': "Basic ZGV2OjEyMzQ=",
        'content-type': "application/json",
        'cache-control': "no-cache",
        'postman-token': "7c8668c0-a4c2-f42d-66a9-95cbfb7806c5"
    }

    try:
        response = requests.request("POST", url, data=payload, headers=headers)
        return  response.json()['id']
    except:
        print "error"

    return 1
开发者ID:epu-ntua,项目名称:PSYMBIOSYS-Influencial,代码行数:42,代码来源:worker.py

示例12: test_matrix_independence

    def test_matrix_independence(self):
        data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
开发者ID:greatyan,项目名称:spark,代码行数:21,代码来源:tests.py

示例13: test_chi_sq_pearson

    def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
        ]

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
开发者ID:greatyan,项目名称:spark,代码行数:21,代码来源:tests.py

示例14: get_language_correlation

def get_language_correlation():
    """
        calculates the correlation between github languages
    """
    #Create Spark Context
    sc = SparkContext(appName="LanguageCorrelations")

    #Create SQL Context
    sqlCtx = SQLContext(sc)

    #Create a schemaRDD from json datasets stored in HDFS
    pushes = sqlCtx.jsonFile('git_14_15/git_results')

    #Register the schemaRDD as a Table
    pushes.registerTempTable('pushes')

    #filter the data to get the pushes for the languages from LANG
    filtered = sqlCtx.sql('select * from pushes where repository_language in ' + str(tuple(LANG)))

    #perform map transformation to get the rdd in the format (actor, {lang : pushes})
    f_pair = filtered.map(lambda s: (s.actor, {s.repository_language:s.pushes}))

    #group the RDD's based on actor to get the RDD of the format (actor, [{lang1 : pushes},{lang2 : pushes}...])
    f_group = f_pair.groupByKey()

    #merge lang dictionries to get single orderd dict per actor
    f_merged = f_group.map(lambda s: merge_lang_dict(s[1]))

    #created rdd of vectors from the pushes values, which is required for the correlation algorithm
    vectors = f_merged.map(lambda s: Vectors.dense(map(float, s.values())))  
    
    #call the correlation function
    matrix = Statistics.corr(vectors)
    print matrix
    plot_graph(matrix)
    sc.stop()
开发者ID:rackerlabs,项目名称:cloudbigdata-extras,代码行数:36,代码来源:github_lang_correlations.py

示例15: print

##### En trichant #####
# Utilisation de pandas pour résumer les données + afficher la matrice de corrélation
df = pd.read_csv("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF+".csv", sep = ";",header=0)
df.describe()
# Matrice de corrélation
# print(df.corr())


# ### Mllib Statistics

# In[5]:

from pyspark.mllib.stat import Statistics
# Basics Statistics
partsNum = parts.map(lambda line: line[0:8])
summary = Statistics.colStats(partsNum)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())
Statistics.corr(partsNum, method="pearson")


# # Classification supervisée

# ## Naive Bayes

# In[6]:

from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
import utils_mesure
nomF_svm = "glass_svm"
开发者ID:noemieSP,项目名称:ML_Spark,代码行数:31,代码来源:ML_Spark.py


注:本文中的pyspark.mllib.stat.Statistics类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。