本文整理汇总了Python中pyspark.mllib.stat.Statistics类的典型用法代码示例。如果您正苦于以下问题:Python Statistics类的具体用法?Python Statistics怎么用?Python Statistics使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Statistics类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: spark_pearson
def spark_pearson(a, b):
rdd_a = sc.parallelize(a)
rdd_b = sc.parallelize(b)
g = func.func_globals
g['pearson'] = Statistics.corr(rdd_a, rdd_b, 'pearson')
g['rho'] = Statistics.corr(rdd_a, rdd_b, 'spearman')
func(a, b)
示例2: run4
def run4(self):
from my_fun import parse_interaction,parse_interaction_with_key,summary_by_label
raw_data = self.raw_data
vector_data = raw_data.map(parse_interaction)
# Compute column summary statistics.
summary = Statistics.colStats(vector_data)
print "Duration Statistics:"
print " Mean: {}".format(round(summary.mean()[0],3))
print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3))
print " Max value: {}".format(round(summary.max()[0],3))
print " Min value: {}".format(round(summary.min()[0],3))
print " Total value count: {}".format(summary.count())
print " Number of non-zero values: {}".format(summary.numNonzeros()[0])
label_vector_data = raw_data.map(parse_interaction_with_key)
normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.")
normal_summary = Statistics.colStats(normal_label_data.values())
print "Duration Statistics for label: {}".format("normal")
print " Mean: {}".format(normal_summary.mean()[0],3)
print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3))
print " Max value: {}".format(round(normal_summary.max()[0],3))
print " Min value: {}".format(round(normal_summary.min()[0],3))
print " Total value count: {}".format(normal_summary.count())
print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0])
normal_sum = summary_by_label(raw_data, "normal.")
print "Duration Statistics for label: {}".format("normal")
print " Mean: {}".format(normal_sum.mean()[0],3)
print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3))
print " Max value: {}".format(round(normal_sum.max()[0],3))
print " Min value: {}".format(round(normal_sum.min()[0],3))
print " Total value count: {}".format(normal_sum.count())
print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0])
label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.",
"imap.","ipsweep.","land.","loadmodule.","multihop.",
"neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.",
"rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.",
"warezmaster."]
stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]
duration_by_label = [
(stat[0], np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())]))
for stat in stats_by_label]
pd.set_option('display.max_columns', 50)
stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index')
print "Duration statistics, by label"
stats_by_label_df
示例3: test_col_norms
def test_col_norms(self):
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(10, len(summary.normL1()))
self.assertEqual(10, len(summary.normL2()))
data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
summary2 = Statistics.colStats(data2)
self.assertEqual(array([45.0]), summary2.normL1())
import math
expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
示例4: test_col_with_different_rdds
def test_col_with_different_rdds(self):
# numpy
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(1000, summary.count())
# array
data = self.sc.parallelize([range(10)] * 10)
summary = Statistics.colStats(data)
self.assertEqual(10, summary.count())
# array
data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
summary = Statistics.colStats(data)
self.assertEqual(10, summary.count())
示例5: test_R_implementation_equivalence
def test_R_implementation_equivalence(self):
data = self.sc.parallelize([
1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
-0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
-0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
-0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
])
model = Statistics.kolmogorovSmirnovTest(data, "norm")
self.assertAlmostEqual(model.statistic, 0.189, 3)
self.assertAlmostEqual(model.pValue, 0.422, 3)
model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
self.assertAlmostEqual(model.statistic, 0.189, 3)
self.assertAlmostEqual(model.pValue, 0.422, 3)
示例6: do_all
def do_all(f_path,out_name):
sc = SparkContext()
data = sc.textFile(f_path)
data = data.map(parseKeepD).filter(lambda p: p[0] != None)
# Scale Features
features = data.map(lambda x: x[0].features)
summary = Statistics.colStats(features)
global means
global varis
means = summary.mean()
varis = summary.variance()
#scale the points
data = data.map(lambda y: (conv_label_pt(y[0]),y[1]))
#train model
model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none')
#calculate disparity
disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1]))
#calculate SSR for later
ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum()
#keep N
N = disparity.count()
#shut down SC
MSE = ssr/float(N)
se = std_errors(data,MSE,N)
disparity.saveAsTextFile(out_loc + out_name)
sc.stop()
return model.intercept,model.weights,se,disparity, ssr, N
示例7: summarize
def summarize(dataset):
print "schema: %s" % dataset.schema().json()
labels = dataset.map(lambda r: r.label)
print "label average: %f" % labels.mean()
features = dataset.map(lambda r: r.features)
summary = Statistics.colStats(features)
print "features average: %r" % summary.mean()
示例8: test_right_number_of_results
def test_right_number_of_results(self):
num_cols = 1001
sparse_data = [
LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
]
chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
self.assertEqual(len(chi), num_cols)
self.assertIsNotNone(chi[1000])
示例9: test_goodness_of_fit
def test_goodness_of_fit(self):
from numpy import inf
observed = Vectors.dense([4, 6, 5])
pearson = Statistics.chiSqTest(observed)
# Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
self.assertEqual(pearson.statistic, 0.4)
self.assertEqual(pearson.degreesOfFreedom, 2)
self.assertAlmostEqual(pearson.pValue, 0.8187, 4)
# Different expected and observed sum
observed1 = Vectors.dense([21, 38, 43, 80])
expected1 = Vectors.dense([3, 5, 7, 20])
pearson1 = Statistics.chiSqTest(observed1, expected1)
# Results validated against the R command
# `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
self.assertEqual(pearson1.degreesOfFreedom, 3)
self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)
# Vectors with different sizes
observed3 = Vectors.dense([1.0, 2.0, 3.0])
expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)
# Negative counts in observed
neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)
# Count = 0.0 in expected but not observed
zero_expected = Vectors.dense([1.0, 0.0, 3.0])
pearson_inf = Statistics.chiSqTest(observed, zero_expected)
self.assertEqual(pearson_inf.statistic, inf)
self.assertEqual(pearson_inf.degreesOfFreedom, 2)
self.assertEqual(pearson_inf.pValue, 0.0)
# 0.0 in expected and observed simultaneously
zero_observed = Vectors.dense([2.0, 0.0, 1.0])
self.assertRaises(
IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
示例10: recommend2user
def recommend2user(self,user_id):
query = '''select page_id from cooladata where date_range(last 21 days) and user_id = {:d} and page_id is not null group by page_id;'''.format(user_id)
def SQLtoURL(query):
data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ')
return data
def QueryXXXXX(query, file = None):
session = Session()
response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
return response.content
table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
title_list = [x['c'] for x in table['rows']]
table_cols = [d['label'] for d in table['cols']]
def convert_row(row):
rowlist = []
rowlist = [d['v'] for d in row]
return rowlist
rd = self.sc.parallelize(title_list).map(convert_row)
historyTitleData = self.spark.createDataFrame(rd, table_cols)
historyTitleData = historyTitleData.dropna()
self.model.createOrReplaceTempView("Database")
historyTitleData.registerTempTable("historyTable")
pageVectorHistory = self.spark.sql('''select d.page_id, d.normTopicDist, case when h.page_id is null then 0 else 1 end as label from Database as d left join historyTable as h on d.page_id = h.page_id''')
mainRdd = pageVectorHistory[pageVectorHistory['label'] == 1][['normTopicDist']].rdd.map(lambda x: x['normTopicDist'].toArray())
mainVec = Statistics.colStats(mainRdd).mean()
pageRank = pageVectorHistory[pageVectorHistory['label'] == 0].rdd.map(lambda row: (row['page_id'], float(np.dot(mainVec, row['normTopicDist'].toArray()))))
pager = pageRank.toDF()
pager.createOrReplaceTempView("pager")
sortPageR = self.sqlctx.sql('''select _1 as page_id, _2 as similarity from pager order by similarity desc''')
return sortPageR.take(10)
示例11: create_or_update_week
def create_or_update_week(influencer_tweets, topic_tweets, week):
topic_cor = []
influencer_cor = []
for t in topic_tweets:
for i in influencer_tweets:
if t['time'] == i['time']:
topic_cor.append(t['count'])
influencer_cor.append(i['count'])
if len(topic_cor)<=1:
corr = 0
else:
sc = SparkContext(appName="CorrelationPerWeek")
topic_tweets = sc.parallelize(topic_cor)
influencer_tweets = sc.parallelize(influencer_cor)
corr = Statistics.corr(topic_tweets, influencer_tweets, "pearson")
sc.stop()
url = "http://localhost:8000/api/weeks/"
today = datetime.fromtimestamp(week/1000.0)
payload = '{ "score": %f, "start_date": "%s" }' % (
float(corr), str(today.year) + "-" + str(today.month) + "-" + str(today.day))
headers = {
'authorization': "Basic ZGV2OjEyMzQ=",
'content-type': "application/json",
'cache-control': "no-cache",
'postman-token': "7c8668c0-a4c2-f42d-66a9-95cbfb7806c5"
}
try:
response = requests.request("POST", url, data=payload, headers=headers)
return response.json()['id']
except:
print "error"
return 1
示例12: test_matrix_independence
def test_matrix_independence(self):
data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
# Results validated against R command
# `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
self.assertAlmostEqual(chi.statistic, 21.9958, 4)
self.assertEqual(chi.degreesOfFreedom, 6)
self.assertAlmostEqual(chi.pValue, 0.001213, 4)
# Negative counts
neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)
# Row sum = 0.0
row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)
# Column sum = 0.0
col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
示例13: test_chi_sq_pearson
def test_chi_sq_pearson(self):
data = [
LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
]
for numParts in [2, 4, 6, 8]:
chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
feature1 = chi[0]
self.assertEqual(feature1.statistic, 0.75)
self.assertEqual(feature1.degreesOfFreedom, 2)
self.assertAlmostEqual(feature1.pValue, 0.6873, 4)
feature2 = chi[1]
self.assertEqual(feature2.statistic, 1.5)
self.assertEqual(feature2.degreesOfFreedom, 3)
self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
示例14: get_language_correlation
def get_language_correlation():
"""
calculates the correlation between github languages
"""
#Create Spark Context
sc = SparkContext(appName="LanguageCorrelations")
#Create SQL Context
sqlCtx = SQLContext(sc)
#Create a schemaRDD from json datasets stored in HDFS
pushes = sqlCtx.jsonFile('git_14_15/git_results')
#Register the schemaRDD as a Table
pushes.registerTempTable('pushes')
#filter the data to get the pushes for the languages from LANG
filtered = sqlCtx.sql('select * from pushes where repository_language in ' + str(tuple(LANG)))
#perform map transformation to get the rdd in the format (actor, {lang : pushes})
f_pair = filtered.map(lambda s: (s.actor, {s.repository_language:s.pushes}))
#group the RDD's based on actor to get the RDD of the format (actor, [{lang1 : pushes},{lang2 : pushes}...])
f_group = f_pair.groupByKey()
#merge lang dictionries to get single orderd dict per actor
f_merged = f_group.map(lambda s: merge_lang_dict(s[1]))
#created rdd of vectors from the pushes values, which is required for the correlation algorithm
vectors = f_merged.map(lambda s: Vectors.dense(map(float, s.values())))
#call the correlation function
matrix = Statistics.corr(vectors)
print matrix
plot_graph(matrix)
sc.stop()
示例15: print
##### En trichant #####
# Utilisation de pandas pour résumer les données + afficher la matrice de corrélation
df = pd.read_csv("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF+".csv", sep = ";",header=0)
df.describe()
# Matrice de corrélation
# print(df.corr())
# ### Mllib Statistics
# In[5]:
from pyspark.mllib.stat import Statistics
# Basics Statistics
partsNum = parts.map(lambda line: line[0:8])
summary = Statistics.colStats(partsNum)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())
Statistics.corr(partsNum, method="pearson")
# # Classification supervisée
# ## Naive Bayes
# In[6]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
import utils_mesure
nomF_svm = "glass_svm"