本文整理汇总了Python中pyspark.mllib.stat.Statistics.colStats方法的典型用法代码示例。如果您正苦于以下问题:Python Statistics.colStats方法的具体用法?Python Statistics.colStats怎么用?Python Statistics.colStats使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.stat.Statistics
的用法示例。
在下文中一共展示了Statistics.colStats方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run4
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
def run4(self):
from my_fun import parse_interaction,parse_interaction_with_key,summary_by_label
raw_data = self.raw_data
vector_data = raw_data.map(parse_interaction)
# Compute column summary statistics.
summary = Statistics.colStats(vector_data)
print "Duration Statistics:"
print " Mean: {}".format(round(summary.mean()[0],3))
print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3))
print " Max value: {}".format(round(summary.max()[0],3))
print " Min value: {}".format(round(summary.min()[0],3))
print " Total value count: {}".format(summary.count())
print " Number of non-zero values: {}".format(summary.numNonzeros()[0])
label_vector_data = raw_data.map(parse_interaction_with_key)
normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.")
normal_summary = Statistics.colStats(normal_label_data.values())
print "Duration Statistics for label: {}".format("normal")
print " Mean: {}".format(normal_summary.mean()[0],3)
print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3))
print " Max value: {}".format(round(normal_summary.max()[0],3))
print " Min value: {}".format(round(normal_summary.min()[0],3))
print " Total value count: {}".format(normal_summary.count())
print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0])
normal_sum = summary_by_label(raw_data, "normal.")
print "Duration Statistics for label: {}".format("normal")
print " Mean: {}".format(normal_sum.mean()[0],3)
print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3))
print " Max value: {}".format(round(normal_sum.max()[0],3))
print " Min value: {}".format(round(normal_sum.min()[0],3))
print " Total value count: {}".format(normal_sum.count())
print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0])
label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.",
"imap.","ipsweep.","land.","loadmodule.","multihop.",
"neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.",
"rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.",
"warezmaster."]
stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]
duration_by_label = [
(stat[0], np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())]))
for stat in stats_by_label]
pd.set_option('display.max_columns', 50)
stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index')
print "Duration statistics, by label"
stats_by_label_df
示例2: test_col_norms
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
def test_col_norms(self):
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(10, len(summary.normL1()))
self.assertEqual(10, len(summary.normL2()))
data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
summary2 = Statistics.colStats(data2)
self.assertEqual(array([45.0]), summary2.normL1())
import math
expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
示例3: test_col_with_different_rdds
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
def test_col_with_different_rdds(self):
# numpy
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(1000, summary.count())
# array
data = self.sc.parallelize([range(10)] * 10)
summary = Statistics.colStats(data)
self.assertEqual(10, summary.count())
# array
data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
summary = Statistics.colStats(data)
self.assertEqual(10, summary.count())
示例4: do_all
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
def do_all(f_path,out_name):
sc = SparkContext()
data = sc.textFile(f_path)
data = data.map(parseKeepD).filter(lambda p: p[0] != None)
# Scale Features
features = data.map(lambda x: x[0].features)
summary = Statistics.colStats(features)
global means
global varis
means = summary.mean()
varis = summary.variance()
#scale the points
data = data.map(lambda y: (conv_label_pt(y[0]),y[1]))
#train model
model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none')
#calculate disparity
disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1]))
#calculate SSR for later
ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum()
#keep N
N = disparity.count()
#shut down SC
MSE = ssr/float(N)
se = std_errors(data,MSE,N)
disparity.saveAsTextFile(out_loc + out_name)
sc.stop()
return model.intercept,model.weights,se,disparity, ssr, N
示例5: summarize
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
def summarize(dataset):
print "schema: %s" % dataset.schema().json()
labels = dataset.map(lambda r: r.label)
print "label average: %f" % labels.mean()
features = dataset.map(lambda r: r.features)
summary = Statistics.colStats(features)
print "features average: %r" % summary.mean()
示例6: recommend2user
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
def recommend2user(self,user_id):
query = '''select page_id from cooladata where date_range(last 21 days) and user_id = {:d} and page_id is not null group by page_id;'''.format(user_id)
def SQLtoURL(query):
data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ')
return data
def QueryXXXXX(query, file = None):
session = Session()
response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
return response.content
table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
title_list = [x['c'] for x in table['rows']]
table_cols = [d['label'] for d in table['cols']]
def convert_row(row):
rowlist = []
rowlist = [d['v'] for d in row]
return rowlist
rd = self.sc.parallelize(title_list).map(convert_row)
historyTitleData = self.spark.createDataFrame(rd, table_cols)
historyTitleData = historyTitleData.dropna()
self.model.createOrReplaceTempView("Database")
historyTitleData.registerTempTable("historyTable")
pageVectorHistory = self.spark.sql('''select d.page_id, d.normTopicDist, case when h.page_id is null then 0 else 1 end as label from Database as d left join historyTable as h on d.page_id = h.page_id''')
mainRdd = pageVectorHistory[pageVectorHistory['label'] == 1][['normTopicDist']].rdd.map(lambda x: x['normTopicDist'].toArray())
mainVec = Statistics.colStats(mainRdd).mean()
pageRank = pageVectorHistory[pageVectorHistory['label'] == 0].rdd.map(lambda row: (row['page_id'], float(np.dot(mainVec, row['normTopicDist'].toArray()))))
pager = pageRank.toDF()
pager.createOrReplaceTempView("pager")
sortPageR = self.sqlctx.sql('''select _1 as page_id, _2 as similarity from pager order by similarity desc''')
return sortPageR.take(10)
示例7: SparkContext
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
#Summary stats of data
# source:https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/summary_statistics_example.py
from __future__ import print_function
from pyspark import SparkContext
import numpy as np
from pyspark.mllib.stat import Statistics
if __name__ == "__main__":
sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext
taxi = sc.textFile("s3://irm238finalproject/input/yellow*") # an RDD of Vectors
taxic = tax.filter(lambda line: line[1:10]).map(lambda row: row.split(","))
# Compute column summary statistics.
summary = Statistics.colStats(taxi)
print(summary.mean()) # a dense vector containing the mean value for each column
print(summary.variance()) # column-wise variance
print(summary.numNonzeros()) # number of nonzeros in each column
sc.stop()
示例8: SparkContext
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
# Create a Spark context and set it to work
with SparkContext(conf=conf) as sc:
# Read the parsed records. This time we are reading the serialized file.
# So in each record the fields will already be split
directory = "hdfs:///user/{0}/data/tvlogs/".format( sc.sparkUser() )
logs = sc.pickleFile( "{0}{1}".format(directory,name) )
# Group records by user
byUser = logs.map( lambda x : (x[0],x[1:]) ).groupByKey()
# Compute the time difference between consecutive records of each user
intervals = byUser.flatMap( lambda (x,y) : time_intervals(y) )
# keep it for reusing
intervals.cache()
# Extract statistics from those time differences
# Note that colStats needs a Vector (or a Python list), since it computes by column
# In our case we have a 1-column list
summary = Statistics.colStats(intervals)
with open( 'interval-stats.txt', 'w' ) as out:
for s in ('count','mean','variance','min','max','numNonzeros'):
print >>out, ' * {0}: {1}'.format( s, getattr(summary,s)() )
# And also save them to disk. Flat the list for that
flat = intervals.map( lambda x: x[0] )
flat.saveAsTextFile( "hdfs:///user/{0}/data/tvlogs/intervals.txt".format(sc.sparkUser()) )
示例9: SparkContext
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
#Summary stats of data
# source:https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/summary_statistics_example.py
from __future__ import print_function
from pyspark import SparkContext
import numpy as np
from pyspark.mllib.stat import Statistics
if __name__ == "__main__":
sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext
bike = sc.textFile("s3://irm238finalproject/input/*-citibike-tripdata.csv") # an RDD of Vectors
# Compute column summary statistics.
summary = Statistics.colStats(bike)
print(summary.mean()) # a dense vector containing the mean value for each column
print(summary.variance()) # column-wise variance
print(summary.numNonzeros()) # number of nonzeros in each column
sc.stop()
示例10: summary_by_label
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
def summary_by_label(raw_data, label):
from pyspark.mllib.stat import Statistics
label_vector_data = raw_data.map(parse_interaction_with_key).filter(lambda x: x[0]==label)
return Statistics.colStats(label_vector_data.values())
示例11: int
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
day = int(x[3:5])
year = int(x[6:10])
return(datetime.date(year,month,day).isocalendar()[1])
violent = ["ASSAULT","BATTERY","CRIM SEXUAL ASSAULT", "DOMESTIC VIOLENCE", "HOMICIDE", "KIDNAPPING"]
def setFlags(x):
if x in violent:
return (0,1)
else:
return (1,0)
beats = parts.map(lambda p:(p[10],p[2][6:10],getWeek(p[2]),1,setFlags(p[5])))
beats2 = beats.filter(lambda x:x[1]=="2015").map(lambda x:((x[0],x[2]),(x[3],x[4][0],x[4][1])))
beats3 = beats2.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1],x[2]+y[2]))
standard_vars = beats3.map(lambda row: Vectors.dense((row[0][1],row[1][0],row[1][1],row[1][2])))
summary = Statistics.colStats(standard_vars)
mean_wn = summary.mean()[0]
sd_wn = math.sqrt(summary.variance()[0])
mean_counts = list(summary.mean()[1:4])
sd_counts = list(np.sqrt(summary.variance()[1:4]))
beats_standard = beats3.map(lambda x: (x[0][0],(x[0][1]-mean_wn)/(sd_wn),(x[1][0]-mean_counts[0])/sd_counts[0],(x[1][1]-mean_counts[1])/sd_counts[1], \
(x[1][2]-mean_counts[2])/sd_counts[2]))
beats_list = beats_standard.map(lambda x: ((x[0]),1)).keys().distinct().collect()
beats_list = beats_list[0:50]
def parsePoint(tuple):
values = [float(x) for x in tuple]
return LabeledPoint(values[0], values[1:])
def deNorm(val,mean,sd):
return(val*sd + mean)
maxWeek = (21 - mean_wn) / sd_wn
curWeek = (20 - mean_wn) / sd_wn
示例12: print
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
print (" ")
print (" ")
print ("matriz de correlacion:")
print (" ")
print(Statistics.corr(rows, method="pearson")
'''
file=sc.textFile("Process_Data/SuperFile/superfile.dat")
row = file.map(lambda line:line.split(' ')[1:len(line)]).map(lambda xs: [float(x) for x in xs])
row_list= row.collect() #transforms to list
print(row_list)
#matrix
w, h = 1,38
new_list = [[0 for x in range(w)] for y in range(h)]
for i in range(0,len(row_list)):
new_list[i][:]=Vectors.dense(row_list[i])
i+=1
rows = sc.parallelize([new_list])
print(rows)
summary = Statistics.colStats(rows)
print("media:"),(summary.mean())
print("varianza:"),(summary.variance())
print ("max:"),(summary.max())
print ("min:"),(summary.min())
print("non Zeros:"),(summary.numNonzeros())
示例13: scale
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
d1.signal = scale(d1.signal, axis=1)
reader.seek(j)
d2 = reader.read()
d2.signal = scale(d2.signal, axis=1)
return i, j, exp(-euclidean(d1.signal[7], d2.signal[7]) ** 2)
if __name__ == '__main__':
counts = []
sc = pyspark.SparkContext()
cchunks = list(chunk_combinations(0, 8192, 512))
start = time.time()
fit_rdd = sc.parallelize(chunks(0, 8182, 512)).flatMap(chunk_fits).sortByKey()
fit_rdd.saveAsTextFile('fit_rdd.txt')
fit_rdd.cache()
stats = Statistics.colStats(fit_rdd.values())
means = stats.mean()
scales = np.sqrt(stats.variance())
fit_rdd = fit_rdd.mapValues(lambda x: (x - means) / scales)
# values = fit_rdd.values()
# values.cache()
# km = KMeans().train(values, 3)
# predictions = km.predict(values)
# with open('predictions.txt', 'w') as f:
# f.write('index,p0,p1,p2,p3,res,category')
# for temp, pred in izip(fit_rdd.collect(), predictions.collect()):
# key, value = temp
# f.write('\n%i,%f,%f,%f,%f,%f,%i' % (key, value[0], value[1], value[2], value[3], value[4], pred))
# pass
# print km.clusterCenters
# rdd = sc.parallelize(cchunks)
示例14: compute_mean
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
def compute_mean(tx_data_rdd):
summary = Statistics.colStats(tx_data_rdd)
return summary.mean()
示例15: float
# 需要导入模块: from pyspark.mllib.stat import Statistics [as 别名]
# 或者: from pyspark.mllib.stat.Statistics import colStats [as 别名]
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo=featuresDic,
numTrees=10, featureSubsetStrategy="auto",
impurity='variance', maxDepth=5, maxBins=maxBins)
### Evalute
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
.sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
# print(model.toDebugString())
### Compute R2
SSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1])).sum()
summary = Statistics.colStats(testData.map(lambda x: Vectors.dense(x.label)))
meanY = float(summary.mean())
# Alternative for mean
# testData.map(lambda x: Vectors.dense(x.label)).mean()
SST = testData.map(lambda y: (y.label-meanY)**2).sum()
n = float(testData.count())
params = 3
Rsqrd = 1 - SSE/SST
RsqrdAdj = 1 - SSE/(n-params)/(SST/(n-1))
print('R-sqruared: {0}'.format(Rsqrd))
print('R-sqruared Adj: {0}'.format(RsqrdAdj))