本文整理汇总了Python中pyspark.SparkContext.stop方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.stop方法的具体用法?Python SparkContext.stop怎么用?Python SparkContext.stop使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.stop方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: wordcount
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def wordcount(input_path, output_path):
""" Computes a histogram of word occurences from a set of tweets.
Args:
input_path: the name of the file to read,
output_path: the name of the file to output to.
"""
# Initiate a SparkContext instance.
sc = SparkContext(appName="PythonWordCount")
# Load the input file as a Spark collection.
lines = sc.textFile(input_path, 1)
# This is a map-reduce.
# The map tokenizes the tweets into words and initializes counts for each word.
# The reduce sums the counts, producing a histogram.
# The histogram in the Insight example was sorted alphabetically, so we're doing a sort here.
counts = lines.flatMap(lambda x: x.split(' ')) \
.map(lambda x: (x, 1)) \
.reduceByKey(add) \
.sortByKey()
# Convert the output to a Python list.
output = counts.collect()
# Write the list of words and their count to an output file
with open(output_path, "w") as f:
print "Writing to: " + output_path
for (word, count) in output:
f.write("{:<50}{:10}\n".format(word, str(count)))
# Kill the SparkContext instance.
sc.stop()
示例2: KMeansModel
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def KMeansModel(dataPath, label, k, character, master):
sc = SparkContext(master)
data = sc.textFile(dataPath).map(lambda line: line.replace(character, ','))
if label == 0:
label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect()
label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect()
train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)]))
else:
label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect()
label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect()
train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1]))
model = km.train(train_data, k)
predict_data = train_data.collect()
train = len(predict_data)
acc = 0
for i in range(len(label_sum)):
ksum = np.zeros(k, dtype = int)
cur_label = label_sum[i][0]
for j in range(train):
if label[j] == cur_label:
ksum[model.predict(predict_data[j])] += 1
acc += max(ksum)
string = "KMeans Result: \n"
center = model.centers
for i in range(k):
cur = str(i) + ":" + str(center[i]) + '\n'
string += cur
string = string + "Acc: " + str((float(acc)/train) * 100) + "%"
sc.stop()
return string
示例3: ZeppelinReporterTest
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
class ZeppelinReporterTest(unittest.TestCase):
def setUp(self):
self.sc = SparkContext()
self.sql = SQLContext(self.sc)
self.df = self.sql.createDataFrame([(1, "a"), (1, None), (3, "c")])
def tearDown(self):
self.sc.stop()
def test_output(self):
with patch("pyddq.reporters.get_field") as get_field:
baos = ByteArrayOutputStream()
baos.jvm = self.df._sc._jvm
get_field.return_value = baos.jvm_obj
check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
z = Mock()
reporter = ZeppelinReporter(z)
check.run([reporter])
expected_output = """
%html
</p>
<h4>Checking [_1: bigint, _2: string]</h4>
<h5>It has a total number of 2 columns and 3 rows.</h5>
<table>
<tr><td style="padding:3px">❌</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr>
<tr><td style="padding:3px">✅</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr>
</table>
<p hidden>
""".strip()
self.assertEqual(baos.get_output(), expected_output)
示例4: __init__
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
class SparkContextFactory:
def __init__(self):
# not sure why windows environment variable can't be read, I set it
##os.environ["SPARK_HOME"] = "C:\Spark"
# not sure why windows environment variable can't be read, I set it
##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin"
##sys.path.append("C:\Spark\python")
##sys.path.append("C:\Spark\bin")
# specify spark home
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark"
# specify pyspark path so its libraries can be accessed by this application
sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python")
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
self.conf = SparkConf().setMaster("yarn-client")
self.conf.setAppName("MrT")
self.conf.set("spark.executor.memory", "5g")
self.conf.set("spark.driver.memory", "10g")
self.sc = SparkContext(conf = self.conf, pyFiles =
["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"])
"""
toDF method is a monkey patch executed inside SQLContext constructor
so to be able to use it you have to create a SQLContext first
"""
self.sqlContextInstance = SQLContext(self.sc)
def disconnect(self):
self.sc.stop()
示例5: test
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def test():
sc = SparkContext(master = spark_addr, appName= app_name)
# lines = sc.textFile(hdfs_addr + file_path1, 4)
rdd = sc.textFile(hdfs_addr + file_path1, 2).map(lambda line:format_list(line)).cache()
# rdd = sc.parallelize(test_list,4).cache()
supp = float(rdd.count())*supp_rate
sam_rdd = rdd.sample(False,0.01,1000)
count = sam_rdd.count()
print "count = ",count
sc.stop()
return
item = create_item(rdd)
# freq(rdd,item,3)
two = item_plus(sc,item)
out = two.collect()
freq(rdd,two,2)
three = item_plus(sc,two)
out3 = three.collect()
output = freq(rdd,three,0).collect()
print "##########output#############"
print "key = ",item.collect()
print "two = ",out
print "three = ",output
print "#############################"
sc.stop()
示例6: solve_puzzle
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def solve_puzzle(master, output, height, width, slaves):
global HEIGHT, WIDTH, level
HEIGHT=height
WIDTH=width
level = 0
sc = SparkContext(master, "python")
""" YOUR CODE HERE """
""" YOUR MAP REDUCE PROCESSING CODE HERE """
solution=Sliding.solution(WIDTH, HEIGHT)
sol = Sliding.board_to_hash(WIDTH, HEIGHT, solution)
data = sc.parallelize([(sol,level),])
counter = 0
curLen = 1
while(counter < curLen):
level += 1
data = data.flatMap(bfs_flat_map)
if (level% 12 == 0):
data = data.partitionBy(PARTITION_COUNT)
data = data.reduceByKey(bfs_reduce)
if (level% 6 == 0):
counter = curLen
curLen = data.count()
""" YOUR OUTPUT CODE HERE """
data.coalesce(slaves).saveAsTextFile(output)
sc.stop()
示例7: solve_puzzle
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def solve_puzzle(master, output, height, width, slaves):
global HEIGHT, WIDTH, level
HEIGHT=height
WIDTH=width
level = 0
sc = SparkContext(master, "python")
""" YOUR CODE HERE """
NUM_WORKERS = slaves
sol = Sliding.solution(WIDTH, HEIGHT)
""" MAP REDUCE PROCESSING CODE HERE """
level_pos = sc.parallelize((make_state(level, sol),))
prev_size, size = 0, 1
while prev_size != size:
level += 1
if level % 10 == 0:
level_pos = level_pos.partitionBy(PARTITION_COUNT)
level_pos = level_pos.flatMap(bfs_flat_map).reduceByKey(bfs_reduce)
prev_size = size
size = level_pos.count()
""" OUTPUT CODE HERE """
level_pos = level_pos.map(unhash_board)
level_pos.coalesce(NUM_WORKERS).saveAsTextFile(output)
sc.stop()
示例8: getSensitiveNews
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def getSensitiveNews(request):
"""7.查询敏感信息"""
conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
sc.stop()
示例9: predictTextHotDegree
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def predictTextHotDegree(request):
"""8.文本热度预测"""
conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
sc.stop()
示例10: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main():
master = 'local[1]'
app_name = 'reduce_demo1'
# print(range(0, 3))
sc = SparkContext(master, app_name)
# 测试1:正常
# rdd_list = [sc.parallelize(range(i * 3, (i+1) * 3)) for i in range(0,3)]
# rdd_union = sc.union(rdd_list)
# print(rdd_union.getNumPartitions())
# result = rdd_union.map(fun_map_print)
# result.count()
# 测试2:两次 union
rdd_list_outer = []
for x in ['a', 'b', 'c']:
rdd_list_inner = [sc.parallelize(map(lambda j: x + str(j),range(i * 3, (i+1) * 3))) for i in range(0,3)]
rdd_union_inner = sc.union(rdd_list_inner)
rdd_list_outer.append(rdd_union_inner)
rdd_union_outer = reduce(lambda rddx, rddy: rddx.union(rddy), rdd_list_outer)
result = rdd_union_outer.map(fun_map_print)
result.count()
sc.stop()
示例11: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main():
spark_conf = SparkConf().setAppName("Different-Sampling data").setMaster('local[*]')
spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sc = SparkContext(conf= spark_conf)
GA.logInConsole(0, "input file read!")
rdd = sc.textFile("/home/fatemeh/Data/saveData.txt", minPartitions= 500, use_unicode=False)
rdd.unpersist()
# print('\nNumber of Partitions for this run: ', rdd.getNumPartitions())
vectorRDD = rdd.map(lambda line: toVector(line, splitter = ' '))
GA.logInConsole(0 , "Data Vectorized!")
ss = list()
GA.logInConsole(-1, 'Start the ensemble')
GA.logInConsole(-10, "GA with range 3")
ss.append(GA.parallel_GA_main(vectorRDD,sc, 5))
# GA.logInConsole(-10, "GA with range 4")
# ss.append(GA.parallel_GA_main(norm,sc, 4))
# GA.logInConsole(-10, "GA with range 5")
# ss.append(GA.parallel_GA_main(norm,sc, 5))
# GA.logInConsole(-10, "GA with range 3 and Sampled data set")
# sampleRDD = norm.sample(False, 0.6, seed=10)
# ss.append(GA.parallel_GA_main(sampleRDD,sc, 3))
print(ss)
#selectedSS = voted_subsapces(ss)
# SSD.outlierDetection(vectorRDD, ss)
GA.logInConsole(100, "\nend of program")
sc.stop()
示例12: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main(training_file,n):
epochs = int(n);
x,y,tags = read_training_data(training_file)
v = {}
sc = SparkContext(appName="parameterMixing")
tags = sc.broadcast(tags)
time0 = time.time()
training_data = []
for i in range(len(x)):
training_data.append((x[i],y[i]))
train_data = sc.parallelize(training_data).cache()
for round in range(0,epochs):
fv = sc.broadcast(v)
feat_vec_list = train_data.mapPartitions(lambda t: perc_train(t, tags.value, fv.value))
feat_vec_list = feat_vec_list.combineByKey((lambda x: (x,1)),
(lambda x, y: (x[0] + y, x[1] + 1)),
(lambda x, y: (x[0] + y[0], x[1] + y[1]))).collect()
for (feat, (a,b)) in feat_vec_list:
v[feat] = float(a)/float(b)
sc.stop()
# Compute the weight vector using the Perceptron algorithm
#trainer.perceptron_algorithm(5)
print "iteration %d in %f seconds" %(iterations, time.time()-t0)
# Write out the final weight vector
write_weight_vector(v)
示例13: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main():
appName = "langPopCount;zl"
conf = (SparkConf()
.setAppName(appName)
.set("spark.executor.memory", "5g")
.set("spark.executor.cores","3")
.set("spark.executor.instance", "3")
)
sc = SparkContext(conf = conf)
hc = HiveContext(sc)
langTagList = ['<java>', '<javascript>', '<c>', '<c++>', '<c#>', '<python>', '<php>', '<css>', '<html>', '<objective-c>']
resultrdd = sc.emptyRDD()
for tag in langTagList:
postCountdf = hc.sql("select creationdate, 1 as c from questionpost where tags like '%{tag}%' ".format(tag=tag))
postCountOnYearrdd = postCountdf \
.filter(postCountdf.creationdate != '__none__') \
.withColumn('year', postCountdf.creationdate.substr(0,4)) \
.drop('creationdate') \
.groupBy('year').count() \
.withColumnRenamed('count', 'c') \
.repartition(1) \
.sort('year', ascending=True) \
.map(lambda _: "{tag} {year} {cnt}".format(tag=tag.strip('<>'), year=_.year, cnt=_.c))
resultrdd = resultrdd.union(postCountOnYearrdd)
resultrdd = resultrdd.repartition(1)
resultrdd.saveAsTextFile('/sshomework_zl/popCount')
sc.stop()
示例14: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main():
sc = SparkContext(appName="matrix")
n_partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
n = 4000
v = Vector(sc, n_partitions, [x for x in range(n)])
# w = Vector(sc, [2*x for x in range(n)])
# vw = v.v_add_w(w)
# print(vw.collect())
m = Matrix(sc, n_partitions, [[x for x in range(n)] for _ in range(n)])
time_start = time.time()
for _ in range(5):
mv = m.m_dot_v(v)
# print(mv.rdd.take(10))
mv.rdd.take(1000)
print(time.time() - time_start)
sc.stop()
示例15: bmRun
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def bmRun(self):
"""
Connect DB from Spark and Run/Profile Query
"""
#create output file for results
print "Create benchmark output file for recoring..."
file_out = open("/Users/mira67/Downloads/benchmark_output.txt", "w")
print "start query evaluation, load tables from DB and register tables in Spark..."
#load data with Spark
with Timer() as tm:
sc = SparkContext("local","penguin")
#sc = SparkContext(master=local[2])
sqlContext = SQLContext(sc)
#queries test here, depends on queries to load table in memory
df1 =sqlContext.read.jdbc(url=self.url, table = self.tbName[0],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable
df1.registerTempTable(self.tbName[0])
df2 =sqlContext.read.jdbc(url=self.url, table = self.tbName[1],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable
df2.registerTempTable(self.tbName[1])
#register helper functions for SQL
sqlContext.registerFunction("MONTH", lambda x: x[5:7], StringType())#grab Month
sqlContext.registerFunction("YEAR", lambda x: x[0:4], StringType())
sqlContext.registerFunction("DAY", lambda x: x[8:10], StringType())
rdf1 = sqlContext.sql("SELECT * FROM "+self.tbName[0])
rdf2 = sqlContext.sql("SELECT * FROM " + self.tbName[1])
sqlContext.registerDataFrameAsTable(rdf1, self.mtb[0])
sqlContext.registerDataFrameAsTable(rdf2, self.mtb[1])
mem_use = self.memory_usage_psutil()
print "memory_use_load %s" %mem_use
print "=> elasped load data: %s ms" % (tm.secs * 1000)
#Query with Spark
with Timer() as tm:
#query
rdf = sqlContext.sql(self.sqlStm)
#need register as table first
print "Data schema from query:"
rdf.printSchema()
#hist of BT values
#Todo
mem_use = self.memory_usage_psutil()
print "memory_use_load %s" %mem_use
print "=> elasped: %s ms" % (tm.secs * 1000)
file_out.write("Query Time %s Memory %s\n" % (str(tm.secs * 1000),str(mem_use)))
#example enabled
day1 = sqlContext.sql("SELECT * FROM ssmi t1, map t2 WHERE t1.DATE BETWEEN '1990-01-01' AND '1990-01-01' AND t1.LOCID = t2.ID ORDER BY t1.LOCID")
#call plot
demoplt = qplt.queryPlot()
demoplt.qMapDemo(day1)
#stop sparkcontext
sc.stop()