当前位置: 首页>>代码示例>>Python>>正文


Python SparkContext.stop方法代码示例

本文整理汇总了Python中pyspark.SparkContext.stop方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.stop方法的具体用法?Python SparkContext.stop怎么用?Python SparkContext.stop使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SparkContext的用法示例。


在下文中一共展示了SparkContext.stop方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: wordcount

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def wordcount(input_path, output_path):
    """ Computes a histogram of word occurences from a set of tweets.

    Args: 
      input_path: the name of the file to read, 
      output_path: the name of the file to output to.
    """

    # Initiate a SparkContext instance.
    sc = SparkContext(appName="PythonWordCount")
    
    # Load the input file as a Spark collection.
    lines = sc.textFile(input_path, 1)

    # This is a map-reduce.
    # The map tokenizes the tweets into words and initializes counts for each word.
    # The reduce sums the counts, producing a histogram.
    # The histogram in the Insight example was sorted alphabetically, so we're doing a sort here.
    counts = lines.flatMap(lambda x: x.split(' ')) \
                  .map(lambda x: (x, 1)) \
                  .reduceByKey(add) \
                  .sortByKey()
    
    # Convert the output to a Python list.
    output = counts.collect()

    # Write the list of words and their count to an output file
    with open(output_path, "w") as f:
        print "Writing to: " + output_path
        for (word, count) in output:
            f.write("{:<50}{:10}\n".format(word, str(count)))

    # Kill the SparkContext instance.
    sc.stop()
开发者ID:bsyouness,项目名称:InsightChallenge,代码行数:36,代码来源:wordcount.py

示例2: KMeansModel

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def KMeansModel(dataPath, label, k, character, master):
    sc = SparkContext(master)
    data = sc.textFile(dataPath).map(lambda line: line.replace(character, ','))

    if label == 0:
        label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect()
        label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect()        
        train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)]))
    else:
        label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect()
        label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect()        
        train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1]))
    model = km.train(train_data, k)
    predict_data = train_data.collect()
    train = len(predict_data)
    acc = 0
    
    for i in range(len(label_sum)):
        ksum = np.zeros(k, dtype = int)
        cur_label = label_sum[i][0]
        for j in range(train):
            if label[j] == cur_label:
                ksum[model.predict(predict_data[j])] += 1
        acc += max(ksum)

    string = "KMeans Result: \n"
    center = model.centers
    for i in range(k):
        cur = str(i) + ":" + str(center[i]) + '\n'
        string += cur  
    string = string + "Acc: " + str((float(acc)/train) * 100) + "%"    
    sc.stop()
    return string
开发者ID:Tomlong,项目名称:MLlib-UI,代码行数:35,代码来源:mlKmeans.py

示例3: ZeppelinReporterTest

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
class ZeppelinReporterTest(unittest.TestCase):
    def setUp(self):
        self.sc = SparkContext()
        self.sql = SQLContext(self.sc)
        self.df = self.sql.createDataFrame([(1, "a"), (1, None), (3, "c")])

    def tearDown(self):
        self.sc.stop()

    def test_output(self):
        with patch("pyddq.reporters.get_field") as get_field:
            baos = ByteArrayOutputStream()
            baos.jvm = self.df._sc._jvm

            get_field.return_value = baos.jvm_obj
            check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
            z = Mock()
            reporter = ZeppelinReporter(z)
            check.run([reporter])
            expected_output = """
%html
</p>
<h4>Checking [_1: bigint, _2: string]</h4>
<h5>It has a total number of 2 columns and 3 rows.</h5>
<table>
<tr><td style="padding:3px">&#10060;</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr>
<tr><td style="padding:3px">&#9989;</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr>
</table>
<p hidden>
""".strip()
            self.assertEqual(baos.get_output(), expected_output)
开发者ID:firemonk9,项目名称:drunken-data-quality,代码行数:33,代码来源:test_reporters.py

示例4: __init__

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
class SparkContextFactory:
  def __init__(self):
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["SPARK_HOME"] = "C:\Spark"
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin"
    ##sys.path.append("C:\Spark\python")
    ##sys.path.append("C:\Spark\bin")

    # specify spark home
    os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark"
    # specify pyspark path so its libraries can be accessed by this application
    sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python")
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SQLContext

    self.conf = SparkConf().setMaster("yarn-client")
    self.conf.setAppName("MrT")
    self.conf.set("spark.executor.memory", "5g")
    self.conf.set("spark.driver.memory", "10g")

    self.sc = SparkContext(conf = self.conf, pyFiles =
    ["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"])

    """
    toDF method is a monkey patch executed inside SQLContext constructor
    so to be able to use it you have to create a SQLContext first
    """
    self.sqlContextInstance = SQLContext(self.sc)


  def disconnect(self):
    self.sc.stop()
开发者ID:howardx,项目名称:pyspark,代码行数:35,代码来源:risk_SparkContextFactory.py

示例5: test

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def test():
    sc = SparkContext(master = spark_addr, appName= app_name)
#    lines = sc.textFile(hdfs_addr + file_path1, 4)
    rdd = sc.textFile(hdfs_addr + file_path1, 2).map(lambda line:format_list(line)).cache()
#    rdd = sc.parallelize(test_list,4).cache()
    supp = float(rdd.count())*supp_rate
    sam_rdd = rdd.sample(False,0.01,1000)
    count = sam_rdd.count()
    print "count = ",count
    sc.stop()
    return

    item = create_item(rdd)
 #   freq(rdd,item,3)
    two = item_plus(sc,item)
    out = two.collect()
    freq(rdd,two,2)
    three = item_plus(sc,two)
    out3 = three.collect()
    output = freq(rdd,three,0).collect()
    print "##########output#############"
    print "key = ",item.collect()
    print "two = ",out
    print "three = ",output
    print "#############################"
    sc.stop()
开发者ID:optimus2014,项目名称:pyspark,代码行数:28,代码来源:ap1.py

示例6: solve_puzzle

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def solve_puzzle(master, output, height, width, slaves):
    global HEIGHT, WIDTH, level
    HEIGHT=height
    WIDTH=width
    level = 0

    sc = SparkContext(master, "python")

    """ YOUR CODE HERE """
    """ YOUR MAP REDUCE PROCESSING CODE HERE """
    solution=Sliding.solution(WIDTH, HEIGHT)
    sol = Sliding.board_to_hash(WIDTH, HEIGHT, solution)
    data = sc.parallelize([(sol,level),])
    counter = 0
    curLen = 1 
    while(counter < curLen):
        level += 1
        data = data.flatMap(bfs_flat_map)
        

        if (level% 12 == 0):
            data = data.partitionBy(PARTITION_COUNT)
        data = data.reduceByKey(bfs_reduce)
        if (level% 6 == 0):
            counter = curLen
            curLen = data.count()
        
        
    """ YOUR OUTPUT CODE HERE """
    data.coalesce(slaves).saveAsTextFile(output)
    sc.stop()
开发者ID:VictoriaSnow,项目名称:CS-Projects,代码行数:33,代码来源:SlidingBfsSpark.py

示例7: solve_puzzle

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def solve_puzzle(master, output, height, width, slaves):
    global HEIGHT, WIDTH, level
    HEIGHT=height
    WIDTH=width
    level = 0

    sc = SparkContext(master, "python")

    """ YOUR CODE HERE """
    NUM_WORKERS = slaves

    sol = Sliding.solution(WIDTH, HEIGHT)
    """ MAP REDUCE PROCESSING CODE HERE """
    level_pos = sc.parallelize((make_state(level, sol),))
    prev_size, size = 0, 1

    while prev_size != size:
        level += 1
        if level % 10 == 0:
            level_pos = level_pos.partitionBy(PARTITION_COUNT)
        level_pos = level_pos.flatMap(bfs_flat_map).reduceByKey(bfs_reduce)
        prev_size = size
        size = level_pos.count()

    """ OUTPUT CODE HERE """
    level_pos = level_pos.map(unhash_board)
    level_pos.coalesce(NUM_WORKERS).saveAsTextFile(output)

    sc.stop()
开发者ID:hansongcal,项目名称:CS61C,代码行数:31,代码来源:SlidingBfsSpark.py

示例8: getSensitiveNews

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def getSensitiveNews(request):
    """7.查询敏感信息"""
    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    sc.stop()
开发者ID:JallyHe,项目名称:networkPublicOpinionAnalysisSystem,代码行数:9,代码来源:views.py

示例9: predictTextHotDegree

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def predictTextHotDegree(request):
    """8.文本热度预测"""
    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    sc.stop()
开发者ID:JallyHe,项目名称:networkPublicOpinionAnalysisSystem,代码行数:9,代码来源:views.py

示例10: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main():

    master = 'local[1]'
    app_name = 'reduce_demo1'

    # print(range(0, 3))

    sc = SparkContext(master, app_name)

    # 测试1:正常
    # rdd_list = [sc.parallelize(range(i * 3, (i+1) * 3)) for i in range(0,3)]
    # rdd_union = sc.union(rdd_list)
    # print(rdd_union.getNumPartitions())
    # result = rdd_union.map(fun_map_print)
    # result.count()

    # 测试2:两次 union
    rdd_list_outer = []
    for x in ['a', 'b', 'c']:
        rdd_list_inner = [sc.parallelize(map(lambda j: x + str(j),range(i * 3, (i+1) * 3))) for i in range(0,3)]
        rdd_union_inner = sc.union(rdd_list_inner)
        rdd_list_outer.append(rdd_union_inner)

    rdd_union_outer = reduce(lambda rddx, rddy: rddx.union(rddy), rdd_list_outer)
    result = rdd_union_outer.map(fun_map_print)
    result.count()

    sc.stop()
开发者ID:tsingfu,项目名称:xuetangx-streaming-app,代码行数:30,代码来源:test_reduce_demo1.py

示例11: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main():
    spark_conf = SparkConf().setAppName("Different-Sampling data").setMaster('local[*]')
    spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf= spark_conf)
    GA.logInConsole(0, "input file read!")
    rdd = sc.textFile("/home/fatemeh/Data/saveData.txt",  minPartitions= 500, use_unicode=False)
    rdd.unpersist()
#     print('\nNumber of Partitions for this run: ', rdd.getNumPartitions())
    vectorRDD = rdd.map(lambda line: toVector(line, splitter = ' '))
    
    GA.logInConsole(0 , "Data Vectorized!")
    ss = list()
    GA.logInConsole(-1, 'Start the ensemble')
    GA.logInConsole(-10, "GA with range 3")
    ss.append(GA.parallel_GA_main(vectorRDD,sc, 5))
#     GA.logInConsole(-10, "GA with range 4")
#     ss.append(GA.parallel_GA_main(norm,sc, 4))
#     GA.logInConsole(-10, "GA with range 5")
#     ss.append(GA.parallel_GA_main(norm,sc, 5))
#     GA.logInConsole(-10, "GA with range 3 and Sampled data set")
#    sampleRDD = norm.sample(False, 0.6, seed=10)
#    ss.append(GA.parallel_GA_main(sampleRDD,sc, 3))
    print(ss)
    #selectedSS = voted_subsapces(ss)
#     SSD.outlierDetection(vectorRDD, ss)
    GA.logInConsole(100, "\nend of program")
    sc.stop()
开发者ID:fchgithub,项目名称:OriginPySparkRepository,代码行数:29,代码来源:ODHD.py

示例12: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main(training_file,n):

    epochs = int(n);
    x,y,tags = read_training_data(training_file)
    v = {}
    sc = SparkContext(appName="parameterMixing")
    tags = sc.broadcast(tags)
    time0 = time.time()
    training_data = []
    for i in range(len(x)):
        training_data.append((x[i],y[i]))
    train_data = sc.parallelize(training_data).cache()
    for round in range(0,epochs):
        fv = sc.broadcast(v)
        feat_vec_list = train_data.mapPartitions(lambda t: perc_train(t, tags.value, fv.value))
        feat_vec_list = feat_vec_list.combineByKey((lambda x: (x,1)),
                             (lambda x, y: (x[0] + y, x[1] + 1)),
                             (lambda x, y: (x[0] + y[0], x[1] + y[1]))).collect()

        for (feat, (a,b)) in feat_vec_list:
            v[feat] = float(a)/float(b)
    sc.stop()
    # Compute the weight vector using the Perceptron algorithm
    #trainer.perceptron_algorithm(5)
    print "iteration %d in %f seconds" %(iterations, time.time()-t0)
    # Write out the final weight vector
    write_weight_vector(v)
开发者ID:aulfster,项目名称:CMPT-419,代码行数:29,代码来源:iter.py

示例13: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main():
    appName = "langPopCount;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    langTagList = ['<java>', '<javascript>', '<c>', '<c++>', '<c#>', '<python>', '<php>', '<css>', '<html>', '<objective-c>']
    resultrdd = sc.emptyRDD()

    for tag in langTagList:
        postCountdf = hc.sql("select creationdate, 1 as c from questionpost where tags like '%{tag}%' ".format(tag=tag))
        postCountOnYearrdd = postCountdf \
                                 .filter(postCountdf.creationdate != '__none__') \
                                 .withColumn('year', postCountdf.creationdate.substr(0,4)) \
                                 .drop('creationdate') \
                                 .groupBy('year').count() \
                                 .withColumnRenamed('count', 'c') \
                                 .repartition(1) \
                                 .sort('year', ascending=True) \
                                 .map(lambda _: "{tag} {year} {cnt}".format(tag=tag.strip('<>'), year=_.year, cnt=_.c))
        resultrdd = resultrdd.union(postCountOnYearrdd)

    resultrdd = resultrdd.repartition(1)
    resultrdd.saveAsTextFile('/sshomework_zl/popCount')

    sc.stop()
开发者ID:retanoj,项目名称:ss_homework,代码行数:34,代码来源:langPopCount.py

示例14: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
def main():
    sc = SparkContext(appName="matrix")

    n_partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2

    n = 4000

    v = Vector(sc, n_partitions, [x for x in range(n)])
    # w = Vector(sc, [2*x for x in range(n)])

    # vw = v.v_add_w(w)
    # print(vw.collect())

    m = Matrix(sc, n_partitions, [[x for x in range(n)] for _ in range(n)])

    time_start = time.time()

    for _ in range(5):
        mv = m.m_dot_v(v)
        # print(mv.rdd.take(10))
        mv.rdd.take(1000)

    print(time.time() - time_start)

    sc.stop()
开发者ID:awlange,项目名称:brainsparks,代码行数:27,代码来源:mess1.py

示例15: bmRun

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import stop [as 别名]
    def bmRun(self):
        """
        Connect DB from Spark and Run/Profile Query
        """
        #create output file for results
        print "Create benchmark output file for recoring..."
        file_out = open("/Users/mira67/Downloads/benchmark_output.txt", "w")
        print "start query evaluation, load tables from DB and register tables in Spark..."

        #load data with Spark
        with Timer() as tm:
            sc = SparkContext("local","penguin")
            #sc = SparkContext(master=local[2])
            sqlContext = SQLContext(sc)
             
            #queries test here, depends on queries to load table in memory
            df1 =sqlContext.read.jdbc(url=self.url, table = self.tbName[0],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable
            df1.registerTempTable(self.tbName[0])

            df2 =sqlContext.read.jdbc(url=self.url, table = self.tbName[1],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable
            df2.registerTempTable(self.tbName[1])

            #register helper functions for SQL
            sqlContext.registerFunction("MONTH", lambda x: x[5:7], StringType())#grab Month
            sqlContext.registerFunction("YEAR", lambda x: x[0:4], StringType())
            sqlContext.registerFunction("DAY", lambda x: x[8:10], StringType())

            rdf1 = sqlContext.sql("SELECT * FROM "+self.tbName[0])
            rdf2 = sqlContext.sql("SELECT * FROM " + self.tbName[1])
            sqlContext.registerDataFrameAsTable(rdf1, self.mtb[0])
            sqlContext.registerDataFrameAsTable(rdf2, self.mtb[1])

        mem_use = self.memory_usage_psutil()
        print "memory_use_load %s" %mem_use
        print "=> elasped load data: %s ms" % (tm.secs * 1000)

        #Query with Spark
        with Timer() as tm:
            #query
            rdf = sqlContext.sql(self.sqlStm)
#need register as table first
            print "Data schema from query:"
            rdf.printSchema()
            #hist of BT values
            #Todo
        mem_use = self.memory_usage_psutil()
        print "memory_use_load %s" %mem_use
        print "=> elasped: %s ms" % (tm.secs * 1000)

        file_out.write("Query Time %s Memory %s\n" % (str(tm.secs * 1000),str(mem_use))) 
                
        #example enabled
        day1 = sqlContext.sql("SELECT * FROM ssmi t1, map t2 WHERE t1.DATE BETWEEN '1990-01-01' AND '1990-01-01' AND t1.LOCID = t2.ID ORDER BY t1.LOCID")
        #call plot
        demoplt = qplt.queryPlot()
        demoplt.qMapDemo(day1)

        
        #stop sparkcontext
        sc.stop()
开发者ID:mira67,项目名称:condense_engine,代码行数:62,代码来源:penguin_main.py


注:本文中的pyspark.SparkContext.stop方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。