当前位置: 首页>>代码示例>>Python>>正文


Python StreamingContext.checkpoint方法代码示例

本文整理汇总了Python中pyspark.streaming.StreamingContext.checkpoint方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.checkpoint方法的具体用法?Python StreamingContext.checkpoint怎么用?Python StreamingContext.checkpoint使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.streaming.StreamingContext的用法示例。


在下文中一共展示了StreamingContext.checkpoint方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def main():
    conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)   # Create a streaming context with batch interval of 10 sec
    ssc.checkpoint("checkpoint")
    geolocator = Nominatim()
    stream(ssc,geolocator,100) 
开发者ID:HackerPack,项目名称:disasterManagement,代码行数:9,代码来源:twitterStream.py

示例2: start_spark

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def start_spark(timeout=None, max_items_per_rdd_sent=None):
    sc = SparkContext("local[4]", "twitter.trending")
    ssc = StreamingContext(sc, 5)

    ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/')

    kafka_params = {
        'zookeeper.connect': config.get('zookeeper', 'host'),
        'group.id': config.get('kafka', 'group_id'),
        'metadata.broker.list': config.get('kafka', 'hosts')
    }

    ksc = KafkaUtils.createDirectStream(ssc,
                                        [config.get('kafka', 'topic')],
                                        kafka_params)

    hashtag_counts = get_word_counts(ksc)
    filtered_tweet_count = filter_tweets(hashtag_counts)
    send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent)
    ssc.start()
    if timeout:
        ssc.awaitTermination(timeout)
        ssc.stop(stopSparkContext=True, stopGraceFully=True)
    else:
        ssc.awaitTermination()
开发者ID:joychugh,项目名称:learning-kafka,代码行数:27,代码来源:spark_example.py

示例3: createStreamingContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createStreamingContext():

    # Create a local StreamingContext with two working thread and batch interval of 1 second
    sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES)
    ssc = StreamingContext(sc, 2)

    # Create a DStream of raw data
    raw = ssc.socketTextStream(MASTER_IP, 9999)

    # Convert into models
    tweets = raw.map(lambda r: Tweet(raw_json=r))

    # Store models
    tweets.foreachRDD(storeTweetsRDD)

    # Sliding window analysis
    window = tweets.window(20*60, 30)
    hashtagCounts = analysisHahtagCount(window)
    streamTop(hashtagCounts).pprint()

    # Keyword extraction - note tweets is immutable
    tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))

    # Update models
    tweetsKeyword.foreachRDD(updateTweetsRDD)

    # Sliding window analysis
    window2 = tweetsKeyword.window(20*60, 30)
    keywordCounts = analysisKeywordCount(window2)
    streamTop(keywordCounts).pprint()

    ssc.checkpoint(CHECKPOINT_DIR)
    return ssc
开发者ID:ecesena,项目名称:spark-tutorial,代码行数:35,代码来源:app.py

示例4: main

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def main():
    conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)   # Create a streaming context with batch interval of 10 sec
    ssc.checkpoint("checkpoint")
    pwords = load_wordlist("positive.txt")
    nwords = load_wordlist("negative.txt")
    counts = stream(ssc, pwords, nwords, 100)
    make_plot(counts)
开发者ID:prutha28,项目名称:Twitter-Sentiment-Analysis,代码行数:11,代码来源:twitterStream.py

示例5: functionToCreateContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def functionToCreateContext():
    sc = SparkContext(appName="StreamingExampleWithKafka")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")
    opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
    kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction)
    counts.pprint()
    return ssc
开发者ID:bithu30,项目名称:myRepo,代码行数:12,代码来源:streamingWordCountWithState.py

示例6: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
        def createContext():
            uBATCH_INTERVAL = 10
            sc = SparkContext(SPARK_MASTER, appName="StreamingKafka")
            sc.broadcast(batchUserPostDict)
            sc.broadcast(batchPostUserDict)
            #sc = SparkContext("local[*]", appName="StreamingKafka")
            # streaming batch interval of 5 sec first, and reduce later to 1 sec or lower
            ssc = StreamingContext(sc, uBATCH_INTERVAL)
            ssc.checkpoint(CHECKPOINT_DIR)   # set checkpoint directory in HDFS
            #ssc.checkpoint(10 * uBATCH_INTERVAL)
            return ssc

            ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
开发者ID:lingding0,项目名称:HottestTopicOnReddit,代码行数:15,代码来源:hotred_stream.py

示例7: main

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def main():
    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set("spark.dynamicAllocation.enabled", "true")
    sc = SparkContext(conf = conf)
    ssc = StreamingContext(sc, 1) # Stream every 1 second
    ssc.checkpoint("checkpoint")

    # Clear the cassandra table
    init_cassandra().execute('TRUNCATE {}'.format(top_airports_table))

    stream_kafka(ssc)
开发者ID:karthikBG,项目名称:AviationAnalytics,代码行数:15,代码来源:2.2.TopDestinationsByAirport.py

示例8: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createContext():

        conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
        sc = SparkContext(conf=conf)

        ssc = StreamingContext(sc, STREAMING_INTERVAL)
        lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))

        ssc.checkpoint(CHECKPOINT_DIR)

        # main split-combine-apply logic put here
        pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1))
        runningCounts = pairs.updateStateByKey(updateFunction)

        sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
开发者ID:paullo0106,项目名称:cloud_computing_capstone,代码行数:17,代码来源:streaming_consumer.py

示例9: createStreamingContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createStreamingContext():
    conf = SparkConf().setMaster("local[2]").setAppName("amqp_temperature")
    conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")

    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint("/tmp/spark-streaming-amqp")

    receiveStream = AMQPUtils.createStream(ssc, "localhost", 5672, "temperature")

    temperature = receiveStream.map(getTemperature)
    max = temperature.reduceByWindow(getMax, None, 5, 5)

    max.pprint()

    return ssc
开发者ID:eformat,项目名称:dstream-amqp,代码行数:18,代码来源:amqp_temperature.py

示例10: functionToCreateContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def functionToCreateContext():
    sc = SparkContext("local[*]", "streaming_part")
    sc.setLogLevel("ERROR")
    
    ssc = StreamingContext(sc, 5)
    
    data_from_ticket_mechine = ssc.socketTextStream("localhost", 9999)
    data_from_camera_mechine = ssc.socketTextStream("localhost", 9998)
    
    
    #meat
    data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler)
    data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler)
    
    ssc.checkpoint(checkpointDirectory)   # set checkpoint directory
    return ssc
开发者ID:sklaw,项目名称:spark_project,代码行数:18,代码来源:stream_handler_v1.py

示例11: main

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def main():
    global ssc

    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set('spark.streaming.stopGracefullyOnShutdown', True)

    sc = SparkContext(conf=conf)

    ssc = StreamingContext(sc, 1)  # Stream every 1 second
    ssc.checkpoint("/tmp/checkpoint")

    signal.signal(signal.SIGINT, stop_streaming)

    stream_kafka()
开发者ID:karthikBG,项目名称:AviationAnalytics,代码行数:18,代码来源:1.1.TopAirports.py

示例12: functionToCreateContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def functionToCreateContext():
    # new context
    conf = SparkConf()
    conf = conf.setAppName(APP_NAME)
    sc   = SparkContext(conf=conf)
    
    # http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes
    sc.addPyFile("common.py")
    
    # As argument Spark Context and batch retention
    ssc = StreamingContext(sc, 10)
    
    # set checkpoint directory
    ssc.checkpoint(CHECKPOINT_DIR)
    
    # return streaming spark context
    return ssc
开发者ID:bunop,项目名称:ccc-capstone,代码行数:19,代码来源:top10_airlines.py

示例13: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createContext(brokers, topic, checkpointDir):
    # If you do not see this printed, that means the StreamingContext has been loaded
    # from the new checkpoint

    sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
    ssc = StreamingContext(sc, 1)

    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
    lines = kvs.map(lambda x: x[1])
    wordCounts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)

    #wordCounts.foreachRDD(echo)
    wordCounts.pprint()
    ssc.checkpoint(checkpointDir)
    return ssc
开发者ID:obaidcuet,项目名称:spark,代码行数:19,代码来源:direct_kafka_wordcount_with_checkpoint.py

示例14: functionToCreateContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def functionToCreateContext():
  # spark context config
  sc = SparkContext(appName="StreamingExampleWithKafka")
  ssc = StreamingContext(sc, 10)
  ssc.checkpoint("checkpoint")
  
  # kafka
  opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
  kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
  # processing
  lines = kvs.map(lambda x: x[1])
  counts = lines.flatMap(lambda line: line.split(" ")) \
   .map(lambda word: (word, 1)) \
   .updateStateByKey(updateFunction) \
   .map(toStringList) \
   .foreachRDD(lambda rdd: rdd.saveAsNewAPIHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv))
  return ssc
开发者ID:SimiY,项目名称:hadoop-ops-course,代码行数:19,代码来源:streamingWordCountToHBase.py

示例15: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createContext():

        conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
        sc = SparkContext(conf=conf)

        ssc = StreamingContext(sc, STREAMING_INTERVAL)
        lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))

        ssc.checkpoint(CHECKPOINT_DIR)

        # main split-combine-apply logic put here
	# filter out header and other invalid rows
	rdd = lines.map(lambda line: line.split(',')).filter(lambda words: len(words) > 56)
        # extract first field (for filtering header), Carrier, Orig, Dest, and delay fields
	rdd2 = rdd.map(lambda x: (x[0], x[8], x[11], x[18], x[52], x[53], x[54], x[55], x[56])).map(lambda line: [str(w.replace('"','')) for w in line]).filter(lambda row: row[0] != 'Year' and any(row[4:]))
	rdd2.pprint()

    	# sum up delay fields for each row
	sum_delay_rdd = rdd2.map(sum_delay)
	sum_delay_rdd.pprint()

    	# sum up delay for each (orig, dest, carrier) pair
	combined_rdd = sum_delay_rdd.updateStateByKey(updateFunction)
	combined_rdd.pprint()

    	# calculate avg delay
	avg_rdd = combined_rdd.transform(lambda rdd: rdd.map(lambda (x, y): ((x[0], x[1]), (y[0]/float(y[1]), x[2]))))
	avg_rdd.pprint()

    	# group by (orig, dest)
	avg_rdd_by_route = avg_rdd.groupByKey()

    	# sort by on time performance for each (orig, dest) route and take top 10
	route_sorted_carrier = avg_rdd_by_route.mapValues(lambda x: sorted(list(x))[:10])
	aa = route_sorted_carrier.flatMapValues(lambda x: x)

    	aa.pprint()
	aa.foreachRDD(process)

        return ssc
开发者ID:paullo0106,项目名称:cloud_computing_capstone,代码行数:42,代码来源:streaming2-3.py


注:本文中的pyspark.streaming.StreamingContext.checkpoint方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。