本文整理汇总了Python中pyspark.streaming.StreamingContext.checkpoint方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.checkpoint方法的具体用法?Python StreamingContext.checkpoint怎么用?Python StreamingContext.checkpoint使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming.StreamingContext
的用法示例。
在下文中一共展示了StreamingContext.checkpoint方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def main():
conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 10) # Create a streaming context with batch interval of 10 sec
ssc.checkpoint("checkpoint")
geolocator = Nominatim()
stream(ssc,geolocator,100)
示例2: start_spark
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def start_spark(timeout=None, max_items_per_rdd_sent=None):
sc = SparkContext("local[4]", "twitter.trending")
ssc = StreamingContext(sc, 5)
ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/')
kafka_params = {
'zookeeper.connect': config.get('zookeeper', 'host'),
'group.id': config.get('kafka', 'group_id'),
'metadata.broker.list': config.get('kafka', 'hosts')
}
ksc = KafkaUtils.createDirectStream(ssc,
[config.get('kafka', 'topic')],
kafka_params)
hashtag_counts = get_word_counts(ksc)
filtered_tweet_count = filter_tweets(hashtag_counts)
send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent)
ssc.start()
if timeout:
ssc.awaitTermination(timeout)
ssc.stop(stopSparkContext=True, stopGraceFully=True)
else:
ssc.awaitTermination()
示例3: createStreamingContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createStreamingContext():
# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES)
ssc = StreamingContext(sc, 2)
# Create a DStream of raw data
raw = ssc.socketTextStream(MASTER_IP, 9999)
# Convert into models
tweets = raw.map(lambda r: Tweet(raw_json=r))
# Store models
tweets.foreachRDD(storeTweetsRDD)
# Sliding window analysis
window = tweets.window(20*60, 30)
hashtagCounts = analysisHahtagCount(window)
streamTop(hashtagCounts).pprint()
# Keyword extraction - note tweets is immutable
tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))
# Update models
tweetsKeyword.foreachRDD(updateTweetsRDD)
# Sliding window analysis
window2 = tweetsKeyword.window(20*60, 30)
keywordCounts = analysisKeywordCount(window2)
streamTop(keywordCounts).pprint()
ssc.checkpoint(CHECKPOINT_DIR)
return ssc
示例4: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def main():
conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 10) # Create a streaming context with batch interval of 10 sec
ssc.checkpoint("checkpoint")
pwords = load_wordlist("positive.txt")
nwords = load_wordlist("negative.txt")
counts = stream(ssc, pwords, nwords, 100)
make_plot(counts)
示例5: functionToCreateContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def functionToCreateContext():
sc = SparkContext(appName="StreamingExampleWithKafka")
ssc = StreamingContext(sc, 10)
ssc.checkpoint("checkpoint")
opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction)
counts.pprint()
return ssc
示例6: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createContext():
uBATCH_INTERVAL = 10
sc = SparkContext(SPARK_MASTER, appName="StreamingKafka")
sc.broadcast(batchUserPostDict)
sc.broadcast(batchPostUserDict)
#sc = SparkContext("local[*]", appName="StreamingKafka")
# streaming batch interval of 5 sec first, and reduce later to 1 sec or lower
ssc = StreamingContext(sc, uBATCH_INTERVAL)
ssc.checkpoint(CHECKPOINT_DIR) # set checkpoint directory in HDFS
#ssc.checkpoint(10 * uBATCH_INTERVAL)
return ssc
ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
示例7: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def main():
conf = SparkConf()
conf.setAppName("TopAirports")
conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
conf.set("spark.dynamicAllocation.enabled", "true")
sc = SparkContext(conf = conf)
ssc = StreamingContext(sc, 1) # Stream every 1 second
ssc.checkpoint("checkpoint")
# Clear the cassandra table
init_cassandra().execute('TRUNCATE {}'.format(top_airports_table))
stream_kafka(ssc)
示例8: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createContext():
conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, STREAMING_INTERVAL)
lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))
ssc.checkpoint(CHECKPOINT_DIR)
# main split-combine-apply logic put here
pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1))
runningCounts = pairs.updateStateByKey(updateFunction)
sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
示例9: createStreamingContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createStreamingContext():
conf = SparkConf().setMaster("local[2]").setAppName("amqp_temperature")
conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 1)
ssc.checkpoint("/tmp/spark-streaming-amqp")
receiveStream = AMQPUtils.createStream(ssc, "localhost", 5672, "temperature")
temperature = receiveStream.map(getTemperature)
max = temperature.reduceByWindow(getMax, None, 5, 5)
max.pprint()
return ssc
示例10: functionToCreateContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def functionToCreateContext():
sc = SparkContext("local[*]", "streaming_part")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 5)
data_from_ticket_mechine = ssc.socketTextStream("localhost", 9999)
data_from_camera_mechine = ssc.socketTextStream("localhost", 9998)
#meat
data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler)
data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler)
ssc.checkpoint(checkpointDirectory) # set checkpoint directory
return ssc
示例11: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def main():
global ssc
conf = SparkConf()
conf.setAppName("TopAirports")
conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
conf.set('spark.streaming.stopGracefullyOnShutdown', True)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 1) # Stream every 1 second
ssc.checkpoint("/tmp/checkpoint")
signal.signal(signal.SIGINT, stop_streaming)
stream_kafka()
示例12: functionToCreateContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def functionToCreateContext():
# new context
conf = SparkConf()
conf = conf.setAppName(APP_NAME)
sc = SparkContext(conf=conf)
# http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes
sc.addPyFile("common.py")
# As argument Spark Context and batch retention
ssc = StreamingContext(sc, 10)
# set checkpoint directory
ssc.checkpoint(CHECKPOINT_DIR)
# return streaming spark context
return ssc
示例13: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createContext(brokers, topic, checkpointDir):
# If you do not see this printed, that means the StreamingContext has been loaded
# from the new checkpoint
sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
ssc = StreamingContext(sc, 1)
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
wordCounts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
#wordCounts.foreachRDD(echo)
wordCounts.pprint()
ssc.checkpoint(checkpointDir)
return ssc
示例14: functionToCreateContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def functionToCreateContext():
# spark context config
sc = SparkContext(appName="StreamingExampleWithKafka")
ssc = StreamingContext(sc, 10)
ssc.checkpoint("checkpoint")
# kafka
opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
# processing
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.updateStateByKey(updateFunction) \
.map(toStringList) \
.foreachRDD(lambda rdd: rdd.saveAsNewAPIHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv))
return ssc
示例15: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import checkpoint [as 别名]
def createContext():
conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, STREAMING_INTERVAL)
lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))
ssc.checkpoint(CHECKPOINT_DIR)
# main split-combine-apply logic put here
# filter out header and other invalid rows
rdd = lines.map(lambda line: line.split(',')).filter(lambda words: len(words) > 56)
# extract first field (for filtering header), Carrier, Orig, Dest, and delay fields
rdd2 = rdd.map(lambda x: (x[0], x[8], x[11], x[18], x[52], x[53], x[54], x[55], x[56])).map(lambda line: [str(w.replace('"','')) for w in line]).filter(lambda row: row[0] != 'Year' and any(row[4:]))
rdd2.pprint()
# sum up delay fields for each row
sum_delay_rdd = rdd2.map(sum_delay)
sum_delay_rdd.pprint()
# sum up delay for each (orig, dest, carrier) pair
combined_rdd = sum_delay_rdd.updateStateByKey(updateFunction)
combined_rdd.pprint()
# calculate avg delay
avg_rdd = combined_rdd.transform(lambda rdd: rdd.map(lambda (x, y): ((x[0], x[1]), (y[0]/float(y[1]), x[2]))))
avg_rdd.pprint()
# group by (orig, dest)
avg_rdd_by_route = avg_rdd.groupByKey()
# sort by on time performance for each (orig, dest) route and take top 10
route_sorted_carrier = avg_rdd_by_route.mapValues(lambda x: sorted(list(x))[:10])
aa = route_sorted_carrier.flatMapValues(lambda x: x)
aa.pprint()
aa.foreachRDD(process)
return ssc