本文整理汇总了Python中pyspark.streaming.StreamingContext.awaitTerminationOrTimeout方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.awaitTerminationOrTimeout方法的具体用法?Python StreamingContext.awaitTerminationOrTimeout怎么用?Python StreamingContext.awaitTerminationOrTimeout使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming.StreamingContext
的用法示例。
在下文中一共展示了StreamingContext.awaitTerminationOrTimeout方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1:
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import awaitTerminationOrTimeout [as 别名]
#datamap = tx_fee_rdd.map(lambda x: ("tx_fee",x) )
#( rowkey , [ row key , column family , column name , value ] )
datamap = tx_fee_rdd.map(lambda x: (str(x[0]),
[str(x[0]),"tx_fee_col","tx_fee",str(x[1])])
)
datamap.saveAsNewAPIHadoopDataset(conf=conf,
keyConverter=keyConv,
valueConverter=valueConv)
lines = ssc.socketTextStream("localhost", 8888)
dump_rdd = lines.map(lambda x: json.dumps(x))
load_rdd = dump_rdd.map(lambda x: json.loads(x)).map(lambda x : x.decode('unicode_escape').encode('ascii','ignore'))
#load_rdd.pprint(2)
split_blk_rdd = load_rdd.map(lambda x: x.split(":"))
#split_blk_rdd.pprint()
tx_fee_rdd = split_blk_rdd.map(lambda x : (x[14][1:7],x[15][1:-15])) #this gets transaction fee
#tx_fee_rdd.pprint(200) #works
tx_fee_rdd.foreachRDD(SaveRecord) #function call
ssc.start() # Start the computation
#ssc.awaitTermination() # Wait for the computation to terminate
ssc.awaitTerminationOrTimeout(15000) #13000#time out in 3 hours
#ssc.stop() # Wait for the computation to terminate
示例2: SparkConf
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import awaitTerminationOrTimeout [as 别名]
dataFilePathOnHdfs = "hdfs://{}/btsdata/aviation/ontime/".format(master)
conf = SparkConf().setAppName(APP_NAME).setMaster('spark://{}:7077'.format(master))
sc = SparkContext(conf)
ssc = StreamingContext(sc, STREAMING_INTERVAL)
ssc.checkpoint('/tmp/ccc')
lines = ssc.textFileStream(dataFilePathOnHdfs)
res2_2 = lines.map(lambda line : line.split(",")) \
.filter(lambda line : line[6] == originAirport) \ # 2nd argument: 'SRQ', 'CMH', 'JFK', 'SEA', or 'BOS'
.map(lambda line : (line[7], float(line[12]))) \ # (Carrier, Departure Delay)
.combineByKey(lambda x : (x, 1), \
lambda x, y : (x[0] + y, x[1] + 1), \ # (sum, count)
lambda x, y : (x[0] + y[0], x[1] + y[1]) ) \
.map(lambda (key, (valueSum, count) : (key, valueSum / count))) \
.sortByKey('ascending')
ssc.start()
while true:
if ssc.awaitTerminationOrTimeout(10):
break
else:
pass
print res2_2.take(10)
print "Gracefully stopping Spark Streaming Application"
ssc.stop(stopSparkContext = True, stopGracefully = True)
print "Application stoppped"
示例3:
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import awaitTerminationOrTimeout [as 别名]
#print gen_tx_json
return gen_tx_json
#get lines RDD
lines = ssc.socketTextStream("localhost", 9999)
dump_rdd = lines.map(lambda x: json.dumps(x))
#print dump_rdd.take(2)
load_rdd = dump_rdd.map(lambda x: json.loads(x)).map(lambda x : x.decode('unicode_escape').encode('ascii','ignore'))
#print load_rdd.take(2)
#load_rdd.pprint(100)
#tx = load_rdd.flatMap(lambda x: x.split(":")) #this works
split_blk_rdd = load_rdd.map(lambda x: x.split(":"))
#split_blk_rdd.pprint()
gen_tx_rdd = split_blk_rdd.map(lambda x : (x[8][1:7],x[6][4:68]) ) #this gets generation transactions
#gen_tx_rdd.pprint() #works
tx_json_rdd = gen_tx_rdd.map(lambda x: (x[0],get_tx_fee(x[1])) ) #function call
tx_fee_rdd = tx_json_rdd.map(lambda x : (x[0],x[1].items()
[3][1][0]["value"]-25) )#.filter(lambda x : "value" in x)
tx_fee_rdd.foreachRDD(SaveRecord) #function call
ssc.start() # Start the computation
#ssc.awaitTermination() # Wait for the computation to terminate
ssc.awaitTerminationOrTimeout(12000) #time out 3.33 hours
#ssc.stop() # Wait for the computation to terminate
示例4: get_cass
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import awaitTerminationOrTimeout [as 别名]
dstream = dstream.flatMap(extract_carr_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average)
elif args.task == 'q13':
dstream = dstream.flatMap(extract_weekday_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average)
elif args.task == 'q21':
dstream = dstream.flatMap(extract_origin_carrier_dep_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average)
elif args.task == 'q22':
dstream = dstream.flatMap(extract_origin_destination_dep_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average)
elif args.task == 'q23':
dstream = dstream.flatMap(extract_route_carrier_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average)
elif args.task == 'q24':
dstream = dstream.flatMap(extract_route_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average)
elif args.task == 'q32':
get_cass().execute('truncate %s' % schema['table'])
dstream = dstream.flatMap(extract_trip_info).foreachRDD(save_trip)
else:
print("Unknown task")
# runner
ts_last_data = time.time()
ssc.start()
while True:
res = ssc.awaitTerminationOrTimeout(args.run_interval)
if res:
# stopped elsewhere
break
else:
# still running
if time.time() - ts_last_data > args.idle_time:
dump("No data received for %d seconds, stopping..." % args.idle_time)
ssc.stop(stopSparkContext=True, stopGraceFully=False)
示例5: BasicOperationTests
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import awaitTerminationOrTimeout [as 别名]
#.........这里部分代码省略.........
s = []
s.extend(vs)
return s
input = [[('k', i)] for i in range(5)]
def func(dstream):
return dstream.updateStateByKey(updater)
expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
expected = [[('k', v)] for v in expected]
self._test_func(input, func, expected)
def test_update_state_by_key_initial_rdd(self):
def updater(vs, s):
if not s:
s = []
s.extend(vs)
return s
initial = [('k', [0, 1])]
initial = self.sc.parallelize(initial, 1)
input = [[('k', i)] for i in range(2, 5)]
def func(dstream):
return dstream.updateStateByKey(updater, initialRDD=initial)
expected = [[0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
expected = [[('k', v)] for v in expected]
self._test_func(input, func, expected)
def test_failed_func(self):
# Test failure in
# TransformFunction.apply(rdd: Option[RDD[_]], time: Time)
input = [self.sc.parallelize([d], 1) for d in range(4)]
input_stream = self.ssc.queueStream(input)
def failed_func(i):
raise ValueError("This is a special error")
input_stream.map(failed_func).pprint()
self.ssc.start()
try:
self.ssc.awaitTerminationOrTimeout(10)
except:
import traceback
failure = traceback.format_exc()
self.assertTrue("This is a special error" in failure)
return
self.fail("a failed func should throw an error")
def test_failed_func2(self):
# Test failure in
# TransformFunction.apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time)
input = [self.sc.parallelize([d], 1) for d in range(4)]
input_stream1 = self.ssc.queueStream(input)
input_stream2 = self.ssc.queueStream(input)
def failed_func(rdd1, rdd2):
raise ValueError("This is a special error")
input_stream1.transformWith(failed_func, input_stream2, True).pprint()
self.ssc.start()
try:
self.ssc.awaitTerminationOrTimeout(10)
except:
import traceback
failure = traceback.format_exc()
self.assertTrue("This is a special error" in failure)
return
self.fail("a failed func should throw an error")
def test_failed_func_with_reseting_failure(self):
input = [self.sc.parallelize([d], 1) for d in range(4)]
input_stream = self.ssc.queueStream(input)
def failed_func(i):
if i == 1:
# Make it fail in the second batch
raise ValueError("This is a special error")
else:
return i
# We should be able to see the results of the 3rd and 4th batches even if the second batch
# fails
expected = [[0], [2], [3]]
self.assertEqual(expected, self._collect(input_stream.map(failed_func), 3))
try:
self.ssc.awaitTerminationOrTimeout(10)
except:
import traceback
failure = traceback.format_exc()
self.assertTrue("This is a special error" in failure)
return
self.fail("a failed func should throw an error")
示例6: StreamingContextTests
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import awaitTerminationOrTimeout [as 别名]
#.........这里部分代码省略.........
for name in ('a', 'b'):
time.sleep(1)
with open(os.path.join(d, name), "wb") as f:
f.write(bytearray(range(10)))
self.wait_for(result, 2)
self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
def test_union(self):
input = [list(range(i + 1)) for i in range(3)]
dstream = self.ssc.queueStream(input)
dstream2 = self.ssc.queueStream(input)
dstream3 = self.ssc.union(dstream, dstream2)
result = self._collect(dstream3, 3)
expected = [i * 2 for i in input]
self.assertEqual(expected, result)
def test_transform(self):
dstream1 = self.ssc.queueStream([[1]])
dstream2 = self.ssc.queueStream([[2]])
dstream3 = self.ssc.queueStream([[3]])
def func(rdds):
rdd1, rdd2, rdd3 = rdds
return rdd2.union(rdd3).union(rdd1)
dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)
self.assertEqual([2, 3, 1], self._take(dstream, 3))
def test_transform_pairrdd(self):
# This regression test case is for SPARK-17756.
dstream = self.ssc.queueStream(
[[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd))
self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3))
def test_get_active(self):
self.assertEqual(StreamingContext.getActive(), None)
# Verify that getActive() returns the active context
self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.ssc.start()
self.assertEqual(StreamingContext.getActive(), self.ssc)
# Verify that getActive() returns None
self.ssc.stop(False)
self.assertEqual(StreamingContext.getActive(), None)
# Verify that if the Java context is stopped, then getActive() returns None
self.ssc = StreamingContext(self.sc, self.duration)
self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.ssc.start()
self.assertEqual(StreamingContext.getActive(), self.ssc)
self.ssc._jssc.stop(False)
self.assertEqual(StreamingContext.getActive(), None)
def test_get_active_or_create(self):
# Test StreamingContext.getActiveOrCreate() without checkpoint data
# See CheckpointTests for tests with checkpoint data
self.ssc = None
self.assertEqual(StreamingContext.getActive(), None)
def setupFunc():
ssc = StreamingContext(self.sc, self.duration)
ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.setupCalled = True
return ssc
# Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active
self.setupCalled = False
self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
self.assertTrue(self.setupCalled)
# Verify that getActiveOrCreate() returns active context and does not call the setupFunc
self.ssc.start()
self.setupCalled = False
self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc)
self.assertFalse(self.setupCalled)
# Verify that getActiveOrCreate() calls setupFunc after active context is stopped
self.ssc.stop(False)
self.setupCalled = False
self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
self.assertTrue(self.setupCalled)
# Verify that if the Java context is stopped, then getActive() returns None
self.ssc = StreamingContext(self.sc, self.duration)
self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.ssc.start()
self.assertEqual(StreamingContext.getActive(), self.ssc)
self.ssc._jssc.stop(False)
self.setupCalled = False
self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
self.assertTrue(self.setupCalled)
def test_await_termination_or_timeout(self):
self._add_input_stream()
self.ssc.start()
self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001))
self.ssc.stop(False)
self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001))
示例7: print
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import awaitTerminationOrTimeout [as 别名]
airportAirports.checkpoint(60)
airportAirports.foreachRDD(outputQ2N2)
carriersA2A.checkpoint(60)
carriersA2A.foreachRDD(outputQ2N3)
topHopFlights.checkpoint(60)
topHopFlights.foreachRDD(outputQ3N2)
print("STARTED!")
ssc.start()
runStatus = 1
while True:
res = ssc.awaitTerminationOrTimeout(10) # 10 seconds timeout
if dataSaved1 and dataSaved2 and dataSaved3 and dataSaved4 and dataSaved5 and dataSaved6:
runStatus = 0
if res:
# stopped elsewhere
break
else:
# still running
timerCount+=1
print("still running...%d" % timerCount)
if runStatus == 0:
print("Finish saving data. Stopping streaming...")
ssc.stop(stopSparkContext=True, stopGraceFully=True)
break