本文整理汇总了Python中pyspark.context.SparkContext.setCheckpointDir方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.setCheckpointDir方法的具体用法?Python SparkContext.setCheckpointDir怎么用?Python SparkContext.setCheckpointDir使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.context.SparkContext
的用法示例。
在下文中一共展示了SparkContext.setCheckpointDir方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: PyVertexRDDTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import setCheckpointDir [as 别名]
class PyVertexRDDTestCase(unittest.TestCase):
"""
Test collect, take, count, mapValues, diff,
filter, mapVertexPartitions, innerJoin and leftJoin
for VertexRDD
"""
def setUp(self):
class_name = self.__class__.__name__
conf = SparkConf().set("spark.default.parallelism", 1)
self.sc = SparkContext(appName=class_name, conf=conf)
self.sc.setCheckpointDir("/tmp")
def tearDown(self):
self.sc.stop()
def collect(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.take(1)
self.assertEqual(results, [(3, ("rxin", "student"))])
def take(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
def count(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.count()
self.assertEqual(results, 2)
def mapValues(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.mapValues(lambda x: x + ":" + x)
self.assertEqual(results, [(3, ("rxin:rxin", "student:student")),
(7, ("jgonzal:jgonzal", "postdoc:postdoc"))])
def innerJoin(self):
vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
vertices0 = VertexRDD(vertexData0)
vertices1 = VertexRDD(vertexData1)
results = vertices0.innerJoin(vertices1).collect()
self.assertEqual(results, [])
def leftJoin(self):
vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
vertices0 = VertexRDD(vertexData0)
vertices1 = VertexRDD(vertexData1)
results = vertices0.diff(vertices1)
self.assertEqual(results, 2)
示例2: PyEdgeRDDTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import setCheckpointDir [as 别名]
class PyEdgeRDDTestCase(unittest.TestCase):
"""
Test collect, take, count, mapValues,
filter and innerJoin for EdgeRDD
"""
def setUp(self):
class_name = self.__class__.__name__
conf = SparkConf().set("spark.default.parallelism", 1)
self.sc = SparkContext(appName=class_name, conf=conf)
self.sc.setCheckpointDir("/tmp")
def tearDown(self):
self.sc.stop()
# TODO
def collect(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
# TODO
def take(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
# TODO
def count(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, 2)
# TODO
def mapValues(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, 2)
# TODO
def filter(self):
return
# TODO
def innerJoin(self):
vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
vertices0 = VertexRDD(vertexData0)
vertices1 = VertexRDD(vertexData1)
results = vertices0.diff(vertices1)
self.assertEqual(results, 2)
示例3: PySparkStreamingTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import setCheckpointDir [as 别名]
class PySparkStreamingTestCase(unittest.TestCase):
timeout = 20 # seconds
duration = 1
def setUp(self):
class_name = self.__class__.__name__
conf = SparkConf().set("spark.default.parallelism", 1)
self.sc = SparkContext(appName=class_name, conf=conf)
self.sc.setCheckpointDir("/tmp")
# TODO: decrease duration to speed up tests
self.ssc = StreamingContext(self.sc, self.duration)
def tearDown(self):
self.ssc.stop()
def wait_for(self, result, n):
start_time = time.time()
while len(result) < n and time.time() - start_time < self.timeout:
time.sleep(0.01)
if len(result) < n:
print("timeout after", self.timeout)
def _take(self, dstream, n):
"""
Return the first `n` elements in the stream (will start and stop).
"""
results = []
def take(_, rdd):
if rdd and len(results) < n:
results.extend(rdd.take(n - len(results)))
dstream.foreachRDD(take)
self.ssc.start()
self.wait_for(results, n)
return results
def _collect(self, dstream, n, block=True):
"""
Collect each RDDs into the returned list.
:return: list, which will have the collected items.
"""
result = []
def get_output(_, rdd):
if rdd and len(result) < n:
r = rdd.collect()
if r:
result.append(r)
dstream.foreachRDD(get_output)
if not block:
return result
self.ssc.start()
self.wait_for(result, n)
return result
def _test_func(self, input, func, expected, sort=False, input2=None):
"""
@param input: dataset for the test. This should be list of lists.
@param func: wrapped function. This function should return PythonDStream object.
@param expected: expected output for this testcase.
"""
if not isinstance(input[0], RDD):
input = [self.sc.parallelize(d, 1) for d in input]
input_stream = self.ssc.queueStream(input)
if input2 and not isinstance(input2[0], RDD):
input2 = [self.sc.parallelize(d, 1) for d in input2]
input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None
# Apply test function to stream.
if input2:
stream = func(input_stream, input_stream2)
else:
stream = func(input_stream)
result = self._collect(stream, len(expected))
if sort:
self._sort_result_based_on_key(result)
self._sort_result_based_on_key(expected)
self.assertEqual(expected, result)
def _sort_result_based_on_key(self, outputs):
"""Sort the list based on first value."""
for output in outputs:
output.sort(key=lambda x: x[0])
示例4: PyGraphXTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import setCheckpointDir [as 别名]
class PyGraphXTestCase(unittest.TestCase):
"""
Test vertices, edges, partitionBy, numEdges, numVertices,
inDegrees, outDegrees, degrees, triplets, mapVertices,
mapEdges, mapTriplets, reverse, subgraph, groupEdges,
joinVertices, outerJoinVertices, collectNeighborIds,
collectNeighbors, mapReduceTriplets, triangleCount for Graph
"""
def setUp(self):
class_name = self.__class__.__name__
conf = SparkConf().set("spark.default.parallelism", 1)
self.sc = SparkContext(appName=class_name, conf=conf)
self.sc.setCheckpointDir("/tmp")
def tearDown(self):
self.sc.stop()
def collect(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
def take(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
def count(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, 2)
def mapValues(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, 2)
def diff(self):
vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
vertices0 = VertexRDD(vertexData0)
vertices1 = VertexRDD(vertexData1)
results = vertices0.diff(vertices1)
self.assertEqual(results, 2)
def innerJoin(self):
vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
vertices0 = VertexRDD(vertexData0)
vertices1 = VertexRDD(vertexData1)
results = vertices0.diff(vertices1)
self.assertEqual(results, 2)
def leftJoin(self):
vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
vertices0 = VertexRDD(vertexData0)
vertices1 = VertexRDD(vertexData1)
results = vertices0.diff(vertices1)
self.assertEqual(results, 2)
示例5: len
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import setCheckpointDir [as 别名]
# Print the results
shifts.foreachRDD(print_shifts)
if __name__ == "__main__":
if len(sys.argv) >= 2 and sys.argv[1] == "test":
# Run the tests
del sys.argv[1]
conf = SparkConf().set("spark.default.parallelism", 1)
sc = SparkContext(appName='unit_test', conf=conf)
sc.setLogLevel("WARN")
sc.setCheckpointDir("/tmp")
unittest.main()
sc.stop()
else:
# Run the main()
sc = SparkContext(appName="BoostWords")
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, 5)
ssc.checkpoint("checkpoint")