本文整理汇总了Python中pyspark.serializers.BatchedSerializer类的典型用法代码示例。如果您正苦于以下问题:Python BatchedSerializer类的具体用法?Python BatchedSerializer怎么用?Python BatchedSerializer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BatchedSerializer类的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parallelize
def parallelize(self, c, numSlices=None):
"""
Distribute a local Python collection to form an RDD.
>>> sc.parallelize(range(5), 5).glom().collect()
[[0], [1], [2], [3], [4]]
"""
numSlices = numSlices or self.defaultParallelism
# Calling the Java parallelize() method with an ArrayList is too slow,
# because it sends O(n) Py4J commands. As an alternative, serialized
# objects are written to a file and loaded through textFile().
tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
# Make sure we distribute data evenly if it's smaller than self.batchSize
if "__len__" not in dir(c):
c = list(c) # Make it a list so we can compute its length
batchSize = min(len(c) // numSlices, self._batchSize)
if batchSize > 1:
serializer = BatchedSerializer(self._unbatched_serializer,
batchSize)
else:
serializer = self._unbatched_serializer
serializer.dump_stream(c, tempFile)
tempFile.close()
readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile
jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices)
return RDD(jrdd, self, serializer)
示例2: _open_file
def _open_file(self):
dirs = _get_local_dirs("objects")
d = dirs[id(self) % len(dirs)]
if not os.path.exists(d):
os.makedirs(d)
p = os.path.join(d, str(id(self)))
self._file = open(p, "wb+", 65536)
self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024)
os.unlink(p)
示例3: parallelize
def parallelize(self, c, numSlices=None):
"""
Distribute a local Python collection to form an RDD. Using xrange
is recommended if the input represents a range for performance.
>>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
[[0], [2], [3], [4], [6]]
>>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
[[], [0], [], [2], [4]]
"""
numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism
if isinstance(c, xrange):
size = len(c)
if size == 0:
return self.parallelize([], numSlices)
step = c[1] - c[0] if size > 1 else 1
start0 = c[0]
def getStart(split):
return start0 + int((split * size / numSlices)) * step
def f(split, iterator):
return xrange(getStart(split), getStart(split + 1), step)
return self.parallelize([], numSlices).mapPartitionsWithIndex(f)
# Calling the Java parallelize() method with an ArrayList is too slow,
# because it sends O(n) Py4J commands. As an alternative, serialized
# objects are written to a file and loaded through textFile().
tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
try:
# Make sure we distribute data evenly if it's smaller than self.batchSize
if "__len__" not in dir(c):
c = list(c) # Make it a list so we can compute its length
batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))
serializer = BatchedSerializer(self._unbatched_serializer, batchSize)
serializer.dump_stream(c, tempFile)
tempFile.close()
readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile
jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices)
finally:
# readRDDFromFile eagerily reads the file so we can delete right after.
os.unlink(tempFile.name)
return RDD(jrdd, self, serializer)
示例4: ExternalList
class ExternalList(object):
"""
ExternalList can have many items which cannot be hold in memory in
the same time.
>>> l = ExternalList(list(range(100)))
>>> len(l)
100
>>> l.append(10)
>>> len(l)
101
>>> for i in range(20240):
... l.append(i)
>>> len(l)
20341
>>> import pickle
>>> l2 = pickle.loads(pickle.dumps(l))
>>> len(l2)
20341
>>> list(l2)[100]
10
"""
LIMIT = 10240
def __init__(self, values):
self.values = values
self.count = len(values)
self._file = None
self._ser = None
def __getstate__(self):
if self._file is not None:
self._file.flush()
with os.fdopen(os.dup(self._file.fileno()), "rb") as f:
f.seek(0)
serialized = f.read()
else:
serialized = b''
return self.values, self.count, serialized
def __setstate__(self, item):
self.values, self.count, serialized = item
if serialized:
self._open_file()
self._file.write(serialized)
else:
self._file = None
self._ser = None
def __iter__(self):
if self._file is not None:
self._file.flush()
# read all items from disks first
with os.fdopen(os.dup(self._file.fileno()), 'rb') as f:
f.seek(0)
for v in self._ser.load_stream(f):
yield v
for v in self.values:
yield v
def __len__(self):
return self.count
def append(self, value):
self.values.append(value)
self.count += 1
# dump them into disk if the key is huge
if len(self.values) >= self.LIMIT:
self._spill()
def _open_file(self):
dirs = _get_local_dirs("objects")
d = dirs[id(self) % len(dirs)]
if not os.path.exists(d):
os.makedirs(d)
p = os.path.join(d, str(id(self)))
self._file = open(p, "wb+", 65536)
self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024)
os.unlink(p)
def __del__(self):
if self._file:
self._file.close()
self._file = None
def _spill(self):
""" dump the values into disk """
global MemoryBytesSpilled, DiskBytesSpilled
if self._file is None:
self._open_file()
used_memory = get_used_memory()
pos = self._file.tell()
self._ser.dump_stream(self.values, self._file)
self.values = []
gc.collect()
DiskBytesSpilled += self._file.tell() - pos
MemoryBytesSpilled += (used_memory - get_used_memory()) << 20