本文整理汇总了Python中dpark.accumulator.Accumulator.merge方法的典型用法代码示例。如果您正苦于以下问题:Python Accumulator.merge方法的具体用法?Python Accumulator.merge怎么用?Python Accumulator.merge使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dpark.accumulator.Accumulator
的用法示例。
在下文中一共展示了Accumulator.merge方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: runJob
# 需要导入模块: from dpark.accumulator import Accumulator [as 别名]
# 或者: from dpark.accumulator.Accumulator import merge [as 别名]
def runJob(self, finalRdd, func, partitions, allowLocal):
outputParts = list(partitions)
numOutputParts = len(partitions)
finalStage = self.newStage(finalRdd, None)
results = [None]*numOutputParts
finished = [None]*numOutputParts
lastFinished = 0
numFinished = 0
waiting = set()
running = set()
failed = set()
pendingTasks = {}
lastFetchFailureTime = 0
self.updateCacheLocs()
logger.debug("Final stage: %s, %d", finalStage, numOutputParts)
logger.debug("Parents of final stage: %s", finalStage.parents)
logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage))
if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage)) and numOutputParts == 1:
split = finalRdd.splits[outputParts[0]]
yield func(finalRdd.iterator(split))
return
def submitStage(stage):
logger.debug("submit stage %s", stage)
if stage not in waiting and stage not in running:
missing = self.getMissingParentStages(stage)
if not missing:
submitMissingTasks(stage)
running.add(stage)
else:
for parent in missing:
submitStage(parent)
waiting.add(stage)
def submitMissingTasks(stage):
myPending = pendingTasks.setdefault(stage, set())
tasks = []
have_prefer = True
if stage == finalStage:
for i in range(numOutputParts):
if not finished[i]:
part = outputParts[i]
if have_prefer:
locs = self.getPreferredLocs(finalRdd, part)
if not locs:
have_prefer = False
else:
locs = []
tasks.append(ResultTask(finalStage.id, finalRdd,
func, part, locs, i))
else:
for p in range(stage.numPartitions):
if not stage.outputLocs[p]:
if have_prefer:
locs = self.getPreferredLocs(stage.rdd, p)
if not locs:
have_prefer = False
else:
locs = []
tasks.append(ShuffleMapTask(stage.id, stage.rdd,
stage.shuffleDep, p, locs))
logger.debug("add to pending %s tasks", len(tasks))
myPending |= set(t.id for t in tasks)
self.submitTasks(tasks)
submitStage(finalStage)
while numFinished != numOutputParts:
try:
evt = self.completionEvents.get(False)
except Queue.Empty:
self.check()
if self._shutdown:
sys.exit(1)
if failed and time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT:
self.updateCacheLocs()
for stage in failed:
logger.info("Resubmitting failed stages: %s", stage)
submitStage(stage)
failed.clear()
else:
time.sleep(0.1)
continue
task, reason = evt.task, evt.reason
stage = self.idToStage[task.stageId]
if stage not in pendingTasks: # stage from other job
continue
logger.debug("remove from pedding %s from %s", task, stage)
pendingTasks[stage].remove(task.id)
if isinstance(reason, Success):
Accumulator.merge(evt.accumUpdates)
if isinstance(task, ResultTask):
finished[task.outputId] = True
numFinished += 1
#.........这里部分代码省略.........
示例2: runJob
# 需要导入模块: from dpark.accumulator import Accumulator [as 别名]
# 或者: from dpark.accumulator.Accumulator import merge [as 别名]
def runJob(self, finalRdd, func, partitions, allowLocal):
outputParts = list(partitions)
numOutputParts = len(partitions)
finalStage = self.newStage(finalRdd, None)
try:
from dpark.web.ui.views.rddopgraph import StageInfo
stage_info = StageInfo()
stage_info.create_stage_info(finalStage)
def create_stage_info_recur(cur_stage, is_final=False):
if not cur_stage or cur_stage.id in self.idToRunJob:
return
for par_stage in cur_stage.parents:
create_stage_info_recur(par_stage)
if cur_stage.id not in self.idToRunJob:
self.idToRunJob[cur_stage.id] = StageInfo.idToStageInfo[cur_stage.id]
self.idToRunJob[cur_stage.id].is_final = is_final
create_stage_info_recur(finalStage, is_final=True)
except ImportError:
pass
results = [None] * numOutputParts
finished = [None] * numOutputParts
lastFinished = 0
numFinished = 0
waiting = set()
running = set()
failed = set()
pendingTasks = {}
lastFetchFailureTime = 0
self.updateCacheLocs()
logger.debug('Final stage: %s, %d', finalStage, numOutputParts)
logger.debug('Parents of final stage: %s', finalStage.parents)
logger.debug(
'Missing parents: %s',
self.getMissingParentStages(finalStage))
def onStageFinished(stage):
def _(r, dep):
return r._do_checkpoint()
MutableDict.merge()
walk_dependencies(stage.rdd, _)
if (allowLocal and
(
not finalStage.parents or
not self.getMissingParentStages(finalStage)
) and numOutputParts == 1):
split = finalRdd.splits[outputParts[0]]
yield func(finalRdd.iterator(split))
onStageFinished(finalStage)
return
def submitStage(stage):
logger.debug('submit stage %s', stage)
if stage not in waiting and stage not in running:
missing = self.getMissingParentStages(stage)
if not missing:
submitMissingTasks(stage)
running.add(stage)
else:
for parent in missing:
submitStage(parent)
waiting.add(stage)
def submitMissingTasks(stage):
myPending = pendingTasks.setdefault(stage, set())
tasks = []
have_prefer = True
if stage == finalStage:
for i in range(numOutputParts):
if not finished[i]:
part = outputParts[i]
if have_prefer:
locs = self.getPreferredLocs(finalRdd, part)
if not locs:
have_prefer = False
else:
locs = []
tasks.append(ResultTask(finalStage.id, finalRdd,
func, part, locs, i))
else:
for p in range(stage.numPartitions):
if not stage.outputLocs[p]:
if have_prefer:
locs = self.getPreferredLocs(stage.rdd, p)
if not locs:
have_prefer = False
else:
locs = []
tasks.append(ShuffleMapTask(stage.id, stage.rdd,
stage.shuffleDep, p, locs))
logger.debug('add to pending %s tasks', len(tasks))
myPending |= set(t.id for t in tasks)
self.submitTasks(tasks)
#.........这里部分代码省略.........