本文整理匯總了Python中dpark.accumulator.Accumulator.merge方法的典型用法代碼示例。如果您正苦於以下問題:Python Accumulator.merge方法的具體用法?Python Accumulator.merge怎麽用?Python Accumulator.merge使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類dpark.accumulator.Accumulator
的用法示例。
在下文中一共展示了Accumulator.merge方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: runJob
# 需要導入模塊: from dpark.accumulator import Accumulator [as 別名]
# 或者: from dpark.accumulator.Accumulator import merge [as 別名]
def runJob(self, finalRdd, func, partitions, allowLocal):
outputParts = list(partitions)
numOutputParts = len(partitions)
finalStage = self.newStage(finalRdd, None)
results = [None]*numOutputParts
finished = [None]*numOutputParts
lastFinished = 0
numFinished = 0
waiting = set()
running = set()
failed = set()
pendingTasks = {}
lastFetchFailureTime = 0
self.updateCacheLocs()
logger.debug("Final stage: %s, %d", finalStage, numOutputParts)
logger.debug("Parents of final stage: %s", finalStage.parents)
logger.debug("Missing parents: %s", self.getMissingParentStages(finalStage))
if allowLocal and (not finalStage.parents or not self.getMissingParentStages(finalStage)) and numOutputParts == 1:
split = finalRdd.splits[outputParts[0]]
yield func(finalRdd.iterator(split))
return
def submitStage(stage):
logger.debug("submit stage %s", stage)
if stage not in waiting and stage not in running:
missing = self.getMissingParentStages(stage)
if not missing:
submitMissingTasks(stage)
running.add(stage)
else:
for parent in missing:
submitStage(parent)
waiting.add(stage)
def submitMissingTasks(stage):
myPending = pendingTasks.setdefault(stage, set())
tasks = []
have_prefer = True
if stage == finalStage:
for i in range(numOutputParts):
if not finished[i]:
part = outputParts[i]
if have_prefer:
locs = self.getPreferredLocs(finalRdd, part)
if not locs:
have_prefer = False
else:
locs = []
tasks.append(ResultTask(finalStage.id, finalRdd,
func, part, locs, i))
else:
for p in range(stage.numPartitions):
if not stage.outputLocs[p]:
if have_prefer:
locs = self.getPreferredLocs(stage.rdd, p)
if not locs:
have_prefer = False
else:
locs = []
tasks.append(ShuffleMapTask(stage.id, stage.rdd,
stage.shuffleDep, p, locs))
logger.debug("add to pending %s tasks", len(tasks))
myPending |= set(t.id for t in tasks)
self.submitTasks(tasks)
submitStage(finalStage)
while numFinished != numOutputParts:
try:
evt = self.completionEvents.get(False)
except Queue.Empty:
self.check()
if self._shutdown:
sys.exit(1)
if failed and time.time() > lastFetchFailureTime + RESUBMIT_TIMEOUT:
self.updateCacheLocs()
for stage in failed:
logger.info("Resubmitting failed stages: %s", stage)
submitStage(stage)
failed.clear()
else:
time.sleep(0.1)
continue
task, reason = evt.task, evt.reason
stage = self.idToStage[task.stageId]
if stage not in pendingTasks: # stage from other job
continue
logger.debug("remove from pedding %s from %s", task, stage)
pendingTasks[stage].remove(task.id)
if isinstance(reason, Success):
Accumulator.merge(evt.accumUpdates)
if isinstance(task, ResultTask):
finished[task.outputId] = True
numFinished += 1
#.........這裏部分代碼省略.........
示例2: runJob
# 需要導入模塊: from dpark.accumulator import Accumulator [as 別名]
# 或者: from dpark.accumulator.Accumulator import merge [as 別名]
def runJob(self, finalRdd, func, partitions, allowLocal):
outputParts = list(partitions)
numOutputParts = len(partitions)
finalStage = self.newStage(finalRdd, None)
try:
from dpark.web.ui.views.rddopgraph import StageInfo
stage_info = StageInfo()
stage_info.create_stage_info(finalStage)
def create_stage_info_recur(cur_stage, is_final=False):
if not cur_stage or cur_stage.id in self.idToRunJob:
return
for par_stage in cur_stage.parents:
create_stage_info_recur(par_stage)
if cur_stage.id not in self.idToRunJob:
self.idToRunJob[cur_stage.id] = StageInfo.idToStageInfo[cur_stage.id]
self.idToRunJob[cur_stage.id].is_final = is_final
create_stage_info_recur(finalStage, is_final=True)
except ImportError:
pass
results = [None] * numOutputParts
finished = [None] * numOutputParts
lastFinished = 0
numFinished = 0
waiting = set()
running = set()
failed = set()
pendingTasks = {}
lastFetchFailureTime = 0
self.updateCacheLocs()
logger.debug('Final stage: %s, %d', finalStage, numOutputParts)
logger.debug('Parents of final stage: %s', finalStage.parents)
logger.debug(
'Missing parents: %s',
self.getMissingParentStages(finalStage))
def onStageFinished(stage):
def _(r, dep):
return r._do_checkpoint()
MutableDict.merge()
walk_dependencies(stage.rdd, _)
if (allowLocal and
(
not finalStage.parents or
not self.getMissingParentStages(finalStage)
) and numOutputParts == 1):
split = finalRdd.splits[outputParts[0]]
yield func(finalRdd.iterator(split))
onStageFinished(finalStage)
return
def submitStage(stage):
logger.debug('submit stage %s', stage)
if stage not in waiting and stage not in running:
missing = self.getMissingParentStages(stage)
if not missing:
submitMissingTasks(stage)
running.add(stage)
else:
for parent in missing:
submitStage(parent)
waiting.add(stage)
def submitMissingTasks(stage):
myPending = pendingTasks.setdefault(stage, set())
tasks = []
have_prefer = True
if stage == finalStage:
for i in range(numOutputParts):
if not finished[i]:
part = outputParts[i]
if have_prefer:
locs = self.getPreferredLocs(finalRdd, part)
if not locs:
have_prefer = False
else:
locs = []
tasks.append(ResultTask(finalStage.id, finalRdd,
func, part, locs, i))
else:
for p in range(stage.numPartitions):
if not stage.outputLocs[p]:
if have_prefer:
locs = self.getPreferredLocs(stage.rdd, p)
if not locs:
have_prefer = False
else:
locs = []
tasks.append(ShuffleMapTask(stage.id, stage.rdd,
stage.shuffleDep, p, locs))
logger.debug('add to pending %s tasks', len(tasks))
myPending |= set(t.id for t in tasks)
self.submitTasks(tasks)
#.........這裏部分代碼省略.........