本文整理汇总了Python中mapreduce.util.create_datastore_write_config函数的典型用法代码示例。如果您正苦于以下问题:Python create_datastore_write_config函数的具体用法?Python create_datastore_write_config怎么用?Python create_datastore_write_config使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了create_datastore_write_config函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: handle
def handle(self):
"""Handles kick off request."""
spec = model.MapreduceSpec.from_json_str(
self._get_required_param("mapreduce_spec"))
app_id = self.request.get("app", None)
queue_name = os.environ.get("HTTP_X_APPENGINE_QUEUENAME", "default")
mapper_input_reader_class = spec.mapper.input_reader_class()
# StartJobHandler might have already saved the state, but it's OK
# to override it because we're using the same mapreduce id.
state = model.MapreduceState.create_new(spec.mapreduce_id)
state.mapreduce_spec = spec
state.active = True
# TODO(user): Initialize UI fields correctly.
state.char_url = ""
state.sparkline_url = ""
if app_id:
state.app_id = app_id
input_readers = mapper_input_reader_class.split_input(spec.mapper)
if not input_readers:
# We don't have any data. Finish map.
logging.warning("Found no mapper input data to process.")
state.active = False
state.active_shards = 0
state.put(config=util.create_datastore_write_config(spec))
return
# Update state and spec with actual shard count.
spec.mapper.shard_count = len(input_readers)
state.active_shards = len(input_readers)
state.mapreduce_spec = spec
output_writer_class = spec.mapper.output_writer_class()
if output_writer_class:
output_writer_class.init_job(state)
output_writers = []
if output_writer_class:
for shard_number in range(len(input_readers)):
writer = output_writer_class.create(state, shard_number)
assert isinstance(writer, output_writer_class)
output_writers.append(writer)
else:
output_writers = [None for ir in input_readers]
state.put(config=util.create_datastore_write_config(spec))
KickOffJobHandler._schedule_shards(
spec, input_readers, output_writers, queue_name, self.base_path())
ControllerCallbackHandler.reschedule(
state, self.base_path(), spec, queue_name=queue_name, serial_id=0)
示例2: flush
def flush(self):
"""Flush all information recorded in context."""
for pool in self._pools.values():
pool.flush()
if self.shard_state:
self.shard_state.put(
config=util.create_datastore_write_config(self.mapreduce_spec))
示例3: _schedule_shards
def _schedule_shards(cls, spec, input_readers, queue_name, base_path):
"""Prepares shard states and schedules their execution.
Args:
spec: mapreduce specification as MapreduceSpec.
input_readers: list of InputReaders describing shard splits.
queue_name: The queue to run this job on.
base_path: The base url path of mapreduce callbacks.
"""
# Note: it's safe to re-attempt this handler because:
# - shard state has deterministic and unique key.
# - schedule_slice will fall back gracefully if a task already exists.
shard_states = []
for shard_number, input_reader in enumerate(input_readers):
shard = model.ShardState.create_new(spec.mapreduce_id, shard_number)
shard.shard_description = str(input_reader)
shard_states.append(shard)
# Retrievs already existing shards.
existing_shard_states = db.get(shard.key() for shard in shard_states)
existing_shard_keys = set(shard.key() for shard in existing_shard_states if shard is not None)
# Puts only non-existing shards.
db.put(
(shard for shard in shard_states if shard.key() not in existing_shard_keys),
config=util.create_datastore_write_config(spec),
)
for shard_number, input_reader in enumerate(input_readers):
shard_id = model.ShardState.shard_id_from_number(spec.mapreduce_id, shard_number)
MapperWorkerCallbackHandler.schedule_slice(
base_path, spec, shard_id, 0, input_reader, queue_name=queue_name
)
示例4: _schedule_shards
def _schedule_shards(cls,
spec,
input_readers,
queue_name,
base_path,
mr_state):
"""Prepares shard states and schedules their execution.
Args:
spec: mapreduce specification as MapreduceSpec.
input_readers: list of InputReaders describing shard splits.
queue_name: The queue to run this job on.
base_path: The base url path of mapreduce callbacks.
mr_state: The MapReduceState of current job.
"""
# Note: it's safe to re-attempt this handler because:
# - shard state has deterministic and unique key.
# - _schedule_slice will fall back gracefully if a task already exists.
shard_states = []
writer_class = spec.mapper.output_writer_class()
output_writers = [None] * len(input_readers)
for shard_number, input_reader in enumerate(input_readers):
shard_state = model.ShardState.create_new(spec.mapreduce_id, shard_number)
shard_state.shard_description = str(input_reader)
if writer_class:
output_writers[shard_number] = writer_class.create(
mr_state, shard_state)
shard_states.append(shard_state)
# Retrievs already existing shards.
existing_shard_states = db.get(shard.key() for shard in shard_states)
existing_shard_keys = set(shard.key() for shard in existing_shard_states
if shard is not None)
# Puts only non-existing shards.
db.put((shard for shard in shard_states
if shard.key() not in existing_shard_keys),
config=util.create_datastore_write_config(spec))
# Give each shard some quota to start with.
processing_rate = int(spec.mapper.params.get(
"processing_rate") or model._DEFAULT_PROCESSING_RATE_PER_SEC)
quota_refill = processing_rate / len(shard_states)
quota_manager = quota.QuotaManager(memcache.Client())
for shard_state in shard_states:
quota_manager.put(shard_state.shard_id, quota_refill)
# Schedule shard tasks.
for shard_number, (input_reader, output_writer) in enumerate(
zip(input_readers, output_writers)):
shard_id = model.ShardState.shard_id_from_number(
spec.mapreduce_id, shard_number)
MapperWorkerCallbackHandler._schedule_slice(
shard_states[shard_number],
model.TransientShardState(
base_path, spec, shard_id, 0, input_reader, input_reader,
output_writer=output_writer),
queue_name=queue_name)
示例5: handle
def handle(self):
mapreduce_id = self.request.get("mapreduce_id")
mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
if mapreduce_state:
config=util.create_datastore_write_config(mapreduce_state.mapreduce_spec)
db.delete(model.MapreduceControl.get_key_by_job_id(mapreduce_id),
config=config)
shard_states = model.ShardState.find_by_mapreduce_state(mapreduce_state)
for shard_state in shard_states:
db.delete(util._HugeTaskPayload.all().ancestor(shard_state),
config=config)
db.delete(shard_states, config=config)
db.delete(util._HugeTaskPayload.all().ancestor(mapreduce_state),
config=config)
示例6: _finalize_job
def _finalize_job(mapreduce_spec, mapreduce_state, base_path):
"""Finalize job execution.
Finalizes output writer, invokes done callback an schedules
finalize job execution.
Args:
mapreduce_spec: an instance of MapreduceSpec
mapreduce_state: an instance of MapreduceState
base_path: handler base path.
"""
config = util.create_datastore_write_config(mapreduce_spec)
# Only finalize the output writers if we the job is successful.
if (mapreduce_spec.mapper.output_writer_class() and
mapreduce_state.result_status == model.MapreduceState.RESULT_SUCCESS):
mapreduce_spec.mapper.output_writer_class().finalize_job(mapreduce_state)
# Enqueue done_callback if needed.
def put_state(state):
state.put(config=config)
done_callback = mapreduce_spec.params.get(
model.MapreduceSpec.PARAM_DONE_CALLBACK)
if done_callback:
done_task = taskqueue.Task(
url=done_callback,
headers={"Mapreduce-Id": mapreduce_spec.mapreduce_id},
method=mapreduce_spec.params.get("done_callback_method", "POST"))
queue_name = mapreduce_spec.params.get(
model.MapreduceSpec.PARAM_DONE_CALLBACK_QUEUE,
"default")
if not _run_task_hook(mapreduce_spec.get_hooks(),
"enqueue_done_task",
done_task,
queue_name):
done_task.add(queue_name, transactional=True)
FinalizeJobHandler.schedule(base_path, mapreduce_spec)
db.run_in_transaction(put_state, mapreduce_state)
示例7: handle
def handle(self):
"""Handle request."""
tstate = model.TransientShardState.from_request(self.request)
spec = tstate.mapreduce_spec
self._start_time = self._time()
shard_id = tstate.shard_id
shard_state, control = db.get([
model.ShardState.get_key_by_shard_id(shard_id),
model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
])
if not shard_state:
# We're letting this task to die. It's up to controller code to
# reinitialize and restart the task.
logging.error("State not found for shard ID %r; shutting down",
shard_id)
return
if not shard_state.active:
logging.error("Shard is not active. Looks like spurious task execution.")
return
ctx = context.Context(spec, shard_state,
task_retry_count=self.task_retry_count())
if control and control.command == model.MapreduceControl.ABORT:
logging.info("Abort command received by shard %d of job '%s'",
shard_state.shard_number, shard_state.mapreduce_id)
if tstate.output_writer:
tstate.output_writer.finalize(ctx, shard_state.shard_number)
# We recieved a command to abort. We don't care if we override
# some data.
shard_state.active = False
shard_state.result_status = model.ShardState.RESULT_ABORTED
shard_state.put(config=util.create_datastore_write_config(spec))
model.MapreduceControl.abort(spec.mapreduce_id)
return
input_reader = tstate.input_reader
if spec.mapper.params.get("enable_quota", True):
quota_consumer = quota.QuotaConsumer(
quota.QuotaManager(memcache.Client()),
shard_id,
_QUOTA_BATCH_SIZE)
else:
quota_consumer = None
context.Context._set(ctx)
try:
# consume quota ahead, because we do not want to run a datastore
# query if there's not enough quota for the shard.
if not quota_consumer or quota_consumer.check():
scan_aborted = False
entity = None
# We shouldn't fetch an entity from the reader if there's not enough
# quota to process it. Perform all quota checks proactively.
if not quota_consumer or quota_consumer.consume():
for entity in input_reader:
if isinstance(entity, db.Model):
shard_state.last_work_item = repr(entity.key())
else:
shard_state.last_work_item = repr(entity)[:100]
scan_aborted = not self.process_data(
entity, input_reader, ctx, tstate)
# Check if we've got enough quota for the next entity.
if (quota_consumer and not scan_aborted and
not quota_consumer.consume()):
scan_aborted = True
if scan_aborted:
break
else:
scan_aborted = True
if not scan_aborted:
logging.info("Processing done for shard %d of job '%s'",
shard_state.shard_number, shard_state.mapreduce_id)
# We consumed extra quota item at the end of for loop.
# Just be nice here and give it back :)
if quota_consumer:
quota_consumer.put(1)
shard_state.active = False
shard_state.result_status = model.ShardState.RESULT_SUCCESS
operation.counters.Increment(
context.COUNTER_MAPPER_WALLTIME_MS,
int((time.time() - self._start_time)*1000))(ctx)
# TODO(user): Mike said we don't want this happen in case of
# exception while scanning. Figure out when it's appropriate to skip.
ctx.flush()
if not shard_state.active:
# shard is going to stop. Finalize output writer if any.
if tstate.output_writer:
tstate.output_writer.finalize(ctx, shard_state.shard_number)
#.........这里部分代码省略.........
示例8: handle
def handle(self):
"""Handle request."""
spec = model.MapreduceSpec.from_json_str(self.request.get("mapreduce_spec"))
self._start_time = self._time()
shard_id = self.shard_id()
# TODO(user): Make this prettier
logging.debug("post: shard=%s slice=%s headers=%s", shard_id, self.slice_id(), self.request.headers)
shard_state, control = db.get(
[
model.ShardState.get_key_by_shard_id(shard_id),
model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
]
)
if not shard_state:
# We're letting this task to die. It's up to controller code to
# reinitialize and restart the task.
logging.error("State not found for shard ID %r; shutting down", shard_id)
return
if control and control.command == model.MapreduceControl.ABORT:
logging.info(
"Abort command received by shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id
)
shard_state.active = False
shard_state.result_status = model.ShardState.RESULT_ABORTED
shard_state.put(config=util.create_datastore_write_config(spec))
model.MapreduceControl.abort(spec.mapreduce_id)
return
input_reader = self.input_reader(spec.mapper)
if spec.mapper.params.get("enable_quota", True):
quota_consumer = quota.QuotaConsumer(quota.QuotaManager(memcache.Client()), shard_id, _QUOTA_BATCH_SIZE)
else:
quota_consumer = None
ctx = context.Context(spec, shard_state, task_retry_count=self.task_retry_count())
context.Context._set(ctx)
try:
# consume quota ahead, because we do not want to run a datastore
# query if there's not enough quota for the shard.
if not quota_consumer or quota_consumer.check():
scan_aborted = False
entity = None
# We shouldn't fetch an entity from the reader if there's not enough
# quota to process it. Perform all quota checks proactively.
if not quota_consumer or quota_consumer.consume():
for entity in input_reader:
if isinstance(entity, db.Model):
shard_state.last_work_item = repr(entity.key())
else:
shard_state.last_work_item = repr(entity)[:100]
scan_aborted = not self.process_entity(entity, ctx)
# Check if we've got enough quota for the next entity.
if quota_consumer and not scan_aborted and not quota_consumer.consume():
scan_aborted = True
if scan_aborted:
break
else:
scan_aborted = True
if not scan_aborted:
logging.info(
"Processing done for shard %d of job '%s'", shard_state.shard_number, shard_state.mapreduce_id
)
# We consumed extra quota item at the end of for loop.
# Just be nice here and give it back :)
if quota_consumer:
quota_consumer.put(1)
shard_state.active = False
shard_state.result_status = model.ShardState.RESULT_SUCCESS
# TODO(user): Mike said we don't want this happen in case of
# exception while scanning. Figure out when it's appropriate to skip.
ctx.flush()
finally:
context.Context._set(None)
if quota_consumer:
quota_consumer.dispose()
# Rescheduling work should always be the last statement. It shouldn't happen
# if there were any exceptions in code before it.
if shard_state.active:
self.reschedule(spec, input_reader)
示例9: testForceWrites
def testForceWrites(self):
self.spec.params["force_writes"] = "True"
config = util.create_datastore_write_config(self.spec)
self.assertTrue(config)
self.assertTrue(config.force_writes)
示例10: handle
def handle(self):
"""Handle request."""
tstate = model.TransientShardState.from_request(self.request)
spec = tstate.mapreduce_spec
self._start_time = self._time()
shard_id = tstate.shard_id
shard_state, control = db.get([
model.ShardState.get_key_by_shard_id(shard_id),
model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
])
if not shard_state:
# We're letting this task to die. It's up to controller code to
# reinitialize and restart the task.
logging.error("State not found for shard ID %r; shutting down",
shard_id)
return
if not shard_state.active:
logging.error("Shard is not active. Looks like spurious task execution.")
return
ctx = context.Context(spec, shard_state,
task_retry_count=self.task_retry_count())
if control and control.command == model.MapreduceControl.ABORT:
logging.info("Abort command received by shard %d of job '%s'",
shard_state.shard_number, shard_state.mapreduce_id)
# NOTE: When aborting, specifically do not finalize the output writer
# because it might be in a bad state.
shard_state.active = False
shard_state.result_status = model.ShardState.RESULT_ABORTED
shard_state.put(config=util.create_datastore_write_config(spec))
model.MapreduceControl.abort(spec.mapreduce_id)
return
input_reader = tstate.input_reader
if spec.mapper.params.get("enable_quota", True):
quota_consumer = quota.QuotaConsumer(
quota.QuotaManager(memcache.Client()),
shard_id,
_QUOTA_BATCH_SIZE)
else:
quota_consumer = None
# Tell NDB to never cache anything in memcache or in-process. This ensures
# that entities fetched from Datastore input_readers via NDB will not bloat
# up the request memory size and Datastore Puts will avoid doing calls
# to memcache. Without this you get soft memory limit exits, which hurts
# overall throughput.
if ndb is not None:
ndb_ctx = ndb.get_context()
ndb_ctx.set_cache_policy(lambda key: False)
ndb_ctx.set_memcache_policy(lambda key: False)
context.Context._set(ctx)
try:
# consume quota ahead, because we do not want to run a datastore
# query if there's not enough quota for the shard.
if not quota_consumer or quota_consumer.check():
scan_aborted = False
entity = None
try:
# We shouldn't fetch an entity from the reader if there's not enough
# quota to process it. Perform all quota checks proactively.
if not quota_consumer or quota_consumer.consume(verbose=True):
for entity in input_reader:
if isinstance(entity, db.Model):
shard_state.last_work_item = repr(entity.key())
else:
shard_state.last_work_item = repr(entity)[:100]
scan_aborted = not self.process_data(
entity, input_reader, ctx, tstate)
# Check if we've got enough quota for the next entity.
if (quota_consumer and not scan_aborted and
not quota_consumer.consume(verbose=True)):
scan_aborted = True
if scan_aborted:
break
else:
scan_aborted = True
if not scan_aborted:
logging.info("Processing done for shard %d of job '%s'",
shard_state.shard_number, shard_state.mapreduce_id)
# We consumed extra quota item at the end of for loop.
# Just be nice here and give it back :)
if quota_consumer:
quota_consumer.put(1)
shard_state.active = False
shard_state.result_status = model.ShardState.RESULT_SUCCESS
operation.counters.Increment(
context.COUNTER_MAPPER_WALLTIME_MS,
int((time.time() - self._start_time)*1000))(ctx)
#.........这里部分代码省略.........
示例11: tx
shard_state.result_status = model.ShardState.RESULT_FAILED
except errors.FailJobError, e:
logging.error("Job failed: %s", e)
scan_aborted = True
shard_state.active = False
shard_state.result_status = model.ShardState.RESULT_FAILED
if not shard_state.active:
# shard is going to stop. Don't finalize output writer unless the job is
# going to be successful, because writer might be stuck in some bad state
# otherwise.
if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and
tstate.output_writer):
tstate.output_writer.finalize(ctx, shard_state.shard_number)
config = util.create_datastore_write_config(spec)
# We don't want shard state to override active state, since that
# may stuck job execution (see issue 116). Do a transactional
# verification for status.
# TODO(user): this might still result in some data inconsistency
# which can be avoided. It doesn't seem to be worth it now, because
# various crashes might result in all sort of data consistencies
# anyway.
@db.transactional(retries=5)
def tx():
fresh_shard_state = db.get(
model.ShardState.get_key_by_shard_id(shard_id))
if not fresh_shard_state:
raise db.Rollback()
if (not fresh_shard_state.active or
"worker_active_state_collision" in _TEST_INJECTED_FAULTS):
示例12: handle
def handle(self):
"""Handle request."""
tstate = model.TransientShardState.from_request(self.request)
spec = tstate.mapreduce_spec
self._start_time = self._time()
shard_id = tstate.shard_id
shard_state, control = db.get([
model.ShardState.get_key_by_shard_id(shard_id),
model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
])
if not shard_state:
# We're letting this task to die. It's up to controller code to
# reinitialize and restart the task.
logging.error("State not found for shard ID %r; shutting down",
shard_id)
return
if not shard_state.active:
logging.error("Shard is not active. Looks like spurious task execution.")
return
ctx = context.Context(spec, shard_state,
task_retry_count=self.task_retry_count())
if control and control.command == model.MapreduceControl.ABORT:
logging.info("Abort command received by shard %d of job '%s'",
shard_state.shard_number, shard_state.mapreduce_id)
# NOTE: When aborting, specifically do not finalize the output writer
# because it might be in a bad state.
shard_state.active = False
shard_state.result_status = model.ShardState.RESULT_ABORTED
shard_state.put(config=util.create_datastore_write_config(spec))
model.MapreduceControl.abort(spec.mapreduce_id)
return
input_reader = tstate.input_reader
if spec.mapper.params.get("enable_quota", True):
quota_consumer = quota.QuotaConsumer(
quota.QuotaManager(memcache.Client()),
shard_id,
_QUOTA_BATCH_SIZE)
else:
quota_consumer = None
# Tell NDB to never cache anything in memcache or in-process. This ensures
# that entities fetched from Datastore input_readers via NDB will not bloat
# up the request memory size and Datastore Puts will avoid doing calls
# to memcache. Without this you get soft memory limit exits, which hurts
# overall throughput.
if ndb is not None:
ndb_ctx = ndb.get_context()
ndb_ctx.set_cache_policy(lambda key: False)
ndb_ctx.set_memcache_policy(lambda key: False)
context.Context._set(ctx)
try:
self.process_inputs(
input_reader, shard_state, tstate, quota_consumer, ctx)
if not shard_state.active:
# shard is going to stop. Finalize output writer only when shard is
# successful because writer might be stuck in some bad state otherwise.
if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and
tstate.output_writer):
tstate.output_writer.finalize(ctx, shard_state.shard_number)
config = util.create_datastore_write_config(spec)
# We don't want shard state to override active state, since that
# may stuck job execution (see issue 116). Do a transactional
# verification for status.
# TODO(user): this might still result in some data inconsistency
# which can be avoided. It doesn't seem to be worth it now, because
# various crashes might result in all sort of data consistencies
# anyway.
@db.transactional(retries=5)
def tx():
fresh_shard_state = db.get(
model.ShardState.get_key_by_shard_id(shard_id))
if not fresh_shard_state:
raise db.Rollback()
if (not fresh_shard_state.active or
"worker_active_state_collision" in _TEST_INJECTED_FAULTS):
shard_state.active = False
logging.error("Spurious task execution. Aborting the shard.")
return
fresh_shard_state.copy_from(shard_state)
fresh_shard_state.put(config=config)
tx()
finally:
context.Context._set(None)
if quota_consumer:
quota_consumer.dispose()
# Rescheduling work should always be the last statement. It shouldn't happen
# if there were any exceptions in code before it.
if shard_state.active:
self.reschedule(shard_state, tstate)
#.........这里部分代码省略.........
示例13: testDefaultConfig
def testDefaultConfig(self):
config = util.create_datastore_write_config(self.spec)
self.assertTrue(config)
self.assertFalse(config.force_writes)
示例14: _start_map
def _start_map(cls, name, mapper_spec,
mapreduce_params,
base_path=None,
queue_name=None,
eta=None,
countdown=None,
hooks_class_name=None,
_app=None,
transactional=False):
queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME",
"default")
if queue_name[0] == "_":
# We are currently in some special queue. E.g. __cron.
queue_name = "default"
# Check that handler can be instantiated.
mapper_spec.get_handler()
# Check that reader can be instantiated and is configured correctly
mapper_input_reader_class = mapper_spec.input_reader_class()
mapper_input_reader_class.validate(mapper_spec)
mapper_output_writer_class = mapper_spec.output_writer_class()
if mapper_output_writer_class:
mapper_output_writer_class.validate(mapper_spec)
mapreduce_id = model.MapreduceState.new_mapreduce_id()
mapreduce_spec = model.MapreduceSpec(
name,
mapreduce_id,
mapper_spec.to_json(),
mapreduce_params,
hooks_class_name)
kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()}
if _app:
kickoff_params["app"] = _app
kickoff_worker_task = util.HugeTask(
url=base_path + "/kickoffjob_callback",
params=kickoff_params,
eta=eta,
countdown=countdown)
hooks = mapreduce_spec.get_hooks()
config = util.create_datastore_write_config(mapreduce_spec)
def start_mapreduce():
parent = None
if not transactional:
# Save state in datastore so that UI can see it.
# We can't save state in foreign transaction, but conventional UI
# doesn't ask for transactional starts anyway.
state = model.MapreduceState.create_new(mapreduce_spec.mapreduce_id)
state.mapreduce_spec = mapreduce_spec
state.active = True
state.active_shards = mapper_spec.shard_count
if _app:
state.app_id = _app
state.put(config=config)
parent = state
if hooks is not None:
try:
hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name)
except NotImplementedError:
# Use the default task addition implementation.
pass
else:
return
kickoff_worker_task.add(queue_name, transactional=True, parent=parent)
if transactional:
start_mapreduce()
else:
db.run_in_transaction(start_mapreduce)
return mapreduce_id
示例15: handle
def handle(self):
"""Handle request."""
tstate = model.TransientShardState.from_request(self.request)
spec = tstate.mapreduce_spec
self._start_time = self._time()
shard_id = tstate.shard_id
shard_state, control = db.get([
model.ShardState.get_key_by_shard_id(shard_id),
model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
])
if not shard_state:
# We're letting this task to die. It's up to controller code to
# reinitialize and restart the task.
logging.error("State not found for shard ID %r; shutting down",
shard_id)
return
if not shard_state.active:
logging.error("Shard is not active. Looks like spurious task execution.")
return
if shard_state.retries > tstate.retries:
logging.error(
"Got shard %s from previous shard retry %s. Drop",
shard_state.shard_id,
tstate.retries)
return
elif shard_state.retries < tstate.retries:
# This happens when the transaction that updates shardstate and enqueues
# task fails after the task has been added. That transaction will
# be retried. Adding the same task will result in
# TaskAlreadyExistsError but the error is ignored.
raise ValueError(
"ShardState for %s is behind slice. Waiting for it to catch up",
shard_state.shard_id)
ctx = context.Context(spec, shard_state,
task_retry_count=self.task_retry_count())
if control and control.command == model.MapreduceControl.ABORT:
logging.info("Abort command received by shard %d of job '%s'",
shard_state.shard_number, shard_state.mapreduce_id)
# NOTE: When aborting, specifically do not finalize the output writer
# because it might be in a bad state.
shard_state.active = False
shard_state.result_status = model.ShardState.RESULT_ABORTED
shard_state.put(config=util.create_datastore_write_config(spec))
model.MapreduceControl.abort(spec.mapreduce_id)
return
input_reader = tstate.input_reader
if spec.mapper.params.get("enable_quota", True):
quota_consumer = quota.QuotaConsumer(
quota.QuotaManager(memcache.Client()),
shard_id,
_QUOTA_BATCH_SIZE)
else:
quota_consumer = None
# Tell NDB to never cache anything in memcache or in-process. This ensures
# that entities fetched from Datastore input_readers via NDB will not bloat
# up the request memory size and Datastore Puts will avoid doing calls
# to memcache. Without this you get soft memory limit exits, which hurts
# overall throughput.
if ndb is not None:
ndb_ctx = ndb.get_context()
ndb_ctx.set_cache_policy(lambda key: False)
ndb_ctx.set_memcache_policy(lambda key: False)
context.Context._set(ctx)
retry_shard = False
try:
self.process_inputs(
input_reader, shard_state, tstate, quota_consumer, ctx)
if not shard_state.active:
# shard is going to stop. Finalize output writer only when shard is
# successful because writer might be stuck in some bad state otherwise.
if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and
tstate.output_writer):
tstate.output_writer.finalize(ctx, shard_state)
# pylint: disable=broad-except
except Exception, e:
retry_shard = self._retry_logic(e, shard_state, tstate, spec.mapreduce_id)