本文整理汇总了Python中mrjob.emr.EMRJobRunner.make_emr_conn方法的典型用法代码示例。如果您正苦于以下问题:Python EMRJobRunner.make_emr_conn方法的具体用法?Python EMRJobRunner.make_emr_conn怎么用?Python EMRJobRunner.make_emr_conn使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类mrjob.emr.EMRJobRunner
的用法示例。
在下文中一共展示了EMRJobRunner.make_emr_conn方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_create_scratch_uri
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def test_create_scratch_uri(self):
# "walrus" bucket will be ignored; it doesn't start with "mrjob-"
self.add_mock_s3_data({'walrus': {}, 'zebra': {}})
runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01)
# bucket name should be mrjob- plus 16 random hex digits
s3_scratch_uri = runner._opts['s3_scratch_uri']
assert_equal(s3_scratch_uri[:11], 's3://mrjob-')
assert_equal(s3_scratch_uri[27:], '/tmp/')
# bucket shouldn't actually exist yet
scratch_bucket, _ = parse_s3_uri(s3_scratch_uri)
assert_not_in(scratch_bucket, self.mock_s3_fs.keys())
# need to do something to ensure that the bucket actually gets
# created. let's launch a (mock) job flow
jfid = runner.make_persistent_job_flow()
assert_in(scratch_bucket, self.mock_s3_fs.keys())
runner.make_emr_conn().terminate_jobflow(jfid)
# once our scratch bucket is created, we should re-use it
runner2 = EMRJobRunner(conf_path=False)
assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri)
s3_scratch_uri = runner._opts['s3_scratch_uri']
示例2: main
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def main(cl_args=None):
# parser command-line args
option_parser = make_option_parser()
options, args = option_parser.parse_args(cl_args)
if len(args) != 1:
option_parser.error('This tool takes exactly one argument.')
emr_job_flow_id = args[0]
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
# create the persistent job
runner = EMRJobRunner(conf_paths=options.conf_paths)
log.debug('Terminating job flow %s' % emr_job_flow_id)
runner.make_emr_conn().terminate_jobflow(emr_job_flow_id)
log.info('Terminated job flow %s' % emr_job_flow_id)
示例3: test_no_region
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def test_no_region(self):
runner = EMRJobRunner(conf_path=False)
assert_equal(runner.make_emr_conn().endpoint,
'elasticmapreduce.amazonaws.com')
assert_equal(runner.make_s3_conn().endpoint,
's3.amazonaws.com')
assert_equal(runner._aws_region, '')
示例4: test_local_bootstrap_action
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def test_local_bootstrap_action(self):
# make sure that local bootstrap action scripts get uploaded to S3
action_path = os.path.join(self.tmp_dir, 'apt-install.sh')
with open(action_path, 'w') as f:
f.write('for $pkg in [email protected]; do sudo apt-get install $pkg; done\n')
bootstrap_actions = [
action_path + ' python-scipy mysql-server']
runner = EMRJobRunner(conf_path=False,
bootstrap_actions=bootstrap_actions,
s3_sync_wait_time=0.01)
job_flow_id = runner.make_persistent_job_flow()
emr_conn = runner.make_emr_conn()
job_flow = emr_conn.describe_jobflow(job_flow_id)
actions = job_flow.bootstrapactions
assert_equal(len(actions), 2)
assert actions[0].path.startswith('s3://mrjob-')
assert actions[0].path.endswith('/apt-install.sh')
assert_equal(actions[0].name, 'apt-install.sh')
assert_equal(actions[0].args, ['python-scipy', 'mysql-server'])
# check for master boostrap script
assert actions[1].path.startswith('s3://mrjob-')
assert actions[1].path.endswith('b.py')
assert_equal(actions[1].args, [])
assert_equal(actions[1].name, 'master')
# make sure master bootstrap script is on S3
assert runner.path_exists(actions[1].path)
示例5: main
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def main(cl_args=None):
# parser command-line args
option_parser = _make_option_parser()
options, args = option_parser.parse_args(cl_args)
if len(args) != 1:
option_parser.error('This tool takes exactly one argument.')
cluster_id = args[0]
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
# create the persistent job
runner = EMRJobRunner(**_runner_kwargs(options))
log.debug('Terminating cluster %s' % cluster_id)
runner.make_emr_conn().terminate_jobflow(cluster_id)
log.info('Terminated cluster %s' % cluster_id)
示例6: find_waiting_flow
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''):
# print (aws_access_key_id,aws_secret_access_key)
JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
emr_conn = JobRunner.make_emr_conn()
job_flows=emr_conn.describe_jobflows()
job_id='NONE'
d = {'WAITING':0,'STARTING':1,'RUNNING':2}
waiting_flows=[]
for flow in job_flows:
try:
if flow.state in d.keys():
job_id=flow.jobflowid
ip_address=flow.masterpublicdnsname
waiting_flows.append([d[flow.state],job_id,ip_address,flow.state])
if ssh_key_pair_file != '':
print 'ssh -i %s [email protected]%s'%(ssh_key_pair_file,ip_address)
job_id=flow.jobflowid
except Exception:
continue
waiting_flows = sorted(waiting_flows, key=itemgetter(0))
waiting_flows = [i[1:] for i in waiting_flows] #An index was added at the beginning for the sorting. Removing that index in this step
waiting_flows_dict = [{'flow_id':i[0],'node':i[1],'flow_state':i[2]} for i in waiting_flows] #Converting a list of lists to a list of dicts
#Printing
index = 0
for flow_dict in waiting_flows_dict:
print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state']
index+=1
return waiting_flows_dict
示例7: test_blank_region
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def test_blank_region(self):
# blank region should be treated the same as no region
runner = EMRJobRunner(conf_path=False, aws_region='')
assert_equal(runner.make_emr_conn().endpoint,
'elasticmapreduce.amazonaws.com')
assert_equal(runner.make_s3_conn().endpoint,
's3.amazonaws.com')
assert_equal(runner._aws_region, '')
示例8: main
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def main():
# parser command-line args
option_parser = make_option_parser()
options, args = option_parser.parse_args()
if len(args) != 1:
option_parser.error('takes exactly one argument')
emr_job_flow_id = args[0]
# set up logging
if not options.quiet:
log_to_stream(name='mrjob', debug=options.verbose)
# create the persistent job
runner = EMRJobRunner(conf_path=options.conf_path)
log.debug('Terminating job flow %s' % emr_job_flow_id)
runner.make_emr_conn().terminate_jobflow(emr_job_flow_id)
log.info('Terminated job flow %s' % emr_job_flow_id)
示例9: find_waiting_flow
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def find_waiting_flow(aws_access_key_id,aws_secret_access_key):
JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
emr_conn = JobRunner.make_emr_conn()
job_flows=emr_conn.describe_jobflows()
job_id='NONE'
for flow in job_flows:
if flow.state=='WAITING':
print flow,flow.name,flow.jobflowid,flow.state
job_id=flow.jobflowid
return job_id
示例10: find_waiting_flow
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''):
print (aws_access_key_id,aws_secret_access_key)
JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
emr_conn = JobRunner.make_emr_conn()
job_flows=emr_conn.describe_jobflows()
job_id='NONE'
waiting_flows=[]
for flow in job_flows:
if flow.state=='WAITING':
waiting_flows.append(flow)
print flow.jobflowid,flow.state
ip_address=flow.masterpublicdnsname
if ssh_key_pair_file != '':
print 'ssh -i %s [email protected]%s'%(ssh_key_pair_file,ip_address)
job_id=flow.jobflowid
return job_id
示例11: test_bootstrap_actions_get_added
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def test_bootstrap_actions_get_added(self):
bootstrap_actions = [
's3://elasticmapreduce/bootstrap-actions/configure-hadoop -m,mapred.tasktracker.map.tasks.maximum=1',
's3://foo/bar#xyzzy', # use alternate name for script
]
runner = EMRJobRunner(conf_path=False,
bootstrap_actions=bootstrap_actions,
s3_sync_wait_time=0.01)
job_flow_id = runner.make_persistent_job_flow()
emr_conn = runner.make_emr_conn()
job_flow = emr_conn.describe_jobflow(job_flow_id)
actions = job_flow.bootstrapactions
assert_equal(len(actions), 3)
assert_equal(
actions[0].path,
's3://elasticmapreduce/bootstrap-actions/configure-hadoop')
assert_equal(
actions[0].args,
['-m,mapred.tasktracker.map.tasks.maximum=1'])
assert_equal(actions[0].name, 'configure-hadoop')
assert_equal(actions[1].path, 's3://foo/bar')
assert_equal(actions[1].args, [])
assert_equal(actions[1].name, 'xyzzy')
# check for master bootstrap script
assert actions[2].path.startswith('s3://mrjob-')
assert actions[2].path.endswith('b.py')
assert_equal(actions[2].args, [])
assert_equal(actions[2].name, 'master')
# make sure master bootstrap script is on S3
assert runner.path_exists(actions[2].path)
示例12: inspect_and_maybe_terminate_job_flows
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def inspect_and_maybe_terminate_job_flows(
conf_path=None,
dry_run=False,
max_hours_idle=None,
mins_to_end_of_hour=None,
now=None,
pool_name=None,
pooled_only=False,
unpooled_only=False,
max_mins_locked=None,
quiet=False,
**kwargs
):
if now is None:
now = datetime.utcnow()
# old default behavior
if max_hours_idle is None and mins_to_end_of_hour is None:
max_hours_idle = DEFAULT_MAX_HOURS_IDLE
runner = EMRJobRunner(conf_path=conf_path, **kwargs)
emr_conn = runner.make_emr_conn()
log.info(
'getting info about all job flows (this goes back about 2 months)')
# We don't filter by job flow state because we want this to work even
# if Amazon adds another kind of idle state.
job_flows = describe_all_job_flows(emr_conn)
num_bootstrapping = 0
num_done = 0
num_idle = 0
num_non_streaming = 0
num_pending = 0
num_running = 0
# a list of tuples of job flow id, name, idle time (as a timedelta)
to_terminate = []
for jf in job_flows:
# check if job flow is done
if is_job_flow_done(jf):
num_done += 1
# check if job flow is bootstrapping
elif is_job_flow_bootstrapping(jf):
num_bootstrapping += 1
# we can't really tell if non-streaming jobs are idle or not, so
# let them be (see Issue #60)
elif not is_job_flow_streaming(jf):
num_non_streaming += 1
elif is_job_flow_running(jf):
num_running += 1
else:
time_idle = now - time_last_active(jf)
time_to_end_of_hour = est_time_to_hour(jf, now=now)
_, pool = pool_hash_and_name(jf)
pending = job_flow_has_pending_steps(jf)
if pending:
num_pending += 1
else:
num_idle += 1
log.debug(
'Job flow %s %s for %s, %s to end of hour, %s (%s)' %
(jf.jobflowid,
'pending' if pending else 'idle',
strip_microseconds(time_idle),
strip_microseconds(time_to_end_of_hour),
('unpooled' if pool is None else 'in %s pool' % pool),
jf.name))
# filter out job flows that don't meet our criteria
if (max_hours_idle is not None and
time_idle <= timedelta(hours=max_hours_idle)):
continue
# mins_to_end_of_hour doesn't apply to jobs with pending steps
if (mins_to_end_of_hour is not None and
(pending or
time_to_end_of_hour >= timedelta(
minutes=mins_to_end_of_hour))):
continue
if (pooled_only and pool is None):
continue
if (unpooled_only and pool is not None):
continue
if (pool_name is not None and pool != pool_name):
continue
to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))
#.........这里部分代码省略.........
示例13: find_all_flows
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def find_all_flows(aws_access_key_id,aws_secret_access_key):
JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
print 'got job runner'
emr_conn = JobRunner.make_emr_conn()
print 'made EMR connection'
return emr_conn.describe_jobflows()
示例14: _maybe_terminate_clusters
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def _maybe_terminate_clusters(dry_run=False,
max_hours_idle=None,
mins_to_end_of_hour=None,
now=None,
pool_name=None,
pooled_only=False,
unpooled_only=False,
max_mins_locked=None,
quiet=False,
**kwargs):
if now is None:
now = datetime.utcnow()
# old default behavior
if max_hours_idle is None and mins_to_end_of_hour is None:
max_hours_idle = _DEFAULT_MAX_HOURS_IDLE
runner = EMRJobRunner(**kwargs)
emr_conn = runner.make_emr_conn()
num_starting = 0
num_bootstrapping = 0
num_done = 0
num_idle = 0
num_pending = 0
num_running = 0
# We don't filter by cluster state because we want this to work even
# if Amazon adds another kind of idle state.
for cluster_summary in _yield_all_clusters(emr_conn):
cluster_id = cluster_summary.id
# check if cluster is done
if _is_cluster_done(cluster_summary):
num_done += 1
continue
# check if cluster is starting
if _is_cluster_starting(cluster_summary):
num_starting += 1
continue
# check if cluster is bootstrapping
if _is_cluster_bootstrapping(cluster_summary):
num_bootstrapping += 1
continue
# need steps to learn more about cluster
steps = _list_all_steps(emr_conn, cluster_id)
if any(_is_step_running(step) for step in steps):
num_running += 1
continue
# cluster is idle
time_idle = now - _time_last_active(cluster_summary, steps)
time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
is_pending = _cluster_has_pending_steps(steps)
bootstrap_actions = list(_yield_all_bootstrap_actions(
emr_conn, cluster_id))
_, pool = _pool_hash_and_name(bootstrap_actions)
if is_pending:
num_pending += 1
else:
num_idle += 1
log.debug(
'cluster %s %s for %s, %s to end of hour, %s (%s)' %
(cluster_id,
'pending' if is_pending else 'idle',
strip_microseconds(time_idle),
strip_microseconds(time_to_end_of_hour),
('unpooled' if pool is None else 'in %s pool' % pool),
cluster_summary.name))
# filter out clusters that don't meet our criteria
if (max_hours_idle is not None and
time_idle <= timedelta(hours=max_hours_idle)):
continue
# mins_to_end_of_hour doesn't apply to jobs with pending steps
if (mins_to_end_of_hour is not None and
(is_pending or
time_to_end_of_hour >= timedelta(
minutes=mins_to_end_of_hour))):
continue
if (pooled_only and pool is None):
continue
if (unpooled_only and pool is not None):
continue
if (pool_name is not None and pool != pool_name):
continue
# terminate idle cluster
_terminate_and_notify(
#.........这里部分代码省略.........
示例15: test_explicit_endpoints
# 需要导入模块: from mrjob.emr import EMRJobRunner [as 别名]
# 或者: from mrjob.emr.EMRJobRunner import make_emr_conn [as 别名]
def test_explicit_endpoints(self):
runner = EMRJobRunner(conf_path=False, aws_region='EU',
s3_endpoint='s3-proxy', emr_endpoint='emr-proxy')
assert_equal(runner.make_emr_conn().endpoint, 'emr-proxy')
assert_equal(runner.make_s3_conn().endpoint, 's3-proxy')