本文整理汇总了Python中mrjob.emr.EMRJobRunner类的典型用法代码示例。如果您正苦于以下问题:Python EMRJobRunner类的具体用法?Python EMRJobRunner怎么用?Python EMRJobRunner使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了EMRJobRunner类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_with_all_job_flows
def test_with_all_job_flows(self):
self.mock_emr_job_flows.update(JOB_FLOWS_BY_ID)
emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
emr_conn.run_jobflow('no name', log_uri=None)
main(['-q', '--no-conf'])
lines = [line for line in StringIO(self.stdout.getvalue())]
self.assertEqual(len(lines), len(JOB_FLOWS_BY_ID) - 1)
示例2: yield_clusters
def yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
"""Get relevant job flow information from EMR.
:param float max_days_ago: If set, don't fetch job flows created longer
than this many days ago.
:param now: the current UTC time, as a :py:class:`datetime.datetime`.
Defaults to the current time.
:param runner_kwargs: keyword args to pass through to
:py:class:`~mrjob.emr.EMRJobRunner`
"""
if now is None:
now = datetime.utcnow()
emr_conn = EMRJobRunner(**runner_kwargs).make_emr_conn()
# if --max-days-ago is set, only look at recent jobs
created_after = None
if max_days_ago is not None:
created_after = now - timedelta(days=max_days_ago)
for cluster_summary in _yield_all_clusters(emr_conn, created_after=created_after):
cluster_id = cluster_summary.id
cluster = emr_conn.describe_cluster(cluster_id)
cluster.steps = list(_yield_all_steps(emr_conn, cluster_id))
cluster.bootstrapactions = list(_yield_all_bootstrap_actions(emr_conn, cluster_id))
yield cluster
示例3: test_local_bootstrap_action
def test_local_bootstrap_action(self):
# make sure that local bootstrap action scripts get uploaded to S3
action_path = os.path.join(self.tmp_dir, 'apt-install.sh')
with open(action_path, 'w') as f:
f.write('for $pkg in [email protected]; do sudo apt-get install $pkg; done\n')
bootstrap_actions = [
action_path + ' python-scipy mysql-server']
runner = EMRJobRunner(conf_path=False,
bootstrap_actions=bootstrap_actions,
s3_sync_wait_time=0.01)
job_flow_id = runner.make_persistent_job_flow()
emr_conn = runner.make_emr_conn()
job_flow = emr_conn.describe_jobflow(job_flow_id)
actions = job_flow.bootstrapactions
assert_equal(len(actions), 2)
assert actions[0].path.startswith('s3://mrjob-')
assert actions[0].path.endswith('/apt-install.sh')
assert_equal(actions[0].name, 'apt-install.sh')
assert_equal(actions[0].args, ['python-scipy', 'mysql-server'])
# check for master boostrap script
assert actions[1].path.startswith('s3://mrjob-')
assert actions[1].path.endswith('b.py')
assert_equal(actions[1].args, [])
assert_equal(actions[1].name, 'master')
# make sure master bootstrap script is on S3
assert runner.path_exists(actions[1].path)
示例4: test_create_scratch_uri
def test_create_scratch_uri(self):
# "walrus" bucket will be ignored; it doesn't start with "mrjob-"
self.add_mock_s3_data({'walrus': {}, 'zebra': {}})
runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01)
# bucket name should be mrjob- plus 16 random hex digits
s3_scratch_uri = runner._opts['s3_scratch_uri']
assert_equal(s3_scratch_uri[:11], 's3://mrjob-')
assert_equal(s3_scratch_uri[27:], '/tmp/')
# bucket shouldn't actually exist yet
scratch_bucket, _ = parse_s3_uri(s3_scratch_uri)
assert_not_in(scratch_bucket, self.mock_s3_fs.keys())
# need to do something to ensure that the bucket actually gets
# created. let's launch a (mock) job flow
jfid = runner.make_persistent_job_flow()
assert_in(scratch_bucket, self.mock_s3_fs.keys())
runner.make_emr_conn().terminate_jobflow(jfid)
# once our scratch bucket is created, we should re-use it
runner2 = EMRJobRunner(conf_path=False)
assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri)
s3_scratch_uri = runner._opts['s3_scratch_uri']
示例5: test_no_region
def test_no_region(self):
runner = EMRJobRunner(conf_path=False)
assert_equal(runner.make_emr_conn().endpoint,
'elasticmapreduce.amazonaws.com')
assert_equal(runner.make_s3_conn().endpoint,
's3.amazonaws.com')
assert_equal(runner._aws_region, '')
示例6: test_attach_to_existing_job_flow
def test_attach_to_existing_job_flow(self):
emr_conn = EMRJobRunner(conf_path=False).make_emr_conn()
# set log_uri to None, so that when we describe the job flow, it
# won't have the loguri attribute, to test Issue #112
emr_job_flow_id = emr_conn.run_jobflow(
name='Development Job Flow', log_uri=None)
stdin = StringIO('foo\nbar\n')
self.mock_emr_output = {(emr_job_flow_id, 1): [
'1\t"bar"\n1\t"foo"\n2\tnull\n']}
mr_job = MRTwoStepJob(['-r', 'emr', '-v',
'-c', self.mrjob_conf_path,
'--emr-job-flow-id', emr_job_flow_id])
mr_job.sandbox(stdin=stdin)
results = []
with mr_job.make_runner() as runner:
runner.run()
# Issue 182: don't create the bootstrap script when
# attaching to another job flow
assert_equal(runner._master_bootstrap_script, None)
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
assert_equal(sorted(results),
[(1, 'bar'), (1, 'foo'), (2, None)])
示例7: MRBossTestCase
class MRBossTestCase(MockBotoTestCase):
def setUp(self):
super(MRBossTestCase, self).setUp()
self.make_runner()
def tearDown(self):
self.cleanup_runner()
super(MRBossTestCase, self).tearDown()
def make_runner(self):
self.runner = EMRJobRunner(conf_paths=[])
self.add_mock_s3_data({'walrus': {}})
self.runner = EMRJobRunner(s3_sync_wait_time=0,
s3_tmp_dir='s3://walrus/tmp',
conf_paths=[])
self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR
self.prepare_runner_for_ssh(self.runner)
self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')
def cleanup_runner(self):
"""This method assumes ``prepare_runner_for_ssh()`` was called. That
method isn't a "proper" setup method because it requires different
arguments for different tests.
"""
shutil.rmtree(self.output_dir)
self.runner.cleanup()
def test_one_node(self):
mock_ssh_file('testmaster', 'some_file', b'file contents')
run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'],
print_stderr=False)
with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f:
self.assertEqual(f.read().rstrip(), 'file contents')
self.assertEqual(os.listdir(self.output_dir), ['master'])
def test_two_nodes(self):
self.add_slave()
self.runner._opts['num_ec2_instances'] = 2
mock_ssh_file('testmaster', 'some_file', b'file contents 1')
mock_ssh_file('testmaster!testslave0', 'some_file', b'file contents 2')
self.runner.fs # force initialization of _ssh_fs
run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'],
print_stderr=False)
with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f:
self.assertEqual(f.read().rstrip(), 'file contents 1')
with open(os.path.join(self.output_dir, 'slave testslave0', 'stdout'),
'r') as f:
self.assertEqual(f.read().strip(), 'file contents 2')
self.assertEqual(sorted(os.listdir(self.output_dir)),
['master', 'slave testslave0'])
示例8: main
def main():
# parser command-line args
option_parser = make_option_parser()
options, args = option_parser.parse_args()
if args:
option_parser.error('takes no arguments')
# set up logging
if not options.quiet:
log_to_stream(name='mrjob', debug=options.verbose)
# create the persistent job
runner_kwargs = {
'conf_path': options.conf_path,
'ec2_instance_type': options.ec2_instance_type,
'ec2_master_instance_type': options.ec2_master_instance_type,
'ec2_slave_instance_type': options.ec2_slave_instance_type,
'label': options.label,
'num_ec2_instances': options.num_ec2_instances,
'owner': options.owner,
}
runner = EMRJobRunner(**runner_kwargs)
emr_job_flow_id = runner.make_persistent_job_flow()
print emr_job_flow_id
示例9: find_waiting_flow
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''):
# print (aws_access_key_id,aws_secret_access_key)
JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
emr_conn = JobRunner.make_emr_conn()
job_flows=emr_conn.describe_jobflows()
job_id='NONE'
d = {'WAITING':0,'STARTING':1,'RUNNING':2}
waiting_flows=[]
for flow in job_flows:
try:
if flow.state in d.keys():
job_id=flow.jobflowid
ip_address=flow.masterpublicdnsname
waiting_flows.append([d[flow.state],job_id,ip_address,flow.state])
if ssh_key_pair_file != '':
print 'ssh -i %s [email protected]%s'%(ssh_key_pair_file,ip_address)
job_id=flow.jobflowid
except Exception:
continue
waiting_flows = sorted(waiting_flows, key=itemgetter(0))
waiting_flows = [i[1:] for i in waiting_flows] #An index was added at the beginning for the sorting. Removing that index in this step
waiting_flows_dict = [{'flow_id':i[0],'node':i[1],'flow_state':i[2]} for i in waiting_flows] #Converting a list of lists to a list of dicts
#Printing
index = 0
for flow_dict in waiting_flows_dict:
print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state']
index+=1
return waiting_flows_dict
示例10: test_blank_region
def test_blank_region(self):
# blank region should be treated the same as no region
runner = EMRJobRunner(conf_path=False, aws_region='')
assert_equal(runner.make_emr_conn().endpoint,
'elasticmapreduce.amazonaws.com')
assert_equal(runner.make_s3_conn().endpoint,
's3.amazonaws.com')
assert_equal(runner._aws_region, '')
示例11: test_spark_script_step_without_mr_job_script
def test_spark_script_step_without_mr_job_script(self):
spark_script_path = self.makefile('a_spark_script.py')
steps = MRSparkScript(['--script', spark_script_path])._steps_desc()
runner = EMRJobRunner(steps=steps, stdin=BytesIO())
runner.run()
runner.cleanup()
示例12: test_spark_jar_step_without_mr_job_script
def test_spark_jar_step_without_mr_job_script(self):
spark_jar_path = self.makefile('fireflies.jar')
steps = MRSparkJar(['--jar', spark_jar_path])._steps_desc()
runner = EMRJobRunner(steps=steps, stdin=BytesIO())
runner.run()
runner.cleanup()
示例13: test_jar_step_without_mr_job_script
def test_jar_step_without_mr_job_script(self):
jar_path = self.makefile('dora.jar')
steps = MRJustAJar(['--jar', jar_path])._steps_desc()
runner = EMRJobRunner(steps=steps, stdin=BytesIO(b'backpack'))
runner.run()
runner.cleanup()
示例14: reducer_init
def reducer_init(self):
emr = EMRJobRunner(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
idf_parts = emr.get_s3_keys('s3://6885public/jeffchan/term-idfs/')
self.word_to_idf = dict()
for part in idf_parts:
json = part.get_contents_as_string()
for line in StringIO.StringIO(json):
pair = json.loads(line)
self.word_to_idf[pair['term']] = pair['idf']
示例15: test_terminate_job_flow
def test_terminate_job_flow(self):
jf_id = self.make_job_flow(pool_emr_job_flows=True)
self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKJOBFLOW0')
terminate_main()
emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
self.assertEqual(emr_conn.describe_jobflow(jf_id).state,
'TERMINATED')