Python emr.EMRJobRunner类代码示例

本文整理汇总了Python中mrjob.emr.EMRJobRunner类的典型用法代码示例。如果您正苦于以下问题：Python EMRJobRunner类的具体用法？Python EMRJobRunner怎么用？Python EMRJobRunner使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了EMRJobRunner类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_with_all_job_flows

 def test_with_all_job_flows(self):
     self.mock_emr_job_flows.update(JOB_FLOWS_BY_ID)
     emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
     emr_conn.run_jobflow('no name', log_uri=None)
     main(['-q', '--no-conf'])
     lines = [line for line in StringIO(self.stdout.getvalue())]
     self.assertEqual(len(lines), len(JOB_FLOWS_BY_ID) - 1)

开发者ID:DanisHack，项目名称:mrjob，代码行数:7，代码来源:test_report_long_jobs.py

示例2: yield_clusters

def yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
    """Get relevant job flow information from EMR.

    :param float max_days_ago: If set, don't fetch job flows created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    :param runner_kwargs: keyword args to pass through to
                          :py:class:`~mrjob.emr.EMRJobRunner`
    """
    if now is None:
        now = datetime.utcnow()

    emr_conn = EMRJobRunner(**runner_kwargs).make_emr_conn()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    for cluster_summary in _yield_all_clusters(emr_conn, created_after=created_after):
        cluster_id = cluster_summary.id

        cluster = emr_conn.describe_cluster(cluster_id)
        cluster.steps = list(_yield_all_steps(emr_conn, cluster_id))
        cluster.bootstrapactions = list(_yield_all_bootstrap_actions(emr_conn, cluster_id))

        yield cluster

开发者ID:nilesh-molankar，项目名称:mrjob，代码行数:28，代码来源:audit_usage.py

示例3: test_local_bootstrap_action

    def test_local_bootstrap_action(self):
        # make sure that local bootstrap action scripts get uploaded to S3
        action_path = os.path.join(self.tmp_dir, 'apt-install.sh')
        with open(action_path, 'w') as f:
            f.write('for $pkg in [email protected]; do sudo apt-get install $pkg; done\n')

        bootstrap_actions = [
            action_path + ' python-scipy mysql-server']

        runner = EMRJobRunner(conf_path=False,
                              bootstrap_actions=bootstrap_actions,
                              s3_sync_wait_time=0.01)

        job_flow_id = runner.make_persistent_job_flow()

        emr_conn = runner.make_emr_conn()
        job_flow = emr_conn.describe_jobflow(job_flow_id)
        actions = job_flow.bootstrapactions

        assert_equal(len(actions), 2)

        assert actions[0].path.startswith('s3://mrjob-')
        assert actions[0].path.endswith('/apt-install.sh')
        assert_equal(actions[0].name, 'apt-install.sh')
        assert_equal(actions[0].args, ['python-scipy', 'mysql-server'])

        # check for master boostrap script
        assert actions[1].path.startswith('s3://mrjob-')
        assert actions[1].path.endswith('b.py')
        assert_equal(actions[1].args, [])
        assert_equal(actions[1].name, 'master')

        # make sure master bootstrap script is on S3
        assert runner.path_exists(actions[1].path)

开发者ID:hblanks，项目名称:mrjob，代码行数:34，代码来源:emr_test.py

示例4: test_create_scratch_uri

    def test_create_scratch_uri(self):
        # "walrus" bucket will be ignored; it doesn't start with "mrjob-"
        self.add_mock_s3_data({'walrus': {}, 'zebra': {}})

        runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01)

        # bucket name should be mrjob- plus 16 random hex digits
        s3_scratch_uri = runner._opts['s3_scratch_uri']
        assert_equal(s3_scratch_uri[:11], 's3://mrjob-')
        assert_equal(s3_scratch_uri[27:], '/tmp/')

        # bucket shouldn't actually exist yet
        scratch_bucket, _ = parse_s3_uri(s3_scratch_uri)
        assert_not_in(scratch_bucket, self.mock_s3_fs.keys())

        # need to do something to ensure that the bucket actually gets
        # created. let's launch a (mock) job flow
        jfid = runner.make_persistent_job_flow()
        assert_in(scratch_bucket, self.mock_s3_fs.keys())
        runner.make_emr_conn().terminate_jobflow(jfid)

        # once our scratch bucket is created, we should re-use it
        runner2 = EMRJobRunner(conf_path=False)
        assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri)
        s3_scratch_uri = runner._opts['s3_scratch_uri']

开发者ID:boursier，项目名称:mrjob，代码行数:25，代码来源:emr_test.py

示例5: test_no_region

 def test_no_region(self):
     runner = EMRJobRunner(conf_path=False)
     assert_equal(runner.make_emr_conn().endpoint,
                  'elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3.amazonaws.com')
     assert_equal(runner._aws_region, '')

开发者ID:boursier，项目名称:mrjob，代码行数:7，代码来源:emr_test.py

示例6: test_attach_to_existing_job_flow

    def test_attach_to_existing_job_flow(self):
        emr_conn = EMRJobRunner(conf_path=False).make_emr_conn()
        # set log_uri to None, so that when we describe the job flow, it
        # won't have the loguri attribute, to test Issue #112
        emr_job_flow_id = emr_conn.run_jobflow(
            name='Development Job Flow', log_uri=None)

        stdin = StringIO('foo\nbar\n')
        self.mock_emr_output = {(emr_job_flow_id, 1): [
            '1\t"bar"\n1\t"foo"\n2\tnull\n']}

        mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                               '-c', self.mrjob_conf_path,
                               '--emr-job-flow-id', emr_job_flow_id])
        mr_job.sandbox(stdin=stdin)

        results = []
        with mr_job.make_runner() as runner:
            runner.run()

            # Issue 182: don't create the bootstrap script when
            # attaching to another job flow
            assert_equal(runner._master_bootstrap_script, None)

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

        assert_equal(sorted(results),
            [(1, 'bar'), (1, 'foo'), (2, None)])

开发者ID:hblanks，项目名称:mrjob，代码行数:30，代码来源:emr_test.py

示例7: MRBossTestCase

class MRBossTestCase(MockBotoTestCase):

    def setUp(self):
        super(MRBossTestCase, self).setUp()
        self.make_runner()

    def tearDown(self):
        self.cleanup_runner()
        super(MRBossTestCase, self).tearDown()

    def make_runner(self):
        self.runner = EMRJobRunner(conf_paths=[])
        self.add_mock_s3_data({'walrus': {}})
        self.runner = EMRJobRunner(s3_sync_wait_time=0,
                                   s3_tmp_dir='s3://walrus/tmp',
                                   conf_paths=[])
        self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR
        self.prepare_runner_for_ssh(self.runner)
        self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')

    def cleanup_runner(self):
        """This method assumes ``prepare_runner_for_ssh()`` was called. That
        method isn't a "proper" setup method because it requires different
        arguments for different tests.
        """
        shutil.rmtree(self.output_dir)
        self.runner.cleanup()

    def test_one_node(self):
        mock_ssh_file('testmaster', 'some_file', b'file contents')

        run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'],
                         print_stderr=False)

        with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f:
            self.assertEqual(f.read().rstrip(), 'file contents')

        self.assertEqual(os.listdir(self.output_dir), ['master'])

    def test_two_nodes(self):
        self.add_slave()
        self.runner._opts['num_ec2_instances'] = 2

        mock_ssh_file('testmaster', 'some_file', b'file contents 1')
        mock_ssh_file('testmaster!testslave0', 'some_file', b'file contents 2')

        self.runner.fs  # force initialization of _ssh_fs

        run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'],
                         print_stderr=False)

        with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f:
            self.assertEqual(f.read().rstrip(), 'file contents 1')

        with open(os.path.join(self.output_dir, 'slave testslave0', 'stdout'),
                  'r') as f:
            self.assertEqual(f.read().strip(), 'file contents 2')

        self.assertEqual(sorted(os.listdir(self.output_dir)),
                         ['master', 'slave testslave0'])

开发者ID:kartheek6，项目名称:mrjob，代码行数:60，代码来源:test_mrboss.py

示例8: main

def main():
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args()

    if args:
        option_parser.error('takes no arguments')

    # set up logging
    if not options.quiet:
        log_to_stream(name='mrjob', debug=options.verbose)

    # create the persistent job
    runner_kwargs = {
        'conf_path': options.conf_path,
        'ec2_instance_type': options.ec2_instance_type,
        'ec2_master_instance_type': options.ec2_master_instance_type,
        'ec2_slave_instance_type': options.ec2_slave_instance_type,
        'label': options.label,
        'num_ec2_instances': options.num_ec2_instances,
        'owner': options.owner,
    }
    runner = EMRJobRunner(**runner_kwargs)
    emr_job_flow_id = runner.make_persistent_job_flow()
    print emr_job_flow_id

开发者ID:chomp，项目名称:mrjob，代码行数:25，代码来源:create_job_flow.py

示例9: find_waiting_flow

def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''):
    # print (aws_access_key_id,aws_secret_access_key)
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
    emr_conn = JobRunner.make_emr_conn()
    job_flows=emr_conn.describe_jobflows()
    job_id='NONE'
    d = {'WAITING':0,'STARTING':1,'RUNNING':2}
    waiting_flows=[]
    for flow in job_flows:
        try:
            if flow.state in d.keys():
                job_id=flow.jobflowid
                ip_address=flow.masterpublicdnsname
                waiting_flows.append([d[flow.state],job_id,ip_address,flow.state])
                if ssh_key_pair_file != '':
                    print 'ssh -i %s [email protected]%s'%(ssh_key_pair_file,ip_address)
                    job_id=flow.jobflowid
        except Exception:
            continue
    waiting_flows = sorted(waiting_flows, key=itemgetter(0))
    waiting_flows = [i[1:] for i in waiting_flows] #An index was added at the beginning for the sorting. Removing that index in this step
    waiting_flows_dict = [{'flow_id':i[0],'node':i[1],'flow_state':i[2]} for i in waiting_flows] #Converting a list of lists to a list of dicts
    
    #Printing
    index = 0
    for flow_dict in waiting_flows_dict:
        print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state']
        index+=1
    
    return waiting_flows_dict

开发者ID:caomw，项目名称:UCSD_BigData_2015，代码行数:30，代码来源:find_waiting_flow.py

示例10: test_blank_region

 def test_blank_region(self):
     # blank region should be treated the same as no region
     runner = EMRJobRunner(conf_path=False, aws_region='')
     assert_equal(runner.make_emr_conn().endpoint,
                  'elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3.amazonaws.com')
     assert_equal(runner._aws_region, '')

开发者ID:boursier，项目名称:mrjob，代码行数:8，代码来源:emr_test.py

示例11: test_spark_script_step_without_mr_job_script

    def test_spark_script_step_without_mr_job_script(self):
        spark_script_path = self.makefile('a_spark_script.py')
        steps = MRSparkScript(['--script', spark_script_path])._steps_desc()

        runner = EMRJobRunner(steps=steps, stdin=BytesIO())

        runner.run()
        runner.cleanup()

开发者ID:Affirm，项目名称:mrjob，代码行数:8，代码来源:test_runner.py

示例12: test_spark_jar_step_without_mr_job_script

    def test_spark_jar_step_without_mr_job_script(self):
        spark_jar_path = self.makefile('fireflies.jar')
        steps = MRSparkJar(['--jar', spark_jar_path])._steps_desc()

        runner = EMRJobRunner(steps=steps, stdin=BytesIO())

        runner.run()
        runner.cleanup()

开发者ID:Affirm，项目名称:mrjob，代码行数:8，代码来源:test_runner.py

示例13: test_jar_step_without_mr_job_script

    def test_jar_step_without_mr_job_script(self):
        jar_path = self.makefile('dora.jar')
        steps = MRJustAJar(['--jar', jar_path])._steps_desc()

        runner = EMRJobRunner(steps=steps, stdin=BytesIO(b'backpack'))

        runner.run()
        runner.cleanup()

开发者ID:Affirm，项目名称:mrjob，代码行数:8，代码来源:test_runner.py

示例14: reducer_init

 def reducer_init(self):
     emr = EMRJobRunner(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
     idf_parts = emr.get_s3_keys('s3://6885public/jeffchan/term-idfs/')
     self.word_to_idf = dict()
     for part in idf_parts:
         json = part.get_contents_as_string()
         for line in StringIO.StringIO(json):
             pair = json.loads(line)
             self.word_to_idf[pair['term']] = pair['idf']

开发者ID:jeffchan，项目名称:asciiclass，代码行数:9，代码来源:mr_tf.py

示例15: test_terminate_job_flow

    def test_terminate_job_flow(self):
        jf_id = self.make_job_flow(pool_emr_job_flows=True)
        self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKJOBFLOW0')

        terminate_main()

        emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
        self.assertEqual(emr_conn.describe_jobflow(jf_id).state,
                         'TERMINATED')

开发者ID:DanisHack，项目名称:mrjob，代码行数:9，代码来源:test_terminate_job_flow.py

注：本文中的mrjob.emr.EMRJobRunner类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。