本文整理汇总了Python中tests.mr_two_step_job.MRTwoStepJob类的典型用法代码示例。如果您正苦于以下问题:Python MRTwoStepJob类的具体用法?Python MRTwoStepJob怎么用?Python MRTwoStepJob使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MRTwoStepJob类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_failed_job
def test_failed_job(self):
mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
mr_job.sandbox()
with no_handlers_for_logger('mrjob.dataproc'):
stderr = StringIO()
log_to_stream('mrjob.dataproc', stderr)
self._dataproc_client.job_get_advances_states = (
collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR']))
with mr_job.make_runner() as runner:
self.assertIsInstance(runner, DataprocJobRunner)
self.assertRaises(StepFailedException, runner.run)
self.assertIn(' => ERROR\n', stderr.getvalue())
cluster_id = runner.get_cluster_id()
# job should get terminated
cluster = (
self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id])
cluster_state = self._dataproc_client.get_state(cluster)
self.assertEqual(cluster_state, 'DELETING')
示例2: test_attach_to_existing_job_flow
def test_attach_to_existing_job_flow(self):
emr_conn = EMRJobRunner(conf_path=False).make_emr_conn()
# set log_uri to None, so that when we describe the job flow, it
# won't have the loguri attribute, to test Issue #112
emr_job_flow_id = emr_conn.run_jobflow(
name='Development Job Flow', log_uri=None)
stdin = StringIO('foo\nbar\n')
self.mock_emr_output = {(emr_job_flow_id, 1): [
'1\t"bar"\n1\t"foo"\n2\tnull\n']}
mr_job = MRTwoStepJob(['-r', 'emr', '-v',
'-c', self.mrjob_conf_path,
'--emr-job-flow-id', emr_job_flow_id])
mr_job.sandbox(stdin=stdin)
results = []
with mr_job.make_runner() as runner:
runner.run()
# Issue 182: don't create the bootstrap script when
# attaching to another job flow
assert_equal(runner._master_bootstrap_script, None)
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
assert_equal(sorted(results),
[(1, 'bar'), (1, 'foo'), (2, None)])
示例3: test_owner_and_label_switches
def test_owner_and_label_switches(self):
runner_opts = ["--no-conf", "--owner=ads", "--label=ads_chain"]
runner = MRTwoStepJob(runner_opts).make_runner()
match = JOB_NAME_RE.match(runner.get_job_name())
self.assertEqual(match.group(1), "ads_chain")
self.assertEqual(match.group(2), "ads")
示例4: test_bootstrap_python_comes_before_bootstrap
def test_bootstrap_python_comes_before_bootstrap(self):
mr_job = MRTwoStepJob(['-r', 'dataproc', '--bootstrap', 'true'])
with mr_job.make_runner() as runner:
self.assertEqual(
runner._bootstrap,
self.EXPECTED_BOOTSTRAP + [['true']])
示例5: test_python_dash_v_as_python_bin
def test_python_dash_v_as_python_bin(self):
python_cmd = cmd_line([sys.executable or 'python', '-v'])
mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf',
'-r', 'local'])
mr_job.sandbox(stdin=[b'bar\n'])
with mr_job.make_runner() as runner:
runner.run()
# expect python -v crud in stderr
with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
self.assertTrue(any(
'import mrjob' in line or # Python 2
"import 'mrjob'" in line
for line in lines))
with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
self.assertTrue(any(
'#' in line for line in lines))
# should still get expected results
self.assertEqual(
sorted(to_lines(runner.cat_output())),
sorted([b'1\tnull\n', b'1\t"bar"\n']))
示例6: test_dont_take_down_cluster_on_failure
def test_dont_take_down_cluster_on_failure(self):
runner = DataprocJobRunner(conf_paths=[])
cluster_body = runner.api_client.cluster_create()
cluster_id = cluster_body['clusterName']
mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
'--cluster-id', cluster_id])
mr_job.sandbox()
self._dataproc_client.job_get_advances_states = collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR'])
with mr_job.make_runner() as runner:
self.assertIsInstance(runner, DataprocJobRunner)
with logger_disabled('mrjob.dataproc'):
self.assertRaises(StepFailedException, runner.run)
cluster = self.get_cluster_from_runner(runner, cluster_id)
cluster_state = self._dataproc_client.get_state(cluster)
self.assertEqual(cluster_state, 'RUNNING')
# job shouldn't get terminated by cleanup
cluster = self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]
cluster_state = self._dataproc_client.get_state(cluster)
self.assertEqual(cluster_state, 'RUNNING')
示例7: test_failed_job
def test_failed_job(self):
mr_job = MRTwoStepJob(['-r', 'emr', '-v',
'-c', self.mrjob_conf_path])
mr_job.sandbox()
self.add_mock_s3_data({'walrus': {}})
self.mock_emr_failures = {('j-MOCKJOBFLOW0', 0): None}
with mr_job.make_runner() as runner:
assert isinstance(runner, EMRJobRunner)
with logger_disabled('mrjob.emr'):
assert_raises(Exception, runner.run)
emr_conn = botoemr.EmrConnection()
job_flow_id = runner.get_emr_job_flow_id()
for i in range(10):
emr_conn.simulate_progress(job_flow_id)
job_flow = emr_conn.describe_jobflow(job_flow_id)
assert_equal(job_flow.state, 'FAILED')
# job should get terminated on cleanup
emr_conn = runner.make_emr_conn()
job_flow_id = runner.get_emr_job_flow_id()
for i in range(10):
emr_conn.simulate_progress(job_flow_id)
job_flow = emr_conn.describe_jobflow(job_flow_id)
assert_equal(job_flow.state, 'TERMINATED')
示例8: test_attach_to_existing_cluster
def test_attach_to_existing_cluster(self):
runner = DataprocJobRunner(conf_paths=[])
cluster_body = runner.api_client.cluster_create()
cluster_id = cluster_body['clusterName']
stdin = BytesIO(b'foo\nbar\n')
mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
'--cluster-id', cluster_id])
mr_job.sandbox(stdin=stdin)
results = []
with mr_job.make_runner() as runner:
runner.run()
# Generate fake output
self.put_job_output_parts(runner, [
b'1\t"bar"\n1\t"foo"\n2\tnull\n'
])
# Issue 182: don't create the bootstrap script when
# attaching to another cluster
self.assertIsNone(runner._master_bootstrap_script_path)
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
self.assertEqual(sorted(results),
[(1, 'bar'), (1, 'foo'), (2, None)])
示例9: test_owner_and_label_switches
def test_owner_and_label_switches(self):
runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain']
runner = MRTwoStepJob(runner_opts).make_runner()
match = _JOB_KEY_RE.match(runner.get_job_key())
self.assertEqual(match.group(1), 'ads_chain')
self.assertEqual(match.group(2), 'ads')
示例10: test_end_to_end
def test_end_to_end(self):
# read from STDIN, a regular file, and a .gz
stdin = StringIO("foo\nbar\n")
input_path = os.path.join(self.tmp_dir, "input")
with open(input_path, "w") as input_file:
input_file.write("bar\nqux\n")
input_gz_path = os.path.join(self.tmp_dir, "input.gz")
input_gz = gzip.GzipFile(input_gz_path, "w")
input_gz.write("foo\n")
input_gz.close()
mr_job = MRTwoStepJob(["-c", self.mrjob_conf_path, "-", input_path, input_gz_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, LocalMRJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
assert os.path.exists(local_tmp_dir)
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])
示例11: test_end_to_end
def test_end_to_end(self):
# read from STDIN, a regular file, and a .gz
stdin = BytesIO(b'foo\nbar\n')
input_path = join(self.tmp_dir, 'input')
with open(input_path, 'w') as input_file:
input_file.write('bar\nqux\n')
input_gz_path = join(self.tmp_dir, 'input.gz')
input_gz = gzip.GzipFile(input_gz_path, 'wb')
input_gz.write(b'foo\n')
input_gz.close()
mr_job = MRTwoStepJob(
['--runner', 'inline', '-', input_path, input_gz_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, InlineMRJobRunner)
runner.run()
results.extend(mr_job.parse_output(runner.cat_output()))
local_tmp_dir = runner._get_local_tmp_dir()
assert exists(local_tmp_dir)
# make sure cleanup happens
assert not exists(local_tmp_dir)
self.assertEqual(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
示例12: test_two_step_job
def test_two_step_job(self):
# good all-around test. MRTwoStepJob's first step logs counters, but
# its second step does not
job = MRTwoStepJob(['-r', 'spark'])
job.sandbox(stdin=BytesIO(b'foo\nbar\n'))
with job.make_runner() as runner:
runner.run()
counters = runner.counters()
# should have two steps worth of counters, even though it runs as a
# single Spark job
self.assertEqual(len(counters), 2)
# first step counters should be {'count': {'combiners': <int>}}
self.assertEqual(sorted(counters[0]), ['count'])
self.assertEqual(sorted(counters[0]['count']), ['combiners'])
self.assertIsInstance(counters[0]['count']['combiners'], int)
# second step counters should be empty
self.assertEqual(counters[1], {})
log_output = '\n'.join(c[0][0] for c in self.log.info.call_args_list)
log_lines = log_output.split('\n')
# should log first step counters but not second step
self.assertIn('Counters for step 1: 1', log_lines)
self.assertIn('\tcount', log_output)
self.assertNotIn('Counters for step 2', log_output)
示例13: test_bootstrap_python_switch
def test_bootstrap_python_switch(self):
mr_job = MRTwoStepJob(["-r", "dataproc", "--bootstrap-python"])
with mr_job.make_runner() as runner:
self.assertEqual(runner._opts["bootstrap_python"], True)
self.assertEqual(runner._bootstrap_python(), self.EXPECTED_BOOTSTRAP)
self.assertEqual(runner._bootstrap, self.EXPECTED_BOOTSTRAP)
示例14: test_owner_and_label_switches
def test_owner_and_label_switches(self):
runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain']
runner = MRTwoStepJob(runner_opts).make_runner()
match = JOB_NAME_RE.match(runner.get_job_name())
assert_equal(match.group(1), 'ads_chain')
assert_equal(match.group(2), 'ads')
示例15: test_missing_input
def test_missing_input(self):
mr_job = MRTwoStepJob(['-r', 'inline', '/some/bogus/file/path'])
mr_job.sandbox()
with mr_job.make_runner() as runner:
assert isinstance(runner, InlineMRJobRunner)
self.assertRaises(IOError, runner.run)