本文整理汇总了Python中tests.mr_two_step_job.MRTwoStepJob.parse_output_line方法的典型用法代码示例。如果您正苦于以下问题:Python MRTwoStepJob.parse_output_line方法的具体用法?Python MRTwoStepJob.parse_output_line怎么用?Python MRTwoStepJob.parse_output_line使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tests.mr_two_step_job.MRTwoStepJob
的用法示例。
在下文中一共展示了MRTwoStepJob.parse_output_line方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_attach_to_existing_cluster
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_attach_to_existing_cluster(self):
runner = DataprocJobRunner(conf_paths=[])
cluster_body = runner.api_client.cluster_create()
cluster_id = cluster_body['clusterName']
stdin = BytesIO(b'foo\nbar\n')
mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
'--cluster-id', cluster_id])
mr_job.sandbox(stdin=stdin)
results = []
with mr_job.make_runner() as runner:
runner.run()
# Generate fake output
self.put_job_output_parts(runner, [
b'1\t"bar"\n1\t"foo"\n2\tnull\n'
])
# Issue 182: don't create the bootstrap script when
# attaching to another cluster
self.assertIsNone(runner._master_bootstrap_script_path)
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
self.assertEqual(sorted(results),
[(1, 'bar'), (1, 'foo'), (2, None)])
示例2: test_attach_to_existing_job_flow
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_attach_to_existing_job_flow(self):
emr_conn = EMRJobRunner(conf_path=False).make_emr_conn()
# set log_uri to None, so that when we describe the job flow, it
# won't have the loguri attribute, to test Issue #112
emr_job_flow_id = emr_conn.run_jobflow(
name='Development Job Flow', log_uri=None)
stdin = StringIO('foo\nbar\n')
self.mock_emr_output = {(emr_job_flow_id, 1): [
'1\t"bar"\n1\t"foo"\n2\tnull\n']}
mr_job = MRTwoStepJob(['-r', 'emr', '-v',
'-c', self.mrjob_conf_path,
'--emr-job-flow-id', emr_job_flow_id])
mr_job.sandbox(stdin=stdin)
results = []
with mr_job.make_runner() as runner:
runner.run()
# Issue 182: don't create the bootstrap script when
# attaching to another job flow
assert_equal(runner._master_bootstrap_script, None)
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
assert_equal(sorted(results),
[(1, 'bar'), (1, 'foo'), (2, None)])
示例3: test_end_to_end
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_end_to_end(self):
# read from STDIN, a regular file, and a .gz
stdin = StringIO("foo\nbar\n")
input_path = os.path.join(self.tmp_dir, "input")
with open(input_path, "w") as input_file:
input_file.write("bar\nqux\n")
input_gz_path = os.path.join(self.tmp_dir, "input.gz")
input_gz = gzip.GzipFile(input_gz_path, "w")
input_gz.write("foo\n")
input_gz.close()
mr_job = MRTwoStepJob(["-c", self.mrjob_conf_path, "-", input_path, input_gz_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, LocalMRJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
assert os.path.exists(local_tmp_dir)
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])
示例4: test_end_to_end
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_end_to_end(self):
# read from STDIN, a local file, and a remote file
stdin = StringIO('foo\nbar\n')
local_input_path = os.path.join(self.tmp_dir, 'input')
with open(local_input_path, 'w') as local_input_file:
local_input_file.write('bar\nqux\n')
input_to_upload = os.path.join(self.tmp_dir, 'remote_input')
with open(input_to_upload, 'w') as input_to_upload_file:
input_to_upload_file.write('foo\n')
remote_input_path = 'hdfs:///data/foo'
check_call([self.hadoop_bin,
'fs', '-put', input_to_upload, remote_input_path])
# doesn't matter what the intermediate output is; just has to exist.
add_mock_hadoop_output([''])
add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])
mr_job = MRTwoStepJob(['-r', 'hadoop', '-v',
'--no-conf', '--hadoop-arg', '-libjar',
'--hadoop-arg', 'containsJars.jar',
'-', local_input_path, remote_input_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, HadoopJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
# make sure cleanup hasn't happened yet
assert os.path.exists(local_tmp_dir)
assert any(runner.ls(runner.get_output_dir()))
# make sure we're writing to the correct path in HDFS
hdfs_root = os.environ['MOCK_HDFS_ROOT']
assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user'])
home_dir = os.path.join(hdfs_root, 'user', getpass.getuser())
assert_equal(os.listdir(home_dir), ['tmp'])
assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob'])
assert_equal(runner._opts['hadoop_extra_args'],
['-libjar', 'containsJars.jar'])
assert_equal(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
assert not any(runner.ls(runner.get_output_dir()))
示例5: test_end_to_end
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_end_to_end(self):
# read from STDIN, a local file, and a remote file
stdin = StringIO("foo\nbar\n")
local_input_path = os.path.join(self.tmp_dir, "input")
with open(local_input_path, "w") as local_input_file:
local_input_file.write("bar\nqux\n")
input_to_upload = os.path.join(self.tmp_dir, "remote_input")
with open(input_to_upload, "w") as input_to_upload_file:
input_to_upload_file.write("foo\n")
remote_input_path = "hdfs:///data/foo"
check_call([self.hadoop_bin, "fs", "-put", input_to_upload, remote_input_path])
# doesn't matter what the intermediate output is; just has to exist.
add_mock_hadoop_output([""])
add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])
mr_job = MRTwoStepJob(["-r", "hadoop", "-v", "--no-conf", "-", local_input_path, remote_input_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, HadoopJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
# make sure cleanup hasn't happened yet
assert os.path.exists(local_tmp_dir)
assert any(runner.ls(runner.get_output_dir()))
# make sure we're writing to the correct path in HDFS
hdfs_root = os.environ["MOCK_HDFS_ROOT"]
assert_equal(sorted(os.listdir(hdfs_root)), ["data", "user"])
home_dir = os.path.join(hdfs_root, "user", getpass.getuser())
assert_equal(os.listdir(home_dir), ["tmp"])
assert_equal(os.listdir(os.path.join(home_dir, "tmp")), ["mrjob"])
assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
assert not any(runner.ls(runner.get_output_dir()))
示例6: test_end_to_end
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_end_to_end(self):
# read from STDIN, a regular file, and a .gz
stdin = StringIO('foo\nbar\n')
input_path = os.path.join(self.tmp_dir, 'input')
with open(input_path, 'w') as input_file:
input_file.write('bar\nqux\n')
input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
input_gz_glob = os.path.join(self.tmp_dir, '*.gz')
input_gz = gzip.GzipFile(input_gz_path, 'w')
input_gz.write('foo\n')
input_gz.close()
mr_job = MRTwoStepJob(['-c', self.mrjob_conf_path,
'-r', 'local',
'-', input_path, input_gz_glob])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, LocalMRJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
assert os.path.exists(local_tmp_dir)
self.assertEqual(runner.counters()[0]['count']['combiners'], 8)
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
self.assertEqual(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
示例7: test_end_to_end_multiple_tasks
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_end_to_end_multiple_tasks(self):
# read from STDIN, a regular file, and a .gz
stdin = BytesIO(b'foo\nbar\n')
input_path = os.path.join(self.tmp_dir, 'input')
with open(input_path, 'wb') as input_file:
input_file.write(b'bar\nqux\n')
input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
input_gz = gzip.GzipFile(input_gz_path, 'wb')
input_gz.write(b'foo\n')
input_gz.close()
mr_job = MRTwoStepJob(['-r', 'local',
'--jobconf=mapred.map.tasks=2',
'--jobconf=mapred.reduce.tasks=2',
'-', input_path, input_gz_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, LocalMRJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
assert os.path.exists(local_tmp_dir)
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
self.assertEqual(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
示例8: test_end_to_end
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_end_to_end(self):
# read from STDIN, a local file, and a remote file
stdin = StringIO('foo\nbar\n')
local_input_path = os.path.join(self.tmp_dir, 'input')
with open(local_input_path, 'w') as local_input_file:
local_input_file.write('bar\nqux\n')
remote_input_path = 's3://walrus/data/foo'
self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}})
# setup fake output
self.mock_emr_output = {('j-MOCKJOBFLOW0', 1): [
'1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']}
mr_job = MRTwoStepJob(['-r', 'emr', '-v',
'-c', self.mrjob_conf_path,
'-', local_input_path, remote_input_path,
'--hadoop-input-format', 'FooFormat',
'--hadoop-output-format', 'BarFormat'])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs)
with mr_job.make_runner() as runner:
assert isinstance(runner, EMRJobRunner)
# make sure that initializing the runner doesn't affect S3
# (Issue #50)
assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
# make sure cleanup hasn't happened yet
assert os.path.exists(local_tmp_dir)
assert any(runner.ls(runner.get_output_dir()))
emr_conn = runner.make_emr_conn()
job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id())
assert_equal(job_flow.state, 'COMPLETED')
name_match = JOB_NAME_RE.match(job_flow.name)
assert_equal(name_match.group(1), 'mr_two_step_job')
assert_equal(name_match.group(2), getpass.getuser())
# make sure our input and output formats are attached to
# the correct steps
assert_in('-inputformat', job_flow.steps[0].args)
assert_not_in('-outputformat', job_flow.steps[0].args)
assert_not_in('-inputformat', job_flow.steps[1].args)
assert_in('-outputformat', job_flow.steps[1].args)
# make sure mrjob.tar.gz is created and uploaded as
# a bootstrap file
assert runner._mrjob_tar_gz_path
mrjob_tar_gz_file_dicts = [
file_dict for file_dict in runner._files
if file_dict['path'] == runner._mrjob_tar_gz_path]
assert_equal(len(mrjob_tar_gz_file_dicts), 1)
mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0]
assert mrjob_tar_gz_file_dict['name']
assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file')
# shouldn't be in PYTHONPATH (we dump it directly in site-packages)
pythonpath = runner._get_cmdenv().get('PYTHONPATH') or ''
assert_not_in(mrjob_tar_gz_file_dict['name'],
pythonpath.split(':'))
assert_equal(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
assert not any(runner.ls(runner.get_output_dir()))
# job should get terminated
emr_conn = runner.make_emr_conn()
job_flow_id = runner.get_emr_job_flow_id()
for i in range(10):
emr_conn.simulate_progress(job_flow_id)
job_flow = emr_conn.describe_jobflow(job_flow_id)
assert_equal(job_flow.state, 'TERMINATED')
示例9: _test_end_to_end
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def _test_end_to_end(self, args=()):
# read from STDIN, a local file, and a remote file
stdin = StringIO('foo\nbar\n')
local_input_path = os.path.join(self.tmp_dir, 'input')
with open(local_input_path, 'w') as local_input_file:
local_input_file.write('bar\nqux\n')
input_to_upload = os.path.join(self.tmp_dir, 'remote_input')
with open(input_to_upload, 'w') as input_to_upload_file:
input_to_upload_file.write('foo\n')
remote_input_path = 'hdfs:///data/foo'
check_call([self.hadoop_bin,
'fs', '-put', input_to_upload, remote_input_path])
# doesn't matter what the intermediate output is; just has to exist.
add_mock_hadoop_output([''])
add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])
mr_job = MRTwoStepJob(['-r', 'hadoop', '-v',
'--no-conf', '--hadoop-arg', '-libjar',
'--hadoop-arg', 'containsJars.jar'] + list(args)
+ ['-', local_input_path, remote_input_path]
+ ['--hadoop-input-format', 'FooFormat']
+ ['--hadoop-output-format', 'BarFormat']
+ ['--jobconf', 'x=y'])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
# don't care that --hadoop-*-format is deprecated
with logger_disabled('mrjob.job'):
runner = mr_job.make_runner()
with runner as runner: # i.e. call cleanup when we're done
assert isinstance(runner, HadoopJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
# make sure cleanup hasn't happened yet
assert os.path.exists(local_tmp_dir)
assert any(runner.ls(runner.get_output_dir()))
# make sure we're writing to the correct path in HDFS
hdfs_root = os.environ['MOCK_HDFS_ROOT']
assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user'])
home_dir = os.path.join(hdfs_root, 'user', getpass.getuser())
assert_equal(os.listdir(home_dir), ['tmp'])
assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob'])
assert_equal(runner._opts['hadoop_extra_args'],
['-libjar', 'containsJars.jar'])
# make sure mrjob.tar.gz is uploaded and in PYTHONPATH
assert runner._mrjob_tar_gz_path
mrjob_tar_gz_file_dicts = [
file_dict for file_dict in runner._files
if file_dict['path'] == runner._mrjob_tar_gz_path]
assert_equal(len(mrjob_tar_gz_file_dicts), 1)
mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0]
assert mrjob_tar_gz_file_dict['name']
pythonpath = runner._get_cmdenv()['PYTHONPATH']
assert_in(mrjob_tar_gz_file_dict['name'],
pythonpath.split(':'))
assert_equal(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
# make sure we called hadoop the way we expected
with open(os.environ['MOCK_HADOOP_LOG']) as mock_log:
hadoop_cmd_args = [shlex.split(line) for line in mock_log]
jar_cmd_args = [args for args in hadoop_cmd_args
if args[:1] == ['jar']]
assert_equal(len(jar_cmd_args), 2)
step_0_args, step_1_args = jar_cmd_args
# check input/output format
assert_in('-inputformat', step_0_args)
assert_not_in('-outputformat', step_0_args)
assert_not_in('-inputformat', step_1_args)
assert_in('-outputformat', step_1_args)
# make sure -libjar extra arg comes before -mapper
for args in (step_0_args, step_1_args):
assert_in('-libjar', args)
assert_in('-mapper', args)
assert_lt(args.index('-libjar'), args.index('-mapper'))
# make sure -jobconf made it through
assert_in('-D', step_0_args)
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
#.........这里部分代码省略.........
示例10: test_end_to_end
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_end_to_end(self):
# read from STDIN, a local file, and a remote file
stdin = StringIO('foo\nbar\n')
local_input_path = os.path.join(self.tmp_dir, 'input')
with open(local_input_path, 'w') as local_input_file:
local_input_file.write('bar\nqux\n')
remote_input_path = 's3://walrus/data/foo'
self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}})
# setup fake output
self.mock_emr_output = {('j-MOCKJOBFLOW0', 1): [
'1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']}
mr_job = MRTwoStepJob(['-r', 'emr', '-v',
'-c', self.mrjob_conf_path,
'-', local_input_path, remote_input_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs)
with mr_job.make_runner() as runner:
assert isinstance(runner, EMRJobRunner)
# make sure that initializing the runner doesn't affect S3
# (Issue #50)
assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
# make sure cleanup hasn't happened yet
assert os.path.exists(local_tmp_dir)
assert any(runner.ls(runner.get_output_dir()))
emr_conn = runner.make_emr_conn()
job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id())
assert_equal(job_flow.state, 'COMPLETED')
name_match = JOB_NAME_RE.match(job_flow.name)
assert_equal(name_match.group(1), 'mr_two_step_job')
assert_equal(name_match.group(2), getpass.getuser())
assert_equal(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
assert not any(runner.ls(runner.get_output_dir()))
# job should get terminated
emr_conn = runner.make_emr_conn()
job_flow_id = runner.get_emr_job_flow_id()
for i in range(10):
emr_conn.simulate_progress(job_flow_id)
job_flow = emr_conn.describe_jobflow(job_flow_id)
assert_equal(job_flow.state, 'TERMINATED')
示例11: test_end_to_end
# 需要导入模块: from tests.mr_two_step_job import MRTwoStepJob [as 别名]
# 或者: from tests.mr_two_step_job.MRTwoStepJob import parse_output_line [as 别名]
def test_end_to_end(self):
# read from STDIN, a local file, and a remote file
stdin = StringIO("foo\nbar\n")
local_input_path = os.path.join(self.tmp_dir, "input")
with open(local_input_path, "w") as local_input_file:
local_input_file.write("bar\nqux\n")
input_to_upload = os.path.join(self.tmp_dir, "remote_input")
with open(input_to_upload, "w") as input_to_upload_file:
input_to_upload_file.write("foo\n")
remote_input_path = "hdfs:///data/foo"
check_call([self.hadoop_bin, "fs", "-put", input_to_upload, remote_input_path])
# doesn't matter what the intermediate output is; just has to exist.
add_mock_hadoop_output([""])
add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])
mr_job = MRTwoStepJob(
[
"-r",
"hadoop",
"-v",
"--no-conf",
"--hadoop-arg",
"-libjar",
"--hadoop-arg",
"containsJars.jar",
"-",
local_input_path,
remote_input_path,
"--hadoop-input-format",
"FooFormat",
"--hadoop-output-format",
"BarFormat",
]
)
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, HadoopJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
# make sure cleanup hasn't happened yet
assert os.path.exists(local_tmp_dir)
assert any(runner.ls(runner.get_output_dir()))
# make sure we're writing to the correct path in HDFS
hdfs_root = os.environ["MOCK_HDFS_ROOT"]
assert_equal(sorted(os.listdir(hdfs_root)), ["data", "user"])
home_dir = os.path.join(hdfs_root, "user", getpass.getuser())
assert_equal(os.listdir(home_dir), ["tmp"])
assert_equal(os.listdir(os.path.join(home_dir, "tmp")), ["mrjob"])
assert_equal(runner._opts["hadoop_extra_args"], ["-libjar", "containsJars.jar"])
# make sure mrjob.tar.gz is uploaded and in PYTHONPATH
assert runner._mrjob_tar_gz_path
mrjob_tar_gz_file_dicts = [
file_dict for file_dict in runner._files if file_dict["path"] == runner._mrjob_tar_gz_path
]
assert_equal(len(mrjob_tar_gz_file_dicts), 1)
mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0]
assert mrjob_tar_gz_file_dict["name"]
pythonpath = runner._get_cmdenv()["PYTHONPATH"]
assert_in(mrjob_tar_gz_file_dict["name"], pythonpath.split(":"))
assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])
# make sure we called hadoop the way we expected
with open(os.environ["MOCK_HADOOP_LOG"]) as mock_log:
hadoop_cmd_args = [shlex.split(line) for line in mock_log]
jar_cmd_args = [args for args in hadoop_cmd_args if args[:1] == ["jar"]]
assert_equal(len(jar_cmd_args), 2)
step_0_args, step_1_args = jar_cmd_args
# check input/output format
assert_in("-inputformat", step_0_args)
assert_not_in("-outputformat", step_0_args)
assert_not_in("-inputformat", step_1_args)
assert_in("-outputformat", step_1_args)
# make sure -libjar extra arg comes before -mapper
for args in (step_0_args, step_1_args):
assert_in("-libjar", args)
assert_in("-mapper", args)
assert_lt(args.index("-libjar"), args.index("-mapper"))
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
#.........这里部分代码省略.........