本文整理汇总了Python中mrjob.hadoop.HadoopJobRunner._args_for_streaming_step方法的典型用法代码示例。如果您正苦于以下问题:Python HadoopJobRunner._args_for_streaming_step方法的具体用法?Python HadoopJobRunner._args_for_streaming_step怎么用?Python HadoopJobRunner._args_for_streaming_step使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类mrjob.hadoop.HadoopJobRunner
的用法示例。
在下文中一共展示了HadoopJobRunner._args_for_streaming_step方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: StreamingArgsTestCase
# 需要导入模块: from mrjob.hadoop import HadoopJobRunner [as 别名]
# 或者: from mrjob.hadoop.HadoopJobRunner import _args_for_streaming_step [as 别名]
class StreamingArgsTestCase(EmptyMrjobConfTestCase):
MRJOB_CONF_CONTENTS = {'runners': {'hadoop': {
'hadoop_home': 'kansas',
'hadoop_streaming_jar': 'binks.jar.jar',
}}}
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
mr_job_script='my_job.py', stdin=BytesIO())
self.runner._add_job_files_for_upload()
self.start(patch.object(self.runner, '_upload_args',
return_value=['new_upload_args']))
self.start(patch.object(self.runner, '_pre_0_20_upload_args',
return_value=['old_upload_args']))
self.start(patch.object(self.runner, '_hadoop_args_for_step',
return_value=['hadoop_args_for_step']))
self.start(patch.object(self.runner, '_hdfs_step_input_files',
return_value=['hdfs_step_input_files']))
self.start(patch.object(self.runner, '_hdfs_step_output_dir',
return_value='hdfs_step_output_dir'))
self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
return_value='1.2.0'))
self.runner._script_path = 'my_job.py'
self._new_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'new_upload_args', 'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir']
self._old_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir',
'old_upload_args']
def _assert_streaming_step(self, step, args):
self.runner._steps = [step]
self.assertEqual(
self.runner._args_for_streaming_step(0),
self._new_basic_args + args)
def _assert_streaming_step_old(self, step, args):
HadoopFilesystem.get_hadoop_version.return_value = '0.18'
self.runner._steps = [step]
self.assertEqual(
self.runner._args_for_streaming_step(0),
self._old_basic_args + args)
def test_basic_mapper(self):
self._assert_streaming_step(
{
'type': 'streaming',
'mapper': {
'type': 'script',
},
},
['-mapper',
PYTHON_BIN + ' my_job.py --step-num=0 --mapper',
'-jobconf',
'mapred.reduce.tasks=0'])
def test_basic_reducer(self):
self._assert_streaming_step(
{
'type': 'streaming',
'reducer': {
'type': 'script',
},
},
['-mapper',
'cat',
'-reducer',
PYTHON_BIN + ' my_job.py --step-num=0 --reducer'])
def test_pre_filters(self):
self._assert_streaming_step(
{
'type': 'streaming',
'mapper': {
'type': 'script',
'pre_filter': 'grep anything',
},
'combiner': {
'type': 'script',
'pre_filter': 'grep nothing',
},
'reducer': {
'type': 'script',
'pre_filter': 'grep something',
},
},
["-mapper",
"bash -c 'grep anything | " + PYTHON_BIN +
" my_job.py --step-num=0 --mapper'",
#.........这里部分代码省略.........
示例2: StreamingArgsTestCase
# 需要导入模块: from mrjob.hadoop import HadoopJobRunner [as 别名]
# 或者: from mrjob.hadoop.HadoopJobRunner import _args_for_streaming_step [as 别名]
class StreamingArgsTestCase(EmptyMrjobConfTestCase):
MRJOB_CONF_CONTENTS = {'runners': {'hadoop': {
'hadoop_home': 'kansas',
'hadoop_streaming_jar': 'binks.jar.jar',
}}}
BASIC_HADOOP_ARGS = [
'hadoop',
'jar', '<streaming jar>',
'<upload args>',
'<hadoop args for step>',
]
BASIC_JOB_ARGS = [
'-input', '<hdfs step input files>',
'-output', '<hdfs step output dir>',
]
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>',
mr_job_script='my_job.py', stdin=BytesIO())
self.runner._add_job_files_for_upload()
self.start(patch.object(self.runner, '_upload_args',
return_value=['<upload args>']))
self.start(patch.object(self.runner, '_hadoop_args_for_step',
return_value=['<hadoop args for step>']))
self.start(patch.object(self.runner, '_hdfs_step_input_files',
return_value=['<hdfs step input files>']))
self.start(patch.object(self.runner, '_hdfs_step_output_dir',
return_value='<hdfs step output dir>'))
self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
return_value='2.7.1'))
self.runner._script_path = 'my_job.py'
def _assert_streaming_step(self, step, args):
self.runner._steps = [step]
self.assertEqual(
self.runner._args_for_streaming_step(0),
self._new_basic_args + args)
def test_basic_mapper(self):
self.runner._steps = [
{
'type': 'streaming',
'mapper': {
'type': 'script',
},
},
]
self.assertEqual(
self.runner._args_for_streaming_step(0),
(self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] +
self.BASIC_JOB_ARGS + [
'-mapper',
PYTHON_BIN + ' my_job.py --step-num=0 --mapper']))
def test_basic_mapper_pre_yarn(self):
# use a different jobconf (-D) on pre-YARN
self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
return_value='1.0.3'))
self.runner._steps = [
{
'type': 'streaming',
'mapper': {
'type': 'script',
},
},
]
self.assertEqual(
self.runner._args_for_streaming_step(0),
(self.BASIC_HADOOP_ARGS + ['-D', 'mapred.reduce.tasks=0'] +
self.BASIC_JOB_ARGS + [
'-mapper',
PYTHON_BIN + ' my_job.py --step-num=0 --mapper']))
def test_basic_reducer(self):
self.runner._steps = [
{
'type': 'streaming',
'reducer': {
'type': 'script',
},
},
]
self.assertEqual(
self.runner._args_for_streaming_step(0),
(self.BASIC_HADOOP_ARGS + self.BASIC_JOB_ARGS + [
'-mapper',
'cat',
'-reducer',
PYTHON_BIN + ' my_job.py --step-num=0 --reducer']))
#.........这里部分代码省略.........
示例3: StreamingArgsTestCase
# 需要导入模块: from mrjob.hadoop import HadoopJobRunner [as 别名]
# 或者: from mrjob.hadoop.HadoopJobRunner import _args_for_streaming_step [as 别名]
class StreamingArgsTestCase(EmptyMrjobConfTestCase):
MRJOB_CONF_CONTENTS = {'runners': {'hadoop': {
'hadoop_home': 'kansas',
'hadoop_streaming_jar': 'binks.jar.jar',
}}}
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
mr_job_script='my_job.py', stdin=StringIO())
self.runner._add_job_files_for_upload()
self.runner._hadoop_version='0.20.204'
self.simple_patch(self.runner, '_new_upload_args',
return_value=['new_upload_args'])
self.simple_patch(self.runner, '_old_upload_args',
return_value=['old_upload_args'])
self.simple_patch(self.runner, '_hadoop_args_for_step',
return_value=['hadoop_args_for_step'])
self.simple_patch(self.runner, '_hdfs_step_input_files',
return_value=['hdfs_step_input_files'])
self.simple_patch(self.runner, '_hdfs_step_output_dir',
return_value='hdfs_step_output_dir')
self.runner._script_path = 'my_job.py'
self._new_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'new_upload_args', 'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir']
self._old_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir',
'old_upload_args']
def simple_patch(self, obj, attr, side_effect=None, return_value=None):
patcher = patch.object(obj, attr, side_effect=side_effect,
return_value=return_value)
patcher.start()
self.addCleanup(patcher.stop)
def _assert_streaming_step(self, step, args):
self.runner._steps = [step]
self.assertEqual(
self.runner._args_for_streaming_step(0),
self._new_basic_args + args)
def _assert_streaming_step_old(self, step, args):
self.runner._hadoop_version = '0.18'
self.runner._steps = [step]
self.assertEqual(
self.runner._args_for_streaming_step(0),
self._old_basic_args + args)
def test_basic_mapper(self):
self._assert_streaming_step(
{
'type': 'streaming',
'mapper': {
'type': 'script',
},
},
['-mapper', 'python my_job.py --step-num=0 --mapper',
'-jobconf', 'mapred.reduce.tasks=0'])
def test_basic_reducer(self):
self._assert_streaming_step(
{
'type': 'streaming',
'reducer': {
'type': 'script',
},
},
['-mapper', 'cat',
'-reducer', 'python my_job.py --step-num=0 --reducer'])
def test_pre_filters(self):
self._assert_streaming_step(
{
'type': 'streaming',
'mapper': {
'type': 'script',
'pre_filter': 'grep anything',
},
'combiner': {
'type': 'script',
'pre_filter': 'grep nothing',
},
'reducer': {
'type': 'script',
'pre_filter': 'grep something',
},
},
["-mapper",
"bash -c 'grep anything | python my_job.py --step-num=0"
#.........这里部分代码省略.........
示例4: StreamingArgsTestCase
# 需要导入模块: from mrjob.hadoop import HadoopJobRunner [as 别名]
# 或者: from mrjob.hadoop.HadoopJobRunner import _args_for_streaming_step [as 别名]
class StreamingArgsTestCase(EmptyMrjobConfTestCase):
MRJOB_CONF_CONTENTS = {"runners": {"hadoop": {"hadoop_home": "kansas", "hadoop_streaming_jar": "binks.jar.jar"}}}
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin="hadoop", hadoop_streaming_jar="streaming.jar", mr_job_script="my_job.py", stdin=StringIO()
)
self.runner._add_job_files_for_upload()
self.runner._hadoop_version = "0.20.204"
self.simple_patch(self.runner, "_new_upload_args", return_value=["new_upload_args"])
self.simple_patch(self.runner, "_old_upload_args", return_value=["old_upload_args"])
self.simple_patch(self.runner, "_hadoop_args_for_step", return_value=["hadoop_args_for_step"])
self.simple_patch(self.runner, "_hdfs_step_input_files", return_value=["hdfs_step_input_files"])
self.simple_patch(self.runner, "_hdfs_step_output_dir", return_value="hdfs_step_output_dir")
self.runner._script_path = "my_job.py"
self._new_basic_args = [
"hadoop",
"jar",
"streaming.jar",
"new_upload_args",
"hadoop_args_for_step",
"-input",
"hdfs_step_input_files",
"-output",
"hdfs_step_output_dir",
]
self._old_basic_args = [
"hadoop",
"jar",
"streaming.jar",
"hadoop_args_for_step",
"-input",
"hdfs_step_input_files",
"-output",
"hdfs_step_output_dir",
"old_upload_args",
]
def simple_patch(self, obj, attr, side_effect=None, return_value=None):
patcher = patch.object(obj, attr, side_effect=side_effect, return_value=return_value)
patcher.start()
self.addCleanup(patcher.stop)
def _assert_streaming_step(self, step, args):
self.runner._steps = [step]
self.assertEqual(self.runner._args_for_streaming_step(0), self._new_basic_args + args)
def _assert_streaming_step_old(self, step, args):
self.runner._hadoop_version = "0.18"
self.runner._steps = [step]
self.assertEqual(self.runner._args_for_streaming_step(0), self._old_basic_args + args)
def test_basic_mapper(self):
self._assert_streaming_step(
{"type": "streaming", "mapper": {"type": "script"}},
["-mapper", "python my_job.py --step-num=0 --mapper", "-jobconf", "mapred.reduce.tasks=0"],
)
def test_basic_reducer(self):
self._assert_streaming_step(
{"type": "streaming", "reducer": {"type": "script"}},
["-mapper", "cat", "-reducer", "python my_job.py --step-num=0 --reducer"],
)
def test_pre_filters(self):
self._assert_streaming_step(
{
"type": "streaming",
"mapper": {"type": "script", "pre_filter": "grep anything"},
"combiner": {"type": "script", "pre_filter": "grep nothing"},
"reducer": {"type": "script", "pre_filter": "grep something"},
},
[
"-mapper",
"bash -c 'grep anything | python my_job.py --step-num=0" " --mapper'",
"-combiner",
"bash -c 'grep nothing | python my_job.py --step-num=0" " --combiner'",
"-reducer",
"bash -c 'grep something | python my_job.py --step-num=0" " --reducer'",
],
)
def test_combiner_018(self):
self._assert_streaming_step_old(
{"type": "streaming", "mapper": {"type": "command", "command": "cat"}, "combiner": {"type": "script"}},
[
"-mapper",
"bash -c 'cat | sort | python my_job.py --step-num=0" " --combiner'",
"-jobconf",
"mapred.reduce.tasks=0",
],
)
def test_pre_filters_018(self):
self._assert_streaming_step_old(
#.........这里部分代码省略.........