本文整理汇总了Python中mrjob.hadoop.HadoopJobRunner类的典型用法代码示例。如果您正苦于以下问题:Python HadoopJobRunner类的具体用法?Python HadoopJobRunner怎么用?Python HadoopJobRunner使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HadoopJobRunner类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: FindProbableCauseOfFailureTestCase
class FindProbableCauseOfFailureTestCase(MockHadoopTestCase):
# integration tests for _find_probable_cause_of_failure()
def setUp(self):
super(FindProbableCauseOfFailureTestCase, self).setUp()
os.environ['MOCK_HADOOP_VERSION'] = '2.7.0'
self.runner = HadoopJobRunner()
def test_empty(self):
self.assertEqual(self.runner._find_probable_cause_of_failure(), None)
def test_yarn_python_exception(self):
APPLICATION_ID = 'application_1450486922681_0004'
CONTAINER_ID = 'container_1450486922681_0005_01_000003'
log_subdir = os.path.join(
os.environ['HADOOP_HOME'], 'logs',
'userlogs', APPLICATION_ID, CONTAINER_ID)
os.makedirs(log_subdir)
syslog_path = os.path.join(log_subdir, 'syslog')
with open(syslog_path, 'w') as syslog:
syslog.write(
'2015-12-21 14:06:17,707 INFO [main]'
' org.apache.hadoop.mapred.MapTask: Processing split:'
' hdfs://e4270474c8ee:9000/user/root/tmp/mrjob'
'/mr_boom.root.20151221.190511.059097/files'
'/bootstrap.sh:0+335\n')
syslog.write(
'2015-12-21 14:06:18,538 WARN [main]'
' org.apache.hadoop.mapred.YarnChild: Exception running child'
' : java.lang.RuntimeException:'
' PipeMapRed.waitOutputThreads(): subprocess failed with'
' code 1\n')
syslog.write(
' at org.apache.hadoop.streaming.PipeMapRed'
'.waitOutputThreads(PipeMapRed.java:322)\n')
stderr_path = os.path.join(log_subdir, 'stderr')
with open(stderr_path, 'w') as stderr:
stderr.write('Traceback (most recent call last):\n')
stderr.write(' File "mr_boom.py", line 10, in <module>\n')
stderr.write(' MRBoom.run()\n')
stderr.write('Exception: BOOM\n')
# need application_id
self.assertIsNone(self.runner._find_probable_cause_of_failure())
cause = self.runner._find_probable_cause_of_failure(
application_id=APPLICATION_ID)
self.assertTrue(cause)
self.assertEqual(cause['syslog']['path'], syslog_path)
self.assertTrue(cause['syslog']['error'])
self.assertEqual(cause['stderr']['path'], stderr_path)
self.assertTrue(cause['stderr']['error'])
示例2: test_du
def test_du(self):
root = os.environ['MOCK_HDFS_ROOT']
data_path_1 = os.path.join(root, 'data1')
with open(data_path_1, 'w') as f:
f.write("abcd")
remote_data_1 = 'hdfs:///data1'
data_dir = os.path.join(root, 'more')
os.mkdir(data_dir)
remote_dir = 'hdfs:///more'
data_path_2 = os.path.join(data_dir, 'data2')
with open(data_path_2, 'w') as f:
f.write("defg")
remote_data_2 = 'hdfs:///more/data2'
data_path_3 = os.path.join(data_dir, 'data3')
with open(data_path_3, 'w') as f:
f.write("hijk")
remote_data_2 = 'hdfs:///more/data3'
runner = HadoopJobRunner(conf_path=False)
self.assertEqual(runner.du(root), 12)
self.assertEqual(runner.du(remote_dir), 8)
self.assertEqual(runner.du(remote_dir + '/*'), 8)
self.assertEqual(runner.du(remote_data_1), 4)
self.assertEqual(runner.du(remote_data_2), 4)
示例3: test_uris
def test_uris(self):
runner = HadoopJobRunner()
list(runner.ls('hdfs://tmp/waffles'))
list(runner.ls('lego://my/ego'))
list(runner.ls('/tmp'))
with open(os.environ['MOCK_HADOOP_LOG']) as mock_log:
hadoop_cmd_args = [shlex.split(line) for line in mock_log]
assert_equal(hadoop_cmd_args, [
['fs', '-lsr', 'hdfs://tmp/waffles'],
['fs', '-lsr', 'lego://my/ego'],
])
示例4: test_hadoop_mapred_home_beats_infer_from_hadoop_bin
def test_hadoop_mapred_home_beats_infer_from_hadoop_bin(self):
self.runner = HadoopJobRunner(
hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])
self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')
self.test_hadoop_mapred_home()
示例5: test_infer_from_hadoop_bin_parent_dir
def test_infer_from_hadoop_bin_parent_dir(self):
self.runner = HadoopJobRunner(
hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])
self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')
self.assertEqual(self.runner._find_hadoop_streaming_jar(),
'/ha/do/op/bin-parent/hadoop-streaming.jar')
示例6: setUp
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar')
self.runner._hadoop_version='0.20.204'
self.simple_patch(self.runner, '_new_upload_args',
return_value=['new_upload_args'])
self.simple_patch(self.runner, '_old_upload_args',
return_value=['old_upload_args'])
self.simple_patch(self.runner, '_hadoop_conf_args',
return_value=['hadoop_conf_args'])
self.simple_patch(self.runner, '_hdfs_step_input_files',
return_value=['hdfs_step_input_files'])
self.simple_patch(self.runner, '_hdfs_step_output_dir',
return_value='hdfs_step_output_dir')
self.runner._script = {'name': 'my_job.py'}
self._new_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'new_upload_args', 'hadoop_conf_args',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir']
self._old_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'hadoop_conf_args',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir',
'old_upload_args']
示例7: test_infer_from_hadoop_bin_realpath
def test_infer_from_hadoop_bin_realpath(self):
with patch('posixpath.realpath', return_value='/ha/do/op/bin'):
self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop'])
self.mock_paths.append('/ha/do/op/hadoop-streaming.jar')
self.assertEqual(self.runner._find_hadoop_streaming_jar(),
'/ha/do/op/hadoop-streaming.jar')
示例8: setUp
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
mr_job_script='my_job.py', stdin=StringIO())
self.runner._add_job_files_for_upload()
self.runner._hadoop_version='0.20.204'
self.simple_patch(self.runner, '_new_upload_args',
return_value=['new_upload_args'])
self.simple_patch(self.runner, '_old_upload_args',
return_value=['old_upload_args'])
self.simple_patch(self.runner, '_hadoop_args_for_step',
return_value=['hadoop_args_for_step'])
self.simple_patch(self.runner, '_hdfs_step_input_files',
return_value=['hdfs_step_input_files'])
self.simple_patch(self.runner, '_hdfs_step_output_dir',
return_value='hdfs_step_output_dir')
self.runner._script_path = 'my_job.py'
self._new_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'new_upload_args', 'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir']
self._old_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir',
'old_upload_args']
示例9: setUp
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
mr_job_script='my_job.py', stdin=BytesIO())
self.runner._add_job_files_for_upload()
self.start(patch.object(self.runner, '_upload_args',
return_value=['new_upload_args']))
self.start(patch.object(self.runner, '_pre_0_20_upload_args',
return_value=['old_upload_args']))
self.start(patch.object(self.runner, '_hadoop_args_for_step',
return_value=['hadoop_args_for_step']))
self.start(patch.object(self.runner, '_hdfs_step_input_files',
return_value=['hdfs_step_input_files']))
self.start(patch.object(self.runner, '_hdfs_step_output_dir',
return_value='hdfs_step_output_dir'))
self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
return_value='1.2.0'))
self.runner._script_path = 'my_job.py'
self._new_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'new_upload_args', 'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir']
self._old_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir',
'old_upload_args']
示例10: test_hadoop_log_dirs_opt
def test_hadoop_log_dirs_opt(self):
self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2'])
os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir'
# setting hadoop_log_dirs short-circuits automatic discovery of logs
self.assertEqual(
list(self.runner._hadoop_log_dirs()),
['/logs1', '/logs2'])
示例11: setUp
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>',
mr_job_script='my_job.py', stdin=BytesIO())
self.runner._add_job_files_for_upload()
self.start(patch.object(self.runner, '_upload_args',
return_value=['<upload args>']))
self.start(patch.object(self.runner, '_hadoop_args_for_step',
return_value=['<hadoop args for step>']))
self.start(patch.object(self.runner, '_hdfs_step_input_files',
return_value=['<hdfs step input files>']))
self.start(patch.object(self.runner, '_hdfs_step_output_dir',
return_value='<hdfs step output dir>'))
self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
return_value='2.7.1'))
self.runner._script_path = 'my_job.py'
示例12: setUp
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin="hadoop", hadoop_streaming_jar="streaming.jar", mr_job_script="my_job.py", stdin=StringIO()
)
self.runner._add_job_files_for_upload()
self.runner._hadoop_version = "0.20.204"
self.simple_patch(self.runner, "_new_upload_args", return_value=["new_upload_args"])
self.simple_patch(self.runner, "_old_upload_args", return_value=["old_upload_args"])
self.simple_patch(self.runner, "_hadoop_args_for_step", return_value=["hadoop_args_for_step"])
self.simple_patch(self.runner, "_hdfs_step_input_files", return_value=["hdfs_step_input_files"])
self.simple_patch(self.runner, "_hdfs_step_output_dir", return_value="hdfs_step_output_dir")
self.runner._script_path = "my_job.py"
self._new_basic_args = [
"hadoop",
"jar",
"streaming.jar",
"new_upload_args",
"hadoop_args_for_step",
"-input",
"hdfs_step_input_files",
"-output",
"hdfs_step_output_dir",
]
self._old_basic_args = [
"hadoop",
"jar",
"streaming.jar",
"hadoop_args_for_step",
"-input",
"hdfs_step_input_files",
"-output",
"hdfs_step_output_dir",
"old_upload_args",
]
示例13: StreamingArgsTestCase
class StreamingArgsTestCase(EmptyMrjobConfTestCase):
MRJOB_CONF_CONTENTS = {'runners': {'hadoop': {
'hadoop_home': 'kansas',
'hadoop_streaming_jar': 'binks.jar.jar',
}}}
def setUp(self):
super(StreamingArgsTestCase, self).setUp()
self.runner = HadoopJobRunner(
hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
mr_job_script='my_job.py', stdin=BytesIO())
self.runner._add_job_files_for_upload()
self.start(patch.object(self.runner, '_upload_args',
return_value=['new_upload_args']))
self.start(patch.object(self.runner, '_pre_0_20_upload_args',
return_value=['old_upload_args']))
self.start(patch.object(self.runner, '_hadoop_args_for_step',
return_value=['hadoop_args_for_step']))
self.start(patch.object(self.runner, '_hdfs_step_input_files',
return_value=['hdfs_step_input_files']))
self.start(patch.object(self.runner, '_hdfs_step_output_dir',
return_value='hdfs_step_output_dir'))
self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
return_value='1.2.0'))
self.runner._script_path = 'my_job.py'
self._new_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'new_upload_args', 'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir']
self._old_basic_args = [
'hadoop', 'jar', 'streaming.jar',
'hadoop_args_for_step',
'-input', 'hdfs_step_input_files',
'-output', 'hdfs_step_output_dir',
'old_upload_args']
def _assert_streaming_step(self, step, args):
self.runner._steps = [step]
self.assertEqual(
self.runner._args_for_streaming_step(0),
self._new_basic_args + args)
def _assert_streaming_step_old(self, step, args):
HadoopFilesystem.get_hadoop_version.return_value = '0.18'
self.runner._steps = [step]
self.assertEqual(
self.runner._args_for_streaming_step(0),
self._old_basic_args + args)
def test_basic_mapper(self):
self._assert_streaming_step(
{
'type': 'streaming',
'mapper': {
'type': 'script',
},
},
['-mapper',
PYTHON_BIN + ' my_job.py --step-num=0 --mapper',
'-jobconf',
'mapred.reduce.tasks=0'])
def test_basic_reducer(self):
self._assert_streaming_step(
{
'type': 'streaming',
'reducer': {
'type': 'script',
},
},
['-mapper',
'cat',
'-reducer',
PYTHON_BIN + ' my_job.py --step-num=0 --reducer'])
def test_pre_filters(self):
self._assert_streaming_step(
{
'type': 'streaming',
'mapper': {
'type': 'script',
'pre_filter': 'grep anything',
},
'combiner': {
'type': 'script',
'pre_filter': 'grep nothing',
},
'reducer': {
'type': 'script',
'pre_filter': 'grep something',
},
},
["-mapper",
"bash -c 'grep anything | " + PYTHON_BIN +
" my_job.py --step-num=0 --mapper'",
#.........这里部分代码省略.........
示例14: test_get_hadoop_version
def test_get_hadoop_version(self):
runner = HadoopJobRunner()
self.assertEqual(runner.get_hadoop_version(), '1.2.0')
示例15: setUp
def setUp(self):
super(FindProbableCauseOfFailureTestCase, self).setUp()
os.environ['MOCK_HADOOP_VERSION'] = '2.7.0'
self.runner = HadoopJobRunner()