本文整理汇总了Python中mrjob.util.shlex_split函数的典型用法代码示例。如果您正苦于以下问题:Python shlex_split函数的具体用法?Python shlex_split怎么用?Python shlex_split使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了shlex_split函数的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _substep_args
def _substep_args(self, step_num, mrc):
step = self._get_step(step_num)
if step[mrc]['type'] == 'command':
cmd = step[mrc]['command']
# never wrap custom hadoop streaming commands in bash
if isinstance(cmd, string_types):
return shlex_split(cmd)
else:
return cmd
elif step[mrc]['type'] == 'script':
script_args = self._script_args_for_step(
step_num, mrc, input_manifest=step.get('input_manifest'))
if 'pre_filter' in step[mrc]:
return self._sh_wrap(
'%s | %s' % (step[mrc]['pre_filter'],
cmd_line(script_args)))
else:
return script_args
else:
raise ValueError("Invalid %s step %d: %r" % (
mrc, step_num, step[mrc]))
示例2: _substep_args
def _substep_args(self, step_dict, step_num, mrc, input_path=None):
if step_dict['type'] != 'streaming':
raise Exception("LocalMRJobRunner cannot run %s steps." %
step_dict['type'])
if step_dict[mrc]['type'] == 'command':
if input_path is None:
return [shlex_split(step_dict[mrc]['command'])]
else:
return [
['cat', input_path],
shlex_split(step_dict[mrc]['command'])]
if step_dict[mrc]['type'] == 'script':
args = self._script_args_for_step(step_num, mrc)
if input_path is None:
return [args]
else:
return [args + [input_path]]
示例3: get_mock_hadoop_cmd_args
def get_mock_hadoop_cmd_args():
"""Get a list for each invocation of hadoop, each containing a list of
arguments (not including the hadoop binary's path)."""
cmd_log = os.path.join(get_mock_dir(), 'cmd.log')
if not os.path.exists(cmd_log):
return []
with open(cmd_log) as f:
return [shlex_split(cmd) for cmd in f]
示例4: combine_cmds
def combine_cmds(*cmds):
"""Take zero or more commands to run on the command line, and return
the last one that is not ``None``. Each command should either be a list
containing the command plus switches, or a string, which will be parsed
with :py:func:`shlex.split`. The string must either be a byte string or a
unicode string containing no non-ASCII characters.
Returns either ``None`` or a list containing the command plus arguments.
"""
cmd = combine_values(*cmds)
if cmd is None:
return None
elif isinstance(cmd, basestring):
return shlex_split(cmd)
else:
return list(cmd)
示例5: main
def main(cmd_line_args=None):
if cmd_line_args is None:
cmd_line_args = sys.argv[1:]
parser = _make_arg_parser()
args = parser.parse_args(cmd_line_args)
# get job_class
job_module_name, job_class_name = args.job_class.rsplit('.', 1)
job_module = import_module(job_module_name)
job_class = getattr(job_module, job_class_name)
if args.job_args:
job_args = shlex_split(args.job_args)
else:
job_args = []
def make_job(*args):
j = job_class(job_args + list(args))
j.sandbox() # so Spark doesn't try to serialize stdin
return j
# get job steps. don't pass --steps, which is deprecated
steps = make_job().steps()
# pick steps
start = args.first_step_num
end = None if args.last_step_num is None else args.last_step_num + 1
steps_to_run = list(enumerate(steps))[start:end]
# load initial data
from pyspark import SparkContext
sc = SparkContext()
rdd = sc.textFile(args.input_path, use_unicode=False)
# run steps
for step_num, step in steps_to_run:
rdd = _run_step(step, step_num, rdd, make_job)
# write the results
rdd.saveAsTextFile(
args.output_path, compressionCodecClass=args.compression_codec)
示例6: main
def main(cl_args=None):
usage = 'usage: %(prog)s CLUSTER_ID [options] "command string"'
description = ('Run a command on the master and all worker nodes of an EMR'
' cluster. Store stdout/stderr for results in OUTPUT_DIR.')
arg_parser = ArgumentParser(usage=usage, description=description)
arg_parser.add_argument('-o', '--output-dir', dest='output_dir',
default=None,
help="Specify an output directory (default:"
" CLUSTER_ID)")
arg_parser.add_argument(dest='cluster_id',
help='ID of cluster to run command on')
arg_parser.add_argument(dest='cmd_string',
help='command to run, as a single string')
_add_basic_args(arg_parser)
_add_runner_args(
arg_parser,
{'ec2_key_pair_file', 'ssh_bin'} | _filter_by_role(
EMRJobRunner.OPT_NAMES, 'connect')
)
_alphabetize_actions(arg_parser)
options = arg_parser.parse_args(cl_args)
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
runner_kwargs = options.__dict__.copy()
for unused_arg in ('cluster_id', 'cmd_string', 'output_dir',
'quiet', 'verbose'):
del runner_kwargs[unused_arg]
cmd_args = shlex_split(options.cmd_string)
output_dir = os.path.abspath(options.output_dir or options.cluster_id)
with EMRJobRunner(
cluster_id=options.cluster_id, **runner_kwargs) as runner:
_run_on_all_nodes(runner, output_dir, cmd_args)
示例7: main
def main():
usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"'
description = ('Run a command on the master and all slaves of an EMR job'
' flow. Store stdout and stderr for results in OUTPUT_DIR.')
option_parser = OptionParser(usage=usage, description=description)
assignments = {
option_parser: ('conf_paths', 'quiet', 'verbose',
'ec2_key_pair_file')
}
option_parser.add_option('-o', '--output-dir', dest='output_dir',
default=None,
help="Specify an output directory (default:"
" JOB_FLOW_ID)")
mr_job = MRJob()
scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)
options, args = option_parser.parse_args()
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
runner_kwargs = options.__dict__.copy()
for unused_arg in ('output_dir', 'quiet', 'verbose'):
del runner_kwargs[unused_arg]
if len(args) < 2:
option_parser.print_help()
sys.exit(1)
job_flow_id, cmd_string = args[:2]
cmd_args = shlex_split(cmd_string)
output_dir = os.path.abspath(options.output_dir or job_flow_id)
with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner:
runner._enable_slave_ssh_access()
run_on_all_nodes(runner, output_dir, cmd_args)
示例8: main
def main(cl_args=None):
usage = 'usage: %prog CLUSTER_ID [options] "command string"'
description = ('Run a command on the master and all slaves of an EMR'
' cluster. Store stdout/stderr for results in OUTPUT_DIR.')
option_parser = OptionParser(usage=usage, description=description)
option_parser.add_option('-o', '--output-dir', dest='output_dir',
default=None,
help="Specify an output directory (default:"
" CLUSTER_ID)")
_add_basic_options(option_parser)
_add_runner_options(
option_parser,
_pick_runner_opts('emr', 'connect') | set(
['ssh_bin', 'ec2_key_pair_file'])
)
_alphabetize_options(option_parser)
options, args = option_parser.parse_args(cl_args)
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
runner_kwargs = options.__dict__.copy()
for unused_arg in ('output_dir', 'quiet', 'verbose'):
del runner_kwargs[unused_arg]
if len(args) < 2:
option_parser.print_help()
sys.exit(1)
cluster_id, cmd_string = args[:2]
cmd_args = shlex_split(cmd_string)
output_dir = os.path.abspath(options.output_dir or cluster_id)
with EMRJobRunner(cluster_id=cluster_id, **runner_kwargs) as runner:
_run_on_all_nodes(runner, output_dir, cmd_args)
示例9: _test_end_to_end
def _test_end_to_end(self, args=()):
# read from STDIN, a local file, and a remote file
stdin = StringIO('foo\nbar\n')
local_input_path = os.path.join(self.tmp_dir, 'input')
with open(local_input_path, 'w') as local_input_file:
local_input_file.write('bar\nqux\n')
input_to_upload = os.path.join(self.tmp_dir, 'remote_input')
with open(input_to_upload, 'w') as input_to_upload_file:
input_to_upload_file.write('foo\n')
remote_input_path = 'hdfs:///data/foo'
check_call([self.hadoop_bin,
'fs', '-put', input_to_upload, remote_input_path])
# doesn't matter what the intermediate output is; just has to exist.
add_mock_hadoop_output([''])
add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])
mr_job = MRTwoStepJob(['-r', 'hadoop', '-v',
'--no-conf', '--hadoop-arg', '-libjar',
'--hadoop-arg', 'containsJars.jar'] + list(args)
+ ['-', local_input_path, remote_input_path]
+ ['--jobconf', 'x=y'])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, HadoopJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
# make sure cleanup hasn't happened yet
assert os.path.exists(local_tmp_dir)
assert any(runner.ls(runner.get_output_dir()))
# make sure we're writing to the correct path in HDFS
hdfs_root = os.environ['MOCK_HDFS_ROOT']
self.assertEqual(sorted(os.listdir(hdfs_root)), ['data', 'user'])
home_dir = os.path.join(hdfs_root, 'user', getpass.getuser())
self.assertEqual(os.listdir(home_dir), ['tmp'])
self.assertEqual(os.listdir(os.path.join(home_dir, 'tmp')),
['mrjob'])
self.assertEqual(runner._opts['hadoop_extra_args'],
['-libjar', 'containsJars.jar'])
# make sure mrjob.tar.gz is was uploaded
self.assertTrue(os.path.exists(runner._mrjob_tar_gz_path))
self.assertIn(runner._mrjob_tar_gz_path,
runner._upload_mgr.path_to_uri())
# make sure setup script exists, and mrjob.tar.gz is added
# to PYTHONPATH in it
self.assertTrue(os.path.exists(runner._setup_wrapper_script_path))
self.assertIn(runner._setup_wrapper_script_path,
runner._upload_mgr.path_to_uri())
mrjob_tar_gz_name = runner._working_dir_mgr.name(
'archive', runner._mrjob_tar_gz_path)
with open(runner._setup_wrapper_script_path) as wrapper:
self.assertTrue(any(
('export PYTHONPATH' in line and mrjob_tar_gz_name in line)
for line in wrapper))
self.assertEqual(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
# make sure we called hadoop the way we expected
with open(os.environ['MOCK_HADOOP_LOG']) as mock_log:
hadoop_cmd_args = [shlex_split(cmd) for cmd in mock_log]
jar_cmd_args = [cmd_args for cmd_args in hadoop_cmd_args
if cmd_args[:1] == ['jar']]
self.assertEqual(len(jar_cmd_args), 2)
step_0_args, step_1_args = jar_cmd_args
# check input/output format
self.assertIn('-inputformat', step_0_args)
self.assertNotIn('-outputformat', step_0_args)
self.assertNotIn('-inputformat', step_1_args)
self.assertIn('-outputformat', step_1_args)
# make sure -libjar extra arg comes before -mapper
for args in (step_0_args, step_1_args):
self.assertIn('-libjar', args)
self.assertIn('-mapper', args)
self.assertLess(args.index('-libjar'), args.index('-mapper'))
# make sure -jobconf made it through
self.assertIn('-D', step_0_args)
self.assertIn('x=y', step_0_args)
self.assertIn('-D', step_1_args)
# job overrides jobconf in step 1
self.assertIn('x=z', step_1_args)
#.........这里部分代码省略.........
示例10: _filter_if_any
def _filter_if_any(self, substep_dict):
if substep_dict['type'] == 'script':
if 'pre_filter' in substep_dict:
return shlex_split(substep_dict['pre_filter'])
return None
示例11: _test_end_to_end
def _test_end_to_end(self, args=()):
# read from STDIN, a local file, and a remote file
stdin = StringIO("foo\nbar\n")
local_input_path = os.path.join(self.tmp_dir, "input")
with open(local_input_path, "w") as local_input_file:
local_input_file.write("bar\nqux\n")
input_to_upload = os.path.join(self.tmp_dir, "remote_input")
with open(input_to_upload, "w") as input_to_upload_file:
input_to_upload_file.write("foo\n")
remote_input_path = "hdfs:///data/foo"
check_call([self.hadoop_bin, "fs", "-put", input_to_upload, remote_input_path])
# doesn't matter what the intermediate output is; just has to exist.
add_mock_hadoop_output([""])
add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])
mr_job = MRTwoStepJob(
["-r", "hadoop", "-v", "--no-conf", "--hadoop-arg", "-libjar", "--hadoop-arg", "containsJars.jar"]
+ list(args)
+ ["-", local_input_path, remote_input_path]
+ ["--jobconf", "x=y"]
)
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, HadoopJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
# make sure cleanup hasn't happened yet
assert os.path.exists(local_tmp_dir)
assert any(runner.ls(runner.get_output_dir()))
# make sure we're writing to the correct path in HDFS
hdfs_root = os.environ["MOCK_HDFS_ROOT"]
self.assertEqual(sorted(os.listdir(hdfs_root)), ["data", "user"])
home_dir = os.path.join(hdfs_root, "user", getpass.getuser())
self.assertEqual(os.listdir(home_dir), ["tmp"])
self.assertEqual(os.listdir(os.path.join(home_dir, "tmp")), ["mrjob"])
self.assertEqual(runner._opts["hadoop_extra_args"], ["-libjar", "containsJars.jar"])
# make sure mrjob.tar.gz is was uploaded
self.assertTrue(os.path.exists(runner._mrjob_tar_gz_path))
self.assertIn(runner._mrjob_tar_gz_path, runner._upload_mgr.path_to_uri())
# make sure setup script exists, and mrjob.tar.gz is added
# to PYTHONPATH in it
self.assertTrue(os.path.exists(runner._setup_wrapper_script_path))
self.assertIn(runner._setup_wrapper_script_path, runner._upload_mgr.path_to_uri())
mrjob_tar_gz_name = runner._working_dir_mgr.name("archive", runner._mrjob_tar_gz_path)
with open(runner._setup_wrapper_script_path) as wrapper:
self.assertTrue(any(("export PYTHONPATH" in line and mrjob_tar_gz_name in line) for line in wrapper))
self.assertEqual(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])
# make sure we called hadoop the way we expected
with open(os.environ["MOCK_HADOOP_LOG"]) as mock_log:
hadoop_cmd_args = [shlex_split(cmd) for cmd in mock_log]
jar_cmd_args = [cmd_args for cmd_args in hadoop_cmd_args if cmd_args[:1] == ["jar"]]
self.assertEqual(len(jar_cmd_args), 2)
step_0_args, step_1_args = jar_cmd_args
# check input/output format
self.assertIn("-inputformat", step_0_args)
self.assertNotIn("-outputformat", step_0_args)
self.assertNotIn("-inputformat", step_1_args)
self.assertIn("-outputformat", step_1_args)
# make sure -libjar extra arg comes before -mapper
for args in (step_0_args, step_1_args):
self.assertIn("-libjar", args)
self.assertIn("-mapper", args)
self.assertLess(args.index("-libjar"), args.index("-mapper"))
# make sure -jobconf made it through
self.assertIn("-D", step_0_args)
self.assertIn("x=y", step_0_args)
self.assertIn("-D", step_1_args)
# job overrides jobconf in step 1
self.assertIn("x=z", step_1_args)
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
assert not any(runner.ls(runner.get_output_dir()))
示例12: main
def main(cmd_line_args=None):
if cmd_line_args is None:
cmd_line_args = sys.argv[1:]
parser = _make_arg_parser()
args = parser.parse_args(cmd_line_args)
if args.num_reducers is not None and args.num_reducers <= 0:
raise ValueError(
'You can only configure num_reducers to positive number.')
# get job_class
job_module_name, job_class_name = args.job_class.rsplit('.', 1)
job_module = import_module(job_module_name)
job_class = getattr(job_module, job_class_name)
# load initial data
from pyspark import SparkContext
if args.job_args:
job_args = shlex_split(args.job_args)
else:
job_args = []
# determine hadoop_*_format, steps
# try to avoid instantiating a job in the driver; see #2044
job = None
if args.hadoop_input_format is None:
job = job or job_class(job_args)
hadoop_input_format = job.hadoop_input_format()
else:
hadoop_input_format = args.hadoop_input_format or None
if args.hadoop_output_format is None:
job = job or job_class(job_args)
hadoop_output_format = job.hadoop_output_format()
else:
hadoop_output_format = args.hadoop_output_format or None
if args.sort_values is None:
job = job or job_class(job_args)
sort_values = job.sort_values()
else:
sort_values = args.sort_values
if args.steps_desc is None:
job = job or job_class(job_args)
steps = [step.description(step_num)
for step_num, step in enumerate(job.steps())]
else:
steps = json.loads(args.steps_desc)
# pick steps
start = args.first_step_num or 0
end = None if args.last_step_num is None else args.last_step_num + 1
steps_to_run = list(enumerate(steps))[start:end]
sc = SparkContext()
# keep track of one set of counters per job step
counter_accumulators = [
sc.accumulator(defaultdict(dict), CounterAccumulator())
for _ in steps_to_run
]
def make_increment_counter(step_num):
counter_accumulator = counter_accumulators[step_num - start]
def increment_counter(group, name, amount=1):
counter_accumulator.add({group: {name: amount}})
return increment_counter
def make_mrc_job(mrc, step_num):
j = job_class(job_args + [
'--%s' % mrc, '--step-num=%d' % step_num
])
# patch increment_counter() to update the accumulator for this step
j.increment_counter = make_increment_counter(step_num)
return j
try:
if hadoop_input_format:
rdd = sc.hadoopFile(
args.input_path,
inputFormatClass=hadoop_input_format,
keyClass='org.apache.hadoop.io.Text',
valueClass='org.apache.hadoop.io.Text')
# hadoopFile loads each line as a key-value pair in which the
# contents of the line are the key and the value is an empty
# string. Convert to an rdd of just lines, encoded as bytes.
rdd = rdd.map(lambda kv: kv[0].encode('utf-8'))
else:
rdd = sc.textFile(args.input_path, use_unicode=False)
# run steps
#.........这里部分代码省略.........