本文整理汇总了Python中mrjob.util.scrape_options_into_new_groups函数的典型用法代码示例。如果您正苦于以下问题:Python scrape_options_into_new_groups函数的具体用法?Python scrape_options_into_new_groups怎么用?Python scrape_options_into_new_groups使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了scrape_options_into_new_groups函数的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _make_option_parser
def _make_option_parser():
usage = '%prog [options]'
description = (
'Create a persistent EMR cluster to run jobs in, and print its ID to'
' stdout. WARNING: Do not run'
' this without mrjob terminate-idle-clusters in your'
' crontab; clusters left idle can quickly become expensive!')
option_parser = OptionParser(usage=usage, description=description)
_add_basic_opts(option_parser)
# these aren't nicely broken down, just scrape specific options
scrape_options_into_new_groups(MRJob().all_option_groups(), {
option_parser: (
'bootstrap_mrjob',
'label',
'owner',
),
})
_add_emr_connect_opts(option_parser)
_add_emr_launch_opts(option_parser)
_add_dataproc_emr_opts(option_parser)
_alphabetize_options(option_parser)
return option_parser
示例2: make_option_parser
def make_option_parser():
usage = '%prog [options] <time-untouched> <URIs>'
description = (
'Delete all files in a given URI that are older than a specified'
' time.\n\nThe time parameter defines the threshold for removing'
' files. If the file has not been accessed for *time*, the file is'
' removed. The time argument is a number with an optional'
' single-character suffix specifying the units: m for minutes, h for'
' hours, d for days. If no suffix is specified, time is in hours.')
option_parser = OptionParser(usage=usage, description=description)
option_parser.add_option(
'-t', '--test', dest='test', default=False,
action='store_true',
help="Don't actually delete any files; just log that we would")
add_basic_opts(option_parser)
scrape_options_into_new_groups(MRJob().all_option_groups(), {
option_parser: ('aws_region', 's3_endpoint'),
})
alphabetize_options(option_parser)
return option_parser
示例3: make_option_parser
def make_option_parser():
usage = '%prog [options]'
description = 'Create a persistent EMR job flow to run jobs in. WARNING: do not run this without mrjob.tools.emr.terminate.idle_job_flows in your crontab; job flows left idle can quickly become expensive!'
option_parser = OptionParser(usage=usage, description=description)
def make_option_group(halp):
g = OptionGroup(option_parser, halp)
option_parser.add_option_group(g)
return g
runner_group = make_option_group('Running the entire job')
hadoop_emr_opt_group = make_option_group('Running on Hadoop or EMR (these apply when you set -r hadoop or -r emr)')
emr_opt_group = make_option_group('Running on Amazon Elastic MapReduce (these apply when you set -r emr)')
assignments = {
runner_group: (
'bootstrap_mrjob',
'conf_path',
'quiet',
'verbose'
),
hadoop_emr_opt_group: (
'label',
'owner',
),
emr_opt_group: (
'additional_emr_info',
'aws_availability_zone',
'aws_region',
'bootstrap_actions',
'bootstrap_cmds',
'bootstrap_files',
'bootstrap_python_packages',
'ec2_instance_type',
'ec2_key_pair',
'ec2_master_instance_type',
'ec2_slave_instance_type',
'emr_endpoint',
'enable_emr_debugging',
'hadoop_version',
'num_ec2_instances',
's3_endpoint',
's3_log_uri',
's3_scratch_uri',
's3_sync_wait_time',
),
}
# Scrape options from MRJob and index them by dest
mr_job = MRJob()
job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group,
mr_job.proto_opt_group, mr_job.runner_opt_group,
mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group)
scrape_options_into_new_groups(job_option_groups, assignments)
return option_parser
示例4: test_scrape_all
def test_scrape_all(self):
assignments = {
self.new_parser: ('a',),
self.new_group_1: ('x', 'y'),
}
old_groups = (self.original_parser, self.original_group)
scrape_options_into_new_groups(old_groups, assignments)
self.assertEqual(self.original_parser.option_list[1:],
self.new_parser.option_list[1:])
self.assertEqual(self.original_group.option_list,
self.new_group_1.option_list)
示例5: test_scrape_different
def test_scrape_different(self):
assignments = {self.new_parser: ("x",), self.new_group_1: ("y",), self.new_group_2: ("a",)}
old_groups = (self.original_parser, self.original_group)
scrape_options_into_new_groups(old_groups, assignments)
target_1 = self.original_group.option_list[:1]
target_2 = self.original_group.option_list[1:]
target_3 = self.original_parser.option_list[1:]
self.assertEqual(target_1, self.new_parser.option_list[1:])
self.assertEqual(target_2, self.new_group_1.option_list)
self.assertEqual(target_3, self.new_group_2.option_list)
options, args = self.new_parser.parse_args(["-x", "happy"])
self.assertEqual(options.x, "happy")
示例6: make_option_parser
def make_option_parser():
usage = 'usage: %prog [options] JOB_FLOW_ID'
description = (
'List, display, and parse Hadoop logs associated with EMR job flows.'
' Useful for debugging failed jobs for which mrjob did not display a'
' useful error message or for inspecting jobs whose output has been'
' lost.')
option_parser = OptionParser(usage=usage, description=description)
option_parser.add_option('-f', '--find-failure', dest='find_failure',
action='store_true', default=False,
help=('Search the logs for information about why'
' the job failed'))
option_parser.add_option('-l', '--list', dest='list_relevant',
action="store_true", default=False,
help='List log files MRJob finds relevant')
option_parser.add_option('-L', '--list-all', dest='list_all',
action="store_true", default=False,
help='List all log files')
option_parser.add_option('-a', '--cat', dest='cat_relevant',
action="store_true", default=False,
help='Cat log files MRJob finds relevant')
option_parser.add_option('-A', '--cat-all', dest='cat_all',
action="store_true", default=False,
help='Cat all log files to JOB_FLOW_ID/')
option_parser.add_option('-s', '--step-num', dest='step_num',
action='store', type='int', default=None,
help=('Limit results to a single step. To be used'
' with --list and --cat.'))
option_parser.add_option('--counters', dest='get_counters',
action='store_true', default=False,
help='Show counters from the job flow')
assignments = {
option_parser: ('conf_paths', 'quiet', 'verbose',
'ec2_key_pair_file', 's3_sync_wait_time')
}
mr_job = MRJob()
job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group,
mr_job.proto_opt_group, mr_job.runner_opt_group,
mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group,
mr_job.hadoop_opts_opt_group)
scrape_options_into_new_groups(job_option_groups, assignments)
return option_parser
示例7: make_option_parser
def make_option_parser():
usage = 'usage: %prog [options] JOB_FLOW_ID'
description = (
'List, display, and parse Hadoop logs associated with EMR job flows.'
' Useful for debugging failed jobs for which mrjob did not display a'
' useful error message or for inspecting jobs whose output has been'
' lost.')
option_parser = OptionParser(usage=usage, description=description)
add_basic_opts(option_parser)
option_parser.add_option('-f', '--find-failure', dest='find_failure',
action='store_true', default=False,
help=('Search the logs for information about why'
' the job failed'))
option_parser.add_option('-l', '--list', dest='list_relevant',
action="store_true", default=False,
help='List log files MRJob finds relevant')
option_parser.add_option('-L', '--list-all', dest='list_all',
action="store_true", default=False,
help='List all log files')
option_parser.add_option('-a', '--cat', dest='cat_relevant',
action="store_true", default=False,
help='Cat log files MRJob finds relevant')
option_parser.add_option('-A', '--cat-all', dest='cat_all',
action="store_true", default=False,
help='Cat all log files to JOB_FLOW_ID/')
option_parser.add_option('-s', '--step-num', dest='step_num',
action='store', type='int', default=None,
help=('Limit results to a single step. To be used'
' with --list and --cat.'))
option_parser.add_option('--counters', dest='get_counters',
action='store_true', default=False,
help='Show counters from the job flow')
add_emr_connect_opts(option_parser)
scrape_options_into_new_groups(MRJob().all_option_groups(), {
option_parser: ('ec2_key_pair_file', 's3_sync_wait_time', 'ssh_bin')
})
alphabetize_options(option_parser)
return option_parser
示例8: test_scrape_different
def test_scrape_different(self):
assignments = {
self.new_parser: ('x',),
self.new_group_1: ('y',),
self.new_group_2: ('a',),
}
old_groups = (self.original_parser, self.original_group)
scrape_options_into_new_groups(old_groups, assignments)
target_1 = self.original_group.option_list[:1]
target_2 = self.original_group.option_list[1:]
target_3 = self.original_parser.option_list[1:]
assert_equal(target_1, self.new_parser.option_list[1:])
assert_equal(target_2, self.new_group_1.option_list)
assert_equal(target_3, self.new_group_2.option_list)
options, args = self.new_parser.parse_args(['-x', 'happy'])
assert_equal(options.x, 'happy')
示例9: main
def main():
usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"'
description = ('Run a command on the master and all slaves of an EMR job'
' flow. Store stdout and stderr for results in OUTPUT_DIR.')
option_parser = OptionParser(usage=usage, description=description)
assignments = {
option_parser: ('conf_path', 'quiet', 'verbose',
'ec2_key_pair_file')
}
option_parser.add_option('-o', '--output-dir', dest='output_dir',
default=None,
help="Specify an output directory (default:"
" JOB_FLOW_ID)")
mr_job = MRJob()
scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)
options, args = option_parser.parse_args()
if not options.quiet:
log_to_stream(name='mrjob', debug=options.verbose)
runner_kwargs = options.__dict__.copy()
for unused_arg in ('output_dir', 'quiet', 'verbose'):
del runner_kwargs[unused_arg]
if len(args) < 2:
option_parser.print_help()
sys.exit(1)
job_flow_id, cmd_string = args[:2]
cmd_args = shlex.split(cmd_string)
output_dir = os.path.abspath(options.output_dir or job_flow_id)
with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner:
runner._enable_slave_ssh_access()
run_on_all_nodes(runner, output_dir, cmd_args)
示例10: main
def main(cl_args=None):
usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"'
description = ('Run a command on the master and all slaves of an EMR job'
' flow. Store stdout and stderr for results in OUTPUT_DIR.')
option_parser = OptionParser(usage=usage, description=description)
option_parser.add_option('-o', '--output-dir', dest='output_dir',
default=None,
help="Specify an output directory (default:"
" JOB_FLOW_ID)")
add_basic_opts(option_parser)
add_emr_connect_opts(option_parser)
scrape_options_into_new_groups(MRJob().all_option_groups(), {
option_parser: ('ec2_key_pair_file', 'ssh_bin'),
})
alphabetize_options(option_parser)
options, args = option_parser.parse_args(cl_args)
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
runner_kwargs = options.__dict__.copy()
for unused_arg in ('output_dir', 'quiet', 'verbose'):
del runner_kwargs[unused_arg]
if len(args) < 2:
option_parser.print_help()
sys.exit(1)
job_flow_id, cmd_string = args[:2]
cmd_args = shlex_split(cmd_string)
output_dir = os.path.abspath(options.output_dir or job_flow_id)
with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner:
runner._enable_slave_ssh_access()
run_on_all_nodes(runner, output_dir, cmd_args)
示例11: main
def main():
usage = 'usage: %prog [options] JOB_FLOW_ID'
description = (
'List, display, and parse Hadoop logs associated with EMR job flows.'
' Useful for debugging failed jobs for which mrjob did not display a'
' useful error message or for inspecting jobs whose output has been'
' lost.')
option_parser = OptionParser(usage=usage, description=description)
option_parser.add_option('-f', '--find-failure', dest='find_failure',
action='store_true', default=False,
help=('Search the logs for information about why'
' the job failed'))
option_parser.add_option('-l', '--list', dest='list_relevant',
action="store_true", default=False,
help='List log files MRJob finds relevant')
option_parser.add_option('-L', '--list-all', dest='list_all',
action="store_true", default=False,
help='List all log files')
option_parser.add_option('-a', '--cat', dest='cat_relevant',
action="store_true", default=False,
help='Cat log files MRJob finds relevant')
option_parser.add_option('-A', '--cat-all', dest='cat_all',
action="store_true", default=False,
help='Cat all log files to JOB_FLOW_ID/')
option_parser.add_option('-s', '--step-num', dest='step_num',
action='store', type='int', default=None,
help=('Limit results to a single step. To be used'
' with --list and --cat.'))
option_parser.add_option('--counters', dest='get_counters',
action='store_true', default=False,
help='Show counters from the job flow')
assignments = {
option_parser: ('conf_path', 'quiet', 'verbose',
'ec2_key_pair_file')
}
mr_job = MRJob()
job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group,
mr_job.proto_opt_group, mr_job.runner_opt_group,
mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group,
mr_job.hadoop_opts_opt_group)
scrape_options_into_new_groups(job_option_groups, assignments)
options, args = option_parser.parse_args()
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
if options.step_num:
step_nums = [options.step_num]
else:
step_nums = None
runner_kwargs = options.__dict__.copy()
for unused_arg in ('quiet', 'verbose', 'list_relevant', 'list_all',
'cat_relevant', 'cat_all', 'get_counters', 'step_num',
'find_failure'):
del runner_kwargs[unused_arg]
with EMRJobRunner(emr_job_flow_id=args[0], **runner_kwargs) as runner:
if options.list_relevant:
list_relevant(runner, step_nums)
if options.list_all:
list_all(runner)
if options.cat_relevant:
cat_relevant(runner, step_nums)
if options.cat_all:
cat_all(runner)
if options.get_counters:
desc = runner._describe_jobflow()
runner._set_s3_job_log_uri(desc)
runner._fetch_counters(
xrange(1, len(desc.steps) + 1), skip_s3_wait=True)
runner.print_counters()
if options.find_failure:
find_failure(runner, options.step_num)
示例12: main
def main():
usage = '%prog [options]'
description = (
'Inspect available job flow pools or identify job flows suitable for'
' running a job with the specified options.')
option_parser = OptionParser(usage=usage, description=description)
import boto.emr.connection
boto.emr.connection.JobFlow.Fields.add('HadoopVersion')
def make_option_group(halp):
g = OptionGroup(option_parser, halp)
option_parser.add_option_group(g)
return g
ec2_opt_group = make_option_group('EC2 instance configuration')
hadoop_opt_group = make_option_group('Hadoop configuration')
job_opt_group = make_option_group('Job flow configuration')
assignments = {
option_parser: (
'conf_path',
'emr_job_flow_pool_name',
'quiet',
'verbose',
),
ec2_opt_group: (
'aws_availability_zone',
'ec2_instance_type',
'ec2_key_pair',
'ec2_key_pair_file',
'ec2_master_instance_type',
'ec2_slave_instance_type',
'emr_endpoint',
'num_ec2_instances',
),
hadoop_opt_group: (
'hadoop_version',
'label',
'owner',
),
job_opt_group: (
'bootstrap_actions',
'bootstrap_cmds',
'bootstrap_files',
'bootstrap_mrjob',
'bootstrap_python_packages',
),
}
option_parser.add_option('-a', '--all', action='store_true',
default=False, dest='list_all',
help=('List all available job flows without'
' filtering by configuration'))
option_parser.add_option('-f', '--find', action='store_true',
default=False, dest='find',
help=('Find a job flow matching the pool name,'
' bootstrap configuration, and instance'
' number/type as specified on the command'
' line and in the configuration files'))
option_parser.add_option('-t', '--terminate', action='store',
default=None, dest='terminate',
metavar='JOB_FLOW_ID',
help=('Terminate all job flows in the given pool'
' (defaults to pool "default")'))
# Scrape options from MRJob and index them by dest
mr_job = MRJob()
scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)
options, args = option_parser.parse_args()
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
runner_kwargs = options.__dict__.copy()
for non_runner_kwarg in ('quiet', 'verbose', 'list_all', 'find',
'terminate'):
del runner_kwargs[non_runner_kwarg]
runner = EMRJobRunner(**runner_kwargs)
if options.list_all:
pprint_pools(runner)
if options.find:
sorted_job_flows = runner.usable_job_flows()
if sorted_job_flows:
jf = sorted_job_flows[-1]
print 'You should use this one:'
pprint_job_flow(jf)
else:
print 'No idle job flows match criteria'
if options.terminate:
terminate(runner, options.terminate)
示例13: make_option_parser
def make_option_parser():
usage = '%prog [options]'
description = (
'Inspect available job flow pools or identify job flows suitable for'
' running a job with the specified options.')
option_parser = OptionParser(usage=usage, description=description)
def make_option_group(halp):
g = OptionGroup(option_parser, halp)
option_parser.add_option_group(g)
return g
ec2_opt_group = make_option_group('EC2 instance configuration')
hadoop_opt_group = make_option_group('Hadoop configuration')
job_opt_group = make_option_group('Job flow configuration')
assignments = {
option_parser: (
'conf_paths',
'emr_job_flow_pool_name',
'quiet',
'verbose',
),
ec2_opt_group: (
'aws_availability_zone',
'ec2_instance_type',
'ec2_key_pair',
'ec2_key_pair_file',
'ec2_master_instance_type',
'ec2_core_instance_type',
'emr_endpoint',
'num_ec2_instances',
),
hadoop_opt_group: (
'hadoop_version',
'label',
'owner',
),
job_opt_group: (
'bootstrap_actions',
'bootstrap_cmds',
'bootstrap_files',
'bootstrap_mrjob',
'bootstrap_python_packages',
),
}
option_parser.add_option('-a', '--all', action='store_true',
default=False, dest='list_all',
help=('List all available job flows without'
' filtering by configuration'))
option_parser.add_option('-f', '--find', action='store_true',
default=False, dest='find',
help=('Find a job flow matching the pool name,'
' bootstrap configuration, and instance'
' number/type as specified on the command'
' line and in the configuration files'))
option_parser.add_option('-t', '--terminate', action='store',
default=None, dest='terminate',
metavar='JOB_FLOW_ID',
help=('Terminate all job flows in the given pool'
' (defaults to pool "default")'))
# Scrape options from MRJob and index them by dest
mr_job = MRJob()
scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)
return option_parser