本文整理汇总了Python中mrjob.job.MRJob类的典型用法代码示例。如果您正苦于以下问题:Python MRJob类的具体用法?Python MRJob怎么用?Python MRJob使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MRJob类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: reducer
def reducer(self, n, vars):
MRJob.set_status(self, "=============> reducer called")
samples_from_mappers = []
counts_from_mappers = []
# First read all the counts from different mappers fo we know the total number of items and we can give
# each of the sets coming from different mappers their appropriate weight
total_counts_from_mappers = 0
for x in vars:
input = json.loads(x)
total_counts_from_mappers += input[0]
counts_from_mappers.append(input[0])
samples_from_mappers.append(input[1])
# Now based on the number of samples in each mapper we need to select appropriate number of samples form
# samples_from_mappers
i = 0
for sample_set in samples_from_mappers:
weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
number_of_needed_samples = int(round(weight * self.options.sample_size))
for j in range(number_of_needed_samples):
yield 1, sample_set.pop()
i += 1
示例2: main
def main(cl_args=None):
arg_parser = _make_arg_parser()
options = arg_parser.parse_args(cl_args)
MRJob.set_up_logging(quiet=options.quiet,
verbose=options.verbose)
# max_hours_idle -> max_mins_idle
max_mins_idle = options.max_mins_idle
if max_mins_idle is None and options.max_hours_idle is not None:
log.warning('--max-hours-idle is deprecated and will be removed'
' in v0.7.0. Please use --max-mins-idle instead.')
max_mins_idle = options.max_hours_idle * 60
if options.mins_to_end_of_hour is not None:
log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0'
' and does nothing')
_maybe_terminate_clusters(
dry_run=options.dry_run,
max_mins_idle=max_mins_idle,
unpooled_only=options.unpooled_only,
now=_boto3_now(),
pool_name=options.pool_name,
pooled_only=options.pooled_only,
max_mins_locked=options.max_mins_locked,
quiet=options.quiet,
**_runner_kwargs(options)
)
示例3: test_cmd_line_options
def test_cmd_line_options(self):
mr_job = MRJob(
["--partitioner", "java.lang.Object", "--partitioner", "org.apache.hadoop.mapreduce.Partitioner"]
)
# second option takes priority
self.assertEqual(mr_job.job_runner_kwargs()["partitioner"], "org.apache.hadoop.mapreduce.Partitioner")
示例4: reducer_final
def reducer_final(self):
MRJob.set_status(self, "=============> reducer final called")
for label in self.output:
stratum_samples = self.output[label]
yield label, (len(stratum_samples), stratum_samples)
示例5: test_spark
def test_spark(self):
job = MRJob(["--spark", "input_dir", "output_dir"])
job.spark = MagicMock()
job.execute()
job.spark.assert_called_once_with("input_dir", "output_dir")
示例6: mapper_final
def mapper_final(self):
MRJob.set_status(self, "=============> mapper final called")
out = [self.count, self.samples]
jOut = json.dumps(out)
yield 1, jOut
示例7: test_bytes_value_protocol
def test_bytes_value_protocol(self):
job = MRJob()
job.OUTPUT_PROTOCOL = BytesValueProtocol
self.assertEqual(
job.parse_output_line(b'one two\n'),
(None, b'one two\n'))
示例8: test_spark_method
def test_spark_method(self):
j = MRJob(["--no-conf"])
j.spark = MagicMock()
self.assertEqual(j.steps(), [SparkStep(j.spark)])
self.assertEqual(j._steps_desc(), [dict(type="spark", spark_args=[])])
示例9: test_default_protocol
def test_default_protocol(self):
job = MRJob()
data = iter([b'1\t2', b'\n{"3": ', b'4}\t"fi', b've"\n'])
self.assertEqual(
list(job.parse_output(data)),
[(1, 2), ({'3': 4}, 'five')])
示例10: main
def main(args):
# parser command-line args
usage = '%prog [options]'
description = "Collect EMR stats from active jobflows. "
description += "Active jobflows are those in states of: "
description += "BOOTSTRAPPING, RUNNING, STARTING, and WAITING. "
description += "Collected stats include total number of active jobflows"
description += "and total number of Amazon EC2 instances used to execute"
description += "these jobflows. The instance counts are not separated by"
description += "instance type."
option_parser = OptionParser(usage=usage, description=description)
option_parser.add_option(
"-p", "--pretty-print",
action="store_true", dest="pretty_print", default=False,
help=('Pretty print the collected stats'))
add_basic_opts(option_parser)
options, args = option_parser.parse_args(args)
if args:
option_parser.error('takes no arguments')
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
log.info('collecting EMR active jobflows...')
job_flows = collect_active_job_flows(options.conf_paths)
log.info('compiling stats from collected jobflows...')
stats = job_flows_to_stats(job_flows)
if options.pretty_print:
pretty_print(stats)
else:
print(json.dumps(stats))
示例11: main
def main(args=None):
now = _boto3_now()
arg_parser = _make_arg_parser()
options = arg_parser.parse_args(args)
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
log.info('getting information about running jobs')
min_time = timedelta(hours=options.min_hours)
emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
cluster_summaries = _boto3_paginate(
'Clusters', emr_client, 'list_clusters',
ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])
if not options.exclude:
filtered_cluster_summaries = cluster_summaries
else:
filtered_cluster_summaries = _filter_clusters(
cluster_summaries, emr_client, options.exclude)
job_info = _find_long_running_jobs(
emr_client, filtered_cluster_summaries, min_time, now=now)
_print_report(job_info)
示例12: __init__
def __init__(self, *args, **kwargs):
MRJob.__init__(self, *args, **kwargs)
## load entities from json file
log("loading entity list")
entities = json.load(urllib.urlopen("https://s3.amazonaws.com/trec-kba-2012/entity-urlnames.json"))
self.entity_representations = toy_kba_algorithm.prepare_entities(entities)
示例13: reducer
def reducer(self, n, vars):
MRJob.set_status(self, "=============> reducer called")
print "reducer:", vars
samples_from_mappers = []
counts_from_mappers = []
# First read all the counts from different mappers fo we know the total number of items and we can give
# each of the sets coming from different mappers their appropriate weight
total_counts_from_mappers = 0
for x in vars:
input = json.loads(x)
total_counts_from_mappers += input[0]
counts_from_mappers.append(input[0])
samples_from_mappers.append(input[1])
# Now based on the number of samples in each mapper we need to select appropriate number of samples form
# samples_from_mappers
i = 0
fileOut=open(os.path.join(PROJECT_ROOT , 'output.txt'),"w")
for sample_set in samples_from_mappers:
weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
number_of_needed_samples = int(round(weight * self.options.sample_size))
for j in range(number_of_needed_samples):
fileOut.write(str(sample_set.pop()) + '\n')
i += 1
fileOut.close()
if False: yield 1,2
示例14: test_wrong_type_of_step
def test_wrong_type_of_step(self):
mr_job = MRJob()
mr_job.spark = MagicMock()
self.assertRaises(TypeError, mr_job.run_mapper)
self.assertRaises(TypeError, mr_job.run_combiner)
self.assertRaises(TypeError, mr_job.run_reducer)
示例15: test_deprecated_mapper_final_positional_arg
def test_deprecated_mapper_final_positional_arg(self):
def mapper(k, v):
pass
def reducer(k, v):
pass
def mapper_final():
pass
stderr = StringIO()
with no_handlers_for_logger():
log_to_stream('mrjob.job', stderr)
step = MRJob.mr(mapper, reducer, mapper_final)
# should be allowed to specify mapper_final as a positional arg,
# but we log a warning
self.assertEqual(
step,
MRJob.mr(
mapper=mapper, reducer=reducer, mapper_final=mapper_final))
self.assertIn('mapper_final should be specified', stderr.getvalue())
# can't specify mapper_final as a positional and keyword arg
self.assertRaises(
TypeError,
MRJob.mr,
mapper,
reducer,
mapper_final,
mapper_final=mapper_final)