当前位置: 首页>>代码示例>>Python>>正文


Python job.MRJob类代码示例

本文整理汇总了Python中mrjob.job.MRJob的典型用法代码示例。如果您正苦于以下问题:Python MRJob类的具体用法?Python MRJob怎么用?Python MRJob使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了MRJob类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: reducer

    def reducer(self, n, vars):
        MRJob.set_status(self, "=============>  reducer called")

        samples_from_mappers = []
        counts_from_mappers = []

        # First read all the counts from different mappers fo we know the total number of items and we can give
        # each of the sets coming from different mappers their appropriate weight
        total_counts_from_mappers = 0

        for x in vars:
            input = json.loads(x)
            total_counts_from_mappers += input[0]

            counts_from_mappers.append(input[0])
            samples_from_mappers.append(input[1])

        # Now based on the number of samples in each mapper we need to select appropriate number of samples form
        # samples_from_mappers
        i = 0
        for sample_set in samples_from_mappers:
            weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
            number_of_needed_samples = int(round(weight * self.options.sample_size))

            for j in range(number_of_needed_samples):
                yield 1, sample_set.pop()

            i += 1
开发者ID:ddehghan,项目名称:machine_learning_class,代码行数:28,代码来源:mrSample.py

示例2: main

def main(cl_args=None):
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet,
                         verbose=options.verbose)

    # max_hours_idle -> max_mins_idle
    max_mins_idle = options.max_mins_idle
    if max_mins_idle is None and options.max_hours_idle is not None:
        log.warning('--max-hours-idle is deprecated and will be removed'
                    ' in v0.7.0. Please use --max-mins-idle instead.')
        max_mins_idle = options.max_hours_idle * 60

    if options.mins_to_end_of_hour is not None:
        log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0'
                    ' and does nothing')

    _maybe_terminate_clusters(
        dry_run=options.dry_run,
        max_mins_idle=max_mins_idle,
        unpooled_only=options.unpooled_only,
        now=_boto3_now(),
        pool_name=options.pool_name,
        pooled_only=options.pooled_only,
        max_mins_locked=options.max_mins_locked,
        quiet=options.quiet,
        **_runner_kwargs(options)
    )
开发者ID:okomestudio,项目名称:mrjob,代码行数:29,代码来源:terminate_idle_clusters.py

示例3: test_cmd_line_options

    def test_cmd_line_options(self):
        mr_job = MRJob(
            ["--partitioner", "java.lang.Object", "--partitioner", "org.apache.hadoop.mapreduce.Partitioner"]
        )

        # second option takes priority
        self.assertEqual(mr_job.job_runner_kwargs()["partitioner"], "org.apache.hadoop.mapreduce.Partitioner")
开发者ID:ndimiduk,项目名称:mrjob,代码行数:7,代码来源:test_job.py

示例4: reducer_final

    def reducer_final(self):

        MRJob.set_status(self, "=============>  reducer final called")

        for label in self.output:
            stratum_samples = self.output[label]
            yield label, (len(stratum_samples), stratum_samples)
开发者ID:marionleborgne,项目名称:machine_learning,代码行数:7,代码来源:mrStratify.py

示例5: test_spark

    def test_spark(self):
        job = MRJob(["--spark", "input_dir", "output_dir"])
        job.spark = MagicMock()

        job.execute()

        job.spark.assert_called_once_with("input_dir", "output_dir")
开发者ID:davidmarin,项目名称:mrjob,代码行数:7,代码来源:test_job.py

示例6: mapper_final

    def mapper_final(self):
        MRJob.set_status(self, "=============>  mapper final called")

        out = [self.count, self.samples]
        jOut = json.dumps(out)

        yield 1, jOut
开发者ID:AshKash,项目名称:kit-sink,代码行数:7,代码来源:mrSample.py

示例7: test_bytes_value_protocol

    def test_bytes_value_protocol(self):
        job = MRJob()
        job.OUTPUT_PROTOCOL = BytesValueProtocol

        self.assertEqual(
            job.parse_output_line(b'one two\n'),
            (None, b'one two\n'))
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py

示例8: test_spark_method

    def test_spark_method(self):
        j = MRJob(["--no-conf"])
        j.spark = MagicMock()

        self.assertEqual(j.steps(), [SparkStep(j.spark)])

        self.assertEqual(j._steps_desc(), [dict(type="spark", spark_args=[])])
开发者ID:davidmarin,项目名称:mrjob,代码行数:7,代码来源:test_job.py

示例9: test_default_protocol

    def test_default_protocol(self):
        job = MRJob()

        data = iter([b'1\t2', b'\n{"3": ', b'4}\t"fi', b've"\n'])
        self.assertEqual(
            list(job.parse_output(data)),
            [(1, 2), ({'3': 4}, 'five')])
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py

示例10: main

def main(args):
    # parser command-line args
    usage = '%prog [options]'
    description = "Collect EMR stats from active jobflows. "
    description += "Active jobflows are those in states of: "
    description += "BOOTSTRAPPING, RUNNING, STARTING, and WAITING. "
    description += "Collected stats include total number of active jobflows"
    description += "and total number of Amazon EC2 instances used to execute"
    description += "these jobflows. The instance counts are not separated by"
    description += "instance type."
    option_parser = OptionParser(usage=usage, description=description)
    option_parser.add_option(
        "-p", "--pretty-print",
        action="store_true", dest="pretty_print", default=False,
        help=('Pretty print the collected stats'))
    add_basic_opts(option_parser)

    options, args = option_parser.parse_args(args)
    if args:
        option_parser.error('takes no arguments')

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
    log.info('collecting EMR active jobflows...')
    job_flows = collect_active_job_flows(options.conf_paths)
    log.info('compiling stats from collected jobflows...')
    stats = job_flows_to_stats(job_flows)

    if options.pretty_print:
        pretty_print(stats)
    else:
        print(json.dumps(stats))
开发者ID:tempcyc,项目名称:mrjob,代码行数:31,代码来源:collect_emr_stats.py

示例11: main

def main(args=None):
    now = _boto3_now()

    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')

    min_time = timedelta(hours=options.min_hours)

    emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
    cluster_summaries = _boto3_paginate(
        'Clusters', emr_client, 'list_clusters',
        ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])

    if not options.exclude:
        filtered_cluster_summaries = cluster_summaries
    else:
        filtered_cluster_summaries = _filter_clusters(
            cluster_summaries, emr_client, options.exclude)

    job_info = _find_long_running_jobs(
        emr_client, filtered_cluster_summaries, min_time, now=now)

    _print_report(job_info)
开发者ID:Affirm,项目名称:mrjob,代码行数:27,代码来源:report_long_jobs.py

示例12: __init__

    def __init__(self, *args, **kwargs):
        MRJob.__init__(self, *args, **kwargs)

        ## load entities from json file
        log("loading entity list")
        entities = json.load(urllib.urlopen("https://s3.amazonaws.com/trec-kba-2012/entity-urlnames.json"))
        self.entity_representations = toy_kba_algorithm.prepare_entities(entities)
开发者ID:SHENbeyond,项目名称:kba-tools,代码行数:7,代码来源:toy_kba_mrjob.py

示例13: reducer

    def reducer(self, n, vars):
        MRJob.set_status(self, "=============>  reducer called")

        print "reducer:", vars
        samples_from_mappers = []
        counts_from_mappers = []

        # First read all the counts from different mappers fo we know the total number of items and we can give
        # each of the sets coming from different mappers their appropriate weight
        total_counts_from_mappers = 0

        for x in vars:
            input = json.loads(x)
            total_counts_from_mappers += input[0]

            counts_from_mappers.append(input[0])
            samples_from_mappers.append(input[1])

        # Now based on the number of samples in each mapper we need to select appropriate number of samples form
        # samples_from_mappers
        i = 0

        fileOut=open(os.path.join(PROJECT_ROOT , 'output.txt'),"w")

        for sample_set in samples_from_mappers:
            weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
            number_of_needed_samples = int(round(weight * self.options.sample_size))

            for j in range(number_of_needed_samples):
                fileOut.write(str(sample_set.pop()) + '\n')


            i += 1
        fileOut.close()
        if False: yield 1,2
开发者ID:AshKash,项目名称:kit-sink,代码行数:35,代码来源:mrSample.py

示例14: test_wrong_type_of_step

    def test_wrong_type_of_step(self):
        mr_job = MRJob()
        mr_job.spark = MagicMock()

        self.assertRaises(TypeError, mr_job.run_mapper)
        self.assertRaises(TypeError, mr_job.run_combiner)
        self.assertRaises(TypeError, mr_job.run_reducer)
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py

示例15: test_deprecated_mapper_final_positional_arg

    def test_deprecated_mapper_final_positional_arg(self):
        def mapper(k, v):
            pass

        def reducer(k, v):
            pass

        def mapper_final():
            pass

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.job', stderr)
            step = MRJob.mr(mapper, reducer, mapper_final)

        # should be allowed to specify mapper_final as a positional arg,
        # but we log a warning
        self.assertEqual(
            step,
            MRJob.mr(
                mapper=mapper, reducer=reducer, mapper_final=mapper_final))
        self.assertIn('mapper_final should be specified', stderr.getvalue())

        # can't specify mapper_final as a positional and keyword arg
        self.assertRaises(
            TypeError,
            MRJob.mr,
            mapper,
            reducer,
            mapper_final,
            mapper_final=mapper_final)
开发者ID:bchess,项目名称:mrjob,代码行数:31,代码来源:test_job.py


注:本文中的mrjob.job.MRJob类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。