当前位置: 首页>>代码示例>>Python>>正文


Python EmrConnection.add_jobflow_steps方法代码示例

本文整理汇总了Python中boto.emr.connection.EmrConnection.add_jobflow_steps方法的典型用法代码示例。如果您正苦于以下问题:Python EmrConnection.add_jobflow_steps方法的具体用法?Python EmrConnection.add_jobflow_steps怎么用?Python EmrConnection.add_jobflow_steps使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在boto.emr.connection.EmrConnection的用法示例。


在下文中一共展示了EmrConnection.add_jobflow_steps方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: add_steps

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]
def add_steps(cluster_id, key):
	try:
		emr_connection = EmrConnection()
		emr_connection.add_jobflow_steps(cluster_id, get_steps(key, key))
		return True
	except Exception, e:
		return False
开发者ID:valeter,项目名称:nlp-site,代码行数:9,代码来源:process_file.py

示例2: run

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]
    def run(self):
        """Run the Hive job on EMR cluster
        """
        #  copy the data source to a new object
        #  (Hive deletes/moves the original)
        copy_s3_file(self.input_path, self.data_path)

        # and create the hive script
        self._generate_and_upload_hive_script()

        logger.info("Waiting {} seconds for S3 eventual consistency".format(
                    self.s3_sync_wait_time))
        time.sleep(self.s3_sync_wait_time)

        # TODO more options like setting aws region
        conn = EmrConnection(self.aws_access_key_id,
                             self.aws_secret_access_key)

        setup_step = InstallHiveStep(self.hive_version)
        run_step = HiveStep(self.job_name, self.script_path)

        jobid = conn.run_jobflow(
            self.job_name,
            self.log_path,
            action_on_failure='CANCEL_AND_WAIT',
            master_instance_type=self.master_instance_type,
            slave_instance_type=self.slave_instance_type,
            ami_version=self.ami_version,
            num_instances=self.num_instances)

        conn.add_jobflow_steps(jobid, [setup_step, run_step])

        self._wait_for_job_to_complete(conn, jobid)

        logger.info("Output file is in: {0}".format(self.output_path))
开发者ID:JonathanBatten,项目名称:apiarist,代码行数:37,代码来源:emr.py

示例3: main

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]
def main(argv):

  # load the config
  config = ConfigParser()
  config.read(os.path.join(os.path.split(argv[0])[0] if not None else '','config.ini'))

  # load AWS config
  awsConfig = ConfigParser()
  awsConfig.read(config.get('Common','aws'))

  aws_access_key = awsConfig.get('AWS','aws_access_key')
  aws_secret_key = awsConfig.get('AWS','aws_secret_key')
  event_bucket = awsConfig.get('AWS','event_bucket')
  output_bucket = awsConfig.get('AWS','emr_output_bucket')
  script_bucket = awsConfig.get('AWS','script_bucket')
  
  jobId = argv[1]

  emrConnection = EmrConnection(aws_access_key, aws_secret_key)

  s3Connection = S3Connection(aws_access_key, aws_secret_key)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=BUCKET_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Foursquare event deduper',
                      mapper='s3://%s/dedup_mapper.py foursquare' % script_bucket,
                      reducer='s3://%s/dedup_reducer.py' % script_bucket,
                      input='s3://%s/normalized' % event_bucket,
                      output='s3://%s/%s' % (output_bucket,BUCKET_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  print 'Successfully started streaming steps'
开发者ID:Thisisdotme,项目名称:thisis.me,代码行数:39,代码来源:foursquare_dedup.py

示例4: EmrJarRuntime

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]
class EmrJarRuntime(ZetRuntime):
    def __init__(self, spec_filename="spec.json"):
        import boto
        from boto.emr.connection import EmrConnection, RegionInfo

        # super(ZetRuntime, self).__init__()
        # TODO
        self.settings = get_settings_from_file(spec_filename)

        p = self.settings.Param
        self.s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET)
        self.s3_bucket = self.s3_conn.get_bucket(p.S3_BUCKET)
        self.region = p.AWS_Region
        self.emr_conn = EmrConnection(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET,
                region = RegionInfo(name = self.region,
                    endpoint = self.region + '.elasticmapreduce.amazonaws.com'))
        self.job_flow_id = p.EMR_jobFlowId

    def get_s3_working_dir(self, path=""):
        ps = self.settings
        glb_vars = ps.GlobalParam
        return os.path.join('zetjob', glb_vars['userName'], "job%s" % glb_vars['jobId'], "blk%s" % glb_vars['blockId'], path)

    def execute(self, jar_path, args):
        from boto.emr.step import JarStep

        s3_jar_path = s3_upload(self.s3_bucket, self.get_s3_working_dir(jar_path), jar_path)
        # s3_jar_path = "s3://run-jars/jar/mahout-core-1.0-SNAPSHOT-job.jar"
        print("Uploading jar to s3 : %s -> %s" % (jar_path, s3_jar_path))

        print("Add jobflow step")
        step = JarStep(name='cl_filter', jar=s3_jar_path, step_args=args)
        self.emr_conn.add_jobflow_steps(self.job_flow_id, steps=[step])

        print("Waiting jobflow step done")
        emr_wait_job(self.emr_conn, self.job_flow_id)
开发者ID:Haizhi,项目名称:screwjack,代码行数:38,代码来源:specparser.py

示例5: __init__

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]

#.........这里部分代码省略.........
        """

        num_process_mappers = 1
        num_process_reducers = 1

        if self._iter_no == 0:
            pagerank_input = self._infile
        elif self._iter_no > 0:
            pagerank_input = self._last_outdir

        if pagerank_output is None:
            pagerank_output = self._get_default_outdir('pagerank')

        # Output from the pagerank step becomes input to process step
        process_input = pagerank_output

        if process_output is None:
            process_output = self._get_default_outdir('process')

        pagerank_step = self._make_step(pagerank_mapper, pagerank_reducer,
                                        pagerank_input, pagerank_output,
                                        num_pagerank_mappers,
                                        num_pagerank_reducers)

        process_step = self._make_step(process_mapper, process_reducer,
                                       process_input, process_output,
                                       num_process_mappers,
                                       num_process_reducers)

        steps = [pagerank_step, process_step]
        if self.job_id is None:
            self._submit_new_job(steps)
        else:
            self._emr_conn.add_jobflow_steps(self.job_id, steps)

        # Store `process_output` directory so it can be used in
        # subsequent iteration
        self._last_outdir = process_output
        self._iter_no += 1

    def is_done(self):
        """
        Returns `True` if the map-reduce job is done, and `False`
        otherwise.

        For all process-step output files that have not been fetched,
        gets the first part of the output file, and checks whether its
        contents begins with the string 'FinalRank'.

        Special notes:
            WARNING! The usage of this method in your code requires that
            that you used the default output directories in all calls
            to do_iter().
        """

        # Cache the result so we can return immediately without hitting
        # any of the Amazon APIs
        if self._is_done:
            return True

        iter_no = self._get_last_process_step_iter_no()
        if iter_no < 0:
            return False

        while self._last_process_step_iter_no < iter_no:
            self._last_process_step_iter_no += 1
开发者ID:aagarwal1990,项目名称:CS144_Rankmaniac,代码行数:70,代码来源:rankmaniac.py

示例6: EmrHiveRuntime

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]
class EmrHiveRuntime(HiveRuntime):
    def __init__(self, spec_filename="spec.json"):
        import boto
        from boto.emr.connection import EmrConnection, RegionInfo

        super(HiveRuntime, self).__init__(spec_filename)
        p = self.settings.Param
        self.s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET)
        self.s3_bucket = self.s3_conn.get_bucket(p.S3_BUCKET)
        self.region = p.AWS_Region
        self.emr_conn = EmrConnection(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET,
                region = RegionInfo(name = self.region,
                    endpoint = self.region + '.elasticmapreduce.amazonaws.com'))
        self.job_flow_id = p.EMR_jobFlowId

    def get_s3_working_dir(self, path=""):
        ps = self.settings
        glb_vars = ps.GlobalParam
        return os.path.join('zetjob', glb_vars['userName'], "job%s" % glb_vars['jobId'], "blk%s" % glb_vars['blockId'], path)

    def get_emr_job_name(self):
        ps = self.settings
        glb_vars = ps.GlobalParam
        return os.path.join('zetjob', glb_vars['userName'], "job%s" % glb_vars['jobId'], "blk%s" % glb_vars['blockId'])

    def s3_upload_dir(self, local_dir):
        print("EmrHiveRuntime.s3_uploader()")
        print("s3_upload_dir :::: %s" % local_dir)
        s3_upload_dir = self.get_s3_working_dir(local_dir)
        ext_files = [f for f in sorted(os.listdir(local_dir)) if os.path.isfile(os.path.join(local_dir,f))]
        for f in ext_files:
            f_local = os.path.join(local_dir, f)
            f_remote = os.path.join(s3_upload_dir, local_dir, f)
            f_remote_full = os.path.join("s3://", self.s3_bucket.name, f_remote)

            print("S3 Upload      :: %s ====> %s" % (f_local, s3_upload_dir))
            print("S3 remote_full :: %s" % f_remote_full)
            yield s3_upload(self.s3_bucket, f_remote, f_local)

    def files_uploader(self, local_dir):
        return self.s3_upload_dir(local_dir)

    def clean_s3_working_dir(self):
        s3_working_dir = self.get_s3_working_dir()
        if not s3_delete(self.s3_bucket, s3_working_dir):
            # TODO : refactor to 'HiveException'
            raise Exception("Can not clean s3 path : %s" % s3_working_dir)

    def clean_working_dir(self):
        self.clean_s3_working_dir()

    def emr_execute_hive(self, s3_hive_script):
        from boto.emr.step import HiveStep
        hive_step = HiveStep(name=self.get_emr_job_name(), hive_file=s3_hive_script)
        self.emr_conn.add_jobflow_steps(self.job_flow_id, steps=[hive_step])
        emr_wait_job(self.emr_conn, self.job_flow_id)

    def execute(self, main_hive_script, generated_hive_script=None):
        self.clean_working_dir()
        hive_script_local = self.generate_script(main_hive_script, generated_hive_script)

        s3_working_dir = self.get_s3_working_dir()
        hive_script_remote = os.path.join(s3_working_dir, os.path.basename(hive_script_local))
        hive_script_remote_full = s3_upload(self.s3_bucket, hive_script_remote, hive_script_local)
        print(hive_script_remote_full)
        print("EmrHiveRuntime.execute()")
        self.emr_execute_hive(hive_script_remote_full)
开发者ID:Haizhi,项目名称:screwjack,代码行数:69,代码来源:specparser.py

示例7: main

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]
def main(argv):

  # load the config
  config = ConfigParser()
  config.read(os.path.join(os.path.split(argv[0])[0] if not None else '','config.ini'))

  # load AWS config
  awsConfig = ConfigParser()
  awsConfig.read(config.get('Common','aws'))

  aws_access_key = awsConfig.get('AWS','aws_access_key')
  aws_secret_key = awsConfig.get('AWS','aws_secret_key')
  event_bucket = awsConfig.get('AWS','event_bucket')
  output_bucket = awsConfig.get('AWS','emr_output_bucket')
  script_bucket = awsConfig.get('AWS','script_bucket')
  
  jobId = argv[1]

  emrConnection = EmrConnection(aws_access_key, aws_secret_key)

  s3Connection = S3Connection(aws_access_key, aws_secret_key)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=DEDUPED_EVENTS_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Twitter event deduper',
                      mapper='s3://%s/dedup_mapper.py twitter' % script_bucket,
                      reducer='s3://%s/dedup_reducer.py' % script_bucket,
                      input='s3://%s/normalized' % event_bucket,
                      output='s3://%s/%s' % (output_bucket,DEDUPED_EVENTS_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=EVENTS_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Twitter 30-day Retweet Min/Max Analyzer',
                      mapper='s3://%s/30d_age_mapper.py' % script_bucket,
                      reducer='s3://%s/twitter_minmax_reducer.py' % script_bucket,
                      input='s3://%s/%s' % (output_bucket,DEDUPED_EVENTS_KEY),
                      output='s3://%s/%s' % (output_bucket,EVENTS_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=SUMMARY_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Twitter Highlighter',
                      mapper='s3://%s/twitter_highlight_mapper.py' % script_bucket,
                      reducer='s3://%s/twitter_highlight_reducer.py' % script_bucket,
                      input='s3://%s/%s' % (output_bucket,EVENTS_KEY),
                      output='s3://%s/%s' % (output_bucket,SUMMARY_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  print 'Successfully started twitter streaming steps'
开发者ID:Thisisdotme,项目名称:thisis.me,代码行数:67,代码来源:twitter_start.py

示例8: EmrManager

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]

#.........这里部分代码省略.........
                return cluster_id
        except:
            logging.error("Launching EMR cluster failed")
            return "FAILED"

    # run scripting step in cluster
    def run_scripting_step(self, cluster_id, name, script_path):
        try:
            step = ScriptRunnerStep(name=name, 
                                    step_args=[script_path],
                                    action_on_failure="CONTINUE")
            return self._run_step(cluster_id, step)
        except:
            logging.error("Running scripting step in cluster " + cluster_id + " failed.")
            return "FAILED"

    # run streaming step in cluster
    def run_streaming_step(self, cluster_id, name, mapper_path, reducer_path, input_path, output_path):
        try:
            # bundle files with the job
            files = []
            if mapper_path != "NONE":
                files.append(mapper_path)
                mapper_path = mapper_path.split("/")[-1]
            if reducer_path != "NONE":
                files.append(reducer_path)
                reducer_path = reducer_path.split("/")[-1]
            # build streaming step
            logging.debug("Launching streaming step with mapper: " + mapper_path + " reducer: " + reducer_path + " and files: " + str(files))
            step = StreamingStep(name=name,
                                    step_args=["-files"] + files, 
                                    mapper=mapper_path, 
                                    reducer=reducer_path, 
                                    input=input_path, 
                                    output=output_path, 
                                    action_on_failure="CONTINUE")
            return self._run_step(cluster_id, step)            
        except:
            logging.error("Running streaming step in cluster " + cluster_id + " failed.")
            return "FAILED"

    # run mapreduce jar step in cluster
    def run_jar_step(self, cluster_id, name, jar_path, class_name, input_path, output_path):
        try:
            # build streaming step
            logging.debug("Launching jar step with jar: " + jar_path + " class name: " + class_name + " input: " + input_path + " and output: " + output_path)
            step = JarStep(name=name,
                            jar=jar_path, 
                            step_args= [class_name,
                                        input_path,
                                        output_path])
            return self._run_step(cluster_id, step)            
        except:
            logging.error("Running jar step in cluster " + cluster_id + " failed.")
            return "FAILED"

    def _run_step(self, cluster_id, step):
        step_list = self.connection.add_jobflow_steps(cluster_id, [step])
        step_id = step_list.stepids[0].value

        logging.info("Starting step " + step_id + " in cluster " + cluster_id + ". Please be patient. Check the progress of the job in your AWS Console")

        # Checking the state of the step
        state = self._find_step_state(cluster_id, step_id)
        while state != u'NOT_FOUND' and state != u'ERROR' and state != u'FAILED' and state!=u'COMPLETED':
            #sleeping to recheck for status.
            time.sleep(int(self.step_status_wait))
            state = self._find_step_state(cluster_id, step_id)
            logging.info("Starting step " + step_id + " in cluster " + cluster_id + ". Status: " + state)

        if state == u'FAILED':
            logging.error("Step " + step_id + " failed in cluster: " + cluster_id)
            return "FAILED"
        if state == u'NOT_FOUND':
            logging.error("Step " + step_id + " could not be found in cluster: " + cluster_id)
            return "NOT_FOUND"
        if state == u'ERROR':
            logging.error("Step " + step_id + " produced an error in _find_step_state in cluster: " + cluster_id)
            return "ERROR"

        #Check if the state is WAITING. Then launch the next steps
        if state == u'COMPLETED':
            #Finding the master node dns of EMR cluster
            logging.info("Step " + step_id + " succesfully completed in cluster: " + cluster_id)
            return step_id


    def _find_step_state(self, cluster_id, step_id):
        try:
            step_summary_list = self.connection.list_steps(cluster_id)
            for step_summary in step_summary_list.steps:
                if step_summary.id == step_id:
                    return step_summary.status.state
            return "NOT_FOUND"
        except:
            return "ERROR"

    #Method for terminating the EMR cluster
    def terminate_cluster(self, cluster_id):
        self.connection.terminate_jobflow(cluster_id)
开发者ID:DiegoTUI,项目名称:emr-orchestrator,代码行数:104,代码来源:emr_manager.py

示例9: main

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]
def main(argv):

  # load the config
  config = ConfigParser()
  config.read(os.path.join(os.path.split(argv[0])[0] if not None else '','config.ini'))

  # load AWS config
  awsConfig = ConfigParser()
  awsConfig.read(config.get('Common','aws'))

  aws_access_key = awsConfig.get('AWS','aws_access_key')
  aws_secret_key = awsConfig.get('AWS','aws_secret_key')
  event_bucket = awsConfig.get('AWS','event_bucket')
  output_bucket = awsConfig.get('AWS','emr_output_bucket')
  script_bucket = awsConfig.get('AWS','script_bucket')
  
  jobId = argv[1]

  emrConnection = EmrConnection(aws_access_key, aws_secret_key)

  s3Connection = S3Connection(aws_access_key, aws_secret_key)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=DEDUPED_EVENTS_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Foursquare event deduper',
                      mapper='s3://%s/dedup_mapper.py foursquare' % script_bucket,
                      reducer='s3://%s/dedup_reducer.py' % script_bucket,
                      input='s3://%s/normalized' % event_bucket,
                      output='s3://%s/%s' % (output_bucket,DEDUPED_EVENTS_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=AGGREGATED_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Foursquare 30-day Collector',
                      mapper='s3://%s/foursquare_30d_mapper.py' % script_bucket,
                      reducer='s3://%s/identity_reducer.py' % script_bucket,
                      input='s3://%s/%s' % (output_bucket,DEDUPED_EVENTS_KEY),
                      output='s3://%s/%s' % (output_bucket,EVENTS_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=EVENTS_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Foursquare Venue Aggregator',
                      mapper='s3://%s/identity_mapper.py' % script_bucket,
                      reducer='s3://%s/foursquare_checkin_aggregator_reducer.py' % script_bucket,
                      input='s3://%s/%s' % (output_bucket,EVENTS_KEY),
                      output='s3://%s/%s' % (output_bucket,AGGREGATED_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=MINMAX_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Foursquare Min/Max Analyzer',
                      mapper='s3://%s/identity_mapper.py' % script_bucket,
                      reducer='s3://%s/foursquare_minmax_reducer.py' % script_bucket,
                      input='s3://%s/%s' % (output_bucket,AGGREGATED_KEY),
                      output='s3://%s/%s' % (output_bucket,MINMAX_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=SUMMARY_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Foursquare Highlighter',
                      mapper='s3://%s/identity_mapper.py' % script_bucket,
                      reducer='s3://%s/foursquare_highlight_reducer.py' % script_bucket,
                      input='s3://%s/%s' % (output_bucket,MINMAX_KEY),
                      output='s3://%s/%s' % (output_bucket,SUMMARY_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  print 'Successfully started foursquare streaming steps'
开发者ID:Thisisdotme,项目名称:thisis.me,代码行数:95,代码来源:foursquare_start.py

示例10: EmrClient

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]
class EmrClient(object):


    # The Hadoop version to use
    HADOOP_VERSION = '1.0.3'

    # The AMI version to use
    AMI_VERSION = '2.4.7'
 
    # Interval to wait between polls to EMR cluster in seconds
    CLUSTER_OPERATION_RESULTS_POLLING_SECONDS = 10
 
    # Timeout for EMR creation and ramp up in seconds
    CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS = 60 * 30
 
    def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None):
 
        # If the access key is not specified, get it from the luigi config.cfg file
        if not aws_access_key_id:
            aws_access_key_id = luigi.configuration.get_config().get('aws', 'aws_access_key_id')
 
        if not aws_secret_access_key:
            aws_secret_access_key = luigi.configuration.get_config().get('aws', 'aws_secret_access_key')
 
 
        # Create the region in which to run
        region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
        region = RegionInfo(name=region_name, endpoint=region_endpoint)
 
        self.emr_connection = EmrConnection(aws_access_key_id=aws_access_key_id,
                                            aws_secret_access_key=aws_secret_access_key,
                                            region=region)
 
    def launch_emr_cluster(self, cluster_name, log_uri, ec2_keyname=None, master_type='m1.small', core_type='m1.small', num_instances=2, hadoop_version='1.0.3', ami_version='2.4.7', ):
 
        # TODO Remove
        # install_pig_step = InstallPigStep()
 
        jobflow_id = self.emr_connection.run_jobflow(name=cluster_name,
                              log_uri=log_uri,
                              ec2_keyname=ec2_keyname,
                              master_instance_type=master_type,
                              slave_instance_type=core_type,
                              num_instances=num_instances,
                              keep_alive=True,
                              enable_debugging=True,
                              hadoop_version=EmrClient.HADOOP_VERSION,
                              steps=[], 
                              ami_version=EmrClient.AMI_VERSION)
 
        # Log important information
        status = self.emr_connection.describe_jobflow(jobflow_id)

        logger.info('Creating new cluster %s with following details' % status.name)
        logger.info('jobflow ID:\t%s' % status.jobflowid)
        logger.info('Log URI:\t%s' % status.loguri)
        logger.info('Master Instance Type:\t%s' % status.masterinstancetype)
        
        # A cluster of size 1 does not have any slave instances
        if hasattr(status, 'slaveinstancetype'):
            logger.info('Slave Instance Type:\t%s' % status.slaveinstancetype)
        
        logger.info('Number of Instances:\t%s' % status.instancecount)
        logger.info('Hadoop Version:\t%s' % status.hadoopversion)
        logger.info('AMI Version:\t%s' % status.amiversion)
        logger.info('Keep Alive:\t%s' % status.keepjobflowalivewhennosteps)
 
        return self._poll_until_cluster_ready(jobflow_id)
 
 
    def add_pig_step(self, jobflow_id, pig_file, name='Pig Script', pig_versions='latest', pig_args=[]): 

        pig_step = PigStep(name=name,
                           pig_file=pig_file,
                           pig_versions=pig_versions,
                           pig_args=pig_args,
                           # action_on_failure='CONTINUE',
                       )

        self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step])

        # Poll until the cluster is done working        
        return self._poll_until_cluster_ready(jobflow_id)


    def shutdown_emr_cluster(self, jobflow_id):
 
        self.emr_connection.terminate_jobflow(jobflow_id)
        return self._poll_until_cluster_shutdown(jobflow_id)
 
    def get_jobflow_id(self):
        # Get the id of the cluster that is WAITING for work
        return self.emr_connection.list_clusters(cluster_states=['WAITING']).clusters[0].id
 
    def get_master_dns(self):
        """
        Get the master node's public address
        """
        # Get the jobflow ID
        jobflow_id = self.get_master_dns()
#.........这里部分代码省略.........
开发者ID:mbrio,项目名称:Luigi,代码行数:103,代码来源:emr_client.py

示例11: __init__

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import add_jobflow_steps [as 别名]

#.........这里部分代码省略.........
        
        if not self.job_id:
            raise Exception('No job is running.')

        return self.emr_conn.describe_jobflow(self.job_id)

    def add_step(self, mapper, reducer, input, output, num_map=1, 
                 num_reduce=1):
        '''Add a step to an existing job

        Adds a new step to an already running job flow.

        Note: any given job flow can support up to 256 steps. To workaround
              this limitation, you can always choose to submit a new job
              once the current job completes.
        
        Arguments:
            mapper          string      path to the mapper, relative to
                                        your data directory.
            reducer         string      path to the reducer, relative to
                                        your data directory.
            input           string      path to the input data, relative to
                                        your data directory. To specify a
                                        directory as input, ensure your path
                                        contains a trailing /.
            output          string      path to the desired output directory.
        '''

        if not self.job_id:
            raise Exception('No job is running.')

        step = self._make_step(mapper, reducer, input, output, num_map,
                               num_reduce)
        self.emr_conn.add_jobflow_steps(self.job_id, [step])
    
    def upload(self, in_dir='data'):
        '''Upload local data to S3

        Uploads the files in the specified directory to S3, where it can be
        used by Elastic MapReduce.

        Note: this method only uploads files in the root of in_dir. It does
              NOT scan through subdirectories.

        Arguments:
            in_dir          string      optional, defaults to 'data'. Uses
                                        this directory as the base directory
                                        from which to upload.
        '''
        
        bucket = self.s3_conn.get_bucket(self.s3_bucket)
        keys = bucket.list(prefix='%s/' % self.team_id)
        bucket.delete_keys(map(lambda k: k.name, keys))
        
        to_upload = [
            (os.path.join(in_dir, file_name),
             os.path.join(self.team_id, file_name))
            for file_name in os.listdir(in_dir)
            if os.path.isfile(os.path.join(in_dir, file_name))]

        for l, r in to_upload:
            key = Key(bucket)
            key.key = r
            key.set_contents_from_filename(l)

    def download(self, out_dir='data'):
开发者ID:arjunc12,项目名称:rankmaniac,代码行数:70,代码来源:rankmaniac.py


注:本文中的boto.emr.connection.EmrConnection.add_jobflow_steps方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。