当前位置: 首页>>代码示例>>Python>>正文


Python EmrConnection.describe_jobflow方法代码示例

本文整理汇总了Python中boto.emr.connection.EmrConnection.describe_jobflow方法的典型用法代码示例。如果您正苦于以下问题:Python EmrConnection.describe_jobflow方法的具体用法?Python EmrConnection.describe_jobflow怎么用?Python EmrConnection.describe_jobflow使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在boto.emr.connection.EmrConnection的用法示例。


在下文中一共展示了EmrConnection.describe_jobflow方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_cluster_status

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
def get_cluster_status(cluster_id):
	try:
		emr_connection = EmrConnection()
		flow = emr_connection.describe_jobflow(cluster_id)
		if flow == None:
			return "none"
		return flow.state
	except Exception, e:
		return "none"
开发者ID:valeter,项目名称:nlp-site,代码行数:11,代码来源:process_file.py

示例2: creating_a_connection

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
class EMR:
    def creating_a_connection(self):
        #Creating a connection
        from boto.emr.connection import EmrConnection
        self.conn = EmrConnection('', '')

    def creating_streaming_job(self):
        #Creating Streaming JobFlow Steps
        from boto.emr.step import StreamingStep
        self.step = StreamingStep(name='my bigdata task',
            mapper='s3n://eth-src/raw_to_stations.py',
            #mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
            reducer='s3n://eth-src/stations_to_features.py',
            #reducer='aggregate',
            input='s3n://eth-input/2007.csv',
            #input='s3n://elasticmapreduce/samples/wordcount/input',
            output='s3n://eth-middle/2007')

    def creating_jobflows(self):
        #Creating JobFlows
        #import boto.emr
        #self.conn = boto.emr.connect_to_region('eu-west-1')
        job_id = self.conn.run_jobflow(name='My jobflow',
                log_uri='s3://eth-log/jobflow_logs',
                master_instance_type='m3.xlarge',
                slave_instance_type='m1.large',
                num_instances=2,
                steps=[self.step],
                ami_version='3.3.1'
                )

        status = self.conn.describe_jobflow(job_id)
        status.state

    def terminating_jobflows(self, job_id):
        #Terminating JobFlows
        #self.conn = boto.emr.connect_to_region('eu-west-1')
        self.conn.terminate_jobflow(job_id)
开发者ID:raynald,项目名称:ETH_BBBigData,代码行数:40,代码来源:EMR.py

示例3: __init__

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]

#.........这里部分代码省略.........

        Returns a boto.emr.emrobject.JobFlow object.

        Special notes:
            The JobFlow object has the following relevant fields.
                state       <str>           the state of the job flow,
                                            either COMPLETED
                                                 | FAILED
                                                 | TERMINATED
                                                 | RUNNING
                                                 | SHUTTING_DOWN
                                                 | STARTING
                                                 | WAITING

                steps       <list(boto.emr.emrobject.Step)>
                            a list of the step details in the workflow.

            The Step object has the following relevant fields.
                state               <str>       the state of the step.

                startdatetime       <str>       the start time of the
                                                job.

                enddatetime         <str>       the end time of the job.

            WARNING! Amazon has an upper-limit on the frequency with
            which you can call this method; we have had success with
            calling it at most once every 10 seconds.
        """

        if not self.job_id:
            raise RankmaniacError('No job is running.')

        return self._emr_conn.describe_jobflow(self.job_id)

    def _get_last_process_step_iter_no(self):
        """
        Returns the most recently process-step of the job flow that has
        been completed.
        """

        steps = self.describe().steps
        i = 1

        while i < len(steps):
            step = steps[i]
            if step.state != 'COMPLETED':
                break

            i += 2

        return i / 2 - 1

    def _get_default_outdir(self, name, iter_no=None):
        """
        Returns the default output directory, which is 'iter_no/name/'.
        """

        if iter_no is None:
            iter_no = self._iter_no

        # Return iter_no/name/ **with** the trailing slash
        return '%s/%s/' % (iter_no, name)

    def _submit_new_job(self, steps):
        """
开发者ID:aagarwal1990,项目名称:CS144_Rankmaniac,代码行数:70,代码来源:rankmaniac.py

示例4: StreamingStep

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
files_short="""split_A.txt
split_B.txt
split_C.txt
split_D.txt
split_E.txt
split_F.txt
split_G.txt
split_H.txt
split_I.txt
split_J.txt
split_K.txt
split_L.txt
split_M.txt""".split('\n')

input_files=['s3n://smalldata/'+f for f in files_short]


step = StreamingStep(name='Inverted Index ',
                      mapper='s3n://css739/invIndex/inv-index-mapper.py',
                      reducer='s3n://css739/invIndex/inv-index-mapper.py',
                      input=input_files,
                      #input='s3n://smalldata/wikipedia_titles.txt',
                      output='s3n://css739/invIndex/invindex_output2')
                      #cache_files=['s3n://css739/invindex/english_stoplist.py'])
                      
                      
jobid = conn.run_jobflow(name='Inverted Index', log_uri='s3n://css739/invIndex/jobflow_logs',steps=[step])

conn.describe_jobflow(jobid).state
开发者ID:2042Labs,项目名称:S12-BigData,代码行数:31,代码来源:emr_inv_index.py

示例5:

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
                           'OUTPUT=s3://dphiveoutput'])

jobname = 'MM Logs Jobflow %s' %dt.datetime.now()

jobid = conne.run_jobflow(name=jobname,
                          log_uri='s3://dphive/debug/',
                          ec2_keyname='dpaws',
                          master_instance_type='c1.medium',
                          slave_instance_type='c1.medium',
                          num_instances=3,
                          steps=[step1, step2])



while True:
	status = conne.describe_jobflow(jobid)
	if status.state == 'STARTING':
		time.sleep(10)
	elif status.state == 'RUNNING':
		time.sleep(10)
	elif status.state == 'WAITING':
		time.sleep(10)
	elif status.state == 'TERMINATED':
		send('Hadoop Job Runner Update %s'%(dt.datetime.now()), 'The Hadoop Job: %s currently has the status: %s' %(jobid, status.state), '[email protected]')
		break
	elif status.state == 'FAILED':
		send('Hadoop Job Runner Update %s'%(dt.datetime.now()), 'The Hadoop Job: %s currently has the status: %s' %(jobid, status.state), ['[email protected]', '[email protected]'])
		break
	elif status.state == 'SHUTTING_DOWN':
		time.sleep(10)
	elif status.state == 'COMPLETED':
开发者ID:mobtownlabs,项目名称:ec2-user,代码行数:33,代码来源:hadoop_job_runner.py

示例6: str

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
for word in b.list():
    keystring = str(word.key)
    if re.match(keystring,'part-00000'):
        word.get_contents_to_filename('/Users/winteram/Documents/Teaching/wordcount_output.txt')

# <codecell>

# Doing our own word counts
step = StreamingStep(name='Alcohol Step',
	mapper='s3n://bia660-winter/mapper.py',
	reducer='s3n://bia660-winter/reducer.py', 
	input='s3://datasets.elasticmapreduce/ngrams/books/20090715/eng-us-all/3gram/data',
	output='s3n://bia660-winter/output')

# <codecell>

jobid = emrcon.run_jobflow(name='Alcohol Religion 6', log_uri='s3://bia660-winter/logfiles',steps=[step],num_instances=4)

# <codecell>

jobid

# <codecell>

status = emrcon.describe_jobflow(jobid)
print status.state

# <codecell>


开发者ID:todatamining,项目名称:db1,代码行数:30,代码来源:boto.py

示例7: EmrLauncher

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
class EmrLauncher(object):

    # Default constructor of the class.
    def __init__(self):
        try:
            self.zone_name = "ap-southeast-1"
            self.access_key = "xxxxxx"
            self.private_key = "xxxxxxx"
            self.ec2_keyname = "xxxxxxxx"
            self.base_bucket = "s3://emr-bucket/"
            self.bootstrap_script = "custom-bootstrap.sh"
            self.log_dir = "Logs"
            self.emr_status_wait = 20
            self.conn = ""
            self.cluster_name = "MyFirstEmrCluster"

            # Establishing EmrConnection
            self.conn = EmrConnection(self.access_key, self.private_key,
                                 region=RegionInfo(name=self.zone_name,
                                 endpoint=self.zone_name + '.elasticmapreduce.amazonaws.com'))


            self.log_bucket_name = self.base_bucket + self.log_dir
            self.bootstrap_script_name = self.base_bucket + self.bootstrap_script

    def launch_emr_cluster(self, master_type, slave_type, num_instance, ami_version):
        try:
            #Custom Bootstrap step
            bootstrap_step = BootstrapAction("CustomBootStrap", self.bootstrap_script_name, None)

            #Modifyting block size to 256 MB
            block_size_conf = 'dfs.block.size=256'
            hadoop_config_params = ['-h', block_size_conf, '-h']
            hadoop_config_bootstrapper = BootstrapAction('hadoop-config',
                                               's3://elasticmapreduce/bootstrap-actions/configure-hadoop',
                                               hadoop_config_params)
            #Bootstrapping Ganglia
            hadoop_monitor_bootstrapper = BootstrapAction('ganglia-config',
                                                's3://elasticmapreduce/bootstrap-actions/install-ganglia', '')

            #Bootstrapping Impala
            impala_install_params = ['--install-impala','--base-path', 's3://elasticmapreduce', '--impala-version', 'latest']
            bootstrap_impala_install_step = BootstrapAction("ImpalaInstall", "s3://elasticmapreduce/libs/impala/setup-impala",
                                                                                                impala_install_params)
            #Hive installation
            hive_install_step = InstallHiveStep();

            #Pig Installation
            pig_install_step = InstallPigStep();

            #Launching the cluster
            jobid = self.conn.run_jobflow(
                         self.cluster_name,
                         self.log_bucket_name,
                         bootstrap_actions=[hadoop_config_bootstrapper, hadoop_monitor_bootstrapper, bootstrap_step,
                                            bootstrap_impala_install_step],
                         ec2_keyname=self.ec2_keyname,
                         steps=[hive_install_step, pig_install_step],
                         keep_alive=True,
                         action_on_failure = 'CANCEL_AND_WAIT',
                         master_instance_type=master_type,
                         slave_instance_type=slave_type,
                         num_instances=num_instance,
                         ami_version=ami_version)

            #Enabling the termination protection
            self.conn.set_termination_protection(jobid, True)

            #Checking the state of EMR cluster
            state = self.conn.describe_jobflow(jobid).state
            while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING':
                #sleeping to recheck for status.
                time.sleep(int(self.emr_status_wait))
                state = self.conn.describe_jobflow(jobid).state

            if state == u'SHUTTING_DOWN' or state == u'FAILED':
                logging.error("Launching EMR cluster failed")
                return "ERROR"

            #Check if the state is WAITING. Then launch the next steps
            if state == u'WAITING':
                #Finding the master node dns of EMR cluster
                master_dns = self.conn.describe_jobflow(jobid).masterpublicdnsname
                logging.info("Launched EMR Cluster Successfully")
                logging.info("Master node DNS of EMR " + master_dns)
                return "SUCCESS"
        except:
            logging.error("Launching EMR cluster failed")
            return "FAILED"

    def main(self):
        try:
            master_type = 'm3.xlarge'
            slave_type = 'm3.xlarge'
            num_instance = 3
            ami_version = '2.4.8'

            emr_status = self.launch_emr_cluster(master_type, slave_type, num_instance, ami_version)
            if emr_status == 'SUCCESS':
                logging.info("Emr cluster launched successfully")
#.........这里部分代码省略.........
开发者ID:amalgjose,项目名称:MyExperiments,代码行数:103,代码来源:EmrLauncher.py

示例8: main

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
def main():
    aws_access = sys.argv[1]
    aws_secert = sys.argv[2]
    jar_path = sys.argv[3]
    input_filename = sys.argv[4]
    output_filename = sys.argv[5]
    nodes = int(sys.argv[6])
    slots = 7 * nodes
    s3_in = sys.argv[7] + "_" + str(os.getpid()) + "_in"
    s3_out= sys.argv[7] + "_" + str(os.getpid()) + "_out"
    s3_asm= sys.argv[7] + "_" + str(os.getpid()) + "_asm"
    readlen = int(sys.argv[8])
    kmer= int(sys.argv[9])


    # connect to S3
    s3_conn = S3Connection(aws_access, aws_secert)
    mybucket = s3_conn.create_bucket(aws_access.lower())
    mybucket = s3_conn.get_bucket(aws_access.lower(), validate=False)
    print "\nConnection created"
    # upload  data
    k = Key(mybucket)
    k.key = 'ReadStackCorrector.jar'
    k.set_contents_from_filename(jar_path + 'ReadStackCorrector.jar')
    #k.key = 'CloudBrush.jar'
    k.key = 'CloudbrushGPU.jar'
    k.set_contents_from_filename(jar_path + 'CloudbrushGPU-GPU.jar')

    # uploading file parallel
    #k.key = s3_in
    #k.set_contents_from_filename(input_filename)
    print "\nStarting Upload"
    s3_path = 's3://%s/%s' % (aws_access.lower(), s3_in)
    upload_cmd = 'python %s/s3-mp-upload.py %s %s %s %s -f 2>&1' % (jar_path, input_filename, s3_path, aws_access, aws_secert)
    proc = subprocess.call( args=upload_cmd, shell=True )

    #k.key = s3_out
    #k.delete()
    # connect to EMR    InstanceGroup(nodes, 'CORE', 'c1.xlarge', 'ON_DEMAND', '[email protected]', '0.4')
    emr_conn = EmrConnection(aws_access, aws_secert)
    instance_groups = [
        InstanceGroup(1, 'MASTER', 'm1.medium', 'ON_DEMAND', '[email protected]', '0.4'),
        InstanceGroup(nodes, 'CORE', 'g2.2xlarge', 'ON_DEMAND', '[email protected]', '0.4')
        ]

    # perform CloudRS
    step1 = JarStep(name='CloudRS',
                   jar='s3n://%s/ReadStackCorrector.jar' % (aws_access.lower()),
                   step_args = ['-in', 's3n://%s/%s' % (aws_access.lower(), s3_in), '-out', s3_out, '-slots', slots, '-javaopts', '-Xmx960m'])

    # perform CloudBrush
    step2 = JarStep(name='CloudBrush',
                   jar='s3n://%s/CloudbrushGPU-GPU.jar' % (aws_access.lower()),
                   step_args = ['-reads', s3_out, '-asm', s3_asm, '-readlen', readlen, '-k', kmer, '-slots', slots, '-javaopts', '-Xmx960m'])

    # copy from hdfs to S3
    k.key = s3_asm
    step3 = JarStep(name='S3DistCp',
                    jar='/home/hadoop/lib/emr-s3distcp-1.0.jar', #'s3://elasticmapreduce/libs/s3distcp/role/s3distcp.jar',
                    step_args = ['--src', 'hdfs:///user/hadoop/%s' % s3_asm , '--dest', 's3://%s/%s' % (aws_access.lower(), s3_asm), '--groupBy', '.*(part).*'])
    jobid = emr_conn.run_jobflow(name='CloudBrush',
                             log_uri='s3://%s/jobflow_logs' % aws_access.lower(),
                             ami_version='latest',
                             hadoop_version='2.4.0', #'0.20.205'
                             keep_alive=False,
                             visible_to_all_users=True,
                             steps=[step1,step2,step3],
                             instance_groups = instance_groups)

    state = emr_conn.describe_jobflow(jobid).state
    print "job state = ", state
    print "job id = ", jobid
    while state != u'COMPLETED':
        print time.asctime(time.localtime())
        time.sleep(30)
        state = emr_conn.describe_jobflow(jobid).state
        print "job state = ", state
        print "job id = ", jobid
        if state == u'FAILED':
            print 'FAILED!!!!'
            break

    # download file parallel
    #k.key = "%s/part0" % (s3_asm)
    #k.get_contents_to_filename(output_filename)
    if state == u'COMPLETED':
        s3_path  = 's3://%s/%s/part0' % (aws_access.lower(), s3_asm)
        download_cmd = 'python %s/s3-mp-download.py %s %s %s %s -f 2>&1' % (jar_path, s3_path, output_filename, aws_access, aws_secert)
        proc = subprocess.call( args=download_cmd, shell=True )


        # delete file in S3
        k.key = s3_in
        k.delete()
        k.key = "%s/part0" % (s3_asm)
        k.delete()
开发者ID:zenlc2000,项目名称:CloudbrushGPU,代码行数:98,代码来源:CloudBrush_EMR.py

示例9: EmrManager

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
class EmrManager(object):
 
    # Default constructor of the class. Uses default parameters if not provided.
    def __init__(self, parameters):
        try: 
            self.region_name = parameters["region_name"]
            self.access_key = parameters["access_key"]
            self.secret_key = parameters["secret_key"]
            self.ec2_keypair_name = parameters["ec2_keypair_name"]
            self.base_bucket = parameters["base_bucket"]
            self.log_dir = parameters["log_dir"]
            self.emr_status_wait = parameters["emr_status_wait"]
            self.step_status_wait = parameters["step_status_wait"]
            self.emr_cluster_name = parameters["emr_cluster_name"]
        except:
            logging.error("Something went wrong initializing EmrManager")
            sys.exit()

        # Establishing EmrConnection
        self.connection = EmrConnection(self.access_key, self.secret_key,
                             region=RegionInfo(name=self.region_name,
                             endpoint=self.region_name + '.elasticmapreduce.amazonaws.com'))

        self.log_bucket_name = self.base_bucket + self.log_dir
 
    #Method for launching the EMR cluster
    def launch_cluster(self, master_type, slave_type, num_instances, ami_version):
        try:
            #Launching the cluster
            cluster_id = self.connection.run_jobflow(
                             self.emr_cluster_name,
                             self.log_bucket_name,
                             ec2_keyname=self.ec2_keypair_name,
                             keep_alive=True,
                             action_on_failure = 'CANCEL_AND_WAIT',
                             master_instance_type=master_type,
                             slave_instance_type=slave_type,
                             num_instances=num_instances,
                             ami_version=ami_version)

            logging.info("Launching cluster: " + cluster_id + ". Please be patient. Check the status of your cluster in your AWS Console")

            # Checking the state of EMR cluster
            state = self.connection.describe_jobflow(cluster_id).state
            while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING':
                #sleeping to recheck for status.
                time.sleep(int(self.emr_status_wait))
                state = self.connection.describe_jobflow(cluster_id).state
                logging.info("Creating cluster " + cluster_id + ". Status: " + state)
 
            if state == u'SHUTTING_DOWN' or state == u'FAILED':
                logging.error("Launching EMR cluster failed")
                return "ERROR"
 
            #Check if the state is WAITING. Then launch the next steps
            if state == u'WAITING':
                #Finding the master node dns of EMR cluster
                master_dns = self.connection.describe_jobflow(cluster_id).masterpublicdnsname
                logging.info("Launched EMR Cluster Successfully with cluster id:" + cluster_id)
                logging.info("Master node DNS of EMR " + master_dns)
                return cluster_id
        except:
            logging.error("Launching EMR cluster failed")
            return "FAILED"

    # run scripting step in cluster
    def run_scripting_step(self, cluster_id, name, script_path):
        try:
            step = ScriptRunnerStep(name=name, 
                                    step_args=[script_path],
                                    action_on_failure="CONTINUE")
            return self._run_step(cluster_id, step)
        except:
            logging.error("Running scripting step in cluster " + cluster_id + " failed.")
            return "FAILED"

    # run streaming step in cluster
    def run_streaming_step(self, cluster_id, name, mapper_path, reducer_path, input_path, output_path):
        try:
            # bundle files with the job
            files = []
            if mapper_path != "NONE":
                files.append(mapper_path)
                mapper_path = mapper_path.split("/")[-1]
            if reducer_path != "NONE":
                files.append(reducer_path)
                reducer_path = reducer_path.split("/")[-1]
            # build streaming step
            logging.debug("Launching streaming step with mapper: " + mapper_path + " reducer: " + reducer_path + " and files: " + str(files))
            step = StreamingStep(name=name,
                                    step_args=["-files"] + files, 
                                    mapper=mapper_path, 
                                    reducer=reducer_path, 
                                    input=input_path, 
                                    output=output_path, 
                                    action_on_failure="CONTINUE")
            return self._run_step(cluster_id, step)            
        except:
            logging.error("Running streaming step in cluster " + cluster_id + " failed.")
            return "FAILED"
#.........这里部分代码省略.........
开发者ID:DiegoTUI,项目名称:emr-orchestrator,代码行数:103,代码来源:emr_manager.py

示例10: EmrClient

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
class EmrClient(object):


    # The Hadoop version to use
    HADOOP_VERSION = '1.0.3'

    # The AMI version to use
    AMI_VERSION = '2.4.7'
 
    # Interval to wait between polls to EMR cluster in seconds
    CLUSTER_OPERATION_RESULTS_POLLING_SECONDS = 10
 
    # Timeout for EMR creation and ramp up in seconds
    CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS = 60 * 30
 
    def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None):
 
        # If the access key is not specified, get it from the luigi config.cfg file
        if not aws_access_key_id:
            aws_access_key_id = luigi.configuration.get_config().get('aws', 'aws_access_key_id')
 
        if not aws_secret_access_key:
            aws_secret_access_key = luigi.configuration.get_config().get('aws', 'aws_secret_access_key')
 
 
        # Create the region in which to run
        region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
        region = RegionInfo(name=region_name, endpoint=region_endpoint)
 
        self.emr_connection = EmrConnection(aws_access_key_id=aws_access_key_id,
                                            aws_secret_access_key=aws_secret_access_key,
                                            region=region)
 
    def launch_emr_cluster(self, cluster_name, log_uri, ec2_keyname=None, master_type='m1.small', core_type='m1.small', num_instances=2, hadoop_version='1.0.3', ami_version='2.4.7', ):
 
        # TODO Remove
        # install_pig_step = InstallPigStep()
 
        jobflow_id = self.emr_connection.run_jobflow(name=cluster_name,
                              log_uri=log_uri,
                              ec2_keyname=ec2_keyname,
                              master_instance_type=master_type,
                              slave_instance_type=core_type,
                              num_instances=num_instances,
                              keep_alive=True,
                              enable_debugging=True,
                              hadoop_version=EmrClient.HADOOP_VERSION,
                              steps=[], 
                              ami_version=EmrClient.AMI_VERSION)
 
        # Log important information
        status = self.emr_connection.describe_jobflow(jobflow_id)

        logger.info('Creating new cluster %s with following details' % status.name)
        logger.info('jobflow ID:\t%s' % status.jobflowid)
        logger.info('Log URI:\t%s' % status.loguri)
        logger.info('Master Instance Type:\t%s' % status.masterinstancetype)
        
        # A cluster of size 1 does not have any slave instances
        if hasattr(status, 'slaveinstancetype'):
            logger.info('Slave Instance Type:\t%s' % status.slaveinstancetype)
        
        logger.info('Number of Instances:\t%s' % status.instancecount)
        logger.info('Hadoop Version:\t%s' % status.hadoopversion)
        logger.info('AMI Version:\t%s' % status.amiversion)
        logger.info('Keep Alive:\t%s' % status.keepjobflowalivewhennosteps)
 
        return self._poll_until_cluster_ready(jobflow_id)
 
 
    def add_pig_step(self, jobflow_id, pig_file, name='Pig Script', pig_versions='latest', pig_args=[]): 

        pig_step = PigStep(name=name,
                           pig_file=pig_file,
                           pig_versions=pig_versions,
                           pig_args=pig_args,
                           # action_on_failure='CONTINUE',
                       )

        self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step])

        # Poll until the cluster is done working        
        return self._poll_until_cluster_ready(jobflow_id)


    def shutdown_emr_cluster(self, jobflow_id):
 
        self.emr_connection.terminate_jobflow(jobflow_id)
        return self._poll_until_cluster_shutdown(jobflow_id)
 
    def get_jobflow_id(self):
        # Get the id of the cluster that is WAITING for work
        return self.emr_connection.list_clusters(cluster_states=['WAITING']).clusters[0].id
 
    def get_master_dns(self):
        """
        Get the master node's public address
        """
        # Get the jobflow ID
        jobflow_id = self.get_master_dns()
#.........这里部分代码省略.........
开发者ID:mbrio,项目名称:Luigi,代码行数:103,代码来源:emr_client.py

示例11: main

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]
def main(args):
	script_name = args
	for i in range(2, 3, 2):
		start_time = time.time()


		bucket_name = 'nlp-' + str(i).strip()

		emr_connection = EmrConnection()

		preprocessing_steps = []
		for j in xrange(12, 13, 12):
			preprocessing_steps.append(JarStep(name='prerocessing-' + str(i).strip(),
				jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-core.jar',
				step_args=['com.digitalpebble.behemoth.util.CorpusGenerator',
					'-i', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/texts',
					'-o', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/bcorpus']))

		tika_steps = []
		for j in xrange(12, 13, 12):
			tika_steps.append(JarStep(name='tika-' + str(i).strip(),
				jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-tika.jar',
				step_args=['com.digitalpebble.behemoth.tika.TikaDriver',
					'-i', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/bcorpus',
					'-o', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/tcorpus']))

		copy_jar_steps = []
		for j in xrange(12, 13, 12):
			copy_jar_steps.append(JarStep(name='copy-jar-' + str(i).strip(),
				jar='s3n://nlp-' + str(i).strip() + '/init/copy-to-hdfs.jar',
				step_args=['s3n://nlp-' + str(i).strip() + '/init/pipeline.pear',
					'/mnt/pipeline.pear']))

		uima_steps = []
		for j in xrange(12, 13, 12):
			uima_steps.append(JarStep(name='uima-' + str(i).strip(),
				jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-uima.jar',
				step_args=['com.digitalpebble.behemoth.uima.UIMADriver',
					's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/tcorpus',
					'/mnt/ucorpus',
					'/mnt/pipeline.pear']))

		steps = []
		steps.extend(preprocessing_steps
		steps.extend(tika_steps)
		steps.extend(copy_jar_steps)
		steps.extend(uima_steps)
		steps.extend(extract_result_steps)

		hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1',
		          '-m', 'mapred.child.java.opts=-Xmx10g']
		configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params)

		jobid = emr_connection.run_jobflow(name='nlp-cloud-' + str(i).strip(),
			log_uri='s3://nlp-' + str(i).strip() + '/jobflow_logs',
			master_instance_type='m2.xlarge',
			slave_instance_type='m2.xlarge',
			num_instances=i,
			keep_alive=False,
			enable_debugging=False,
			bootstrap_actions=[configure_hadoop_action],
			hadoop_version='1.0.3',
			steps=steps)

		termination_statuses = [u'COMPLETED', u'FAILED', u'TERMINATED']
		while True:
			time.sleep(5)
			status = emr_connection.describe_jobflow(jobid) 
			if status.state in termination_statuses:
				print 'Job finished for %s nodes' % i
				break


		print time.time() - start_time, ' seconds elapsed'



	return True

if (__name__ == '__main__'):
	args = sys.argv
	if (check_args(args)):
		if (main(args)):
			sys.exit()
			print 'Work successfully finished'
		else:
			print 'Could not finish work'
			sys.exit(1)
	else:
		print USAGE_MESSAGE
	sys.exit(2)
开发者ID:valeter,项目名称:nlp-site,代码行数:93,代码来源:run_cluster.py

示例12: __init__

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import describe_jobflow [as 别名]

#.........这里部分代码省略.........
        Stops the current running job.
        '''

        if not self.job_id:
            raise Exception('No job is running.')

        self.emr_conn.terminate_jobflow(self.job_id)
        self.job_id = None

    def get_job(self):
        '''Gets the running job details

        Returns:
            JobFlow object with relevant fields:
                state           string      the state of the job flow, either
                                            COMPLETED | FAILED | TERMINATED
                                            RUNNING | SHUTTING_DOWN | STARTING
                                            WAITING | BOOTSTRAPPING
                steps           list(Step)  a list of the step details in the
                                            workflow. A Step has the relevant
                                            fields:
                                                status              string
                                                startdatetime       string
                                                enddatetime         string

        Note: Amazon has an upper-limit on the frequency with which you can
              call this function; we have had success with calling it one
              every 10 seconds.
        '''
        
        if not self.job_id:
            raise Exception('No job is running.')

        return self.emr_conn.describe_jobflow(self.job_id)

    def add_step(self, mapper, reducer, input, output, num_map=1, 
                 num_reduce=1):
        '''Add a step to an existing job

        Adds a new step to an already running job flow.

        Note: any given job flow can support up to 256 steps. To workaround
              this limitation, you can always choose to submit a new job
              once the current job completes.
        
        Arguments:
            mapper          string      path to the mapper, relative to
                                        your data directory.
            reducer         string      path to the reducer, relative to
                                        your data directory.
            input           string      path to the input data, relative to
                                        your data directory. To specify a
                                        directory as input, ensure your path
                                        contains a trailing /.
            output          string      path to the desired output directory.
        '''

        if not self.job_id:
            raise Exception('No job is running.')

        step = self._make_step(mapper, reducer, input, output, num_map,
                               num_reduce)
        self.emr_conn.add_jobflow_steps(self.job_id, [step])
    
    def upload(self, in_dir='data'):
        '''Upload local data to S3
开发者ID:arjunc12,项目名称:rankmaniac,代码行数:70,代码来源:rankmaniac.py


注:本文中的boto.emr.connection.EmrConnection.describe_jobflow方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。