当前位置: 首页>>代码示例>>Python>>正文


Python EmrConnection.run_jobflow方法代码示例

本文整理汇总了Python中boto.emr.connection.EmrConnection.run_jobflow方法的典型用法代码示例。如果您正苦于以下问题:Python EmrConnection.run_jobflow方法的具体用法?Python EmrConnection.run_jobflow怎么用?Python EmrConnection.run_jobflow使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在boto.emr.connection.EmrConnection的用法示例。


在下文中一共展示了EmrConnection.run_jobflow方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: start_hadoop_cluster

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
def start_hadoop_cluster(nodenum):
	try:
		hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1',
		          '-m', 'mapred.child.java.opts=-Xmx10g']
		configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params)

		emr_connection = EmrConnection()
		bucket_name = "udk-bucket"
		steps = []
		copy_jar_step = JarStep(name='copy-jar',
			jar='s3n://' + bucket_name + '/copy-to-hdfs.jar',
			step_args=['s3n://' + bucket_name + '/pipeline.pear',
				'/mnt/pipeline.pear'])
		steps.append(copy_jar_step)

		jobflow_id = emr_connection.run_jobflow(name='udk',
			log_uri='s3://udk-bucket/jobflow_logs',
			master_instance_type='m2.xlarge',
			slave_instance_type='m2.xlarge',
			num_instances=nodenum,
			keep_alive=True,
			enable_debugging=False,
			bootstrap_actions=[configure_hadoop_action],
			hadoop_version='1.0.3',
			steps=steps)
		emr_connection.set_termination_protection(jobflow_id, True)
		
		return jobflow_id
	except Exception, e:
		return "none" 
开发者ID:valeter,项目名称:nlp-site,代码行数:32,代码来源:run_cluster.py

示例2: run

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
    def run(self):
        """Run the Hive job on EMR cluster
        """
        #  copy the data source to a new object
        #  (Hive deletes/moves the original)
        copy_s3_file(self.input_path, self.data_path)

        # and create the hive script
        self._generate_and_upload_hive_script()

        logger.info("Waiting {} seconds for S3 eventual consistency".format(
                    self.s3_sync_wait_time))
        time.sleep(self.s3_sync_wait_time)

        # TODO more options like setting aws region
        conn = EmrConnection(self.aws_access_key_id,
                             self.aws_secret_access_key)

        setup_step = InstallHiveStep(self.hive_version)
        run_step = HiveStep(self.job_name, self.script_path)

        jobid = conn.run_jobflow(
            self.job_name,
            self.log_path,
            action_on_failure='CANCEL_AND_WAIT',
            master_instance_type=self.master_instance_type,
            slave_instance_type=self.slave_instance_type,
            ami_version=self.ami_version,
            num_instances=self.num_instances)

        conn.add_jobflow_steps(jobid, [setup_step, run_step])

        self._wait_for_job_to_complete(conn, jobid)

        logger.info("Output file is in: {0}".format(self.output_path))
开发者ID:JonathanBatten,项目名称:apiarist,代码行数:37,代码来源:emr.py

示例3: creating_a_connection

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
class EMR:
    def creating_a_connection(self):
        #Creating a connection
        from boto.emr.connection import EmrConnection
        self.conn = EmrConnection('', '')

    def creating_streaming_job(self):
        #Creating Streaming JobFlow Steps
        from boto.emr.step import StreamingStep
        self.step = StreamingStep(name='my bigdata task',
            mapper='s3n://eth-src/raw_to_stations.py',
            #mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
            reducer='s3n://eth-src/stations_to_features.py',
            #reducer='aggregate',
            input='s3n://eth-input/2007.csv',
            #input='s3n://elasticmapreduce/samples/wordcount/input',
            output='s3n://eth-middle/2007')

    def creating_jobflows(self):
        #Creating JobFlows
        #import boto.emr
        #self.conn = boto.emr.connect_to_region('eu-west-1')
        job_id = self.conn.run_jobflow(name='My jobflow',
                log_uri='s3://eth-log/jobflow_logs',
                master_instance_type='m3.xlarge',
                slave_instance_type='m1.large',
                num_instances=2,
                steps=[self.step],
                ami_version='3.3.1'
                )

        status = self.conn.describe_jobflow(job_id)
        status.state

    def terminating_jobflows(self, job_id):
        #Terminating JobFlows
        #self.conn = boto.emr.connect_to_region('eu-west-1')
        self.conn.terminate_jobflow(job_id)
开发者ID:raynald,项目名称:ETH_BBBigData,代码行数:40,代码来源:EMR.py

示例4: EmrConnection

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
from boto.emr.connection import EmrConnection
from boto.emr.step import StreamingStep
import boto

AWS_KEY='AKIAIQ7VG4UORIN75ZSA'
AWS_SECRET='jzxajGx8gzwX+ymYXJ0/5heCjkPtWLQkICYRn7Vj'

conn = EmrConnection(AWS_KEY, AWS_SECRET)

step = StreamingStep(name='My wordcount example',
                      mapper='s3n://css739/wordcount/bigramSplitter.py',
                      reducer='aggregate',
                      input='s3n://smalldata/wikipedia_titles.txt',
                      output='s3n://css739/wordcount/bigram_count_output2',
                      cache_files=['s3n://css739/wordcount/english_stoplist.py'])
                      
                      
jobid = conn.run_jobflow(name='My jobflow', log_uri='s3n://css739/wordcount/jobflow_logs',steps=[step])

conn.describe_jobflow(jobid).state
开发者ID:2042Labs,项目名称:S12-BigData,代码行数:22,代码来源:emr_test.py

示例5: EmrLauncher

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
class EmrLauncher(object):

    # Default constructor of the class.
    def __init__(self):
        try:
            self.zone_name = "ap-southeast-1"
            self.access_key = "xxxxxx"
            self.private_key = "xxxxxxx"
            self.ec2_keyname = "xxxxxxxx"
            self.base_bucket = "s3://emr-bucket/"
            self.bootstrap_script = "custom-bootstrap.sh"
            self.log_dir = "Logs"
            self.emr_status_wait = 20
            self.conn = ""
            self.cluster_name = "MyFirstEmrCluster"

            # Establishing EmrConnection
            self.conn = EmrConnection(self.access_key, self.private_key,
                                 region=RegionInfo(name=self.zone_name,
                                 endpoint=self.zone_name + '.elasticmapreduce.amazonaws.com'))


            self.log_bucket_name = self.base_bucket + self.log_dir
            self.bootstrap_script_name = self.base_bucket + self.bootstrap_script

    def launch_emr_cluster(self, master_type, slave_type, num_instance, ami_version):
        try:
            #Custom Bootstrap step
            bootstrap_step = BootstrapAction("CustomBootStrap", self.bootstrap_script_name, None)

            #Modifyting block size to 256 MB
            block_size_conf = 'dfs.block.size=256'
            hadoop_config_params = ['-h', block_size_conf, '-h']
            hadoop_config_bootstrapper = BootstrapAction('hadoop-config',
                                               's3://elasticmapreduce/bootstrap-actions/configure-hadoop',
                                               hadoop_config_params)
            #Bootstrapping Ganglia
            hadoop_monitor_bootstrapper = BootstrapAction('ganglia-config',
                                                's3://elasticmapreduce/bootstrap-actions/install-ganglia', '')

            #Bootstrapping Impala
            impala_install_params = ['--install-impala','--base-path', 's3://elasticmapreduce', '--impala-version', 'latest']
            bootstrap_impala_install_step = BootstrapAction("ImpalaInstall", "s3://elasticmapreduce/libs/impala/setup-impala",
                                                                                                impala_install_params)
            #Hive installation
            hive_install_step = InstallHiveStep();

            #Pig Installation
            pig_install_step = InstallPigStep();

            #Launching the cluster
            jobid = self.conn.run_jobflow(
                         self.cluster_name,
                         self.log_bucket_name,
                         bootstrap_actions=[hadoop_config_bootstrapper, hadoop_monitor_bootstrapper, bootstrap_step,
                                            bootstrap_impala_install_step],
                         ec2_keyname=self.ec2_keyname,
                         steps=[hive_install_step, pig_install_step],
                         keep_alive=True,
                         action_on_failure = 'CANCEL_AND_WAIT',
                         master_instance_type=master_type,
                         slave_instance_type=slave_type,
                         num_instances=num_instance,
                         ami_version=ami_version)

            #Enabling the termination protection
            self.conn.set_termination_protection(jobid, True)

            #Checking the state of EMR cluster
            state = self.conn.describe_jobflow(jobid).state
            while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING':
                #sleeping to recheck for status.
                time.sleep(int(self.emr_status_wait))
                state = self.conn.describe_jobflow(jobid).state

            if state == u'SHUTTING_DOWN' or state == u'FAILED':
                logging.error("Launching EMR cluster failed")
                return "ERROR"

            #Check if the state is WAITING. Then launch the next steps
            if state == u'WAITING':
                #Finding the master node dns of EMR cluster
                master_dns = self.conn.describe_jobflow(jobid).masterpublicdnsname
                logging.info("Launched EMR Cluster Successfully")
                logging.info("Master node DNS of EMR " + master_dns)
                return "SUCCESS"
        except:
            logging.error("Launching EMR cluster failed")
            return "FAILED"

    def main(self):
        try:
            master_type = 'm3.xlarge'
            slave_type = 'm3.xlarge'
            num_instance = 3
            ami_version = '2.4.8'

            emr_status = self.launch_emr_cluster(master_type, slave_type, num_instance, ami_version)
            if emr_status == 'SUCCESS':
                logging.info("Emr cluster launched successfully")
#.........这里部分代码省略.........
开发者ID:amalgjose,项目名称:MyExperiments,代码行数:103,代码来源:EmrLauncher.py

示例6:

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
install_hive_step = step.InstallHiveStep(hive_versions='0.11.0.1')

# <codecell>

names1gram = step.HiveStep("fullNgramNamesBoto",
                           's3://wambia660fall2013/fullNgramNamesBoto.hql',
                           hive_args=['-d INPUT=s3://datasets.elasticmapreduce/ngrams/books/20090715/eng-us-all/1gram/', 
                                      '-d OUTPUT=s3://wambia660fall2013/output/'])

# <codecell>

jobid = emrcon.run_jobflow(name='Names 1gram boto v3', 
                           log_uri='s3://wambia660fall2013/logs/',
                           steps=[install_hive_step,
                                  names1gram], 
                           enable_debugging=True,
                           master_instance_type='m1.medium', 
                           slave_instance_type='m1.medium',
                           num_instances=4,
                           hadoop_version='1.0.3')

# <codecell>

print jobid

# <codecell>

status = emrcon.describe_jobflow(jobid)
print status.state

# <codecell>
开发者ID:winterProf,项目名称:BIA_Fall2013,代码行数:33,代码来源:Hive.py

示例7: Reduce

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
from boto.emr.bootstrap_action import BootstrapAction
from boto.emr.connection import EmrConnection

# Description:
# BootstrapAction is an object reperesenting a bootstrap action in Elastic Map
# Reduce (EMR), a script that gets run before the EMR job executes.

# initialize a bootstrap action
bootstrapSetup = BootstrapAction("Bootstrap Name",
                                 "s3://<my-bucket>/<my-bootstrap-action>",
                                 ["arg1=hello", "arg2=world"])

# initialize emr connection
emr_job = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")

# run emr job flow with defined bootstrap action
emr_job.run_jobflow(bootstrap_actions=[bootstrapSetup])
开发者ID:rushton,项目名称:botoexamples,代码行数:19,代码来源:bootstrap_action.py

示例8: main

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
def main(args):
	script_name = args
	for i in range(2, 3, 2):
		start_time = time.time()


		bucket_name = 'nlp-' + str(i).strip()

		emr_connection = EmrConnection()

		preprocessing_steps = []
		for j in xrange(12, 13, 12):
			preprocessing_steps.append(JarStep(name='prerocessing-' + str(i).strip(),
				jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-core.jar',
				step_args=['com.digitalpebble.behemoth.util.CorpusGenerator',
					'-i', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/texts',
					'-o', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/bcorpus']))

		tika_steps = []
		for j in xrange(12, 13, 12):
			tika_steps.append(JarStep(name='tika-' + str(i).strip(),
				jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-tika.jar',
				step_args=['com.digitalpebble.behemoth.tika.TikaDriver',
					'-i', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/bcorpus',
					'-o', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/tcorpus']))

		copy_jar_steps = []
		for j in xrange(12, 13, 12):
			copy_jar_steps.append(JarStep(name='copy-jar-' + str(i).strip(),
				jar='s3n://nlp-' + str(i).strip() + '/init/copy-to-hdfs.jar',
				step_args=['s3n://nlp-' + str(i).strip() + '/init/pipeline.pear',
					'/mnt/pipeline.pear']))

		uima_steps = []
		for j in xrange(12, 13, 12):
			uima_steps.append(JarStep(name='uima-' + str(i).strip(),
				jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-uima.jar',
				step_args=['com.digitalpebble.behemoth.uima.UIMADriver',
					's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/tcorpus',
					'/mnt/ucorpus',
					'/mnt/pipeline.pear']))

		steps = []
		steps.extend(preprocessing_steps
		steps.extend(tika_steps)
		steps.extend(copy_jar_steps)
		steps.extend(uima_steps)
		steps.extend(extract_result_steps)

		hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1',
		          '-m', 'mapred.child.java.opts=-Xmx10g']
		configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params)

		jobid = emr_connection.run_jobflow(name='nlp-cloud-' + str(i).strip(),
			log_uri='s3://nlp-' + str(i).strip() + '/jobflow_logs',
			master_instance_type='m2.xlarge',
			slave_instance_type='m2.xlarge',
			num_instances=i,
			keep_alive=False,
			enable_debugging=False,
			bootstrap_actions=[configure_hadoop_action],
			hadoop_version='1.0.3',
			steps=steps)

		termination_statuses = [u'COMPLETED', u'FAILED', u'TERMINATED']
		while True:
			time.sleep(5)
			status = emr_connection.describe_jobflow(jobid) 
			if status.state in termination_statuses:
				print 'Job finished for %s nodes' % i
				break


		print time.time() - start_time, ' seconds elapsed'



	return True

if (__name__ == '__main__'):
	args = sys.argv
	if (check_args(args)):
		if (main(args)):
			sys.exit()
			print 'Work successfully finished'
		else:
			print 'Could not finish work'
			sys.exit(1)
	else:
		print USAGE_MESSAGE
	sys.exit(2)
开发者ID:valeter,项目名称:nlp-site,代码行数:93,代码来源:run_cluster.py

示例9: EmrConnection

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
#emrcon = EmrConnection('<aws access key>', '<aws secret key>')
emrcon = EmrConnection('AKIAJRV3RN6NXQTSSTBA', '3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc')

# <codecell>

# Using EMR's wordcount example
step = StreamingStep(name='My wordcount example',
	mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
	reducer='aggregate', 
	input='s3n://elasticmapreduce/samples/wordcount/input',
	output='s3n://wambia660fall2013/output/wordcount_output')

# <codecell>

jobid = emrcon.run_jobflow(name='Word Count Example', 
                           log_uri='s3://wambia660fall2013/logs',
                           steps=[step])

# <codecell>

print jobid

# <codecell>

import re

# <codecell>

for word in b.list():
    keystring = str(word.key)
    if re.match(keystring,'part-00000'):
开发者ID:todatamining,项目名称:db1,代码行数:33,代码来源:boto.py

示例10: EMR

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
# the nodes of an EMR(Elastic Map Reduce) job.

# build up our instance groups
namenode_instance_group = InstanceGroup(num_instances=1,
                                        role="MASTER",
                                        type="c1.xlarge",
                                        market="ON_DEMAND",
                                        name="MASTER_GROUP")

core_nodes = InstanceGroup(num_instances=20,
                           role="MASTER",
                           type="c1.xlarge",
                           market="SPOT",
                           name="MASTER_GROUP")

task_nodes = InstanceGroup(num_instances=10,
                           role="TASK",
                           type="c1.xlarge",
                           market="ON_DEMAND",
                           name="INITIAL_TASK_GROUP")

instance_groups = [namenode_instance_group, core_nodes, task_nodes]


# run the job
conn = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")
conn.run_jobflow(name="My Job Flow",
                 instance_groups=instance_groups)


开发者ID:jayzeng,项目名称:botoexamples,代码行数:30,代码来源:instance_group.py

示例11: EmrConn

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
# EmrConn() args: aws_access_key_id=None  aws_secret_access_key=None,
emr = EmrConnection(aws_access_key_id= credentials['aws_access_key_id'],\
    aws_secret_access_key = credentials['aws_secret_access_key'])


print "logged in / made new emr?"
raw_input()

# Python files must be hosted on s3 and linked to for execution.
## [ ] TODO(emmagras): Check the docs for StreamingStep and understand
##     the arguments below.

## args for StreamingStep: name, mapper uri, reducer uri=None,
## combiner uri=None, action_on_failure='TERMINATE_JOB_FLOW', 
## cache_files=None, cache_archives=None, step_args=None,
## input=None, output=None, 
## jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar'
wc_step = StreamingStep('wc text', \
  's3://elasticmapreduce/samples/wordcount/wordSplitter.py', \
  'aggregate', input='s3://elasticmapreduce/samples/wordcount/input', \
  output='s3://wc-test-bucket/output/%s' % job_ts)
jf_id = emr.run_jobflow('wc jobflow', 's3n://emr-debug/%s' % job_ts, \
  steps=[wc_step])

while True:
  jf = emr.describe_jobflow(jf_id)
  print "[%s] %s" % (datetime.now().strftime("%Y-%m-%d %T"), jf.state)
  if jf.state == 'COMPLETED':
    break
time.sleep(10)
开发者ID:emmagras,项目名称:emc-aws-data-project,代码行数:32,代码来源:helloworld_emr.py

示例12:

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
    market="ON_DEMAND",
    name="Main node"))
instance_groups.append(InstanceGroup(
    num_instances=6,
    role="CORE",
    type="m1.large",
    market="ON_DEMAND",
    name="node"))
instance_groups.append(InstanceGroup(
    num_instances=6,
    role="TASK",
	type="m1.large",
	    market="SPOT",
	    name="spot node",
	    bidprice="0.004"))

	job_id = conn.run_jobflow(
	'MyCluster',
	instance_groups=instance_groups,
	action_on_failure='TERMINATE_JOB_FLOW',
	keep_alive=False,
	enable_debugging=True,
	log_uri='s3://'+holder+'/log',
	hadoop_version=None,
	ami_version="2.4.9",
	steps=[step],
	bootstrap_actions=[],
	ec2_keyname='euireland1kp',
	visible_to_all_users=True,
	job_flow_role="EMR_EC2_DefaultRole",
	service_role="EMR_DefaultRole")
开发者ID:austineinstein,项目名称:Plagiarism-Detection-System,代码行数:33,代码来源:runboto.py

示例13: StreamingStep

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
# <codecell>

# Using EMR's wordcount example
step = StreamingStep(
    name="hwu9 wordcount example",
    # mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
    mapper="s3n://" + BUCKET + "/mapper.py",
    reducer="s3n://" + BUCKET + "/reducer.py",
    # input='s3n://elasticmapreduce/samples/wordcount/input',
    input="s3n://" + BUCKET + "/test.txt",
    output="s3n://" + BUCKET + "/output/wordcount_output",
)

# <codecell>

jobid = emrcon.run_jobflow(name="hwu9 Word Count Example", log_uri="s3://" + BUCKET + "/logs", steps=[step])

# <codecell>

print jobid

result_queue = multiprocessing.Queue()
process = multiprocessing.Process(target=check_status, args=[emrcon, jobid, result_queue])
process.start()
result = result_queue.get()
# thread = threading.Thread(target=check_status)
# thread.start()
# thread.join()


# <codecell>
开发者ID:todatamining,项目名称:db1,代码行数:33,代码来源:myboto.py

示例14: EmrClient

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
class EmrClient(object):


    # The Hadoop version to use
    HADOOP_VERSION = '1.0.3'

    # The AMI version to use
    AMI_VERSION = '2.4.7'
 
    # Interval to wait between polls to EMR cluster in seconds
    CLUSTER_OPERATION_RESULTS_POLLING_SECONDS = 10
 
    # Timeout for EMR creation and ramp up in seconds
    CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS = 60 * 30
 
    def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None):
 
        # If the access key is not specified, get it from the luigi config.cfg file
        if not aws_access_key_id:
            aws_access_key_id = luigi.configuration.get_config().get('aws', 'aws_access_key_id')
 
        if not aws_secret_access_key:
            aws_secret_access_key = luigi.configuration.get_config().get('aws', 'aws_secret_access_key')
 
 
        # Create the region in which to run
        region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
        region = RegionInfo(name=region_name, endpoint=region_endpoint)
 
        self.emr_connection = EmrConnection(aws_access_key_id=aws_access_key_id,
                                            aws_secret_access_key=aws_secret_access_key,
                                            region=region)
 
    def launch_emr_cluster(self, cluster_name, log_uri, ec2_keyname=None, master_type='m1.small', core_type='m1.small', num_instances=2, hadoop_version='1.0.3', ami_version='2.4.7', ):
 
        # TODO Remove
        # install_pig_step = InstallPigStep()
 
        jobflow_id = self.emr_connection.run_jobflow(name=cluster_name,
                              log_uri=log_uri,
                              ec2_keyname=ec2_keyname,
                              master_instance_type=master_type,
                              slave_instance_type=core_type,
                              num_instances=num_instances,
                              keep_alive=True,
                              enable_debugging=True,
                              hadoop_version=EmrClient.HADOOP_VERSION,
                              steps=[], 
                              ami_version=EmrClient.AMI_VERSION)
 
        # Log important information
        status = self.emr_connection.describe_jobflow(jobflow_id)

        logger.info('Creating new cluster %s with following details' % status.name)
        logger.info('jobflow ID:\t%s' % status.jobflowid)
        logger.info('Log URI:\t%s' % status.loguri)
        logger.info('Master Instance Type:\t%s' % status.masterinstancetype)
        
        # A cluster of size 1 does not have any slave instances
        if hasattr(status, 'slaveinstancetype'):
            logger.info('Slave Instance Type:\t%s' % status.slaveinstancetype)
        
        logger.info('Number of Instances:\t%s' % status.instancecount)
        logger.info('Hadoop Version:\t%s' % status.hadoopversion)
        logger.info('AMI Version:\t%s' % status.amiversion)
        logger.info('Keep Alive:\t%s' % status.keepjobflowalivewhennosteps)
 
        return self._poll_until_cluster_ready(jobflow_id)
 
 
    def add_pig_step(self, jobflow_id, pig_file, name='Pig Script', pig_versions='latest', pig_args=[]): 

        pig_step = PigStep(name=name,
                           pig_file=pig_file,
                           pig_versions=pig_versions,
                           pig_args=pig_args,
                           # action_on_failure='CONTINUE',
                       )

        self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step])

        # Poll until the cluster is done working        
        return self._poll_until_cluster_ready(jobflow_id)


    def shutdown_emr_cluster(self, jobflow_id):
 
        self.emr_connection.terminate_jobflow(jobflow_id)
        return self._poll_until_cluster_shutdown(jobflow_id)
 
    def get_jobflow_id(self):
        # Get the id of the cluster that is WAITING for work
        return self.emr_connection.list_clusters(cluster_states=['WAITING']).clusters[0].id
 
    def get_master_dns(self):
        """
        Get the master node's public address
        """
        # Get the jobflow ID
        jobflow_id = self.get_master_dns()
#.........这里部分代码省略.........
开发者ID:mbrio,项目名称:Luigi,代码行数:103,代码来源:emr_client.py

示例15: JarStep

# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
step2 = JarStep(name='Run Hive Script',
                jar='s3://elasticmapreduce/libs/script-runner/script-runner.jar',
                step_args=['s3://elasticmapreduce/libs/hive/hive-script',
                           '--run-hive-script',
                           '--args',
                           '-f',
                           's3://dphive/mmhadooprollup.hql', '-d',
                           'INPUT=s3://mmlogs', '-d',
                           'OUTPUT=s3://dphiveoutput'])

jobname = 'MM Logs Jobflow %s' %dt.datetime.now()

jobid = conne.run_jobflow(name=jobname,
                          log_uri='s3://dphive/debug/',
                          ec2_keyname='dpaws',
                          master_instance_type='c1.medium',
                          slave_instance_type='c1.medium',
                          num_instances=3,
                          steps=[step1, step2])



while True:
	status = conne.describe_jobflow(jobid)
	if status.state == 'STARTING':
		time.sleep(10)
	elif status.state == 'RUNNING':
		time.sleep(10)
	elif status.state == 'WAITING':
		time.sleep(10)
	elif status.state == 'TERMINATED':
开发者ID:mobtownlabs,项目名称:ec2-user,代码行数:33,代码来源:hadoop_job_runner.py


注:本文中的boto.emr.connection.EmrConnection.run_jobflow方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。