本文整理汇总了Python中boto.emr.connection.EmrConnection.run_jobflow方法的典型用法代码示例。如果您正苦于以下问题:Python EmrConnection.run_jobflow方法的具体用法?Python EmrConnection.run_jobflow怎么用?Python EmrConnection.run_jobflow使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类boto.emr.connection.EmrConnection
的用法示例。
在下文中一共展示了EmrConnection.run_jobflow方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: start_hadoop_cluster
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
def start_hadoop_cluster(nodenum):
try:
hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1',
'-m', 'mapred.child.java.opts=-Xmx10g']
configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params)
emr_connection = EmrConnection()
bucket_name = "udk-bucket"
steps = []
copy_jar_step = JarStep(name='copy-jar',
jar='s3n://' + bucket_name + '/copy-to-hdfs.jar',
step_args=['s3n://' + bucket_name + '/pipeline.pear',
'/mnt/pipeline.pear'])
steps.append(copy_jar_step)
jobflow_id = emr_connection.run_jobflow(name='udk',
log_uri='s3://udk-bucket/jobflow_logs',
master_instance_type='m2.xlarge',
slave_instance_type='m2.xlarge',
num_instances=nodenum,
keep_alive=True,
enable_debugging=False,
bootstrap_actions=[configure_hadoop_action],
hadoop_version='1.0.3',
steps=steps)
emr_connection.set_termination_protection(jobflow_id, True)
return jobflow_id
except Exception, e:
return "none"
示例2: run
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
def run(self):
"""Run the Hive job on EMR cluster
"""
# copy the data source to a new object
# (Hive deletes/moves the original)
copy_s3_file(self.input_path, self.data_path)
# and create the hive script
self._generate_and_upload_hive_script()
logger.info("Waiting {} seconds for S3 eventual consistency".format(
self.s3_sync_wait_time))
time.sleep(self.s3_sync_wait_time)
# TODO more options like setting aws region
conn = EmrConnection(self.aws_access_key_id,
self.aws_secret_access_key)
setup_step = InstallHiveStep(self.hive_version)
run_step = HiveStep(self.job_name, self.script_path)
jobid = conn.run_jobflow(
self.job_name,
self.log_path,
action_on_failure='CANCEL_AND_WAIT',
master_instance_type=self.master_instance_type,
slave_instance_type=self.slave_instance_type,
ami_version=self.ami_version,
num_instances=self.num_instances)
conn.add_jobflow_steps(jobid, [setup_step, run_step])
self._wait_for_job_to_complete(conn, jobid)
logger.info("Output file is in: {0}".format(self.output_path))
示例3: creating_a_connection
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
class EMR:
def creating_a_connection(self):
#Creating a connection
from boto.emr.connection import EmrConnection
self.conn = EmrConnection('', '')
def creating_streaming_job(self):
#Creating Streaming JobFlow Steps
from boto.emr.step import StreamingStep
self.step = StreamingStep(name='my bigdata task',
mapper='s3n://eth-src/raw_to_stations.py',
#mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='s3n://eth-src/stations_to_features.py',
#reducer='aggregate',
input='s3n://eth-input/2007.csv',
#input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://eth-middle/2007')
def creating_jobflows(self):
#Creating JobFlows
#import boto.emr
#self.conn = boto.emr.connect_to_region('eu-west-1')
job_id = self.conn.run_jobflow(name='My jobflow',
log_uri='s3://eth-log/jobflow_logs',
master_instance_type='m3.xlarge',
slave_instance_type='m1.large',
num_instances=2,
steps=[self.step],
ami_version='3.3.1'
)
status = self.conn.describe_jobflow(job_id)
status.state
def terminating_jobflows(self, job_id):
#Terminating JobFlows
#self.conn = boto.emr.connect_to_region('eu-west-1')
self.conn.terminate_jobflow(job_id)
示例4: EmrConnection
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
from boto.emr.connection import EmrConnection
from boto.emr.step import StreamingStep
import boto
AWS_KEY='AKIAIQ7VG4UORIN75ZSA'
AWS_SECRET='jzxajGx8gzwX+ymYXJ0/5heCjkPtWLQkICYRn7Vj'
conn = EmrConnection(AWS_KEY, AWS_SECRET)
step = StreamingStep(name='My wordcount example',
mapper='s3n://css739/wordcount/bigramSplitter.py',
reducer='aggregate',
input='s3n://smalldata/wikipedia_titles.txt',
output='s3n://css739/wordcount/bigram_count_output2',
cache_files=['s3n://css739/wordcount/english_stoplist.py'])
jobid = conn.run_jobflow(name='My jobflow', log_uri='s3n://css739/wordcount/jobflow_logs',steps=[step])
conn.describe_jobflow(jobid).state
示例5: EmrLauncher
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
class EmrLauncher(object):
# Default constructor of the class.
def __init__(self):
try:
self.zone_name = "ap-southeast-1"
self.access_key = "xxxxxx"
self.private_key = "xxxxxxx"
self.ec2_keyname = "xxxxxxxx"
self.base_bucket = "s3://emr-bucket/"
self.bootstrap_script = "custom-bootstrap.sh"
self.log_dir = "Logs"
self.emr_status_wait = 20
self.conn = ""
self.cluster_name = "MyFirstEmrCluster"
# Establishing EmrConnection
self.conn = EmrConnection(self.access_key, self.private_key,
region=RegionInfo(name=self.zone_name,
endpoint=self.zone_name + '.elasticmapreduce.amazonaws.com'))
self.log_bucket_name = self.base_bucket + self.log_dir
self.bootstrap_script_name = self.base_bucket + self.bootstrap_script
def launch_emr_cluster(self, master_type, slave_type, num_instance, ami_version):
try:
#Custom Bootstrap step
bootstrap_step = BootstrapAction("CustomBootStrap", self.bootstrap_script_name, None)
#Modifyting block size to 256 MB
block_size_conf = 'dfs.block.size=256'
hadoop_config_params = ['-h', block_size_conf, '-h']
hadoop_config_bootstrapper = BootstrapAction('hadoop-config',
's3://elasticmapreduce/bootstrap-actions/configure-hadoop',
hadoop_config_params)
#Bootstrapping Ganglia
hadoop_monitor_bootstrapper = BootstrapAction('ganglia-config',
's3://elasticmapreduce/bootstrap-actions/install-ganglia', '')
#Bootstrapping Impala
impala_install_params = ['--install-impala','--base-path', 's3://elasticmapreduce', '--impala-version', 'latest']
bootstrap_impala_install_step = BootstrapAction("ImpalaInstall", "s3://elasticmapreduce/libs/impala/setup-impala",
impala_install_params)
#Hive installation
hive_install_step = InstallHiveStep();
#Pig Installation
pig_install_step = InstallPigStep();
#Launching the cluster
jobid = self.conn.run_jobflow(
self.cluster_name,
self.log_bucket_name,
bootstrap_actions=[hadoop_config_bootstrapper, hadoop_monitor_bootstrapper, bootstrap_step,
bootstrap_impala_install_step],
ec2_keyname=self.ec2_keyname,
steps=[hive_install_step, pig_install_step],
keep_alive=True,
action_on_failure = 'CANCEL_AND_WAIT',
master_instance_type=master_type,
slave_instance_type=slave_type,
num_instances=num_instance,
ami_version=ami_version)
#Enabling the termination protection
self.conn.set_termination_protection(jobid, True)
#Checking the state of EMR cluster
state = self.conn.describe_jobflow(jobid).state
while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING':
#sleeping to recheck for status.
time.sleep(int(self.emr_status_wait))
state = self.conn.describe_jobflow(jobid).state
if state == u'SHUTTING_DOWN' or state == u'FAILED':
logging.error("Launching EMR cluster failed")
return "ERROR"
#Check if the state is WAITING. Then launch the next steps
if state == u'WAITING':
#Finding the master node dns of EMR cluster
master_dns = self.conn.describe_jobflow(jobid).masterpublicdnsname
logging.info("Launched EMR Cluster Successfully")
logging.info("Master node DNS of EMR " + master_dns)
return "SUCCESS"
except:
logging.error("Launching EMR cluster failed")
return "FAILED"
def main(self):
try:
master_type = 'm3.xlarge'
slave_type = 'm3.xlarge'
num_instance = 3
ami_version = '2.4.8'
emr_status = self.launch_emr_cluster(master_type, slave_type, num_instance, ami_version)
if emr_status == 'SUCCESS':
logging.info("Emr cluster launched successfully")
#.........这里部分代码省略.........
示例6:
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
install_hive_step = step.InstallHiveStep(hive_versions='0.11.0.1')
# <codecell>
names1gram = step.HiveStep("fullNgramNamesBoto",
's3://wambia660fall2013/fullNgramNamesBoto.hql',
hive_args=['-d INPUT=s3://datasets.elasticmapreduce/ngrams/books/20090715/eng-us-all/1gram/',
'-d OUTPUT=s3://wambia660fall2013/output/'])
# <codecell>
jobid = emrcon.run_jobflow(name='Names 1gram boto v3',
log_uri='s3://wambia660fall2013/logs/',
steps=[install_hive_step,
names1gram],
enable_debugging=True,
master_instance_type='m1.medium',
slave_instance_type='m1.medium',
num_instances=4,
hadoop_version='1.0.3')
# <codecell>
print jobid
# <codecell>
status = emrcon.describe_jobflow(jobid)
print status.state
# <codecell>
示例7: Reduce
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
from boto.emr.bootstrap_action import BootstrapAction
from boto.emr.connection import EmrConnection
# Description:
# BootstrapAction is an object reperesenting a bootstrap action in Elastic Map
# Reduce (EMR), a script that gets run before the EMR job executes.
# initialize a bootstrap action
bootstrapSetup = BootstrapAction("Bootstrap Name",
"s3://<my-bucket>/<my-bootstrap-action>",
["arg1=hello", "arg2=world"])
# initialize emr connection
emr_job = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")
# run emr job flow with defined bootstrap action
emr_job.run_jobflow(bootstrap_actions=[bootstrapSetup])
示例8: main
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
def main(args):
script_name = args
for i in range(2, 3, 2):
start_time = time.time()
bucket_name = 'nlp-' + str(i).strip()
emr_connection = EmrConnection()
preprocessing_steps = []
for j in xrange(12, 13, 12):
preprocessing_steps.append(JarStep(name='prerocessing-' + str(i).strip(),
jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-core.jar',
step_args=['com.digitalpebble.behemoth.util.CorpusGenerator',
'-i', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/texts',
'-o', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/bcorpus']))
tika_steps = []
for j in xrange(12, 13, 12):
tika_steps.append(JarStep(name='tika-' + str(i).strip(),
jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-tika.jar',
step_args=['com.digitalpebble.behemoth.tika.TikaDriver',
'-i', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/bcorpus',
'-o', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/tcorpus']))
copy_jar_steps = []
for j in xrange(12, 13, 12):
copy_jar_steps.append(JarStep(name='copy-jar-' + str(i).strip(),
jar='s3n://nlp-' + str(i).strip() + '/init/copy-to-hdfs.jar',
step_args=['s3n://nlp-' + str(i).strip() + '/init/pipeline.pear',
'/mnt/pipeline.pear']))
uima_steps = []
for j in xrange(12, 13, 12):
uima_steps.append(JarStep(name='uima-' + str(i).strip(),
jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-uima.jar',
step_args=['com.digitalpebble.behemoth.uima.UIMADriver',
's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/tcorpus',
'/mnt/ucorpus',
'/mnt/pipeline.pear']))
steps = []
steps.extend(preprocessing_steps
steps.extend(tika_steps)
steps.extend(copy_jar_steps)
steps.extend(uima_steps)
steps.extend(extract_result_steps)
hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1',
'-m', 'mapred.child.java.opts=-Xmx10g']
configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params)
jobid = emr_connection.run_jobflow(name='nlp-cloud-' + str(i).strip(),
log_uri='s3://nlp-' + str(i).strip() + '/jobflow_logs',
master_instance_type='m2.xlarge',
slave_instance_type='m2.xlarge',
num_instances=i,
keep_alive=False,
enable_debugging=False,
bootstrap_actions=[configure_hadoop_action],
hadoop_version='1.0.3',
steps=steps)
termination_statuses = [u'COMPLETED', u'FAILED', u'TERMINATED']
while True:
time.sleep(5)
status = emr_connection.describe_jobflow(jobid)
if status.state in termination_statuses:
print 'Job finished for %s nodes' % i
break
print time.time() - start_time, ' seconds elapsed'
return True
if (__name__ == '__main__'):
args = sys.argv
if (check_args(args)):
if (main(args)):
sys.exit()
print 'Work successfully finished'
else:
print 'Could not finish work'
sys.exit(1)
else:
print USAGE_MESSAGE
sys.exit(2)
示例9: EmrConnection
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
#emrcon = EmrConnection('<aws access key>', '<aws secret key>')
emrcon = EmrConnection('AKIAJRV3RN6NXQTSSTBA', '3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc')
# <codecell>
# Using EMR's wordcount example
step = StreamingStep(name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://wambia660fall2013/output/wordcount_output')
# <codecell>
jobid = emrcon.run_jobflow(name='Word Count Example',
log_uri='s3://wambia660fall2013/logs',
steps=[step])
# <codecell>
print jobid
# <codecell>
import re
# <codecell>
for word in b.list():
keystring = str(word.key)
if re.match(keystring,'part-00000'):
示例10: EMR
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
# the nodes of an EMR(Elastic Map Reduce) job.
# build up our instance groups
namenode_instance_group = InstanceGroup(num_instances=1,
role="MASTER",
type="c1.xlarge",
market="ON_DEMAND",
name="MASTER_GROUP")
core_nodes = InstanceGroup(num_instances=20,
role="MASTER",
type="c1.xlarge",
market="SPOT",
name="MASTER_GROUP")
task_nodes = InstanceGroup(num_instances=10,
role="TASK",
type="c1.xlarge",
market="ON_DEMAND",
name="INITIAL_TASK_GROUP")
instance_groups = [namenode_instance_group, core_nodes, task_nodes]
# run the job
conn = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")
conn.run_jobflow(name="My Job Flow",
instance_groups=instance_groups)
示例11: EmrConn
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
# EmrConn() args: aws_access_key_id=None aws_secret_access_key=None,
emr = EmrConnection(aws_access_key_id= credentials['aws_access_key_id'],\
aws_secret_access_key = credentials['aws_secret_access_key'])
print "logged in / made new emr?"
raw_input()
# Python files must be hosted on s3 and linked to for execution.
## [ ] TODO(emmagras): Check the docs for StreamingStep and understand
## the arguments below.
## args for StreamingStep: name, mapper uri, reducer uri=None,
## combiner uri=None, action_on_failure='TERMINATE_JOB_FLOW',
## cache_files=None, cache_archives=None, step_args=None,
## input=None, output=None,
## jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar'
wc_step = StreamingStep('wc text', \
's3://elasticmapreduce/samples/wordcount/wordSplitter.py', \
'aggregate', input='s3://elasticmapreduce/samples/wordcount/input', \
output='s3://wc-test-bucket/output/%s' % job_ts)
jf_id = emr.run_jobflow('wc jobflow', 's3n://emr-debug/%s' % job_ts, \
steps=[wc_step])
while True:
jf = emr.describe_jobflow(jf_id)
print "[%s] %s" % (datetime.now().strftime("%Y-%m-%d %T"), jf.state)
if jf.state == 'COMPLETED':
break
time.sleep(10)
示例12:
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
market="ON_DEMAND",
name="Main node"))
instance_groups.append(InstanceGroup(
num_instances=6,
role="CORE",
type="m1.large",
market="ON_DEMAND",
name="node"))
instance_groups.append(InstanceGroup(
num_instances=6,
role="TASK",
type="m1.large",
market="SPOT",
name="spot node",
bidprice="0.004"))
job_id = conn.run_jobflow(
'MyCluster',
instance_groups=instance_groups,
action_on_failure='TERMINATE_JOB_FLOW',
keep_alive=False,
enable_debugging=True,
log_uri='s3://'+holder+'/log',
hadoop_version=None,
ami_version="2.4.9",
steps=[step],
bootstrap_actions=[],
ec2_keyname='euireland1kp',
visible_to_all_users=True,
job_flow_role="EMR_EC2_DefaultRole",
service_role="EMR_DefaultRole")
示例13: StreamingStep
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
# <codecell>
# Using EMR's wordcount example
step = StreamingStep(
name="hwu9 wordcount example",
# mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
mapper="s3n://" + BUCKET + "/mapper.py",
reducer="s3n://" + BUCKET + "/reducer.py",
# input='s3n://elasticmapreduce/samples/wordcount/input',
input="s3n://" + BUCKET + "/test.txt",
output="s3n://" + BUCKET + "/output/wordcount_output",
)
# <codecell>
jobid = emrcon.run_jobflow(name="hwu9 Word Count Example", log_uri="s3://" + BUCKET + "/logs", steps=[step])
# <codecell>
print jobid
result_queue = multiprocessing.Queue()
process = multiprocessing.Process(target=check_status, args=[emrcon, jobid, result_queue])
process.start()
result = result_queue.get()
# thread = threading.Thread(target=check_status)
# thread.start()
# thread.join()
# <codecell>
示例14: EmrClient
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
class EmrClient(object):
# The Hadoop version to use
HADOOP_VERSION = '1.0.3'
# The AMI version to use
AMI_VERSION = '2.4.7'
# Interval to wait between polls to EMR cluster in seconds
CLUSTER_OPERATION_RESULTS_POLLING_SECONDS = 10
# Timeout for EMR creation and ramp up in seconds
CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS = 60 * 30
def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None):
# If the access key is not specified, get it from the luigi config.cfg file
if not aws_access_key_id:
aws_access_key_id = luigi.configuration.get_config().get('aws', 'aws_access_key_id')
if not aws_secret_access_key:
aws_secret_access_key = luigi.configuration.get_config().get('aws', 'aws_secret_access_key')
# Create the region in which to run
region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
region = RegionInfo(name=region_name, endpoint=region_endpoint)
self.emr_connection = EmrConnection(aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region=region)
def launch_emr_cluster(self, cluster_name, log_uri, ec2_keyname=None, master_type='m1.small', core_type='m1.small', num_instances=2, hadoop_version='1.0.3', ami_version='2.4.7', ):
# TODO Remove
# install_pig_step = InstallPigStep()
jobflow_id = self.emr_connection.run_jobflow(name=cluster_name,
log_uri=log_uri,
ec2_keyname=ec2_keyname,
master_instance_type=master_type,
slave_instance_type=core_type,
num_instances=num_instances,
keep_alive=True,
enable_debugging=True,
hadoop_version=EmrClient.HADOOP_VERSION,
steps=[],
ami_version=EmrClient.AMI_VERSION)
# Log important information
status = self.emr_connection.describe_jobflow(jobflow_id)
logger.info('Creating new cluster %s with following details' % status.name)
logger.info('jobflow ID:\t%s' % status.jobflowid)
logger.info('Log URI:\t%s' % status.loguri)
logger.info('Master Instance Type:\t%s' % status.masterinstancetype)
# A cluster of size 1 does not have any slave instances
if hasattr(status, 'slaveinstancetype'):
logger.info('Slave Instance Type:\t%s' % status.slaveinstancetype)
logger.info('Number of Instances:\t%s' % status.instancecount)
logger.info('Hadoop Version:\t%s' % status.hadoopversion)
logger.info('AMI Version:\t%s' % status.amiversion)
logger.info('Keep Alive:\t%s' % status.keepjobflowalivewhennosteps)
return self._poll_until_cluster_ready(jobflow_id)
def add_pig_step(self, jobflow_id, pig_file, name='Pig Script', pig_versions='latest', pig_args=[]):
pig_step = PigStep(name=name,
pig_file=pig_file,
pig_versions=pig_versions,
pig_args=pig_args,
# action_on_failure='CONTINUE',
)
self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step])
# Poll until the cluster is done working
return self._poll_until_cluster_ready(jobflow_id)
def shutdown_emr_cluster(self, jobflow_id):
self.emr_connection.terminate_jobflow(jobflow_id)
return self._poll_until_cluster_shutdown(jobflow_id)
def get_jobflow_id(self):
# Get the id of the cluster that is WAITING for work
return self.emr_connection.list_clusters(cluster_states=['WAITING']).clusters[0].id
def get_master_dns(self):
"""
Get the master node's public address
"""
# Get the jobflow ID
jobflow_id = self.get_master_dns()
#.........这里部分代码省略.........
示例15: JarStep
# 需要导入模块: from boto.emr.connection import EmrConnection [as 别名]
# 或者: from boto.emr.connection.EmrConnection import run_jobflow [as 别名]
step2 = JarStep(name='Run Hive Script',
jar='s3://elasticmapreduce/libs/script-runner/script-runner.jar',
step_args=['s3://elasticmapreduce/libs/hive/hive-script',
'--run-hive-script',
'--args',
'-f',
's3://dphive/mmhadooprollup.hql', '-d',
'INPUT=s3://mmlogs', '-d',
'OUTPUT=s3://dphiveoutput'])
jobname = 'MM Logs Jobflow %s' %dt.datetime.now()
jobid = conne.run_jobflow(name=jobname,
log_uri='s3://dphive/debug/',
ec2_keyname='dpaws',
master_instance_type='c1.medium',
slave_instance_type='c1.medium',
num_instances=3,
steps=[step1, step2])
while True:
status = conne.describe_jobflow(jobid)
if status.state == 'STARTING':
time.sleep(10)
elif status.state == 'RUNNING':
time.sleep(10)
elif status.state == 'WAITING':
time.sleep(10)
elif status.state == 'TERMINATED':