本文整理匯總了Python中boto.emr.connection.EmrConnection類的典型用法代碼示例。如果您正苦於以下問題:Python EmrConnection類的具體用法?Python EmrConnection怎麽用?Python EmrConnection使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了EmrConnection類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: add_steps
def add_steps(cluster_id, key):
try:
emr_connection = EmrConnection()
emr_connection.add_jobflow_steps(cluster_id, get_steps(key, key))
return True
except Exception, e:
return False
示例2: start_hadoop_cluster
def start_hadoop_cluster(nodenum):
try:
hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1',
'-m', 'mapred.child.java.opts=-Xmx10g']
configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params)
emr_connection = EmrConnection()
bucket_name = "udk-bucket"
steps = []
copy_jar_step = JarStep(name='copy-jar',
jar='s3n://' + bucket_name + '/copy-to-hdfs.jar',
step_args=['s3n://' + bucket_name + '/pipeline.pear',
'/mnt/pipeline.pear'])
steps.append(copy_jar_step)
jobflow_id = emr_connection.run_jobflow(name='udk',
log_uri='s3://udk-bucket/jobflow_logs',
master_instance_type='m2.xlarge',
slave_instance_type='m2.xlarge',
num_instances=nodenum,
keep_alive=True,
enable_debugging=False,
bootstrap_actions=[configure_hadoop_action],
hadoop_version='1.0.3',
steps=steps)
emr_connection.set_termination_protection(jobflow_id, True)
return jobflow_id
except Exception, e:
return "none"
示例3: run
def run(self):
"""Run the Hive job on EMR cluster
"""
# copy the data source to a new object
# (Hive deletes/moves the original)
copy_s3_file(self.input_path, self.data_path)
# and create the hive script
self._generate_and_upload_hive_script()
logger.info("Waiting {} seconds for S3 eventual consistency".format(
self.s3_sync_wait_time))
time.sleep(self.s3_sync_wait_time)
# TODO more options like setting aws region
conn = EmrConnection(self.aws_access_key_id,
self.aws_secret_access_key)
setup_step = InstallHiveStep(self.hive_version)
run_step = HiveStep(self.job_name, self.script_path)
jobid = conn.run_jobflow(
self.job_name,
self.log_path,
action_on_failure='CANCEL_AND_WAIT',
master_instance_type=self.master_instance_type,
slave_instance_type=self.slave_instance_type,
ami_version=self.ami_version,
num_instances=self.num_instances)
conn.add_jobflow_steps(jobid, [setup_step, run_step])
self._wait_for_job_to_complete(conn, jobid)
logger.info("Output file is in: {0}".format(self.output_path))
示例4: get_cluster_status
def get_cluster_status(cluster_id):
try:
emr_connection = EmrConnection()
flow = emr_connection.describe_jobflow(cluster_id)
if flow == None:
return "none"
return flow.state
except Exception, e:
return "none"
示例5: terminate
def terminate(cluster_id):
try:
emr_connection = EmrConnection()
emr_connection.set_termination_protection(cluster_id, False)
emr_connection.terminate_jobflow(cluster_id)
return True
except Exception, e:
print e
return False
示例6: create_data_source_variable
def create_data_source_variable(cluster_id, cr):
"""
Creates a data source variable .json file using the cluster_id of an EMR cluster_id
@PARAM: cluster_id: ID of an EMR cluster
return: True if success, creates a file in the pwd 'default_emr.json'
Object created should look like:
HADOOP_DATA_SOURCE_NAME="emr_data_source"
HADOOP_DATA_SOURCE_DISTRO="Cloudera CDH5.4-5.7"
HADOOP_DATA_SOURCE_HOST="emr_master_dns_hostname"
HADOOP_DATA_SOURCE_PORT=8020
HADOOP_DATA_SOURCE_USER="hdfs"
HADOOP_DATA_SOURCE_GROUP="hadoop"
HADOOP_DATA_SOURCE_JT_HOST="emr_master_dns_hostname"
HADOOP_DATA_SOURCE_JT_PORT=8032
CONNECTION_PARAMETERS='[{"key":"mapreduce.jobhistory.address", "value":"0.0.0.0:10020"}, ' \
'{"key":"mapreduce.jobhistory.webapp.address", "value":"cdh5hakerberosnn.alpinenow.local:19888"}, ' \
'{"key":"yarn.app.mapreduce.am.staging-dir", "value":"/tmp/hadoop-yarn/staging"}, ' \
'{"key":"yarn.resourcemanager.admin.address", "value":"cdh5hakerberosnn.alpinenow.local:8033"}, ' \
'{"key":"yarn.resourcemanager.resource-tracker.address", "value":"cdh5hakerberosnn.alpinenow.local:8031"}, ' \
'{"key":"yarn.resourcemanager.scheduler.address", "value":"cdh5hakerberosnn.alpinenow.local:8030"}]'
"""
conn = EmrConnection(
cr.get_config("aws_access_key"),
cr.get_config("aws_secret_key"),
region = RegionInfo(name = cr.get_config("aws_region"),
endpoint = cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com" ))
emr_cluster = conn.describe_cluster(cluster_id)
master_dns_hostname = emr_cluster.masterpublicdnsname
# Build up connection parameters
conn_params = []
conn_params.append({"key": "mapreduce.jobhistory.address", "value": "{0}:10020".format(master_dns_hostname)})
conn_params.append({"key": "mapreduce.jobhistory.webapp.address", "value": "{0}:19888".format(master_dns_hostname)})
conn_params.append({"key": "yarn.app.mapreduce.am.staging-dir", "value": "/user"})
conn_params.append({"key": "yarn.resourcemanager.admin.address", "value": "{0}:8033".format(master_dns_hostname)})
conn_params.append({"key": "yarn.resourcemanager.scheduler.address", "value": "{0}:8030".format(master_dns_hostname)})
conn_params_str = "CONNECTION_PARAMETERS=\"{0}\"".format(conn_params)
email_str = "EMAIL=\"avalanche_{0}.alpinenow.com\"".format(random.randint(1,99999))
with open("emr_default.conf", "w") as f:
f.writelines("HADOOP_DATA_SOURCE_NAME=\"{0}\"\n".format(cr.get_config("emr_cluster_name")))
f.writelines("HADOOP_DATA_SOURCE_DISTRO=\"{0}\"\n".format("Amazon EMR5"))
f.writelines("HADOOP_DATA_SOURCE_HOST=\"{0}\"\n".format(master_dns_hostname))
f.writelines("HADOOP_DATA_SOURCE_POST=\"8020\"\n")
f.writelines("HADOOP_DATA_SOURCE_USER=\"hdfs\"\n")
f.writelines("HADOOP_DATA_SOURCE_GROUP=\"hadoop\"\n")
f.writelines("HADOOP_DATA_SOURCE_JT_HOST=\"{0}\"\n".format(master_dns_hostname))
f.writelines("HADOOP_DATA_SOURCE_JT_PORT=\"8032\"\n")
f.writelines(email_str)
f.writelines(conn_params_str)
示例7: __init__
def __init__(self, team_id, access_key, secret_key,
bucket='cs144students'):
"""
(constructor)
Creates a new instance of the Rankmaniac class for a specific
team using the provided credentials.
Arguments:
team_id <str> the team identifier, which may be
differ slightly from the actual team
name.
access_key <str> the AWS access key identifier.
secret_key <str> the AWS secret acess key.
Keyword arguments:
bucket <str> the S3 bucket name.
"""
region = RegionInfo(None, self.DefaultRegionName,
self.DefaultRegionEndpoint)
self._s3_bucket = bucket
self._s3_conn = S3Connection(access_key, secret_key)
self._emr_conn = EmrConnection(access_key, secret_key, region=region)
self.team_id = team_id
self.job_id = None
self._reset()
self._num_instances = 1
示例8: __init__
def __init__(self, prop):
'''Constructor, initialize EMR connection.'''
self.prop = prop
self.conn = EmrConnection(self.prop.ec2.key, self.prop.ec2.secret)
self.jobid = None
self.retry = 0
self.level = 0
self.last_update = -1
示例9: get_internal_ips_from_emr
def get_internal_ips_from_emr(cluster_id, cr):
"""
Retrieves a list of internal IP addresses for a given EMR cluster
"""
# Open connection to EMR
conn = EmrConnection(
cr.get_config("aws_access_key"),
cr.get_config("aws_secret_key"),
region = RegionInfo(name = cr.get_config("aws_region"),
endpoint = cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com" ))
# Build list of internal ips from list_instances EMR API
emr_internal_ips = []
emr_instances = conn.list_instances(cluster_id).instances
for instance in emr_instances:
emr_internal_ips.append(instance.privateipaddress)
return emr_internal_ips
示例10: __init__
def __init__(self, spec_filename="spec.json"):
import boto
from boto.emr.connection import EmrConnection, RegionInfo
super(HiveRuntime, self).__init__(spec_filename)
p = self.settings.Param
self.s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET)
self.s3_bucket = self.s3_conn.get_bucket(p.S3_BUCKET)
self.region = p.AWS_Region
self.emr_conn = EmrConnection(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET,
region = RegionInfo(name = self.region,
endpoint = self.region + '.elasticmapreduce.amazonaws.com'))
self.job_flow_id = p.EMR_jobFlowId
示例11: creating_a_connection
class EMR:
def creating_a_connection(self):
#Creating a connection
from boto.emr.connection import EmrConnection
self.conn = EmrConnection('', '')
def creating_streaming_job(self):
#Creating Streaming JobFlow Steps
from boto.emr.step import StreamingStep
self.step = StreamingStep(name='my bigdata task',
mapper='s3n://eth-src/raw_to_stations.py',
#mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='s3n://eth-src/stations_to_features.py',
#reducer='aggregate',
input='s3n://eth-input/2007.csv',
#input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://eth-middle/2007')
def creating_jobflows(self):
#Creating JobFlows
#import boto.emr
#self.conn = boto.emr.connect_to_region('eu-west-1')
job_id = self.conn.run_jobflow(name='My jobflow',
log_uri='s3://eth-log/jobflow_logs',
master_instance_type='m3.xlarge',
slave_instance_type='m1.large',
num_instances=2,
steps=[self.step],
ami_version='3.3.1'
)
status = self.conn.describe_jobflow(job_id)
status.state
def terminating_jobflows(self, job_id):
#Terminating JobFlows
#self.conn = boto.emr.connect_to_region('eu-west-1')
self.conn.terminate_jobflow(job_id)
示例12: main
def main(argv):
# load the config
config = ConfigParser()
config.read(os.path.join(os.path.split(argv[0])[0] if not None else '','config.ini'))
# load AWS config
awsConfig = ConfigParser()
awsConfig.read(config.get('Common','aws'))
aws_access_key = awsConfig.get('AWS','aws_access_key')
aws_secret_key = awsConfig.get('AWS','aws_secret_key')
event_bucket = awsConfig.get('AWS','event_bucket')
output_bucket = awsConfig.get('AWS','emr_output_bucket')
script_bucket = awsConfig.get('AWS','script_bucket')
jobId = argv[1]
emrConnection = EmrConnection(aws_access_key, aws_secret_key)
s3Connection = S3Connection(aws_access_key, aws_secret_key)
# clean s3 output
bucket = s3Connection.get_bucket(output_bucket)
for key in bucket.get_all_keys(prefix=BUCKET_KEY):
bucket.delete_key(key)
step = StreamingStep(name='Foursquare event deduper',
mapper='s3://%s/dedup_mapper.py foursquare' % script_bucket,
reducer='s3://%s/dedup_reducer.py' % script_bucket,
input='s3://%s/normalized' % event_bucket,
output='s3://%s/%s' % (output_bucket,BUCKET_KEY),
action_on_failure='CONTINUE')
emrConnection.add_jobflow_steps(jobId, step)
print 'Successfully started streaming steps'
示例13: EMRInventory
class EMRInventory():
def __init__(self, region='eu-west-1'):
regionEMR = self.get_emr_region(region)
self.emrConnection = EmrConnection(region=regionEMR)
def list_current_resources(self, region='eu-west-1'):
jobFlows = self.emrConnection.describe_jobflows()
for jobFlow in jobFlows:
print jobFlow.jobflowid
def get_emr_region(self, region='eu-west-1'):
regionEndpoint = '%s.elasticmapreduce.amazonaws.com' % region
regionEMR = RegionInfo (name=region,
endpoint=regionEndpoint)
return regionEMR
示例14: EmrJarRuntime
class EmrJarRuntime(ZetRuntime):
def __init__(self, spec_filename="spec.json"):
import boto
from boto.emr.connection import EmrConnection, RegionInfo
# super(ZetRuntime, self).__init__()
# TODO
self.settings = get_settings_from_file(spec_filename)
p = self.settings.Param
self.s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET)
self.s3_bucket = self.s3_conn.get_bucket(p.S3_BUCKET)
self.region = p.AWS_Region
self.emr_conn = EmrConnection(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET,
region = RegionInfo(name = self.region,
endpoint = self.region + '.elasticmapreduce.amazonaws.com'))
self.job_flow_id = p.EMR_jobFlowId
def get_s3_working_dir(self, path=""):
ps = self.settings
glb_vars = ps.GlobalParam
return os.path.join('zetjob', glb_vars['userName'], "job%s" % glb_vars['jobId'], "blk%s" % glb_vars['blockId'], path)
def execute(self, jar_path, args):
from boto.emr.step import JarStep
s3_jar_path = s3_upload(self.s3_bucket, self.get_s3_working_dir(jar_path), jar_path)
# s3_jar_path = "s3://run-jars/jar/mahout-core-1.0-SNAPSHOT-job.jar"
print("Uploading jar to s3 : %s -> %s" % (jar_path, s3_jar_path))
print("Add jobflow step")
step = JarStep(name='cl_filter', jar=s3_jar_path, step_args=args)
self.emr_conn.add_jobflow_steps(self.job_flow_id, steps=[step])
print("Waiting jobflow step done")
emr_wait_job(self.emr_conn, self.job_flow_id)
示例15: __init__
def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None):
# If the access key is not specified, get it from the luigi config.cfg file
if not aws_access_key_id:
aws_access_key_id = luigi.configuration.get_config().get('aws', 'aws_access_key_id')
if not aws_secret_access_key:
aws_secret_access_key = luigi.configuration.get_config().get('aws', 'aws_secret_access_key')
# Create the region in which to run
region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
region = RegionInfo(name=region_name, endpoint=region_endpoint)
self.emr_connection = EmrConnection(aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region=region)