本文整理汇总了Python中apache.aurora.client.api.AuroraClientAPI类的典型用法代码示例。如果您正苦于以下问题:Python AuroraClientAPI类的具体用法?Python AuroraClientAPI怎么用?Python AuroraClientAPI使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了AuroraClientAPI类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: increase_quota
def increase_quota(cluster, role, cpu_str, ram_str, disk_str):
"""usage: increase_quota cluster role cpu ram[unit] disk[unit]
Increases the amount of production quota allocated to a user.
"""
cpu = float(cpu_str)
ram = parse_data(ram_str)
disk = parse_data(disk_str)
options = app.get_options()
client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == "verbose")
resp = client.get_quota(role)
quota = resp.result.getQuotaResult.quota
log.info(
"Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB"
% (role, quota.numCpus, quota.ramMb, quota.diskMb)
)
new_cpu = float(cpu + quota.numCpus)
new_ram = int((ram + Amount(quota.ramMb, Data.MB)).as_(Data.MB))
new_disk = int((disk + Amount(quota.diskMb, Data.MB)).as_(Data.MB))
log.info(
"Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB"
% (role, new_cpu, new_ram, new_disk)
)
resp = client.set_quota(role, new_cpu, new_ram, new_disk)
check_and_log_response(resp)
示例2: sla_probe_hosts
def sla_probe_hosts(cluster, percentage, duration):
"""usage: sla_probe_hosts
[--filename=filename]
[--hosts=hosts]
cluster percentage duration
Probes individual hosts with respect to their job SLA.
Specifically, given a host, outputs all affected jobs with their projected SLAs
if the host goes down. In addition, if a job's projected SLA does not clear
the specified limits suggests the approximate time when that job reaches its SLA.
Output format:
HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN
where:
HOST - host being probed.
JOB - job that has tasks running on the host being probed.
PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
SAFE? - PREDICTED_SLA >= percentage
PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
"""
options = app.get_options()
sla_percentage = parse_sla_percentage(percentage)
sla_duration = parse_time(duration)
hosts = parse_hosts(options.filename, options.hosts)
vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(hosts)
probed_hosts = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), hosts)
results = []
for host, job_details in sorted(probed_hosts.items()):
results.append(
"\n".join(
[
"%s\t%s\t%.2f\t%s\t%s"
% (
host,
d.job.to_path(),
d.predicted_percentage,
d.safe,
"n/a" if d.safe_in_secs is None else d.safe_in_secs,
)
for d in sorted(job_details)
]
)
)
print_results(results)
示例3: __init__
def __init__(self, cluster, role, env, jobs, ssh_user=None):
self._cluster = cluster
self._api = AuroraClientAPI(cluster=cluster)
self._role = role
self._env = env
self._jobs = jobs
self._ssh_user = ssh_user if ssh_user else self._role
示例4: test_handles_api_auth_error
def test_handles_api_auth_error():
context = AuroraCommandContext()
mock_scheduler_proxy = mock.create_autospec(spec=SchedulerProxyApiSpec, instance=True)
mock_scheduler_proxy.killTasks.side_effect = SchedulerProxy.AuthError()
mock_api = AuroraClientAPI(TEST_CLUSTER, "user-agent")
mock_api._scheduler_proxy = mock_scheduler_proxy
context.apis = {TEST_CLUSTER.name: mock_api}
api = context.get_api(TEST_CLUSTER.name, clusters={TEST_CLUSTER.name: TEST_CLUSTER})
with pytest.raises(Context.CommandError) as e:
api.kill_job(AuroraJobKey(TEST_CLUSTER.name, "role", "env", "job"))
assert e.value.code == EXIT_AUTH_ERROR
assert mock_scheduler_proxy.killTasks.call_count == 1
示例5: __init__
def __init__(self, cluster, role, env, jobs, ssh_user=None, ssh_options=None, log_fn=log.log):
self._cluster = cluster
self._api = AuroraClientAPI(cluster=cluster, user_agent=AURORA_V2_USER_AGENT_NAME)
self._role = role
self._env = env
self._jobs = jobs
self._ssh_user = ssh_user if ssh_user else self._role
self._ssh_options = ssh_options if ssh_options else []
self._log = log_fn
示例6: sla_probe_hosts
def sla_probe_hosts(cluster, percentage, duration):
"""usage: sla_probe_hosts
[--filename=FILENAME]
[--grouping=GROUPING]
[--hosts=HOSTS]
[--min_job_instance_count=COUNT]
cluster percentage duration
Probes individual hosts with respect to their job SLA.
Specifically, given a host, outputs all affected jobs with their projected SLAs
if the host goes down. In addition, if a job's projected SLA does not clear
the specified limits suggests the approximate time when that job reaches its SLA.
Output format:
HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN
where:
HOST - host being probed.
JOB - job that has tasks running on the host being probed.
PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
SAFE? - PREDICTED_SLA >= percentage
PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
"""
options = app.get_options()
sla_percentage = parse_sla_percentage(percentage)
sla_duration = parse_time(duration)
hosts = parse_hostnames(options.filename, options.hosts)
get_grouping_or_die(options.grouping)
vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(
options.min_instance_count, hosts
)
groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)
output, _ = format_sla_results(groups)
print_results(output)
示例7: mock_api
def mock_api(cls):
api = AuroraClientAPI(Cluster(name="foo"), 'test-client')
mock_proxy = create_autospec(spec=SchedulerProxyApiSpec, spec_set=True, instance=True)
api._scheduler_proxy = mock_proxy
return api, mock_proxy
示例8: __init__
def __init__(self, cluster, verbosity, wait_event=None):
self._client = AuroraClientAPI(cluster, verbosity == "verbose")
self._wait_event = wait_event or Event()
示例9: DistributedCommandRunner
class DistributedCommandRunner(object):
@classmethod
def make_executor_path(cls, cluster, executor_name):
parameters = cls.sandbox_args(cluster)
parameters.update(executor_name=executor_name)
return (
posixpath.join(
"%(slave_root)s", "slaves/*/frameworks/*/executors/%(executor_name)s/runs", "%(slave_run_directory)s"
)
% parameters
)
@classmethod
def thermos_sandbox(cls, cluster, executor_sandbox=False):
sandbox = cls.make_executor_path(cluster, "thermos-{{thermos.task_id}}")
return sandbox if executor_sandbox else posixpath.join(sandbox, "sandbox")
@classmethod
def sandbox_args(cls, cluster):
cluster = cluster.with_trait(CommandRunnerTrait)
return {"slave_root": cluster.slave_root, "slave_run_directory": cluster.slave_run_directory}
@classmethod
def substitute_thermos(cls, command, task, cluster, **kw):
prefix_command = "cd %s;" % cls.thermos_sandbox(cluster, **kw)
thermos_namespace = ThermosContext(task_id=task.assignedTask.taskId, ports=task.assignedTask.assignedPorts)
mesos_namespace = MesosContext(instance=task.assignedTask.instanceId)
command = String(prefix_command + command) % Environment(thermos=thermos_namespace, mesos=mesos_namespace)
return command.get()
@classmethod
def aurora_sandbox(cls, cluster, executor_sandbox=False):
if executor_sandbox:
return cls.make_executor_path(cluster, "twitter")
else:
return "/var/run/nexus/%task_id%/sandbox"
@classmethod
def substitute_aurora(cls, command, task, cluster, **kw):
command = ("cd %s;" % cls.aurora_sandbox(cluster, **kw)) + command
command = command.replace("%shard_id%", str(task.assignedTask.instanceId))
command = command.replace("%task_id%", task.assignedTask.taskId)
for name, port in task.assignedTask.assignedPorts.items():
command = command.replace("%port:" + name + "%", str(port))
return command
@classmethod
def substitute(cls, command, task, cluster, **kw):
if task.assignedTask.task.executorConfig:
return cls.substitute_thermos(command, task, cluster, **kw)
else:
return cls.substitute_aurora(command, task, cluster, **kw)
@classmethod
def query_from(cls, role, env, job):
return TaskQuery(statuses=LIVE_STATES, jobKeys=[JobKey(role=role, environment=env, name=job)])
def __init__(self, cluster, role, env, jobs, ssh_user=None, log_fn=log.log):
self._cluster = cluster
self._api = AuroraClientAPI(cluster=cluster)
self._role = role
self._env = env
self._jobs = jobs
self._ssh_user = ssh_user if ssh_user else self._role
self._log = log_fn
def execute(self, args):
hostname, role, command = args
ssh_command = ["ssh", "-n", "-q", "%[email protected]%s" % (role, hostname), command]
self._log(logging.DEBUG, "Running command: %s" % ssh_command)
po = subprocess.Popen(ssh_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
output = po.communicate()
return "\n".join("%s: %s" % (hostname, line) for line in output[0].splitlines())
def resolve(self):
for job in self._jobs:
resp = self._api.query(self.query_from(self._role, self._env, job))
if resp.responseCode != ResponseCode.OK:
self._log(logging.ERROR, "Failed to query job: %s" % job)
continue
for task in resp.result.scheduleStatusResult.tasks:
yield task
def process_arguments(self, command, **kw):
for task in self.resolve():
host = task.assignedTask.slaveHost
yield (host, self._ssh_user, self.substitute(command, task, self._cluster, **kw))
def run(self, command, parallelism=1, **kw):
threadpool = ThreadPool(processes=parallelism)
for result in threadpool.imap_unordered(self.execute, self.process_arguments(command, **kw)):
print(result)
示例10: HostMaintenance
class HostMaintenance(object):
"""Submit requests to the scheduler to put hosts into and out of maintenance
mode so they can be operated upon without causing LOST tasks.
"""
DEFAULT_GROUPING = 'by_host'
GROUPING_FUNCTIONS = {
'by_host': group_by_host,
}
START_MAINTENANCE_DELAY = Amount(30, Time.SECONDS)
@classmethod
def group_hosts(cls, hostnames, grouping_function=DEFAULT_GROUPING):
try:
grouping_function = cls.GROUPING_FUNCTIONS[grouping_function]
except KeyError:
raise ValueError('Unknown grouping function %s!' % grouping_function)
groups = defaultdict(set)
for hostname in hostnames:
groups[grouping_function(hostname)].add(hostname)
return groups
@classmethod
def iter_batches(cls, hostnames, groups_per_batch, grouping_function=DEFAULT_GROUPING):
if groups_per_batch <= 0:
raise ValueError('Batch size must be > 0!')
groups = cls.group_hosts(hostnames, grouping_function)
groups = sorted(groups.items(), key=lambda v: v[0])
for k in range(0, len(groups), groups_per_batch):
yield Hosts(set.union(*(hostset for (key, hostset) in groups[k:k + groups_per_batch])))
def __init__(self, cluster, verbosity):
self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
def _drain_hosts(self, drainable_hosts, clock=time):
"""This will actively turn down tasks running on hosts."""
check_and_log_response(self._client.drain_hosts(drainable_hosts))
not_ready_hosts = [hostname for hostname in drainable_hosts.hostNames]
while not_ready_hosts:
log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY)
clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS))
resp = self._client.maintenance_status(Hosts(not_ready_hosts))
if not resp.result.maintenanceStatusResult.statuses:
not_ready_hosts = None
for host_status in resp.result.maintenanceStatusResult.statuses:
if host_status.mode != MaintenanceMode.DRAINED:
log.warning('%s is currently in status %s' %
(host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
else:
not_ready_hosts.remove(host_status.host)
def _complete_maintenance(self, drained_hosts):
"""End the maintenance status for a give set of hosts."""
check_and_log_response(self._client.end_maintenance(drained_hosts))
resp = self._client.maintenance_status(drained_hosts)
for host_status in resp.result.maintenanceStatusResult.statuses:
if host_status.mode != MaintenanceMode.NONE:
log.warning('%s is DRAINING or in DRAINED' % host_status.host)
def _operate_on_hosts(self, drained_hosts, callback):
"""Perform a given operation on a list of hosts that are ready for maintenance."""
for host in drained_hosts.hostNames:
callback(host)
def end_maintenance(self, hosts):
"""Pull a list of hosts out of maintenance mode."""
self._complete_maintenance(Hosts(set(hosts)))
def start_maintenance(self, hosts):
"""Put a list of hosts into maintenance mode, to de-prioritize scheduling."""
check_and_log_response(self._client.start_maintenance(Hosts(set(hosts))))
def perform_maintenance(self, hosts, groups_per_batch=1, grouping_function=DEFAULT_GROUPING,
callback=None):
"""The wrap a callback in between sending hosts into maintenance mode and back.
Walk through the process of putting hosts into maintenance, draining them of tasks,
performing an action on them once drained, then removing them from maintenance mode
so tasks can schedule.
"""
self._complete_maintenance(Hosts(set(hosts)))
self.start_maintenance(hosts)
for hosts in self.iter_batches(hosts, groups_per_batch, grouping_function):
self._drain_hosts(hosts)
if callback:
self._operate_on_hosts(hosts, callback)
self._complete_maintenance(hosts)
def check_status(self, hosts):
resp = self._client.maintenance_status(Hosts(set(hosts)))
check_and_log_response(resp)
statuses = []
for host_status in resp.result.maintenanceStatusResult.statuses:
statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
return statuses
示例11: DistributedCommandRunner
class DistributedCommandRunner(object):
@staticmethod
def execute(args):
hostname, role, command = args
ssh_command = ['ssh', '-n', '-q', '%[email protected]%s' % (role, hostname), command]
po = subprocess.Popen(ssh_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
output = po.communicate()
return '\n'.join('%s: %s' % (hostname, line) for line in output[0].splitlines())
@classmethod
def make_executor_path(cls, cluster, executor_name):
parameters = cls.sandbox_args(cluster)
parameters.update(executor_name=executor_name)
return posixpath.join(
'%(slave_root)s',
'slaves/*/frameworks/*/executors/%(executor_name)s/runs',
'%(slave_run_directory)s'
) % parameters
@classmethod
def thermos_sandbox(cls, cluster, executor_sandbox=False):
sandbox = cls.make_executor_path(cluster, 'thermos-{{thermos.task_id}}')
return sandbox if executor_sandbox else posixpath.join(sandbox, 'sandbox')
@classmethod
def sandbox_args(cls, cluster):
cluster = cluster.with_trait(CommandRunnerTrait)
return {'slave_root': cluster.slave_root, 'slave_run_directory': cluster.slave_run_directory}
@classmethod
def substitute_thermos(cls, command, task, cluster, **kw):
prefix_command = 'cd %s;' % cls.thermos_sandbox(cluster, **kw)
thermos_namespace = ThermosContext(
task_id=task.assignedTask.taskId,
ports=task.assignedTask.assignedPorts)
mesos_namespace = MesosContext(instance=task.assignedTask.instanceId)
command = String(prefix_command + command) % Environment(
thermos=thermos_namespace,
mesos=mesos_namespace)
return command.get()
@classmethod
def aurora_sandbox(cls, cluster, executor_sandbox=False):
if executor_sandbox:
return cls.make_executor_path(cluster, 'twitter')
else:
return '/var/run/nexus/%task_id%/sandbox'
@classmethod
def substitute_aurora(cls, command, task, cluster, **kw):
command = ('cd %s;' % cls.aurora_sandbox(cluster, **kw)) + command
command = command.replace('%shard_id%', str(task.assignedTask.instanceId))
command = command.replace('%task_id%', task.assignedTask.taskId)
for name, port in task.assignedTask.assignedPorts.items():
command = command.replace('%port:' + name + '%', str(port))
return command
@classmethod
def substitute(cls, command, task, cluster, **kw):
if task.assignedTask.task.executorConfig:
return cls.substitute_thermos(command, task, cluster, **kw)
else:
return cls.substitute_aurora(command, task, cluster, **kw)
@classmethod
def query_from(cls, role, env, job):
return TaskQuery(statuses=LIVE_STATES, owner=Identity(role), jobName=job, environment=env)
def __init__(self, cluster, role, env, jobs, ssh_user=None):
self._cluster = cluster
self._api = AuroraClientAPI(cluster=cluster)
self._role = role
self._env = env
self._jobs = jobs
self._ssh_user = ssh_user if ssh_user else self._role
def resolve(self):
for job in self._jobs:
resp = self._api.query(self.query_from(self._role, self._env, job))
if resp.responseCode != ResponseCode.OK:
log.error('Failed to query job: %s' % job)
continue
for task in resp.result.scheduleStatusResult.tasks:
yield task
def process_arguments(self, command, **kw):
for task in self.resolve():
host = task.assignedTask.slaveHost
role = task.assignedTask.task.owner.role
yield (host, self._ssh_user, self.substitute(command, task, self._cluster, **kw))
def run(self, command, parallelism=1, **kw):
threadpool = ThreadPool(processes=parallelism)
for result in threadpool.imap_unordered(self.execute, self.process_arguments(command, **kw)):
print result
示例12: sla_list_safe_domain
def sla_list_safe_domain(cluster, percentage, duration):
"""usage: sla_list_safe_domain
[--exclude_file=FILENAME]
[--exclude_hosts=HOSTS]
[--grouping=GROUPING]
[--include_file=FILENAME]
[--include_hosts=HOSTS]
[--list_jobs]
[--min_job_instance_count=COUNT]
[--override_jobs=FILENAME]
cluster percentage duration
Returns a list of relevant hosts where it would be safe to kill
tasks without violating their job SLA. The SLA is defined as a pair of
percentage and duration, where:
percentage - Percentage of tasks required to be up within the duration.
Applied to all jobs except those listed in --override_jobs file;
duration - Time interval (now - value) for the percentage of up tasks.
Applied to all jobs except those listed in --override_jobs file.
Format: XdYhZmWs (each field is optional but must be in that order.)
Examples: 5m, 1d3h45m.
NOTE: if --grouping option is specified and is set to anything other than
default (by_host) the results will be processed and filtered based
on the grouping function on a all-or-nothing basis. In other words,
the group is 'safe' IFF it is safe to kill tasks on all hosts in the
group at the same time.
"""
def parse_jobs_file(filename):
result = {}
with open(filename, "r") as overrides:
for line in overrides:
if not line.strip():
continue
tokens = line.split()
if len(tokens) != 3:
die("Invalid line in %s:%s" % (filename, line))
job_key = AuroraJobKey.from_path(tokens[0])
result[job_key] = JobUpTimeLimit(
job=job_key,
percentage=parse_sla_percentage(tokens[1]),
duration_secs=parse_time(tokens[2]).as_(Time.SECONDS),
)
return result
options = app.get_options()
sla_percentage = parse_sla_percentage(percentage)
sla_duration = parse_time(duration)
exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename)
include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename)
override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {}
get_grouping_or_die(options.grouping)
vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(
options.min_instance_count, include_hosts
)
groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs, options.grouping)
results = []
for group in groups:
for host in sorted(group.keys()):
if exclude_hosts and host in exclude_hosts:
continue
if options.list_jobs:
results.append(
"\n".join(
[
"%s\t%s\t%.2f\t%d" % (host, d.job.to_path(), d.percentage, d.duration_secs)
for d in sorted(group[host])
]
)
)
else:
results.append("%s" % host)
print_results(results)
示例13: sla_list_safe_domain
def sla_list_safe_domain(cluster, percentage, duration):
"""usage: sla_list_safe_domain
[--exclude_hosts=filename]
[--include_hosts=filename]
[--list_jobs]
[--override_jobs=filename]
cluster percentage duration
Returns a list of relevant hosts where it would be safe to kill
tasks without violating their job SLA. The SLA is defined as a pair of
percentage and duration, where:
percentage - Percentage of tasks required to be up within the duration.
Applied to all jobs except those listed in --override_jobs file;
duration - Time interval (now - value) for the percentage of up tasks.
Applied to all jobs except those listed in --override_jobs file.
Format: XdYhZmWs (each field is optional but must be in that order.)
Examples: 5m, 1d3h45m.
"""
def parse_jobs_file(filename):
result = {}
with open(filename, "r") as overrides:
for line in overrides:
if not line.strip():
continue
tokens = line.split()
if len(tokens) != 3:
die("Invalid line in %s:%s" % (filename, line))
job_key = AuroraJobKey.from_path(tokens[0])
result[job_key] = DomainUpTimeSlaVector.JobUpTimeLimit(
job=job_key,
percentage=parse_sla_percentage(tokens[1]),
duration_secs=parse_time(tokens[2]).as_(Time.SECONDS),
)
return result
options = app.get_options()
sla_percentage = parse_sla_percentage(percentage)
sla_duration = parse_time(duration)
exclude_hosts = parse_hosts_optional(options.exclude_hosts, options.exclude_filename)
include_hosts = parse_hosts_optional(options.include_hosts, options.include_filename)
override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {}
vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(include_hosts)
hosts = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs)
results = []
for host in sorted(hosts.keys()):
if exclude_hosts and host in exclude_hosts:
continue
if options.list_jobs:
results.append(
"\n".join(
[
"%s\t%s\t%.2f\t%d" % (host, d.job.to_path(), d.percentage, d.duration_secs)
for d in sorted(hosts[host])
]
)
)
else:
results.append("%s" % host)
print_results(results)
示例14: mock_api
def mock_api(cls):
api = AuroraClientAPI(Cluster(name="foo"))
mock_proxy = Mock()
api._scheduler_proxy = mock_proxy
return api, mock_proxy
示例15: HostMaintenance
class HostMaintenance(object):
"""Submit requests to the scheduler to put hosts into and out of maintenance
mode so they can be operated upon without causing LOST tasks.
Aurora provides a two-tiered concept of Maintenance. The first step is to initiate maintenance,
which will ask the Aurora scheduler to de-prioritize scheduling on a large set of hosts (the ones
that will be operated upon during this maintenance window). Once all hosts have been tagged in
this manner, the operator can begin draining individual machines, which will have all user-tasks
killed and rescheduled. When the tasks get placed onto a new machine, the scheduler will first
look for hosts that do not have the maintenance tag, which will help decrease churn and prevent a
task from being constantly killed as its hosts go down from underneath it.
"""
START_MAINTENANCE_DELAY = Amount(30, Time.SECONDS)
SLA_MIN_JOB_INSTANCE_COUNT = 20
@classmethod
def iter_batches(cls, hostnames, grouping_function=DEFAULT_GROUPING):
groups = group_hosts(hostnames, grouping_function)
groups = sorted(groups.items(), key=lambda v: v[0])
for group in groups:
yield Hosts(group[1])
def __init__(self, cluster, verbosity):
self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
def _drain_hosts(self, drainable_hosts, clock=time):
""""Drains tasks from the specified hosts.
This will move active tasks on these hosts to the DRAINING state, causing them to be
rescheduled elsewhere.
:param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
:type drainable_hosts: gen.apache.aurora.ttypes.Hosts
:param clock: time module for testing
:type clock: time
"""
check_and_log_response(self._client.drain_hosts(drainable_hosts))
not_ready_hostnames = [hostname for hostname in drainable_hosts.hostNames]
while not_ready_hostnames:
log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY)
clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS))
resp = self._client.maintenance_status(Hosts(set(not_ready_hostnames)))
if not resp.result.maintenanceStatusResult.statuses:
not_ready_hostnames = None
for host_status in resp.result.maintenanceStatusResult.statuses:
if host_status.mode != MaintenanceMode.DRAINED:
log.warning('%s is currently in status %s' %
(host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
else:
not_ready_hostnames.remove(host_status.host)
def _complete_maintenance(self, drained_hosts):
"""End the maintenance status for a given set of hosts.
:param drained_hosts: Hosts that are drained and finished being operated upon
:type drained_hosts: gen.apache.aurora.ttypes.Hosts
"""
check_and_log_response(self._client.end_maintenance(drained_hosts))
resp = self._client.maintenance_status(drained_hosts)
for host_status in resp.result.maintenanceStatusResult.statuses:
if host_status.mode != MaintenanceMode.NONE:
log.warning('%s is DRAINING or in DRAINED' % host_status.host)
def _check_sla(self, hostnames, grouping_function, percentage, duration):
"""Check if the provided list of hosts passes the job uptime SLA check.
This is an all-or-nothing check, meaning that all provided hosts must pass their job
SLA check for the maintenance to proceed.
:param hostnames: list of host names to check SLA for
:type hostnames: list of strings
:param grouping_function: grouping function to apply to the given hosts
:type grouping_function: function
:param percentage: SLA uptime percentage override
:type percentage: float
:param duration: SLA uptime duration override
:type duration: twitter.common.quantity.Amount
:rtype: set of unsafe hosts
"""
vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
host_groups = vector.probe_hosts(
percentage,
duration.as_(Time.SECONDS),
grouping_function)
unsafe_hostnames = set()
# Given that maintenance is performed 1 group at a time, any result longer than 1 group
# should be considered a batch failure.
if host_groups:
if len(host_groups) > 1:
log.error('Illegal multiple groups detected in SLA results. Skipping hosts: %s' % hostnames)
return set(hostnames)
results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True)
if results:
print_results(results)
return unsafe_hostnames
#.........这里部分代码省略.........