本文整理汇总了C++中orte_get_attribute函数的典型用法代码示例。如果您正苦于以下问题:C++ orte_get_attribute函数的具体用法?C++ orte_get_attribute怎么用?C++ orte_get_attribute使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了orte_get_attribute函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: bind_upwards
static int bind_upwards(orte_job_t *jdata,
orte_node_t *node,
hwloc_obj_type_t target,
unsigned cache_level)
{
/* traverse the hwloc topology tree on each node upwards
* until we find an object of type target - and then bind
* the process to that target
*/
int j;
orte_job_map_t *map;
orte_proc_t *proc;
hwloc_obj_t obj;
hwloc_cpuset_t cpus;
unsigned int idx, ncpus;
opal_hwloc_obj_data_t *data;
hwloc_obj_t locale;
char *cpu_bitmap;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bind upwards for job %s with bindings %s",
ORTE_JOBID_PRINT(jdata->jobid),
opal_hwloc_base_print_binding(jdata->map->binding));
/* initialize */
map = jdata->map;
/* cycle thru the procs */
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
/* bozo check */
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
return ORTE_ERR_SILENT;
}
/* starting at the locale, move up thru the parents
* to find the target object type
*/
cpu_bitmap = NULL;
for (obj = locale->parent; NULL != obj; obj = obj->parent) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"%s bind:upward target %s type %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
hwloc_obj_type_string(target),
hwloc_obj_type_string(obj->type));
if (target == obj->type) {
if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
continue;
}
/* get its index */
if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_SILENT;
}
/* track the number bound */
data = (opal_hwloc_obj_data_t*)obj->userdata;
data->num_bound++;
/* get the number of cpus under this location */
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
return ORTE_ERR_SILENT;
}
/* error out if adding a proc would cause overload and that wasn't allowed,
* and it wasn't a default binding policy (i.e., the user requested it)
*/
if (ncpus < data->num_bound &&
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
/* if the user specified a binding policy, then we cannot meet
* it since overload isn't allowed, so error out - have the
* message indicate that setting overload allowed will remove
* this restriction */
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
opal_hwloc_base_print_binding(map->binding), node->name,
data->num_bound, ncpus);
return ORTE_ERR_SILENT;
} else {
/* if we have the default binding policy, then just don't bind */
OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
unbind_procs(jdata);
return ORTE_SUCCESS;
}
}
/* bind it here */
cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus);
orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
/* record the location */
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
cpu_bitmap,
//.........这里部分代码省略.........
示例2: prune
/* recursively climb the topology, pruning procs beyond that allowed
* by the given ppr
*/
static void prune(orte_jobid_t jobid,
orte_app_idx_t app_idx,
orte_node_t *node,
opal_hwloc_level_t *level,
orte_vpid_t *nmapped)
{
hwloc_obj_t obj, top;
unsigned int i, nobjs;
hwloc_obj_type_t lvl;
unsigned cache_level = 0, k;
int nprocs;
hwloc_cpuset_t avail, cpus, childcpus;
int n, limit, nmax, nunder, idx, idxmax = 0;
orte_proc_t *proc, *pptr, *procmax;
opal_hwloc_level_t ll;
char dang[64];
hwloc_obj_t locale;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:ppr: pruning level %d",
*level);
/* convenience */
ll = *level;
/* convenience */
lvl = opal_hwloc_levels[ll];
limit = ppr[ll];
if (0 == limit) {
/* no limit at this level, so move up if necessary */
if (0 == ll) {
/* done */
return;
}
--(*level);
prune(jobid, app_idx, node, level, nmapped);
return;
}
/* handle the darn cache thing again */
if (OPAL_HWLOC_L3CACHE_LEVEL == ll) {
cache_level = 3;
} else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) {
cache_level = 2;
} else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) {
cache_level = 1;
}
/* get the number of resources at this level on this node */
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
lvl, cache_level,
OPAL_HWLOC_AVAILABLE);
/* for each resource, compute the number of procs sitting
* underneath it and check against the limit
*/
for (i=0; i < nobjs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology,
lvl, cache_level,
i, OPAL_HWLOC_AVAILABLE);
/* get the available cpuset */
avail = opal_hwloc_base_get_available_cpus(node->topology, obj);
/* look at the intersection of this object's cpuset and that
* of each proc in the job/app - if they intersect, then count this proc
* against the limit
*/
nprocs = 0;
for (n=0; n < node->procs->size; n++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
continue;
}
if (proc->name.jobid != jobid ||
proc->app_idx != app_idx) {
continue;
}
locale = NULL;
if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
if (hwloc_bitmap_intersects(avail, cpus)) {
nprocs++;
}
}
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:ppr: found %d procs limit %d",
nprocs, limit);
/* check against the limit */
while (limit < nprocs) {
/* need to remove procs - do this in a semi-intelligent
* manner to provide a little load balancing by cycling
* across the objects beneath this one, removing procs
* in a round-robin fashion until the limit is satisfied
//.........这里部分代码省略.........
示例3: launch_daemons
//.........这里部分代码省略.........
/* Add basic orted command line options, including debug flags */
orte_plm_base_orted_append_basic_args(&argc, &argv,
"slurm", &proc_vpid_index,
nodelist_flat);
free(nodelist_flat);
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end
*/
rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start);
if (ORTE_SUCCESS != rc) {
opal_output(0, "plm_slurm: unable to get daemon vpid as string");
goto cleanup;
}
free(argv[proc_vpid_index]);
argv[proc_vpid_index] = strdup(name_string);
free(name_string);
/* Copy the prefix-directory specified in the
corresponding app_context. If there are multiple,
different prefix's in the app context, complain (i.e., only
allow one --prefix option for the entire slurm run -- we
don't support different --prefix'es for different nodes in
the SLURM plm) */
cur_prefix = NULL;
for (n=0; n < state->jdata->apps->size; n++) {
char * app_prefix_dir;
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) {
continue;
}
app_prefix_dir = NULL;
orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
/* Check for already set cur_prefix_dir -- if different,
complain */
if (NULL != app_prefix_dir) {
if (NULL != cur_prefix &&
0 != strcmp (cur_prefix, app_prefix_dir)) {
orte_show_help("help-plm-slurm.txt", "multiple-prefixes",
true, cur_prefix, app_prefix_dir);
goto cleanup;
}
/* If not yet set, copy it; iff set, then it's the
* same anyway
*/
if (NULL == cur_prefix) {
cur_prefix = strdup(app_prefix_dir);
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:slurm: Set prefix:%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
cur_prefix));
}
free(app_prefix_dir);
}
}
/* protect the args in case someone has a script wrapper around srun */
mca_base_cmd_line_wrap_args(argv);
/* setup environment */
env = opal_argv_copy(orte_launch_environ);
if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
param = opal_argv_join(argv, ' ');
示例4: track_procs
//.........这里部分代码省略.........
/* do NOT update the proc state as this can hit
* while we are still trying to notify the HNP of
* successful launch for short-lived procs
*/
ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
/* Release the stdin IOF file descriptor for this child, if one
* was defined. File descriptors for the other IOF channels - stdout,
* stderr, and stddiag - were released when their associated pipes
* were cleared and closed due to termination of the process
* Do this after we handle termination in case the IOF needs
* to check to see if all procs from the job are actually terminated
*/
if (NULL != orte_iof.close) {
orte_iof.close(proc, ORTE_IOF_STDIN);
}
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) &&
!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
}
} else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
/* do NOT update the proc state as this can hit
* while we are still trying to notify the HNP of
* successful launch for short-lived procs
*/
ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) &&
!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
}
} else if (ORTE_PROC_STATE_TERMINATED == state) {
/* if this proc has not already recorded as terminated, then
* update the accounting here */
if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
jdata->num_terminated++;
}
/* update the proc state */
ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED);
ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
pdata->state = state;
/* Clean up the session directory as if we were the process
* itself. This covers the case where the process died abnormally
* and didn't cleanup its own session directory.
*/
orte_session_dir_finalize(proc);
/* if we are trying to terminate and our routes are
* gone, then terminate ourselves IF no local procs
* remain (might be some from another job)
*/
if (orte_orteds_term_ordered &&
0 == orte_routed.num_routes()) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
/* at least one is still alive */
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
"%s state:orted all routes gone but proc %s still alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pdata->name)));
goto cleanup;
}
}
/* call our appropriate exit procedure */
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
"%s state:orted all routes and children gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
goto cleanup;
}
/* track job status */
if (jdata->num_terminated == jdata->num_local_procs &&
!orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) {
/* pack update state command */
cmd = ORTE_PLM_UPDATE_PROC_STATE;
alert = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* pack the job info */
if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
ORTE_ERROR_LOG(rc);
}
/* send it */
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
"%s state:orted: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
}
/* mark that we sent it so we ensure we don't do it again */
orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
}
}
cleanup:
OBJ_RELEASE(caddy);
}
示例5: notify_job_errors
/*****************
* Local Functions
*****************/
static void notify_job_errors(orte_job_t *jdata, orte_proc_t *proc)
{
int command = ORTE_DAEMON_ABORT_PROCS_CALLED;
int rc;
opal_buffer_t *buf;
if (ORTE_JOB_STATE_ANY == jdata->state) {
goto CLEANUP;
}
buf = OBJ_NEW(opal_buffer_t);
/* pack the complete command flag */
if (OPAL_SUCCESS !=
(rc = opal_dss.pack(buf, &command, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
goto CLEANUP;
}
/* pack the job */
if (OPAL_SUCCESS !=
(rc = opal_dss.pack(buf, &jdata, 1, ORTE_JOB))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
goto CLEANUP;
}
/* pack the proc name */
if (OPAL_SUCCESS !=
(rc = opal_dss.pack(buf, &proc, 1, ORTE_PROC))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
goto CLEANUP;
}
/* pack the proc state */
if (OPAL_SUCCESS !=
(rc = opal_dss.pack(buf, &proc->state, 1, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
goto CLEANUP;
}
/* pack the proc exit code */
if (OPAL_SUCCESS !=
(rc = opal_dss.pack(buf, &proc->exit_code, 1, ORTE_EXIT_CODE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
goto CLEANUP;
}
/* pack the node name */
if (OPAL_SUCCESS !=
(rc = opal_dss.pack(buf, &proc->node->name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
goto CLEANUP;
}
OBJ_RETAIN(jdata);
if (ORTE_SUCCESS !=
(rc = orte_rml.send_buffer_nb(&jdata->originator, buf,
ORTE_RML_TAG_TOOL,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
goto CLEANUP;
}
OPAL_OUTPUT_VERBOSE((0, orte_state_base_framework.framework_output,
"%s state:orcmsd:notify job error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)));
/* if this job is a continuously operating one, then don't do
* anything further - just return here
*/
if (NULL != jdata &&
(orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) {
/* set state to ANY */
jdata->state = ORTE_JOB_STATE_ANY;
} else {
/* give us a chance to stop the orcmsds */
orte_plm.terminate_orteds();
}
CLEANUP:
return;
}
示例6: orte_dt_unpack_job
//.........这里部分代码省略.........
}
/* if there are apps, unpack them */
if (0 < jobs[i]->num_apps) {
orte_app_context_t *app;
for (j=0; j < jobs[i]->num_apps; j++) {
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&app, &n, ORTE_APP_CONTEXT))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_pointer_array_add(jobs[i]->apps, app);
}
}
/* unpack num procs and offset */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->num_procs)), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->offset)), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 < jobs[i]->num_procs) {
/* check attributes to see if this job was fully
* described in the launch msg */
if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
orte_proc_t *proc;
for (j=0; j < jobs[i]->num_procs; j++) {
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&proc, &n, ORTE_PROC))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_pointer_array_add(jobs[i]->procs, proc);
}
}
}
/* unpack stdin target */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->stdin_target)), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the total slots allocated to the job */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->total_slots_alloc)), &n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if the map is NULL, then we didn't pack it as there was
* nothing to pack. Instead, we packed a flag to indicate whether or not
* the map is included */
示例7: rank_fill
static int rank_fill(orte_job_t *jdata,
hwloc_obj_type_t target,
unsigned cache_level)
{
orte_app_context_t *app;
hwloc_obj_t obj;
int num_objs, i, j, m, n, rc;
orte_vpid_t num_ranked=0;
orte_node_t *node;
orte_proc_t *proc, *pptr;
orte_vpid_t vpid;
int cnt;
hwloc_obj_t locale;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_fill: for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* if the ranking is fill, then we rank all the procs
* within a given object before moving on to the next
*
* Node 0 Node 1
* Obj 0 Obj 1 Obj 0 Obj 1
* 0 1 4 5 8 9 12 13
* 2 3 6 7 10 11 14 15
*/
vpid = 0;
for (n=0; n < jdata->apps->size; n++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
continue;
}
cnt = 0;
for (m=0; m < jdata->map->nodes->size; m++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
continue;
}
/* get the number of objects - only consider those we can actually use */
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target,
cache_level, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_fill: found %d objects on node %s with %d procs",
num_objs, node->name, (int)node->num_procs);
if (0 == num_objs) {
return ORTE_ERR_NOT_SUPPORTED;
}
/* for each object */
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target,
cache_level, i, OPAL_HWLOC_AVAILABLE);
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_fill: working object %d", i);
/* cycle thru the procs on this node */
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d",
ORTE_NAME_PRINT(&proc->name), num_ranked);
continue;
}
/* ignore procs that are already assigned */
if (ORTE_VPID_INVALID != proc->name.vpid) {
continue;
}
/* ignore procs from other apps */
if (proc->app_idx != app->idx) {
continue;
}
/* protect against bozo case */
locale = NULL;
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
/* ignore procs not on this object */
if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_fill: proc at position %d is not on object %d",
j, i);
continue;
}
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid));
proc->name.vpid = vpid++;
if (0 == cnt) {
app->first_rank = proc->name.vpid;
}
cnt++;
/* insert the proc into the jdata array */
if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
OBJ_RELEASE(pptr);
//.........这里部分代码省略.........
示例8: dyn_allocate
/* we cannot use the RML to communicate with SLURM as it doesn't
* understand our internal protocol, so we have to do a bare-bones
* exchange based on sockets
*/
static int dyn_allocate(orte_job_t *jdata)
{
char *cmd_str, **cmd=NULL, *tmp, *jstring;
char *node_list;
orte_app_context_t *app;
int i;
struct timeval tv;
local_jobtracker_t *jtrk;
int64_t i64, *i64ptr;
if (NULL == mca_ras_slurm_component.config_file) {
opal_output(0, "Cannot perform dynamic allocation as no Slurm configuration file provided");
return ORTE_ERR_NOT_FOUND;
}
/* track this request */
jtrk = OBJ_NEW(local_jobtracker_t);
jtrk->jobid = jdata->jobid;
opal_list_append(&jobs, &jtrk->super);
/* construct the command - note that the jdata structure contains
* a field for the minimum number of nodes required for the job.
* The node list can be constructed from the union of all the nodes
* contained in the dash_host field of the app_contexts. So you'll
* need to do a little work to build the command. We don't currently
* have a field in the jdata structure for "mandatory" vs "optional"
* allocations, so we'll have to add that someday. Likewise, you may
* want to provide a param to adjust the timeout value
*/
/* construct the cmd string */
opal_argv_append_nosize(&cmd, "allocate");
/* add the jobid */
orte_util_convert_jobid_to_string(&jstring, jdata->jobid);
asprintf(&tmp, "jobid=%s", jstring);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
free(jstring);
/* if we want the allocation for all apps in one shot,
* then tell slurm
*
* RHC: we don't currently have the ability to handle
* rolling allocations in the rest of the code base
*/
#if 0
if (!mca_ras_slurm_component.rolling_alloc) {
opal_argv_append_nosize(&cmd, "return=all");
}
#else
opal_argv_append_nosize(&cmd, "return=all");
#endif
/* pass the timeout */
asprintf(&tmp, "timeout=%d", mca_ras_slurm_component.timeout);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
/* for each app, add its allocation request info */
i64ptr = &i64;
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* add the app id, preceded by a colon separator */
asprintf(&tmp, ": app=%d", (int)app->idx);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
/* add the number of process "slots" we need */
asprintf(&tmp, "np=%d", app->num_procs);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
/* if we were given a minimum number of nodes, pass it along */
if (orte_get_attribute(&app->attributes, ORTE_APP_MIN_NODES, (void**)&i64ptr, OPAL_INT64)) {
asprintf(&tmp, "N=%ld", (long int)i64);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
}
/* add the list of nodes, if one was given, ensuring
* that each node only appears once
*/
node_list = get_node_list(app);
if (NULL != node_list) {
asprintf(&tmp, "node_list=%s", node_list);
opal_argv_append_nosize(&cmd, tmp);
free(node_list);
free(tmp);
}
/* add the mandatory/optional flag */
if (orte_get_attribute(&app->attributes, ORTE_APP_MANDATORY, NULL, OPAL_BOOL)) {
opal_argv_append_nosize(&cmd, "flag=mandatory");
} else {
opal_argv_append_nosize(&cmd, "flag=optional");
}
}
/* assemble it into the final cmd to be sent */
cmd_str = opal_argv_join(cmd, ' ');
//.........这里部分代码省略.........
示例9: orte_rmaps_base_filter_nodes
int orte_rmaps_base_filter_nodes(orte_app_context_t *app,
opal_list_t *nodes, bool remove)
{
int rc=ORTE_ERR_TAKE_NEXT_OPTION;
char *hosts;
/* did the app_context contain a hostfile? */
if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
/* yes - filter the node list through the file, removing
* any nodes not found in the file
*/
if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(nodes, hosts, remove))) {
ORTE_ERROR_LOG(rc);
free(hosts);
return rc;
}
/** check that anything is here */
if (0 == opal_list_get_size(nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "-hostfile", hosts);
free(hosts);
return ORTE_ERR_SILENT;
}
free(hosts);
}
/* did the app_context contain an add-hostfile? */
if (orte_get_attribute(&app->attributes, ORTE_APP_ADD_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
/* yes - filter the node list through the file, removing
* any nodes not found in the file
*/
if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(nodes, hosts, remove))) {
free(hosts);
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is here */
if (0 == opal_list_get_size(nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "-add-hostfile", hosts);
free(hosts);
return ORTE_ERR_SILENT;
}
free(hosts);
}
/* now filter the list through any -host specification */
if (!orte_soft_locations &&
orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(nodes, hosts, remove))) {
ORTE_ERROR_LOG(rc);
free(hosts);
return rc;
}
/** check that anything is left! */
if (0 == opal_list_get_size(nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "-host", hosts);
free(hosts);
return ORTE_ERR_SILENT;
}
free(hosts);
}
/* now filter the list through any add-host specification */
if (orte_get_attribute(&app->attributes, ORTE_APP_ADD_HOST, (void**)&hosts, OPAL_STRING)) {
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(nodes, hosts, remove))) {
ORTE_ERROR_LOG(rc);
free(hosts);
return rc;
}
/** check that anything is left! */
if (0 == opal_list_get_size(nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "-add-host", hosts);
free(hosts);
return ORTE_ERR_SILENT;
}
free(hosts);
}
return rc;
}
示例10: orte_plm_base_recv
/* process incoming messages in order of receipt */
void orte_plm_base_recv(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_plm_cmd_flag_t command;
orte_std_cntr_t count;
orte_jobid_t job;
orte_job_t *jdata, *parent;
opal_buffer_t *answer;
orte_vpid_t vpid;
orte_proc_t *proc;
orte_proc_state_t state;
orte_exit_code_t exit_code;
int32_t rc=ORTE_SUCCESS, ret;
orte_app_context_t *app, *child_app;
orte_process_name_t name;
pid_t pid;
bool running;
int i;
char **env;
char *prefix_dir;
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:receive processing msg",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
switch (command) {
case ORTE_PLM_LAUNCH_JOB_CMD:
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:receive job launch command from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
/* unpack the job object */
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &count, ORTE_JOB))) {
ORTE_ERROR_LOG(rc);
goto ANSWER_LAUNCH;
}
/* record the sender so we know who to respond to */
jdata->originator.jobid = sender->jobid;
jdata->originator.vpid = sender->vpid;
/* get the parent's job object */
if (NULL != (parent = orte_get_job_data_object(sender->jobid))) {
/* if the prefix was set in the parent's job, we need to transfer
* that prefix to the child's app_context so any further launch of
* orteds can find the correct binary. There always has to be at
* least one app_context in both parent and child, so we don't
* need to check that here. However, be sure not to overwrite
* the prefix if the user already provided it!
*/
app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
prefix_dir = NULL;
if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) &&
!orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) {
orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING);
}
if (NULL != prefix_dir) {
free(prefix_dir);
}
}
/* if the user asked to forward any envars, cycle through the app contexts
* in the comm_spawn request and add them
*/
if (NULL != orte_forwarded_envars) {
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
env = opal_environ_merge(orte_forwarded_envars, app->env);
opal_argv_free(app->env);
app->env = env;
}
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:receive adding hosts",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* process any add-hostfile and add-host options that were provided */
if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
ORTE_ERROR_LOG(rc);
goto ANSWER_LAUNCH;
}
if (NULL != parent) {
if (NULL == parent->bookmark) {
/* find the sender's node in the job map */
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) {
//.........这里部分代码省略.........
示例11: orte_rmaps_base_get_target_nodes
/*
* Query the registry for all nodes allocated to a specified app_context
*/
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
orte_app_context_t *app, orte_mapping_policy_t policy,
bool initial_map, bool silent)
{
opal_list_item_t *item, *next;
orte_node_t *node, *nd, *nptr;
orte_std_cntr_t num_slots;
orte_std_cntr_t i;
int rc;
orte_job_t *daemons;
bool novm;
opal_list_t nodes;
char *hosts;
/** set default answer */
*total_num_slots = 0;
/* get the daemon job object */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
/* see if we have a vm or not */
novm = orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL);
/* if this is NOT a managed allocation, then we use the nodes
* that were specified for this app - there is no need to collect
* all available nodes and "filter" them
*/
if (!orte_managed_allocation) {
OBJ_CONSTRUCT(&nodes, opal_list_t);
/* if the app provided a dash-host, and we are not treating
* them as requested or "soft" locations, then use those nodes
*/
if (!orte_soft_locations &&
orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
"%s using dash_host %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts))) {
ORTE_ERROR_LOG(rc);
free(hosts);
return rc;
}
free(hosts);
} else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
/* otherwise, if the app provided a hostfile, then use that */
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
"%s using hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, hosts))) {
free(hosts);
ORTE_ERROR_LOG(rc);
return rc;
}
free(hosts);
} else if (NULL != orte_rankfile) {
/* use the rankfile, if provided */
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
"%s using rankfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_rankfile));
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
orte_rankfile))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 == opal_list_get_size(&nodes)) {
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
"%s nothing found in given rankfile",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OBJ_DESTRUCT(&nodes);
return ORTE_ERR_BAD_PARAM;
}
} else if (NULL != orte_default_hostfile) {
/* fall back to the default hostfile, if provided */
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
"%s using default hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_default_hostfile));
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
orte_default_hostfile))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* this is a special case - we always install a default
* hostfile, but it is empty. If the user didn't remove it
* or put something into it, then we will have pursued that
* option and found nothing. This isn't an error, we just need
* to add all the known nodes
*/
if (0 == opal_list_get_size(&nodes)) {
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
"%s nothing in default hostfile - using known nodes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto addknown;
}
} else {
/* if nothing else was available, then use all known nodes, which
* will include ourselves
//.........这里部分代码省略.........
示例12: bind_in_place
//.........这里部分代码省略.........
}
}
}
/* some systems do not report cores, and so we can get a situation where our
* default binding policy will fail for no necessary reason. So if we are
* computing a binding due to our default policy, and no cores are found
* on this node, just silently skip it - we will not bind
*/
if (!OPAL_BINDING_POLICY_IS_SET(map->binding) &&
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"Unable to bind-to core by default on node %s as no cores detected",
node->name);
continue;
}
/* we share topologies in order
* to save space, so we need to reset the usage info to reflect
* our own current state
*/
reset_usage(node, jdata->jobid);
/* cycle thru the procs */
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
/* bozo check */
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
return ORTE_ERR_SILENT;
}
/* get the index of this location */
if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, locale, OPAL_HWLOC_AVAILABLE))) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_SILENT;
}
data = (opal_hwloc_obj_data_t*)locale->userdata;
/* get the number of cpus under this location */
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, locale))) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
return ORTE_ERR_SILENT;
}
/* if we don't have enough cpus to support this additional proc, try
* shifting the location to a cousin that can support it - the important
* thing is that we maintain the same level in the topology */
if (ncpus < (data->num_bound+1)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"%s bind_in_place: searching right",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
sib = locale;
found = false;
while (NULL != (sib = sib->next_cousin)) {
data = (opal_hwloc_obj_data_t*)sib->userdata;
ncpus = opal_hwloc_base_get_npus(node->topology, sib);
if (data->num_bound < ncpus) {
found = true;
locale = sib;
break;
}
}
示例13: bind_downwards
static int bind_downwards(orte_job_t *jdata,
orte_node_t *node,
hwloc_obj_type_t target,
unsigned cache_level)
{
int j;
orte_job_map_t *map;
orte_proc_t *proc;
hwloc_obj_t trg_obj, nxt_obj;
hwloc_cpuset_t cpus;
unsigned int ncpus;
opal_hwloc_obj_data_t *data;
int total_cpus;
hwloc_cpuset_t totalcpuset;
hwloc_obj_t locale;
char *cpu_bitmap;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bind downward for job %s with bindings %s",
ORTE_JOBID_PRINT(jdata->jobid),
opal_hwloc_base_print_binding(jdata->map->binding));
/* initialize */
map = jdata->map;
totalcpuset = hwloc_bitmap_alloc();
/* cycle thru the procs */
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
/* bozo check */
locale = NULL;
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;
}
/* we don't know if the target is a direct child of this locale,
* or if it is some depth below it, so we have to conduct a bit
* of a search. Let hwloc find the min usage one for us.
*/
trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology, locale,
target, cache_level);
if (NULL == trg_obj) {
/* there aren't any such targets under this object */
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;
}
/* record the location */
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, trg_obj, OPAL_PTR);
/* start with a clean slate */
hwloc_bitmap_zero(totalcpuset);
total_cpus = 0;
nxt_obj = trg_obj;
do {
if (NULL == nxt_obj) {
/* could not find enough cpus to meet request */
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;
}
trg_obj = nxt_obj;
/* get the number of cpus under this location */
ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj);
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"%s GOT %d CPUS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
/* track the number bound */
if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
trg_obj->userdata = data;
}
data->num_bound++;
/* error out if adding a proc would cause overload and that wasn't allowed,
* and it wasn't a default binding policy (i.e., the user requested it)
*/
if (ncpus < data->num_bound &&
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
/* if the user specified a binding policy, then we cannot meet
* it since overload isn't allowed, so error out - have the
* message indicate that setting overload allowed will remove
* this restriction */
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
opal_hwloc_base_print_binding(map->binding), node->name,
data->num_bound, ncpus);
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;
} else {
/* if we have the default binding policy, then just don't bind */
OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
unbind_procs(jdata);
hwloc_bitmap_zero(totalcpuset);
return ORTE_SUCCESS;
}
//.........这里部分代码省略.........
示例14: job_errors
static void job_errors(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata;
orte_job_state_t jobstate;
orte_exit_code_t sts;
orte_proc_t *aborted_proc;
opal_buffer_t *answer;
int32_t rc, ret;
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return;
}
/* if the jdata is NULL, then we abort as this
* is reporting an unrecoverable error
*/
if (NULL == caddy->jdata) {
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
OBJ_RELEASE(caddy);
return;
}
/* update the state */
jdata = caddy->jdata;
jobstate = caddy->job_state;
jdata->state = jobstate;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:orcmsd_hnp: job %s reported state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(jobstate)));
if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
/* this is the primary job */
orte_never_launched = true;
}
/* disable routing as we may not have performed the daemon
* wireup - e.g., in a managed environment, all the daemons
* "phone home", but don't actually wireup into the routed
* network until they receive the launch message
*/
orte_routing_is_enabled = false;
jdata->num_terminated = jdata->num_procs;
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
/* if it was a dynamic spawn, then we better tell them this didn't work */
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
rc = jobstate;
answer = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
return;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
return;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s errmgr:hnp sending dyn error release of job %s to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&jdata->originator)));
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
ORTE_RML_TAG_PLM_PROXY,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
OBJ_RELEASE(caddy);
return;
}
if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
aborted_proc = NULL;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
sts = aborted_proc->exit_code;
if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
if (WIFSIGNALED(sts)) { /* died on signal */
#ifdef WCOREDUMP
if (WCOREDUMP(sts)) {
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
//.........这里部分代码省略.........
示例15: dump_aborted_procs
static void dump_aborted_procs(void)
{
orte_std_cntr_t i, n;
orte_proc_t *proc, *pptr;
orte_app_context_t *approc;
orte_job_t *job;
orte_node_t *node;
/* find the job that caused the problem - be sure to start the loop
* at 1 as the daemons are in 0 and will clearly be "running", so no
* point in checking them
*/
for (n=1; n < orte_job_data->size; n++) {
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
/* the array is no longer left-justified, so we have to continue */
continue;
}
if (ORTE_JOB_STATE_UNDEF != job->state &&
ORTE_JOB_STATE_INIT != job->state &&
ORTE_JOB_STATE_RUNNING != job->state &&
ORTE_JOB_STATE_TERMINATED != job->state &&
ORTE_JOB_STATE_ABORT_ORDERED != job->state) {
/* cycle through and count the number that were killed or aborted */
for (i=0; i < job->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) {
/* array is left-justfied - we are done */
continue;
}
if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state ||
ORTE_PROC_STATE_FAILED_TO_LAUNCH == pptr->state) {
++num_failed_start;
} else if (ORTE_PROC_STATE_ABORTED == pptr->state) {
++num_aborted;
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) {
++num_killed;
} else if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == pptr->state) {
++num_killed;
}
}
/* this is a guilty party */
proc = NULL;
if (!orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC, (void**)&proc, OPAL_PTR) ||
NULL == proc) {
continue;
}
approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
node = proc->node;
if (ORTE_JOB_STATE_FAILED_TO_START == job->state ||
ORTE_JOB_STATE_FAILED_TO_LAUNCH == job->state) {
switch (proc->exit_code) {
case ORTE_ERR_SILENT:
/* say nothing - it was already reported */
break;
case ORTE_ERR_SYS_LIMITS_PIPES:
orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
orte_basename, proc->node->name,
(unsigned long)proc->name.vpid);
break;
case ORTE_ERR_PIPE_SETUP_FAILURE:
orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
orte_basename, proc->node->name,
(unsigned long)proc->name.vpid);
break;
case ORTE_ERR_SYS_LIMITS_CHILDREN:
orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
orte_basename, proc->node->name,
(unsigned long)proc->name.vpid);
break;
case ORTE_ERR_FAILED_GET_TERM_ATTRS:
orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
orte_basename, proc->node->name,
(unsigned long)proc->name.vpid);
break;
case ORTE_ERR_WDIR_NOT_FOUND:
orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
orte_basename, approc->cwd,
proc->node->name, (unsigned long)proc->name.vpid);
break;
case ORTE_ERR_EXE_NOT_FOUND:
orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
orte_basename,
(unsigned long)proc->name.vpid,
orte_basename,
orte_basename,
proc->node->name,
approc->app);
break;
case ORTE_ERR_EXE_NOT_ACCESSIBLE:
orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
orte_basename, approc->app, proc->node->name,
(unsigned long)proc->name.vpid);
break;
case ORTE_ERR_MULTIPLE_AFFINITIES:
orte_show_help("help-orterun.txt",
"orterun:multiple-paffinity-schemes", true, NULL);
break;
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
//.........这里部分代码省略.........