本文整理汇总了C++中ORTE_ERROR_LOG函数的典型用法代码示例。如果您正苦于以下问题:C++ ORTE_ERROR_LOG函数的具体用法?C++ ORTE_ERROR_LOG怎么用?C++ ORTE_ERROR_LOG使用的例子?那么, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了ORTE_ERROR_LOG函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: orte_session_dir_finalize
int
orte_session_dir_finalize(orte_process_name_t *proc)
{
int rc;
char *tmp;
char *job, *job_session_dir, *vpid, *proc_session_dir;
/* need to setup the top_session_dir with the prefix */
tmp = opal_os_path(false,
orte_process_info.tmpdir_base,
orte_process_info.top_session_dir, NULL);
/* define the proc and job session directories for this process */
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid_string(&job, proc))) {
ORTE_ERROR_LOG(rc);
free(tmp);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_string(&vpid, proc))) {
ORTE_ERROR_LOG(rc);
free(tmp);
free(job);
return rc;
}
job_session_dir = opal_os_path( false, orte_process_info.universe_session_dir,
job, NULL );
if( NULL == job_session_dir ) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(tmp);
free(job);
free(vpid);
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc_session_dir = opal_os_path( false, job_session_dir, vpid, NULL );
if( NULL == proc_session_dir ) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(tmp);
free(job);
free(vpid);
free(job_session_dir);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_os_dirpath_destroy(proc_session_dir,
false, orte_dir_check_file);
opal_os_dirpath_destroy(job_session_dir,
false, orte_dir_check_file);
opal_os_dirpath_destroy(orte_process_info.universe_session_dir,
false, orte_dir_check_file);
opal_os_dirpath_destroy(tmp,
false, orte_dir_check_file);
if (opal_os_dirpath_is_empty(proc_session_dir)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found proc session dir empty - deleting");
}
rmdir(proc_session_dir);
} else {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: proc session dir not empty - leaving");
}
goto CLEANUP;
}
if (opal_os_dirpath_is_empty(job_session_dir)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found job session dir empty - deleting");
}
rmdir(job_session_dir);
} else {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: job session dir not empty - leaving");
}
goto CLEANUP;
}
if (opal_os_dirpath_is_empty(orte_process_info.universe_session_dir)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found univ session dir empty - deleting");
}
rmdir(orte_process_info.universe_session_dir);
} else {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: univ session dir not empty - leaving");
}
goto CLEANUP;
}
if (opal_os_dirpath_is_empty(tmp)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found top session dir empty - deleting");
}
rmdir(tmp);
} else {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: top session dir not empty - leaving");
}
}
CLEANUP:
//.........这里部分代码省略.........
示例2: mca_oob_ud_send_self
static int mca_oob_ud_send_self (orte_rml_send_t *msg)
{
unsigned int srco, dsto;
mca_oob_ud_req_t *req;
int srci, dsti;
int rc, size;
MCA_OOB_UD_IOV_SIZE(msg, size);
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s mca_oob_ud_send_self: sending %d bytes to myself",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size);
rc = mca_oob_ud_get_recv_req (*ORTE_PROC_MY_NAME, msg->tag, &req, (msg->iov != NULL) ? true : false);
if (ORTE_SUCCESS != rc) {
return rc;
}
req->req_rem_data_len = size;
req->req_is_eager = true;
rc = mca_oob_ud_recv_alloc (req);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
free (req->req_data.iov.uiov);
}
OBJ_RELEASE(req);
return rc;
}
srci = dsti = 0;
srco = dsto = 0;
if (msg->iov != NULL) {
do {
req->req_data_type = MCA_OOB_UD_REQ_IOV;
size_t copy = min(msg->iov[srci].iov_len - srco,
req->req_data.iov.uiov[dsti].iov_len - dsto);
memmove ((unsigned char *) req->req_data.iov.uiov[dsti].iov_base + dsto,
(unsigned char *) msg->iov[srci].iov_base + srco, copy);
srco += copy;
if (srco == msg->iov[srci].iov_len) {
srci++;
srco = 0;
}
dsto += copy;
if (dsto == req->req_data.iov.uiov[dsti].iov_len) {
dsti++;
dsto = 0;
}
} while (srci < req->req_data.iov.count && dsti < msg->count);
} else {
req->req_data_type = MCA_OOB_UD_REQ_BUF;
opal_buffer_t *buffer;
buffer = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, msg->buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
}
if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&req->req_data.buf.p, &req->req_data.buf.size)))
{
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
free(req->req_data.buf.p);
return rc;
}
OBJ_RELEASE(buffer);
}
req->state = MCA_OOB_UD_REQ_COMPLETE;
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s mca_oob_ud_send_self: complete. calling callbacks",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* queue up recv callback */
mca_oob_ud_event_queue_completed (req);
req->rml_msg->status = ORTE_SUCCESS;
return size;
}
示例3: orte_sstore_base_get_all_snapshots
int orte_sstore_base_get_all_snapshots(opal_list_t *all_snapshots, char *basedir)
{
#ifndef HAVE_DIRENT_H
return ORTE_ERR_NOT_SUPPORTED;
#else
int ret, exit_status = ORTE_SUCCESS;
char *loc_basedir = NULL;
char * tmp_str = NULL, * metadata_file = NULL;
DIR *dirp = NULL;
struct dirent *dir_entp = NULL;
struct stat file_status;
orte_sstore_base_global_snapshot_info_t *global_snapshot = NULL;
/* Sanity check */
if( NULL == all_snapshots ||
(NULL == orte_sstore_base_global_snapshot_dir && NULL == basedir)) {
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
if( NULL == basedir ) {
loc_basedir = strdup(orte_sstore_base_global_snapshot_dir);
} else {
loc_basedir = strdup(basedir);
}
/*
* Get all subdirectories under the base directory
*/
dirp = opendir(loc_basedir);
while( NULL != (dir_entp = readdir(dirp))) {
/* Skip "." and ".." if they are in the list */
if( 0 == strncmp("..", dir_entp->d_name, strlen("..") ) ||
0 == strncmp(".", dir_entp->d_name, strlen(".") ) ) {
continue;
}
/* Add the full path */
asprintf(&tmp_str, "%s/%s", loc_basedir, dir_entp->d_name);
if(0 != (ret = stat(tmp_str, &file_status) ) ){
free( tmp_str);
tmp_str = NULL;
continue;
} else {
/* Is it a directory? */
if(S_ISDIR(file_status.st_mode) ) {
asprintf(&metadata_file, "%s/%s",
tmp_str,
orte_sstore_base_global_metadata_filename);
if(0 != (ret = stat(metadata_file, &file_status) ) ){
free( tmp_str);
tmp_str = NULL;
free( metadata_file);
metadata_file = NULL;
continue;
} else {
if(S_ISREG(file_status.st_mode) ) {
global_snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t);
global_snapshot->ss_handle = 1;
global_snapshot->basedir = strdup(loc_basedir);
asprintf(&(global_snapshot->reference),
"%s",
dir_entp->d_name);
asprintf(&(global_snapshot->metadata_filename),
"%s/%s/%s",
global_snapshot->basedir,
global_snapshot->reference,
orte_sstore_base_global_metadata_filename);
opal_list_append(all_snapshots, &(global_snapshot->super));
}
}
free( metadata_file);
metadata_file = NULL;
}
}
free( tmp_str);
tmp_str = NULL;
}
closedir(dirp);
cleanup:
if( NULL != loc_basedir ) {
free(loc_basedir);
loc_basedir = NULL;
}
if( NULL != tmp_str) {
free( tmp_str);
tmp_str = NULL;
}
return exit_status;
#endif /* HAVE_DIRENT_H */
}
示例4: orte_dt_unpack_proc
/*
* PROC
*/
int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int rc;
int32_t i, n, count, k;
orte_attribute_t *kv;;
orte_proc_t **procs;
/* unpack into array of orte_proc_t objects */
procs = (orte_proc_t**) dest;
for (i=0; i < *num_vals; i++) {
/* create the orte_proc_t object */
procs[i] = OBJ_NEW(orte_proc_t);
if (NULL == procs[i]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* unpack the name */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(procs[i]->name), &n, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the node it is on */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->parent)), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the local rank */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->local_rank)), &n, ORTE_LOCAL_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the node rank */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->node_rank)), &n, ORTE_NODE_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the state */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->state)), &n, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the app context index */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(procs[i]->app_idx)), &n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the attributes */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
&n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (k=0; k < count; k++) {
n=1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv,
&n, ORTE_ATTRIBUTE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
kv->local = ORTE_ATTR_GLOBAL; // obviously not a local value
opal_list_append(&procs[i]->attributes, &kv->super);
}
}
return ORTE_SUCCESS;
}
示例5: orte_dt_unpack_map
/*
* JOB_MAP
* NOTE: There is no obvious reason to include all the node information when
* sending a map - hence, we do not pack that field, so don't unpack it here
*/
int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int rc;
int32_t i, n;
orte_job_map_t **maps;
/* unpack into array of orte_job_map_t objects */
maps = (orte_job_map_t**) dest;
for (i=0; i < *num_vals; i++) {
/* create the orte_rmaps_base_map_t object */
maps[i] = OBJ_NEW(orte_job_map_t);
if (NULL == maps[i]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* unpack the requested mapper */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->req_mapper), &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the policies */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->mapping), &n, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc);
return rc;
}
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->ranking), &n, ORTE_RANKING_POLICY))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#if OPAL_HAVE_HWLOC
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->binding), &n, OPAL_BINDING_POLICY))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#endif
/* unpack the ppr */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->ppr), &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the display map flag */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->display_map), &n, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the number of nodes involved in the job */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->num_nodes), &n, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
return ORTE_SUCCESS;
}
示例6: main
//.........这里部分代码省略.........
}
/* report out our URI, if we were requested to do so, using syntax
* proposed in an email thread by Jeff Squyres
*/
if (NULL != report_uri) {
rml_uri = orte_rml.get_contact_info();
if (0 == strcmp(report_uri, "-")) {
/* if '-', then output to stdout */
printf("%s\n", rml_uri);
} else if (0 == strcmp(report_uri, "+")) {
/* if '+', output to stderr */
fprintf(stderr, "%s\n", rml_uri);
} else {
/* treat it as a filename and output into it */
FILE *fp;
fp = fopen(report_uri, "w");
if (NULL == fp) {
fprintf(stderr, "ompi-server: failed to open designated file %s -- aborting\n", report_uri);
orte_finalize();
exit(1);
}
fprintf(fp, "%s\n", rml_uri);
fclose(fp);
}
free(rml_uri);
}
/* setup the data server to listen for commands */
if (ORTE_SUCCESS != (ret = orte_data_server_init())) {
fprintf(stderr, "ompi-server: failed to start data server -- aborting\n");
orte_finalize();
exit(1);
}
/* setup to listen for commands sent specifically to me */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
orte_finalize();
exit(1);
}
/* Set signal handlers to catch kill signals so we can properly clean up
* after ourselves.
*/
opal_event_set(opal_event_base, &term_handler, SIGTERM, OPAL_EV_SIGNAL,
shutdown_callback, NULL);
opal_event_add(&term_handler, NULL);
opal_event_set(opal_event_base, &int_handler, SIGINT, OPAL_EV_SIGNAL,
shutdown_callback, NULL);
opal_event_add(&int_handler, NULL);
/* We actually do *not* want the server to voluntarily yield() the
processor more than necessary. The server already blocks when
it is doing nothing, so it doesn't use any more CPU cycles than
it should; but when it *is* doing something, we do not want it
to be unnecessarily delayed because it voluntarily yielded the
processor in the middle of its work.
For example: when a message arrives at the server, we want the
OS to wake up the server in a timely fashion (which most OS's
seem good about doing) and then we want the server to process
the message as fast as possible. If the server yields and lets
aggressive MPI applications get the processor back, it may be a
long time before the OS schedules the server to run again
(particularly if there is no IO event to wake it up). Hence,
publish and lookup (for example) may be significantly delayed
before being delivered to MPI processes, which can be
problematic in some scenarios (e.g., COMM_SPAWN). */
opal_progress_set_yield_when_idle(false);
/* Change the default behavior of libevent such that we want to
continually block rather than blocking for the default timeout
and then looping around the progress engine again. There
should be nothing in the server that cannot block in libevent
until "something" happens (i.e., there's no need to keep
cycling through progress because the only things that should
happen will happen in libevent). This is a minor optimization,
but what the heck... :-) */
opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);
if (debug) {
opal_output(0, "%s ompi-server: up and running!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* wait to hear we are done */
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
/* should never get here, but if we do... */
/* Finalize and clean up ourselves */
if (ORTE_SUCCESS != (ret = orte_finalize())) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
示例7: orte_dt_unpack_job
/*
* JOB
* NOTE: We do not pack all of the job object's fields as many of them have no
* value in sending them to another location. The only purpose in packing and
* sending a job object is to communicate the data required to dynamically
* spawn another job - so we only pack that limited set of required data.
* Therefore, only unpack what was packed
*/
int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int rc;
int32_t i, k, n, count;
orte_job_t **jobs;
orte_app_idx_t j;
orte_attribute_t *kv;
/* unpack into array of orte_job_t objects */
jobs = (orte_job_t**) dest;
for (i=0; i < *num_vals; i++) {
/* create the orte_job_t object */
jobs[i] = OBJ_NEW(orte_job_t);
if (NULL == jobs[i]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* unpack the jobid */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(jobs[i]->jobid), &n, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the num apps */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->num_apps)), &n, ORTE_APP_IDX))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if there are apps, unpack them */
if (0 < jobs[i]->num_apps) {
orte_app_context_t *app;
for (j=0; j < jobs[i]->num_apps; j++) {
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&app, &n, ORTE_APP_CONTEXT))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_pointer_array_add(jobs[i]->apps, app);
}
}
/* unpack num procs and offset */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->num_procs)), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->offset)), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* and the procs, if provided */
if (0 < jobs[i]->num_procs) {
orte_proc_t *proc;
for (j=0; j < jobs[i]->num_procs; j++) {
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&proc, &n, ORTE_PROC))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_pointer_array_add(jobs[i]->procs, proc);
}
}
/* unpack stdin target */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->stdin_target)), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the total slots allocated to the job */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(jobs[i]->total_slots_alloc)), &n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
//.........这里部分代码省略.........
示例8: recv_data
//.........这里部分代码省略.........
opal_event_del(&jtrk->timeout_ev);
/* cycle across all the remaining parts - each is the allocation for
* an app in this job
*/
OBJ_CONSTRUCT(&nds, opal_list_t);
OBJ_CONSTRUCT(&ndtmp, opal_list_t);
idx = -1;
sjob = -1;
nodelist = NULL;
for (i=1; NULL != alloc[i]; i++) {
if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) {
orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
return;
}
if (idx < 0 || NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
opal_argv_free(alloc);
return;
}
/* track the Slurm jobid */
if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) {
aptrk = OBJ_NEW(local_apptracker_t);
opal_pointer_array_set_item(&jtrk->apps, idx, aptrk);
}
aptrk->sjob = sjob;
/* release the current dash_host as that contained the *desired* allocation */
opal_argv_free(app->dash_host);
app->dash_host = NULL;
/* since the nodelist/tpn may contain regular expressions, parse them */
if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) {
ORTE_ERROR_LOG(rc);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
opal_argv_free(alloc);
return;
}
/* transfer the discovered nodes to our node list, and construct
* the new dash_host entry to match what was allocated
*/
while (NULL != (item = opal_list_remove_first(&ndtmp))) {
nd = (orte_node_t*)item;
opal_argv_append_nosize(&app->dash_host, nd->name);
/* check for duplicates */
found = false;
for (itm = opal_list_get_first(&nds);
itm != opal_list_get_end(&nds);
itm = opal_list_get_next(itm)) {
nd2 = (orte_node_t*)itm;
if (0 == strcmp(nd->name, nd2->name)) {
found = true;
nd2->slots += nd->slots;
OBJ_RELEASE(item);
break;
}
}
if (!found) {
/* append the new node to our list */
opal_list_append(&nds, item);
}
}
/* cleanup */
free(nodelist);
free(tpn);
}
示例9: dyn_allocate
//.........这里部分代码省略.........
* need to do a little work to build the command. We don't currently
* have a field in the jdata structure for "mandatory" vs "optional"
* allocations, so we'll have to add that someday. Likewise, you may
* want to provide a param to adjust the timeout value
*/
/* construct the cmd string */
opal_argv_append_nosize(&cmd, "allocate");
/* add the jobid */
orte_util_convert_jobid_to_string(&jstring, jdata->jobid);
asprintf(&tmp, "jobid=%s", jstring);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
free(jstring);
/* if we want the allocation for all apps in one shot,
* then tell slurm
*
* RHC: we don't currently have the ability to handle
* rolling allocations in the rest of the code base
*/
#if 0
if (!mca_ras_slurm_component.rolling_alloc) {
opal_argv_append_nosize(&cmd, "return=all");
}
#else
opal_argv_append_nosize(&cmd, "return=all");
#endif
/* pass the timeout */
asprintf(&tmp, "timeout=%d", mca_ras_slurm_component.timeout);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
/* for each app, add its allocation request info */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* add the app id, preceded by a colon separator */
asprintf(&tmp, ": app=%d", (int)app->idx);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
/* add the number of process "slots" we need */
asprintf(&tmp, "np=%d", app->num_procs);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
/* if we were given a minimum number of nodes, pass it along */
if (0 < app->min_number_of_nodes) {
asprintf(&tmp, "N=%ld", (long int)app->min_number_of_nodes);
opal_argv_append_nosize(&cmd, tmp);
free(tmp);
}
/* add the list of nodes, if one was given, ensuring
* that each node only appears once
*/
node_list = get_node_list(app);
if (NULL != node_list) {
asprintf(&tmp, "node_list=%s", node_list);
opal_argv_append_nosize(&cmd, tmp);
free(node_list);
free(tmp);
}
/* add the mandatory/optional flag */
if (app->mandatory) {
opal_argv_append_nosize(&cmd, "flag=mandatory");
} else {
opal_argv_append_nosize(&cmd, "flag=optional");
}
}
/* assemble it into the final cmd to be sent */
cmd_str = opal_argv_join(cmd, ' ');
opal_argv_free(cmd);
/* start a timer - if the response to our request doesn't appear
* in the defined time, then we will error out as Slurm isn't
* responding to us
*/
opal_event_evtimer_set(orte_event_base, &jtrk->timeout_ev, timeout, jtrk);
tv.tv_sec = mca_ras_slurm_component.timeout * 2;
tv.tv_usec = 0;
opal_event_evtimer_add(&jtrk->timeout_ev, &tv);
opal_output_verbose(2, orte_ras_base_framework.framework_output,
"%s slurm:dynalloc cmd_str = %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
cmd_str);
if (send(socket_fd, cmd_str, strlen(cmd_str)+1, 0) < 0) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
free(cmd_str);
/* we cannot wait here for a response as we
* are already in an event. So return a value
* that indicates we are waiting for an
* allocation so the base functions know
* that they shouldn't progress the job
*/
return ORTE_ERR_ALLOCATION_PENDING;
}
示例10: orte_ras_slurm_discover
/**
* Discover the available resources.
*
* In order to fully support slurm, we need to be able to handle
* node regexp/task_per_node strings such as:
* foo,bar 5,3
* foo 5
* foo[2-10,12,99-105],bar,foobar[3-11] 2(x10),5,100(x16)
*
* @param *regexp A node regular expression from SLURM (i.e. SLURM_NODELIST)
* @param *tasks_per_node A tasks per node expression from SLURM
* (i.e. SLURM_TASKS_PER_NODE)
* @param *nodelist A list which has already been constucted to return
* the found nodes in
*/
static int orte_ras_slurm_discover(char *regexp, char *tasks_per_node,
opal_list_t* nodelist)
{
int i, j, len, ret, count, reps, num_nodes;
char *base, **names = NULL;
char *begptr, *endptr, *orig;
int *slots;
bool found_range = false;
bool more_to_come = false;
orig = base = strdup(regexp);
if (NULL == base) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s ras:slurm:allocate:discover: checking nodelist: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
regexp));
do {
/* Find the base */
len = strlen(base);
for (i = 0; i <= len; ++i) {
if (base[i] == '[') {
/* we found a range. this gets dealt with below */
base[i] = '\0';
found_range = true;
break;
}
if (base[i] == ',') {
/* we found a singleton node, and there are more to come */
base[i] = '\0';
found_range = false;
more_to_come = true;
break;
}
if (base[i] == '\0') {
/* we found a singleton node */
found_range = false;
more_to_come = false;
break;
}
}
if(i == 0) {
/* we found a special character at the beginning of the string */
orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value",
1, regexp, tasks_per_node, "SLURM_NODELIST");
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
free(orig);
return ORTE_ERR_BAD_PARAM;
}
if (found_range) {
/* If we found a range, now find the end of the range */
for (j = i; j < len; ++j) {
if (base[j] == ']') {
base[j] = '\0';
break;
}
}
if (j >= len) {
/* we didn't find the end of the range */
orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value",
1, regexp, tasks_per_node, "SLURM_NODELIST");
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
free(orig);
return ORTE_ERR_BAD_PARAM;
}
ret = orte_ras_slurm_parse_ranges(base, base + i + 1, &names);
if(ORTE_SUCCESS != ret) {
orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value",
1, regexp, tasks_per_node, "SLURM_NODELIST");
ORTE_ERROR_LOG(ret);
free(orig);
return ret;
}
if(base[j + 1] == ',') {
more_to_come = true;
base = &base[j + 2];
} else {
more_to_come = false;
}
//.........这里部分代码省略.........
示例11: orte_ras_slurm_parse_range
/*
* Parse a single range in a set and add the full names of the nodes
* found to the names argv
*
* @param base The base text of the node name
* @param *ranges A pointer to a single range. (i.e. "1-3" or "5")
* @param ***names An argv array to add the newly discovered nodes to
*/
static int orte_ras_slurm_parse_range(char *base, char *range, char ***names)
{
char *str, temp1[BUFSIZ];
size_t i, j, start, end;
size_t base_len, len, num_len;
size_t num_str_len;
bool found;
int ret;
len = strlen(range);
base_len = strlen(base);
/* Silence compiler warnings; start and end are always assigned
properly, below */
start = end = 0;
/* Look for the beginning of the first number */
for (found = false, i = 0; i < len; ++i) {
if (isdigit((int) range[i])) {
if (!found) {
start = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* Look for the end of the first number */
for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) {
if (!isdigit((int) range[i])) {
break;
}
}
/* Was there no range, just a single number? */
if (i >= len) {
end = start;
found = true;
}
/* Nope, there was a range. Look for the beginning of the second
number */
else {
for (; i < len; ++i) {
if (isdigit((int) range[i])) {
end = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* Make strings for all values in the range */
len = base_len + num_str_len + 32;
str = malloc(len);
if (NULL == str) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
strcpy(str, base);
for (i = start; i <= end; ++i) {
str[base_len] = '\0';
snprintf(temp1, BUFSIZ - 1, "%lu", (long) i);
/* Do we need zero pading? */
if ((num_len = strlen(temp1)) < num_str_len) {
for (j = base_len; j < base_len + (num_str_len - num_len); ++j) {
str[j] = '0';
}
str[j] = '\0';
}
strcat(str, temp1);
ret = opal_argv_append_nosize(names, str);
if(ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
free(str);
return ret;
}
}
//.........这里部分代码省略.........
示例12: orte_ras_slurm_allocate
/**
* Discover available (pre-allocated) nodes. Allocate the
* requested number of nodes/process slots to the job.
*
*/
static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
{
int ret, cpus_per_task;
char *slurm_node_str, *regexp;
char *tasks_per_node, *node_tasks;
char *tmp;
char *slurm_jobid;
if (NULL == (slurm_jobid = getenv("SLURM_JOBID"))) {
/* we are not in a slurm allocation - see if dyn alloc
* is enabled
*/
if (!mca_ras_slurm_component.dyn_alloc_enabled) {
/* nope - nothing we can do */
opal_output_verbose(2, orte_ras_base_framework.framework_output,
"%s ras:slurm: no prior allocation and dynamic alloc disabled",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
} else {
/* save this value in the global job ident string for
* later use in any error reporting
*/
orte_job_ident = strdup(slurm_jobid);
}
slurm_node_str = getenv("SLURM_NODELIST");
if (NULL == slurm_node_str) {
/* see if dynamic allocation is enabled */
if (mca_ras_slurm_component.dyn_alloc_enabled) {
/* attempt to get the allocation - the function
* dyn_allocate will return as ORTE_ERR_ALLOCATION_PENDING
* if it succeeds in sending the allocation request
*/
ret = dyn_allocate(jdata);
/* return to the above layer in ras/base/ras_base_allocate.c
* to wait for event (libevent) happening
*/
return ret;
}
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
"SLURM_NODELIST");
return ORTE_ERR_NOT_FOUND;
}
regexp = strdup(slurm_node_str);
tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE");
if (NULL == tasks_per_node) {
/* try an older variation */
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
if (NULL == tasks_per_node) {
/* couldn't find any version - abort */
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
"SLURM_TASKS_PER_NODE");
return ORTE_ERR_NOT_FOUND;
}
}
node_tasks = strdup(tasks_per_node);
if(NULL == regexp || NULL == node_tasks) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* get the number of CPUs per task that the user provided to slurm */
tmp = getenv("SLURM_CPUS_PER_TASK");
if(NULL != tmp) {
cpus_per_task = atoi(tmp);
if(0 >= cpus_per_task) {
opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. "
"Variable was: %s\n", tmp);
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
} else {
cpus_per_task = 1;
}
ret = orte_ras_slurm_discover(regexp, node_tasks, nodes);
free(regexp);
free(node_tasks);
if (ORTE_SUCCESS != ret) {
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s ras:slurm:allocate: discover failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return ret;
}
/* record the number of allocated nodes */
orte_num_allocated_nodes = opal_list_get_size(nodes);
/* All done */
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
"%s ras:slurm:allocate: success",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
//.........这里部分代码省略.........
示例13: init
/* init the module */
static int init(void)
{
char *slurm_host=NULL;
uint16_t port=0;
struct sockaddr_in address;
int flags;
struct hostent *h;
if (mca_ras_slurm_component.dyn_alloc_enabled) {
if (NULL == mca_ras_slurm_component.config_file) {
orte_show_help("help-ras-slurm.txt", "dyn-alloc-no-config", true);
return ORTE_ERR_SILENT;
}
/* setup the socket */
if (ORTE_SUCCESS != read_ip_port(mca_ras_slurm_component.config_file,
&slurm_host, &port) ||
NULL == slurm_host || 0 == port) {
return ORTE_ERR_SILENT;
}
OPAL_OUTPUT_VERBOSE((2, orte_ras_base_framework.framework_output,
"ras:slurm got [ ip = %s, port = %u ] from %s\n",
slurm_host, port, mca_ras_slurm_component.config_file));
/* obtain a socket for our use */
if ((socket_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* connect to the Slurm dynamic allocation port */
bzero(&address, sizeof(address));
address.sin_family = AF_INET;
if (!opal_net_isaddr(slurm_host)) {
/* if the ControlMachine was not specified as an IP address,
* we need to resolve it here
*/
if (NULL == (h = gethostbyname(slurm_host))) {
/* could not resolve it */
orte_show_help("help-ras-slurm.txt", "host-not-resolved",
true, slurm_host);
free(slurm_host);
return ORTE_ERR_SILENT;
}
free(slurm_host);
slurm_host = strdup(inet_ntoa(*(struct in_addr*)h->h_addr_list[0]));
}
address.sin_addr.s_addr = inet_addr(slurm_host);
address.sin_port = htons(port);
if (connect(socket_fd, (struct sockaddr*)&address, sizeof(address)) < 0) {
orte_show_help("help-ras-slurm.txt", "connection-failed",
true, slurm_host, (int)port);
return ORTE_ERR_SILENT;
}
/* set socket up to be non-blocking */
if ((flags = fcntl(socket_fd, F_GETFL, 0)) < 0) {
opal_output(0, "ras:slurm:dyn: fcntl(F_GETFL) failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno);
return ORTE_ERROR;
} else {
flags |= O_NONBLOCK;
if (fcntl(socket_fd, F_SETFL, flags) < 0) {
opal_output(0, "ras:slurm:dyn: fcntl(F_SETFL) failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno);
return ORTE_ERROR;
}
}
/* setup to recv data */
opal_event_set(orte_event_base, &recv_ev, socket_fd,
OPAL_EV_READ, recv_data, NULL);
opal_event_add(&recv_ev, 0);
/* initialize the list of jobs for tracking dynamic allocations */
OBJ_CONSTRUCT(&jobs, opal_list_t);
}
return ORTE_SUCCESS;
}
示例14: orte_ras_base_open
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int orte_ras_base_open(void)
{
int value, rc, param;
orte_data_type_t tmp;
char *requested;
/* Debugging / verbose output */
orte_ras_base.ras_output = opal_output_open(NULL);
mca_base_param_reg_int_name("ras", "base_verbose",
"Enable debugging for the RAS framework (nonzero = enabled)",
false, false, 0, &value);
if (value != 0) {
orte_ras_base.ras_output = opal_output_open(NULL);
} else {
orte_ras_base.ras_output = -1;
}
/* Defaults */
orte_ras_base.ras_opened_valid = false;
orte_ras_base.ras_using_proxy = false;
orte_ras_base.ras_available_valid = false;
/** register the base system types with the DSS */
tmp = ORTE_RAS_NODE;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ras_base_pack_node,
orte_ras_base_unpack_node,
(orte_dss_copy_fn_t)orte_ras_base_copy_node,
(orte_dss_compare_fn_t)orte_ras_base_compare_node,
(orte_dss_size_fn_t)orte_ras_base_size_node,
(orte_dss_print_fn_t)orte_ras_base_print_node,
(orte_dss_release_fn_t)orte_ras_base_std_obj_release,
ORTE_DSS_STRUCTURED,
"ORTE_RAS_NODE", &tmp))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* Some systems do not want any RAS support. In those cases,
* memory consumption is also an issue. For those systems, we
* avoid opening the RAS components by checking for a directive
* to use the "null" component.
*/
param = mca_base_param_reg_string_name("ras", NULL, NULL,
false, false, NULL, NULL);
if (ORTE_ERROR == mca_base_param_lookup_string(param, &requested)) {
return ORTE_ERROR;
}
if (NULL != requested && 0 == strcmp(requested, "null")) {
/* the user has specifically requested that we use the "null"
* component. In this case, that means we do NOT open any
* components, and we simply use the default module we have
* already defined above
*/
orte_ras_base.ras_opened_valid = false;
orte_ras = orte_ras_no_op; /* use the no_op module */
return ORTE_SUCCESS;
}
/* check for timing tests */
param = mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, 0, &value);
if (value != 0) {
orte_ras_base.timing = true;
} else {
orte_ras_base.timing = false;
}
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("ras", orte_ras_base.ras_output,
mca_ras_base_static_components,
&orte_ras_base.ras_opened, true)) {
return ORTE_ERROR;
}
/* if we are not on a HNP, select the proxy 'module' */
if (!orte_process_info.seed) {
orte_ras = orte_ras_base_proxy_module;
/* initialize the module */
orte_ras_base_proxy_init(&rc);
orte_ras_base.ras_using_proxy = true;
return ORTE_SUCCESS;
}
/* All done */
orte_ras_base.ras_opened_valid = true;
return ORTE_SUCCESS;
}
示例15: update_state
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
opal_list_item_t *item, *next;
orte_odls_job_t *jobdat = NULL;
orte_odls_child_t *child;
opal_buffer_t *alert;
orte_plm_cmd_flag_t cmd;
int rc=ORTE_SUCCESS;
orte_vpid_t null=ORTE_VPID_INVALID;
orte_ns_cmp_bitmask_t mask;
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:default_orted:update_state() %s) "
"------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
((NULL == proc) ? "App. Process" :
(proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
/* if this is a heartbeat failure, let the HNP handle it */
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
return ORTE_SUCCESS;
}
/*** UPDATE COMMAND FOR A JOB ***/
if (NULL == proc) {
/* this is an update for an entire job */
if (ORTE_JOBID_INVALID == job) {
/* whatever happened, we don't know what job
* it happened to
*/
orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error",
true, orte_job_state_to_str(jobstate));
alert = OBJ_NEW(opal_buffer_t);
/* pack update state command */
cmd = ORTE_PLM_UPDATE_PROC_STATE;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the "invalid" jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
ORTE_ERROR_LOG(rc);
} else {
rc = ORTE_SUCCESS;
}
return rc;
}
/* lookup the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == job) {
break;
}
}
if (NULL == jobdat) {
return ORTE_ERR_NOT_FOUND;
}
switch (jobstate) {
case ORTE_JOB_STATE_FAILED_TO_START:
failed_start(jobdat, exit_code);
break;
case ORTE_JOB_STATE_RUNNING:
/* update all local child states */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING);
break;
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
/* update all procs in job */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
case ORTE_JOB_STATE_COMM_FAILED:
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* tell the caller we can't recover */
return ORTE_ERR_UNRECOVERABLE;
//.........这里部分代码省略.........