"Fossies" - the Fresh Open Source Software Archive

Member "hydra-3.3.2/tools/ckpoint/blcr/ckpoint_blcr.c" (12 Nov 2019, 10394 Bytes) of package /linux/misc/hydra-3.3.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ckpoint_blcr.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.3.1_vs_3.3.2.

    1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
    2 /*
    3  *  (C) 2008 by Argonne National Laboratory.
    4  *      See COPYRIGHT in top-level directory.
    5  */
    6 
    7 #include "hydra.h"
    8 #include "ckpoint.h"
    9 #include "ckpoint_blcr.h"
   10 #include <libcr.h>
   11 
   12 static int my_callback(void *arg)
   13 {
   14     int rc;
   15 
   16     rc = cr_checkpoint(CR_CHECKPOINT_OMIT);
   17 
   18     switch (rc) {
   19         case -CR_ETEMPFAIL:
   20             /* One of the processes indicated that it couldn't take the checkpoint now.  Try again later. */
   21             return -1;
   22             break;
   23         case -CR_EPERMFAIL:
   24             /* One of the processes indicated a permanent failure */
   25             return -1;
   26             break;
   27         case -CR_EOMITTED:
   28             /* This is the expected return */
   29             break;
   30         default:
   31             /* Something bad happened */
   32             return -1;
   33     }
   34 
   35 
   36     return 0;
   37 }
   38 
   39 static HYD_status create_env_file(const struct HYD_env *envlist, int num_ranks, int *ranks)
   40 {
   41     HYD_status status = HYD_SUCCESS;
   42     char filename[256];
   43     FILE *f;
   44     const struct HYD_env *e;
   45     int ret;
   46     int r;
   47 
   48     HYDU_FUNC_ENTER();
   49 
   50     for (r = 0; r < num_ranks; ++r) {
   51         MPL_snprintf(filename, sizeof(filename), "/tmp/hydra-env-file-%d:%d", (int) getpid(),
   52                      ranks[r]);
   53 
   54         f = fopen(filename, "w");
   55         HYDU_ERR_CHKANDJUMP(status, f == NULL, HYD_INTERNAL_ERROR, "fopen failed: %s\n",
   56                             strerror(errno));
   57 
   58         for (e = envlist; e; e = e->next) {
   59             fprintf(f, "%s=%s\n", e->env_name, e->env_value);
   60         }
   61 
   62         ret = fclose(f);
   63         HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "fclose failed: %s\n",
   64                             strerror(errno));
   65     }
   66 
   67   fn_exit:
   68     HYDU_FUNC_EXIT();
   69     return status;
   70 
   71   fn_fail:
   72     goto fn_exit;
   73 }
   74 
   75 static int listen_fd;
   76 
   77 static HYD_status create_stdinouterr_sock(int *port)
   78 {
   79     HYD_status status = HYD_SUCCESS;
   80     int ret;
   81     unsigned short t_port;
   82     MPL_sockaddr_t addr;
   83     HYDU_FUNC_ENTER();
   84 
   85     listen_fd = MPL_socket();
   86     HYDU_ERR_CHKANDJUMP(status, listen_fd < 0, HYD_INTERNAL_ERROR, "socket() failed, %s\n",
   87                         strerror(errno));
   88 
   89     MPL_LISTEN_PUSH(1, SOMAXCONN);
   90     ret = MPL_listen_anyport(listen_fd, &t_port);
   91     *port = t_port;
   92     MPL_LISTEN_POP;
   93     HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "listen() failed, %s\n", strerror(errno));
   94 
   95   fn_exit:
   96     HYDU_FUNC_EXIT();
   97     return status;
   98   fn_fail:
   99     goto fn_exit;
  100 }
  101 
  102 typedef struct sock_ident {
  103     int rank;
  104     enum { IN_SOCK, OUT_SOCK, ERR_SOCK } socktype;
  105     int pid;
  106 } sock_ident_t;
  107 
  108 /* This waits for the restarted processes to reconnect their
  109    stdin/out/err sockets, then sets the appropriate entries in the in
  110    out and err arrays.  This also gets the pids of the restarted
  111    processes. */
  112 static HYD_status wait_for_stdinouterr_sockets(int num_ranks, int *ranks, int *in, int *out,
  113                                                int *err, int *pid)
  114 {
  115     HYD_status status = HYD_SUCCESS;
  116     int ret;
  117     int fd;
  118     int i, c;
  119     sock_ident_t id;
  120     int num_expected_connections = num_ranks * 2;       /* wait for connections for stdout and err */
  121     HYDU_FUNC_ENTER();
  122 
  123     /* if one of the processes is rank 0, we should wait for an
  124      * additional connection for stdin */
  125     for (i = 0; i < num_ranks; ++i)
  126         if (ranks[i] == 0) {
  127             ++num_expected_connections;
  128             break;
  129         }
  130 
  131     for (c = 0; c < num_expected_connections; ++c) {
  132         size_t len;
  133         char *id_p;
  134         /* wait for a connection */
  135         do {
  136             MPL_sockaddr_t rmt_addr;
  137             socklen_t sa_len = sizeof(rmt_addr);;
  138             fd = accept(listen_fd, (struct sockaddr *) &rmt_addr, &sa_len);
  139         } while (fd && errno == EINTR);
  140         HYDU_ERR_CHKANDJUMP(status, fd == -1, HYD_INTERNAL_ERROR, "accept failed, %s\n",
  141                             strerror(errno));
  142 
  143         /* read the socket identifier */
  144         len = sizeof(id);
  145         id_p = (char *) &id;
  146         do {
  147             do {
  148                 ret = read(fd, id_p, len);
  149             } while (ret == 0 || (ret == -1 && errno == EINTR));
  150             HYDU_ERR_CHKANDJUMP(status, ret == -1, HYD_INTERNAL_ERROR, "read failed, %s\n",
  151                                 strerror(errno));
  152             len -= ret;
  153             id_p += ret;
  154         } while (len);
  155 
  156         /* determine the index for this process in the stdout/err
  157          * arrays */
  158         for (i = 0; i < num_ranks; ++i)
  159             if (ranks[i] == id.rank)
  160                 break;
  161         HYDU_ASSERT(i < num_ranks, status);
  162 
  163         /* assign the fd */
  164         switch (id.socktype) {
  165             case IN_SOCK:
  166                 HYDU_ASSERT(id.rank == 0, status);
  167                 *in = fd;
  168                 break;
  169             case OUT_SOCK:
  170                 out[i] = fd;
  171                 break;
  172             case ERR_SOCK:
  173                 err[i] = fd;
  174                 break;
  175             default:
  176                 HYDU_ASSERT(0, status);
  177                 break;
  178         }
  179 
  180         /* assign the pid */
  181         pid[i] = id.pid;
  182     }
  183 
  184     ret = close(listen_fd);
  185     HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "close of listener port failed, %s\n",
  186                         strerror(errno));
  187 
  188 
  189   fn_exit:
  190     HYDU_FUNC_EXIT();
  191     return status;
  192   fn_fail:
  193     goto fn_exit;
  194 }
  195 
  196 
  197 
  198 
  199 HYD_status HYDT_ckpoint_blcr_checkpoint(const char *prefix, int pgid, int id, int ckpt_num)
  200 {
  201     HYD_status status = HYD_SUCCESS;
  202     int ret;
  203     int fd;
  204     cr_checkpoint_args_t my_args;
  205     cr_checkpoint_handle_t my_handle;
  206     char filename[256];
  207 
  208     HYDU_FUNC_ENTER();
  209 
  210     /* build the checkpoint filename */
  211     MPL_snprintf(filename, sizeof(filename), "%s/context-num%d-%d-%d", prefix, ckpt_num, pgid, id);
  212 
  213     /* remove existing checkpoint file, if any */
  214     (void) unlink(filename);
  215 
  216     /* open the checkpoint file */
  217     fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC /* | O_LARGEFILE */ , 0600);
  218     HYDU_ERR_CHKANDJUMP(status, fd < 0, HYD_INTERNAL_ERROR, "open failed: %s\n", strerror(errno));
  219 
  220     cr_initialize_checkpoint_args_t(&my_args);
  221     my_args.cr_fd = fd;
  222     my_args.cr_scope = CR_SCOPE_TREE;
  223 
  224     /* issue the request */
  225     ret = cr_request_checkpoint(&my_args, &my_handle);
  226     if (ret < 0) {
  227         HYDU_ERR_CHKANDJUMP(status, errno == CR_ENOSUPPORT, HYD_INTERNAL_ERROR,
  228                             "Checkpointing failed.  Make sure BLCR kernel module is loaded. %s\n",
  229                             strerror(errno));
  230         HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "cr_request_checkpoint failed, %s\n",
  231                             strerror(errno));
  232     }
  233     /* wait for the request to complete */
  234     while (1) {
  235         ret = cr_poll_checkpoint(&my_handle, NULL);
  236         if (ret < 0) {
  237             if ((ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED)) {
  238                 HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
  239                                     "trying to restart in a checkpoint\n");
  240             } else if (errno == EINTR) {
  241                 /* poll was interrupted by a signal -- retry */
  242             } else {
  243                 HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
  244                                     "cr_poll_checkpoint failed: %s\n", strerror(errno));
  245             }
  246         } else if (ret == 0) {
  247             HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
  248                                 "cr_poll_checkpoint returned 0 unexpectedly\n");
  249         } else {
  250             break;
  251         }
  252     }
  253 
  254     ret = close(my_args.cr_fd);
  255     HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "close failed, %s\n", strerror(errno));
  256 
  257   fn_exit:
  258     HYDU_FUNC_EXIT();
  259     return status;
  260 
  261   fn_fail:
  262     goto fn_exit;
  263 }
  264 
  265 #define STDINOUTERR_PORT_NAME "CKPOINT_STDINOUTERR_PORT"
  266 
  267 HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, int pgid, int id, int ckpt_num,
  268                                      struct HYD_env *envlist, int num_ranks, int ranks[],
  269                                      int *in, int *out, int *err, int *pid)
  270 {
  271     HYD_status status = HYD_SUCCESS;
  272     int ret;
  273     int context_fd;
  274     cr_restart_handle_t cr_handle;
  275     cr_restart_args_t args;
  276     char filename[256];
  277     char port_str[64];
  278     int port;
  279 
  280     HYDU_FUNC_ENTER();
  281 
  282     /* create listener socket for stdin/out/err */
  283     status = create_stdinouterr_sock(&port);
  284     HYDU_ERR_POP(status, "failed to create stdin/out/err socket\n");
  285     MPL_snprintf(port_str, sizeof(port_str), "%d", port);
  286     status = HYDU_append_env_to_list(STDINOUTERR_PORT_NAME, port_str, &envlist);
  287     HYDU_ERR_POP(status, "failed to add to env list\n");
  288 
  289     status = create_env_file(envlist, num_ranks, ranks);
  290     if (status)
  291         HYDU_ERR_POP(status, "blcr restart\n");
  292 
  293     /* open the checkpoint file */
  294     MPL_snprintf(filename, sizeof(filename), "%s/context-num%d-%d-%d", prefix, ckpt_num, pgid, id);
  295     context_fd = open(filename, O_RDONLY /* | O_LARGEFILE */);
  296     HYDU_ERR_CHKANDJUMP(status, context_fd < 0, HYD_INTERNAL_ERROR, "open failed, %s\n",
  297                         strerror(errno));
  298 
  299     /* ... initialize the request structure */
  300     cr_initialize_restart_args_t(&args);
  301     args.cr_fd = context_fd;
  302     args.cr_flags = CR_RSTRT_RESTORE_PID;
  303 
  304     /* ... issue the request */
  305     ret = cr_request_restart(&args, &cr_handle);
  306     HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "cr_request_restart failed, %s\n",
  307                         strerror(errno));
  308 
  309     ret = close(context_fd);
  310     HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "close failed, %s\n", strerror(errno));
  311 
  312     /* get fds for stdin/out/err sockets, and get pids of restarted processes */
  313     status = wait_for_stdinouterr_sockets(num_ranks, ranks, in, out, err, pid);
  314     if (status)
  315         HYDU_ERR_POP(status, "blcr restart\n");
  316 
  317   fn_exit:
  318     HYDU_FUNC_EXIT();
  319     return status;
  320 
  321   fn_fail:
  322     goto fn_exit;
  323 }
  324 
  325 HYD_status HYDT_ckpoint_blcr_init(void)
  326 {
  327     HYD_status status = HYD_SUCCESS;
  328     int rc;
  329     cr_client_id_t client_id;
  330     cr_callback_id_t callback_id;
  331 
  332     HYDU_FUNC_ENTER();
  333 
  334     client_id = (int) cr_init();
  335     if (client_id < 0)
  336         goto fn_fail;
  337 
  338     callback_id = cr_register_callback(my_callback, &rc, CR_SIGNAL_CONTEXT);
  339     if (callback_id < 0)
  340         goto fn_fail;
  341 
  342   fn_exit:
  343     HYDU_FUNC_EXIT();
  344     return status;
  345 
  346   fn_fail:
  347     goto fn_exit;
  348 }