"Fossies" - the Fresh Open Source Software Archive

Member "lxc-4.0.10/src/lxc/start.c" (16 Jul 2021, 61044 Bytes) of package /linux/misc/lxc-4.0.10.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "start.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 4.0.9_vs_4.0.10.

    1 /* SPDX-License-Identifier: LGPL-2.1+ */
    2 
    3 #ifndef _GNU_SOURCE
    4 #define _GNU_SOURCE 1
    5 #endif
    6 #include <dirent.h>
    7 #include <errno.h>
    8 #include <fcntl.h>
    9 #include <grp.h>
   10 #include <poll.h>
   11 #include <pthread.h>
   12 #include <signal.h>
   13 #include <stdio.h>
   14 #include <stdlib.h>
   15 #include <string.h>
   16 #include <sys/file.h>
   17 #include <sys/mount.h>
   18 #include <sys/param.h>
   19 #include <sys/prctl.h>
   20 #include <sys/socket.h>
   21 #include <sys/stat.h>
   22 #include <sys/syscall.h>
   23 #include <sys/types.h>
   24 #include <sys/un.h>
   25 #include <sys/wait.h>
   26 #include <unistd.h>
   27 
   28 #include "af_unix.h"
   29 #include "caps.h"
   30 #include "cgroups/cgroup.h"
   31 #include "cgroups/cgroup_utils.h"
   32 #include "commands.h"
   33 #include "commands_utils.h"
   34 #include "compiler.h"
   35 #include "conf.h"
   36 #include "config.h"
   37 #include "confile_utils.h"
   38 #include "error.h"
   39 #include "file_utils.h"
   40 #include "list.h"
   41 #include "log.h"
   42 #include "lsm/lsm.h"
   43 #include "lxccontainer.h"
   44 #include "lxclock.h"
   45 #include "lxcseccomp.h"
   46 #include "macro.h"
   47 #include "mainloop.h"
   48 #include "memory_utils.h"
   49 #include "monitor.h"
   50 #include "namespace.h"
   51 #include "network.h"
   52 #include "process_utils.h"
   53 #include "start.h"
   54 #include "storage/storage.h"
   55 #include "storage/storage_utils.h"
   56 #include "sync.h"
   57 #include "syscall_wrappers.h"
   58 #include "terminal.h"
   59 #include "utils.h"
   60 
   61 #if HAVE_LIBCAP
   62 #include <sys/capability.h>
   63 #endif
   64 
   65 #ifndef HAVE_STRLCPY
   66 #include "include/strlcpy.h"
   67 #endif
   68 
   69 lxc_log_define(start, lxc);
   70 
   71 extern void mod_all_rdeps(struct lxc_container *c, bool inc);
   72 static bool do_destroy_container(struct lxc_handler *handler);
   73 static int lxc_rmdir_onedev_wrapper(void *data);
   74 static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
   75                         const char *name);
   76 
   77 static void print_top_failing_dir(const char *path)
   78 {
   79     __do_free char *copy = NULL;
   80     int ret;
   81     char *e, *p, saved;
   82 
   83     copy = must_copy_string(path);
   84     p = copy;
   85     e = copy + strlen(path);
   86 
   87     while (p < e) {
   88         while (p < e && *p == '/')
   89             p++;
   90 
   91         while (p < e && *p != '/')
   92             p++;
   93 
   94         saved = *p;
   95         *p = '\0';
   96 
   97         ret = access(copy, X_OK);
   98         if (ret != 0) {
   99             SYSERROR("Could not access %s. Please grant it x access, or add an ACL for the container " "root", copy);
  100             return;
  101         }
  102         *p = saved;
  103     }
  104 }
  105 
  106 static void lxc_put_nsfds(struct lxc_handler *handler)
  107 {
  108     for (int i = 0; i < LXC_NS_MAX; i++) {
  109         if (handler->nsfd[i] < 0)
  110             continue;
  111 
  112         close_prot_errno_disarm(handler->nsfd[i]);
  113     }
  114 }
  115 
  116 static int lxc_try_preserve_namespace(struct lxc_handler *handler,
  117                       lxc_namespace_t idx, const char *ns)
  118 {
  119     __do_close int fd = -EBADF;
  120     int ret;
  121 
  122     fd = lxc_preserve_ns(handler->pid, ns);
  123     if (fd < 0)
  124         return -errno;
  125 
  126     ret = strnprintf(handler->nsfd_paths[idx],
  127              sizeof(handler->nsfd_paths[idx]), "%s:/proc/%d/fd/%d",
  128              ns_info[idx].proc_name, handler->monitor_pid, fd);
  129     if (ret < 0)
  130         return ret_errno(EIO);
  131 
  132     /*
  133      * In case LXC is configured for exposing information to hooks as
  134      * argv-style arguments prepare an argv array we can use.
  135      */
  136     handler->hook_argv[handler->hook_argc] = handler->nsfd_paths[idx];
  137     handler->hook_argc++;
  138 
  139     DEBUG("Preserved %s namespace via fd %d and stashed path as %s",
  140           ns_info[idx].proc_name, fd, handler->nsfd_paths[idx]);
  141 
  142     handler->nsfd[idx] = move_fd(fd);
  143     return 0;
  144 }
  145 
  146 /* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
  147  * specified in ns_clone_flags.
  148  * Return true on success, false on failure.
  149  */
  150 static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
  151                     int ns_clone_flags)
  152 {
  153     for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++)
  154         handler->nsfd[ns_idx] = -EBADF;
  155 
  156     for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++) {
  157         int ret;
  158         const char *ns = ns_info[ns_idx].proc_name;
  159 
  160         if ((ns_clone_flags & ns_info[ns_idx].clone_flag) == 0)
  161             continue;
  162 
  163         ret = lxc_try_preserve_namespace(handler, ns_idx,
  164                          ns_info[ns_idx].proc_name);
  165         if (ret < 0) {
  166             if (ret == -ENOENT) {
  167                 SYSERROR("Kernel does not support preserving %s namespaces", ns);
  168                 continue;
  169             }
  170 
  171             /*
  172              * Handle kernels that do not support interacting with
  173              * namespaces through procfs.
  174              */
  175             lxc_put_nsfds(handler);
  176             return log_error_errno(false, errno, "Failed to preserve %s namespace", ns);
  177         }
  178     }
  179 
  180     return true;
  181 }
  182 
  183 static inline bool match_stdfds(int fd)
  184 {
  185     return (fd == STDIN_FILENO || fd == STDOUT_FILENO || fd == STDERR_FILENO);
  186 }
  187 
  188 #ifdef HAVE_DLOG
  189 static bool match_dlog_fds(struct dirent *direntp)
  190 {
  191     char path[PATH_MAX] = {0};
  192     char link[PATH_MAX] = {0};
  193     ssize_t linklen;
  194     int ret;
  195 
  196     ret = strnprintf(path, sizeof(path), "/proc/self/fd/%s", direntp->d_name);
  197     if (ret < 0)
  198         return log_error(false, "Failed to create file descriptor name");
  199 
  200     linklen = readlink(path, link, PATH_MAX);
  201     if (linklen < 0)
  202         return log_error(false, "Failed to read link path - \"%s\"", path);
  203     else if (linklen >= PATH_MAX)
  204         return log_error(false, "The name of link path is too long - \"%s\"", path);
  205 
  206     if (strequal(link, "/dev/log_main") ||
  207         strequal(link, "/dev/log_system") ||
  208         strequal(link, "/dev/log_radio"))
  209         return true;
  210 
  211     return false;
  212 }
  213 #endif
  214 
  215 /* Parses the LISTEN_FDS environment variable value.
  216  * The returned value is the highest fd number up to which the
  217  * file descriptors must be passed to the container process.
  218  *
  219  * For example, if LISTEN_FDS=2 then 4 is returned and file descriptors 3 and 4
  220  * MUST be passed to the container process (in addition to the standard streams)
  221  * to support [socket activation][systemd-listen-fds].
  222  */
  223 static unsigned int get_listen_fds_max(void)
  224 {
  225     int ret;
  226     unsigned int num_fds;
  227     const char *val;
  228 
  229     val = getenv("LISTEN_FDS");
  230     if (!val)
  231         return 0;
  232 
  233     ret = lxc_safe_uint(val, &num_fds);
  234     if (ret < 0)
  235         return syserror_ret(0, "Failed to parse \"LISTEN_FDS=%s\" environment variable", val);
  236 
  237     return log_trace(num_fds, "Parsed \"LISTEN_FDS=%s\" environment variable", val);
  238 }
  239 
  240 int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
  241             int *fds_to_ignore, size_t len_fds)
  242 {
  243     int fd, fddir;
  244     size_t i;
  245     DIR *dir;
  246     struct dirent *direntp;
  247     unsigned int listen_fds_max;
  248 
  249     if (conf && conf->close_all_fds)
  250         closeall = true;
  251 
  252     listen_fds_max = get_listen_fds_max();
  253 
  254     /*
  255      * Disable syslog at this point to avoid the above logging
  256      * function to open a new fd and make the check_inherited function
  257      * enter an infinite loop.
  258      */
  259     lxc_log_syslog_disable();
  260 
  261 restart:
  262     dir = opendir("/proc/self/fd");
  263     if (!dir)
  264         return log_warn(-1, "Failed to open directory");
  265 
  266     fddir = dirfd(dir);
  267 
  268     while ((direntp = readdir(dir))) {
  269         int ret;
  270         struct lxc_list *cur;
  271         bool matched = false;
  272 
  273         if (strequal(direntp->d_name, "."))
  274             continue;
  275 
  276         if (strequal(direntp->d_name, ".."))
  277             continue;
  278 
  279         ret = lxc_safe_int(direntp->d_name, &fd);
  280         if (ret < 0) {
  281             INFO("Could not parse file descriptor for \"%s\"", direntp->d_name);
  282             continue;
  283         }
  284 
  285         for (i = 0; i < len_fds; i++)
  286             if (fds_to_ignore[i] == fd)
  287                 break;
  288 
  289         if (fd == fddir || fd == lxc_log_fd ||
  290             (i < len_fds && fd == fds_to_ignore[i]))
  291             continue;
  292 
  293         /* Keep state clients that wait on reboots. */
  294         if (conf) {
  295             lxc_list_for_each(cur, &conf->state_clients) {
  296                 struct lxc_state_client *client = cur->elem;
  297 
  298                 if (client->clientfd != fd)
  299                     continue;
  300 
  301                 matched = true;
  302                 break;
  303             }
  304         }
  305 
  306         if (matched)
  307             continue;
  308 
  309         if (current_config && fd == current_config->logfd)
  310             continue;
  311 
  312         if (match_stdfds(fd))
  313             continue;
  314 
  315 #ifdef HAVE_DLOG
  316         if (match_dlog_fds(direntp))
  317             continue;
  318 
  319 #endif
  320 
  321         if (fd <= listen_fds_max) {
  322             INFO("Inheriting fd %d (using the LISTEN_FDS environment variable)", fd);
  323             continue;
  324         }
  325 
  326         if (closeall) {
  327             if (close(fd))
  328                 SYSINFO("Closed inherited fd %d", fd);
  329             else
  330                 INFO("Closed inherited fd %d", fd);
  331             closedir(dir);
  332             goto restart;
  333         }
  334         WARN("Inherited fd %d", fd);
  335     }
  336     closedir(dir);
  337 
  338     /*
  339      * Only enable syslog at this point to avoid the above logging
  340      * function to open a new fd and make the check_inherited function
  341      * enter an infinite loop.
  342      */
  343     lxc_log_syslog_enable();
  344 
  345     return 0;
  346 }
  347 
  348 static int setup_signal_fd(sigset_t *oldmask)
  349 {
  350     int ret;
  351     sigset_t mask;
  352     const int signals[] = {SIGBUS, SIGILL, SIGSEGV, SIGWINCH};
  353 
  354     /* Block everything except serious error signals. */
  355     ret = sigfillset(&mask);
  356     if (ret < 0)
  357         return -EBADF;
  358 
  359     for (int sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
  360         ret = sigdelset(&mask, signals[sig]);
  361         if (ret < 0)
  362             return -EBADF;
  363     }
  364 
  365     ret = pthread_sigmask(SIG_BLOCK, &mask, oldmask);
  366     if (ret < 0)
  367         return log_error_errno(-EBADF, errno,
  368                        "Failed to set signal mask");
  369 
  370     ret = signalfd(-1, &mask, SFD_CLOEXEC);
  371     if (ret < 0)
  372         return log_error_errno(-EBADF,
  373                        errno, "Failed to create signal file descriptor");
  374 
  375     TRACE("Created signal file descriptor %d", ret);
  376 
  377     return ret;
  378 }
  379 
  380 static int signal_handler(int fd, uint32_t events, void *data,
  381               struct lxc_epoll_descr *descr)
  382 {
  383     int ret;
  384     siginfo_t info;
  385     struct signalfd_siginfo siginfo;
  386     struct lxc_handler *hdlr = data;
  387 
  388     ret = lxc_read_nointr(fd, &siginfo, sizeof(siginfo));
  389     if (ret < 0)
  390         return log_error(LXC_MAINLOOP_ERROR, "Failed to read signal info from signal file descriptor %d", fd);
  391 
  392     if (ret != sizeof(siginfo))
  393         return log_error(LXC_MAINLOOP_ERROR, "Unexpected size for struct signalfd_siginfo");
  394 
  395     /* Check whether init is running. */
  396     info.si_pid = 0;
  397     ret = waitid(P_PID, hdlr->pid, &info, WEXITED | WNOWAIT | WNOHANG);
  398     if (ret == 0 && info.si_pid == hdlr->pid)
  399         hdlr->init_died = true;
  400 
  401     /* Try to figure out a reasonable exit status to report. */
  402     if (hdlr->init_died) {
  403         switch (info.si_code) {
  404         case CLD_EXITED:
  405             hdlr->exit_status = info.si_status << 8;
  406             break;
  407         case CLD_KILLED:
  408         case CLD_DUMPED:
  409         case CLD_STOPPED:
  410             hdlr->exit_status = info.si_status << 8 | 0x7f;
  411             break;
  412         case CLD_CONTINUED:
  413             /* Huh? The waitid() told us it's dead *and* continued? */
  414             WARN("Init %d dead and continued?", hdlr->pid);
  415             hdlr->exit_status = 1;
  416             break;
  417         default:
  418             ERROR("Unknown si_code: %d", info.si_code);
  419             hdlr->exit_status = 1;
  420         }
  421     }
  422 
  423     if (siginfo.ssi_signo == SIGHUP) {
  424         if (hdlr->pidfd >= 0)
  425             lxc_raw_pidfd_send_signal(hdlr->pidfd, SIGTERM, NULL, 0);
  426         else
  427             kill(hdlr->pid, SIGTERM);
  428         INFO("Killing %d since terminal hung up", hdlr->pid);
  429         return hdlr->init_died ? LXC_MAINLOOP_CLOSE
  430                        : LXC_MAINLOOP_CONTINUE;
  431     }
  432 
  433     if (siginfo.ssi_signo != SIGCHLD) {
  434         if (hdlr->pidfd >= 0)
  435             lxc_raw_pidfd_send_signal(hdlr->pidfd,
  436                           siginfo.ssi_signo, NULL, 0);
  437         else
  438             kill(hdlr->pid, siginfo.ssi_signo);
  439         INFO("Forwarded signal %d to pid %d", siginfo.ssi_signo, hdlr->pid);
  440         return hdlr->init_died ? LXC_MAINLOOP_CLOSE
  441                        : LXC_MAINLOOP_CONTINUE;
  442     }
  443 
  444     /* More robustness, protect ourself from a SIGCHLD sent
  445      * by a process different from the container init.
  446      */
  447     if (siginfo.ssi_pid != hdlr->pid) {
  448         NOTICE("Received %d from pid %d instead of container init %d",
  449                siginfo.ssi_signo, siginfo.ssi_pid, hdlr->pid);
  450         return hdlr->init_died ? LXC_MAINLOOP_CLOSE
  451                        : LXC_MAINLOOP_CONTINUE;
  452     }
  453 
  454     if (siginfo.ssi_code == CLD_STOPPED) {
  455         INFO("Container init process was stopped");
  456         return hdlr->init_died ? LXC_MAINLOOP_CLOSE
  457                        : LXC_MAINLOOP_CONTINUE;
  458     }
  459 
  460     if (siginfo.ssi_code == CLD_CONTINUED) {
  461         INFO("Container init process was continued");
  462         return hdlr->init_died ? LXC_MAINLOOP_CLOSE
  463                        : LXC_MAINLOOP_CONTINUE;
  464     }
  465 
  466     return log_debug(LXC_MAINLOOP_CLOSE, "Container init process %d exited", hdlr->pid);
  467 }
  468 
  469 int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
  470                 lxc_state_t state)
  471 {
  472     size_t retlen;
  473     ssize_t ret;
  474     struct lxc_list *cur, *next;
  475     struct lxc_msg msg = {.type = lxc_msg_state, .value = state};
  476 
  477     if (state == THAWED)
  478         handler->state = RUNNING;
  479     else
  480         handler->state = state;
  481 
  482     TRACE("Set container state to %s", lxc_state2str(state));
  483 
  484     if (lxc_list_empty(&handler->conf->state_clients))
  485         return log_trace(0, "No state clients registered");
  486 
  487     retlen = strlcpy(msg.name, name, sizeof(msg.name));
  488     if (retlen >= sizeof(msg.name))
  489         return -E2BIG;
  490 
  491     lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
  492         struct lxc_state_client *client = cur->elem;
  493 
  494         if (client->states[state] == 0) {
  495             TRACE("State %s not registered for state client %d",
  496                   lxc_state2str(state), client->clientfd);
  497             continue;
  498         }
  499 
  500         TRACE("Sending state %s to state client %d",
  501               lxc_state2str(state), client->clientfd);
  502 
  503         ret = lxc_send_nointr(client->clientfd, &msg, sizeof(msg), MSG_NOSIGNAL);
  504         if (ret <= 0)
  505             SYSERROR("Failed to send message to client");
  506 
  507         /* kick client from list */
  508         lxc_list_del(cur);
  509         close(client->clientfd);
  510         free(cur->elem);
  511         free(cur);
  512     }
  513 
  514     return 0;
  515 }
  516 
  517 static int lxc_serve_state_socket_pair(const char *name,
  518                        struct lxc_handler *handler,
  519                        lxc_state_t state)
  520 {
  521     ssize_t ret;
  522 
  523     if (!handler->daemonize ||
  524             handler->state_socket_pair[1] < 0 ||
  525         state == STARTING)
  526         return 0;
  527 
  528     /* Close read end of the socket pair. */
  529     close_prot_errno_disarm(handler->state_socket_pair[0]);
  530 
  531 again:
  532     ret = lxc_abstract_unix_send_credential(handler->state_socket_pair[1],
  533                         &(int){state}, sizeof(int));
  534     if (ret < 0) {
  535         SYSERROR("Failed to send state to %d", handler->state_socket_pair[1]);
  536 
  537         if (errno == EINTR)
  538             goto again;
  539 
  540         return -1;
  541     }
  542 
  543     if (ret != sizeof(int))
  544         return log_error(-1, "Message too long : %d", handler->state_socket_pair[1]);
  545 
  546     TRACE("Sent container state \"%s\" to %d", lxc_state2str(state),
  547           handler->state_socket_pair[1]);
  548 
  549     /* Close write end of the socket pair. */
  550     close_prot_errno_disarm(handler->state_socket_pair[1]);
  551 
  552     return 0;
  553 }
  554 
  555 int lxc_set_state(const char *name, struct lxc_handler *handler,
  556           lxc_state_t state)
  557 {
  558     int ret;
  559 
  560     ret = lxc_serve_state_socket_pair(name, handler, state);
  561     if (ret < 0)
  562         return log_error(-1, "Failed to synchronize via anonymous pair of unix sockets");
  563 
  564     ret = lxc_serve_state_clients(name, handler, state);
  565     if (ret < 0)
  566         return -1;
  567 
  568     /* This function will try to connect to the legacy lxc-monitord state
  569      * server and only exists for backwards compatibility.
  570      */
  571     lxc_monitor_send_state(name, state, handler->lxcpath);
  572 
  573     return 0;
  574 }
  575 
  576 int lxc_poll(const char *name, struct lxc_handler *handler)
  577 {
  578     int ret;
  579     bool has_console = true;
  580     struct lxc_epoll_descr descr, descr_console;
  581 
  582     if (handler->conf->console.path &&
  583         strequal(handler->conf->console.path, "none"))
  584         has_console = false;
  585 
  586     ret = lxc_mainloop_open(&descr);
  587     if (ret < 0) {
  588         ERROR("Failed to create mainloop");
  589         goto out_sigfd;
  590     }
  591 
  592     if (has_console) {
  593         ret = lxc_mainloop_open(&descr_console);
  594         if (ret < 0) {
  595             ERROR("Failed to create console mainloop");
  596             goto out_mainloop;
  597         }
  598     }
  599 
  600     ret = lxc_mainloop_add_handler(&descr, handler->sigfd, signal_handler, handler);
  601     if (ret < 0) {
  602         ERROR("Failed to add signal handler for %d to mainloop", handler->sigfd);
  603         goto out_mainloop_console;
  604     }
  605 
  606     ret = lxc_seccomp_setup_proxy(&handler->conf->seccomp, &descr, handler);
  607     if (ret < 0) {
  608         ERROR("Failed to setup seccomp proxy");
  609         goto out_mainloop_console;
  610     }
  611 
  612     if (has_console) {
  613         struct lxc_terminal *console = &handler->conf->console;
  614 
  615         ret = lxc_terminal_mainloop_add(&descr, console);
  616         if (ret < 0) {
  617             ERROR("Failed to add console handlers to mainloop");
  618             goto out_mainloop_console;
  619         }
  620 
  621         ret = lxc_terminal_mainloop_add(&descr_console, console);
  622         if (ret < 0) {
  623             ERROR("Failed to add console handlers to console mainloop");
  624             goto out_mainloop_console;
  625         }
  626 
  627         handler->conf->console.descr = &descr;
  628     }
  629 
  630     ret = lxc_cmd_mainloop_add(name, &descr, handler);
  631     if (ret < 0) {
  632         ERROR("Failed to add command handler to mainloop");
  633         goto out_mainloop_console;
  634     }
  635 
  636     TRACE("Mainloop is ready");
  637 
  638     ret = lxc_mainloop(&descr, -1);
  639     close_prot_errno_disarm(descr.epfd);
  640     if (ret < 0 || !handler->init_died)
  641         goto out_mainloop_console;
  642 
  643     if (has_console)
  644         ret = lxc_mainloop(&descr_console, 0);
  645 
  646 out_mainloop_console:
  647     if (has_console) {
  648         lxc_mainloop_close(&descr_console);
  649         TRACE("Closed console mainloop");
  650     }
  651 
  652 out_mainloop:
  653     lxc_mainloop_close(&descr);
  654     TRACE("Closed mainloop");
  655 
  656 out_sigfd:
  657     TRACE("Closed signal file descriptor %d", handler->sigfd);
  658     close_prot_errno_disarm(handler->sigfd);
  659 
  660     return ret;
  661 }
  662 
  663 void lxc_put_handler(struct lxc_handler *handler)
  664 {
  665     close_prot_errno_disarm(handler->pidfd);
  666     close_prot_errno_disarm(handler->sigfd);
  667     lxc_put_nsfds(handler);
  668     if (handler->conf && handler->conf->reboot == REBOOT_NONE)
  669         close_prot_errno_disarm(handler->conf->maincmd_fd);
  670     close_prot_errno_disarm(handler->monitor_status_fd);
  671     close_prot_errno_disarm(handler->state_socket_pair[0]);
  672     close_prot_errno_disarm(handler->state_socket_pair[1]);
  673     cgroup_exit(handler->cgroup_ops);
  674     if (handler->conf && handler->conf->reboot == REBOOT_NONE)
  675         free_disarm(handler);
  676     else
  677         handler->conf = NULL;
  678 }
  679 
  680 struct lxc_handler *lxc_init_handler(struct lxc_handler *old,
  681                      const char *name, struct lxc_conf *conf,
  682                      const char *lxcpath, bool daemonize)
  683 {
  684     int nr_keep_fds = 0;
  685     int ret;
  686     struct lxc_handler *handler;
  687 
  688     if (!old)
  689         handler = zalloc(sizeof(*handler));
  690     else
  691         handler = old;
  692     if (!handler)
  693         return NULL;
  694 
  695     /* Note that am_guest_unpriv() checks the effective uid. We
  696      * probably don't care if we are real root only if we are running
  697      * as root so this should be fine.
  698      */
  699     handler->am_root = !am_guest_unpriv();
  700     handler->conf = conf;
  701     handler->lxcpath = lxcpath;
  702     handler->init_died = false;
  703     handler->data_sock[0] = -EBADF;
  704     handler->data_sock[1] = -EBADF;
  705     handler->monitor_status_fd = -EBADF;
  706     handler->pidfd = -EBADF;
  707     handler->sigfd = -EBADF;
  708     handler->state_socket_pair[0] = -EBADF;
  709     handler->state_socket_pair[1] = -EBADF;
  710     if (handler->conf->reboot == REBOOT_NONE)
  711         lxc_list_init(&handler->conf->state_clients);
  712 
  713     for (lxc_namespace_t idx = 0; idx < LXC_NS_MAX; idx++) {
  714         handler->nsfd[idx] = -EBADF;
  715 
  716         if (handler->conf->reboot == REBOOT_NONE)
  717             continue;
  718 
  719         handler->nsfd_paths[idx][0] = '\0';
  720         handler->hook_argv[idx] = NULL;
  721 
  722         if (handler->hook_argc != 0)
  723             handler->hook_argc = 0;
  724     }
  725 
  726     handler->name = name;
  727     if (daemonize)
  728         handler->transient_pid = lxc_raw_getpid();
  729     else
  730         handler->transient_pid = -1;
  731 
  732     if (daemonize && handler->conf->reboot == REBOOT_NONE) {
  733         /* Create socketpair() to synchronize on daemonized startup.
  734          * When the container reboots we don't need to synchronize
  735          * again currently so don't open another socketpair().
  736          */
  737         ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0,
  738                  handler->state_socket_pair);
  739         if (ret < 0) {
  740             ERROR("Failed to create anonymous pair of unix sockets");
  741             goto on_error;
  742         }
  743 
  744         TRACE("Created anonymous pair {%d,%d} of unix sockets",
  745               handler->state_socket_pair[0],
  746               handler->state_socket_pair[1]);
  747         handler->keep_fds[nr_keep_fds++] = handler->state_socket_pair[0];
  748         handler->keep_fds[nr_keep_fds++] = handler->state_socket_pair[1];
  749     }
  750 
  751     if (handler->conf->reboot == REBOOT_NONE) {
  752         handler->conf->maincmd_fd = lxc_server_init(name, lxcpath, "command");
  753         if (handler->conf->maincmd_fd < 0) {
  754             ERROR("Failed to set up command socket");
  755             goto on_error;
  756         }
  757         handler->keep_fds[nr_keep_fds++] = handler->conf->maincmd_fd;
  758     }
  759 
  760     TRACE("Unix domain socket %d for command server is ready",
  761           handler->conf->maincmd_fd);
  762 
  763     return handler;
  764 
  765 on_error:
  766     lxc_put_handler(handler);
  767 
  768     return NULL;
  769 }
  770 
  771 int lxc_init(const char *name, struct lxc_handler *handler)
  772 {
  773     __do_close int status_fd = -EBADF;
  774     int ret;
  775     const char *loglevel;
  776     struct lxc_conf *conf = handler->conf;
  777 
  778     handler->monitor_pid = lxc_raw_getpid();
  779     status_fd = open("/proc/self/status", O_RDONLY | O_CLOEXEC);
  780     if (status_fd < 0)
  781         return log_error_errno(-1, errno, "Failed to open monitor status fd");
  782 
  783     handler->lsm_ops = lsm_init_static();
  784     TRACE("Initialized LSM");
  785 
  786     /* Begin by setting the state to STARTING. */
  787     ret = lxc_set_state(name, handler, STARTING);
  788     if (ret < 0)
  789         return log_error(-1, "Failed to set state to \"%s\"", lxc_state2str(STARTING));
  790     TRACE("Set container state to \"STARTING\"");
  791 
  792     /* Start of environment variable setup for hooks. */
  793     ret = setenv("LXC_NAME", name, 1);
  794     if (ret < 0)
  795         SYSERROR("Failed to set environment variable: LXC_NAME=%s", name);
  796 
  797     if (conf->rcfile) {
  798         ret = setenv("LXC_CONFIG_FILE", conf->rcfile, 1);
  799         if (ret < 0)
  800             SYSERROR("Failed to set environment variable: LXC_CONFIG_FILE=%s", conf->rcfile);
  801     }
  802 
  803     if (conf->rootfs.mount) {
  804         ret = setenv("LXC_ROOTFS_MOUNT", conf->rootfs.mount, 1);
  805         if (ret < 0)
  806             SYSERROR("Failed to set environment variable: LXC_ROOTFS_MOUNT=%s", conf->rootfs.mount);
  807     }
  808 
  809     if (conf->rootfs.path) {
  810         ret = setenv("LXC_ROOTFS_PATH", conf->rootfs.path, 1);
  811         if (ret < 0)
  812             SYSERROR("Failed to set environment variable: LXC_ROOTFS_PATH=%s", conf->rootfs.path);
  813     }
  814 
  815     if (conf->console.path) {
  816         ret = setenv("LXC_CONSOLE", conf->console.path, 1);
  817         if (ret < 0)
  818             SYSERROR("Failed to set environment variable: LXC_CONSOLE=%s", conf->console.path);
  819     }
  820 
  821     if (conf->console.log_path) {
  822         ret = setenv("LXC_CONSOLE_LOGPATH", conf->console.log_path, 1);
  823         if (ret < 0)
  824             SYSERROR("Failed to set environment variable: LXC_CONSOLE_LOGPATH=%s", conf->console.log_path);
  825     }
  826 
  827     if (cgns_supported()) {
  828         ret = setenv("LXC_CGNS_AWARE", "1", 1);
  829         if (ret < 0)
  830             SYSERROR("Failed to set environment variable LXC_CGNS_AWARE=1");
  831     }
  832 
  833     loglevel = lxc_log_priority_to_string(lxc_log_get_level());
  834     ret = setenv("LXC_LOG_LEVEL", loglevel, 1);
  835     if (ret < 0)
  836         SYSERROR("Set environment variable LXC_LOG_LEVEL=%s", loglevel);
  837 
  838     if (conf->hooks_version == 0)
  839         ret = setenv("LXC_HOOK_VERSION", "0", 1);
  840     else
  841         ret = setenv("LXC_HOOK_VERSION", "1", 1);
  842     if (ret < 0)
  843         SYSERROR("Failed to set environment variable LXC_HOOK_VERSION=%u", conf->hooks_version);
  844     /* End of environment variable setup for hooks. */
  845 
  846     TRACE("Set environment variables");
  847 
  848     ret = run_lxc_hooks(name, "pre-start", conf, NULL);
  849     if (ret < 0)
  850         return log_error(-1, "Failed to run lxc.hook.pre-start for container \"%s\"", name);
  851     TRACE("Ran pre-start hooks");
  852 
  853     /* The signal fd has to be created before forking otherwise if the child
  854      * process exits before we setup the signal fd, the event will be lost
  855      * and the command will be stuck.
  856      */
  857     handler->sigfd = setup_signal_fd(&handler->oldmask);
  858     if (handler->sigfd < 0)
  859         return log_error(-1, "Failed to setup SIGCHLD fd handler.");
  860     TRACE("Set up signal fd");
  861 
  862     /* Do this after setting up signals since it might unblock SIGWINCH. */
  863     ret = lxc_terminal_setup(conf);
  864     if (ret < 0) {
  865         ERROR("Failed to create console");
  866         goto out_restore_sigmask;
  867     }
  868     TRACE("Created console");
  869 
  870     handler->cgroup_ops = cgroup_init(handler->conf);
  871     if (!handler->cgroup_ops) {
  872         ERROR("Failed to initialize cgroup driver");
  873         goto out_delete_terminal;
  874     }
  875     TRACE("Initialized cgroup driver");
  876 
  877     ret = lxc_read_seccomp_config(conf);
  878     if (ret < 0)
  879         return log_error(-1, "Failed loading seccomp policy");
  880     TRACE("Read seccomp policy");
  881 
  882     ret = handler->lsm_ops->prepare(handler->lsm_ops, conf, handler->lxcpath);
  883     if (ret < 0) {
  884         ERROR("Failed to initialize LSM");
  885         goto out_delete_terminal;
  886     }
  887     TRACE("Initialized LSM");
  888 
  889     INFO("Container \"%s\" is initialized", name);
  890     handler->monitor_status_fd = move_fd(status_fd);
  891     return 0;
  892 
  893 out_delete_terminal:
  894     lxc_terminal_delete(&handler->conf->console);
  895 
  896 out_restore_sigmask:
  897     (void)pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
  898 
  899     return -1;
  900 }
  901 
  902 void lxc_expose_namespace_environment(const struct lxc_handler *handler)
  903 {
  904     for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
  905         int ret;
  906         const char *fd_path;
  907 
  908         if (handler->nsfd[i] < 0)
  909             continue;
  910 
  911         fd_path = handler->nsfd_paths[i] + strcspn(handler->nsfd_paths[i], "/");
  912         ret = setenv(ns_info[i].env_name, fd_path, 1);
  913         if (ret < 0)
  914             SYSERROR("Failed to set environment variable %s=%s",
  915                  ns_info[i].env_name, fd_path);
  916         else
  917             TRACE("Set environment variable %s=%s",
  918                   ns_info[i].env_name, fd_path);
  919     }
  920 }
  921 
  922 void lxc_end(struct lxc_handler *handler)
  923 {
  924     int ret;
  925     struct lxc_list *cur, *next;
  926     const char *name = handler->name;
  927     struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
  928 
  929     /* The STOPPING state is there for future cleanup code which can take
  930      * awhile.
  931      */
  932     lxc_set_state(name, handler, STOPPING);
  933 
  934     /* Passing information to hooks via environment variables. */
  935     if (handler->conf->hooks_version > 0)
  936         lxc_expose_namespace_environment(handler);
  937 
  938     if (handler->conf->reboot > REBOOT_NONE) {
  939         ret = setenv("LXC_TARGET", "reboot", 1);
  940         if (ret < 0)
  941             SYSERROR("Failed to set environment variable: LXC_TARGET=reboot");
  942     }
  943 
  944     if (handler->conf->reboot == REBOOT_NONE) {
  945         ret = setenv("LXC_TARGET", "stop", 1);
  946         if (ret < 0)
  947             SYSERROR("Failed to set environment variable: LXC_TARGET=stop");
  948     }
  949 
  950     if (handler->conf->hooks_version == 0)
  951         ret = run_lxc_hooks(name, "stop", handler->conf, handler->hook_argv);
  952     else
  953         ret = run_lxc_hooks(name, "stop", handler->conf, NULL);
  954     if (ret < 0)
  955         ERROR("Failed to run \"lxc.hook.stop\" hook");
  956 
  957     handler->lsm_ops->cleanup(handler->lsm_ops, handler->conf, handler->lxcpath);
  958 
  959     if (cgroup_ops) {
  960         cgroup_ops->payload_destroy(cgroup_ops, handler);
  961         cgroup_ops->monitor_destroy(cgroup_ops, handler);
  962     }
  963 
  964     put_lxc_rootfs(&handler->conf->rootfs, true);
  965 
  966     if (handler->conf->reboot == REBOOT_NONE) {
  967         /* For all new state clients simply close the command socket.
  968          * This will inform all state clients that the container is
  969          * STOPPED and also prevents a race between a open()/close() on
  970          * the command socket causing a new process to get ECONNREFUSED
  971          * because we haven't yet closed the command socket.
  972          */
  973         close_prot_errno_disarm(handler->conf->maincmd_fd);
  974         TRACE("Closed command socket");
  975 
  976         /* This function will try to connect to the legacy lxc-monitord
  977          * state server and only exists for backwards compatibility.
  978          */
  979         lxc_monitor_send_state(name, STOPPED, handler->lxcpath);
  980 
  981         /* The command socket is closed so no one can acces the command
  982          * socket anymore so there's no need to lock it.
  983          */
  984         handler->state = STOPPED;
  985         TRACE("Set container state to \"STOPPED\"");
  986     } else {
  987         lxc_set_state(name, handler, STOPPED);
  988         TRACE("Set container state to \"STOPPED\"");
  989     }
  990 
  991     /* Avoid lingering namespace references. */
  992     lxc_put_nsfds(handler);
  993 
  994     ret = run_lxc_hooks(name, "post-stop", handler->conf, NULL);
  995     if (ret < 0) {
  996         ERROR("Failed to run lxc.hook.post-stop for container \"%s\"", name);
  997         if (handler->conf->reboot > REBOOT_NONE) {
  998             WARN("Container will be stopped instead of rebooted");
  999             handler->conf->reboot = REBOOT_NONE;
 1000 
 1001             ret = setenv("LXC_TARGET", "stop", 1);
 1002             if (ret < 0)
 1003                 WARN("Failed to set environment variable: LXC_TARGET=stop");
 1004         }
 1005     }
 1006 
 1007     /* Reset mask set by setup_signal_fd. */
 1008     ret = pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
 1009     if (ret < 0)
 1010         SYSWARN("Failed to restore signal mask");
 1011 
 1012     lxc_terminal_delete(&handler->conf->console);
 1013     lxc_delete_tty(&handler->conf->ttys);
 1014     close_prot_errno_disarm(handler->conf->devpts_fd);
 1015 
 1016     /* The command socket is now closed, no more state clients can register
 1017      * themselves from now on. So free the list of state clients.
 1018      */
 1019     lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
 1020         struct lxc_state_client *client = cur->elem;
 1021 
 1022         /* Keep state clients that want to be notified about reboots. */
 1023         if ((handler->conf->reboot > REBOOT_NONE) &&
 1024             (client->states[RUNNING] == 2))
 1025             continue;
 1026 
 1027         /* close state client socket */
 1028         lxc_list_del(cur);
 1029         close(client->clientfd);
 1030         free(cur->elem);
 1031         free(cur);
 1032     }
 1033 
 1034     if (handler->conf->ephemeral == 1 && handler->conf->reboot != REBOOT_REQ)
 1035         lxc_destroy_container_on_signal(handler, name);
 1036 
 1037     lxc_put_handler(handler);
 1038 }
 1039 
 1040 void lxc_abort(struct lxc_handler *handler)
 1041 {
 1042     int ret = 0;
 1043     int status;
 1044 
 1045     lxc_set_state(handler->name, handler, ABORTING);
 1046 
 1047     if (handler->pidfd >= 0) {
 1048         ret = lxc_raw_pidfd_send_signal(handler->pidfd, SIGKILL, NULL, 0);
 1049         if (ret)
 1050             SYSWARN("Failed to send SIGKILL via pidfd %d for process %d",
 1051                 handler->pidfd, handler->pid);
 1052     }
 1053 
 1054     if ((!ret || errno != ESRCH) && handler->pid > 0)
 1055         if (kill(handler->pid, SIGKILL))
 1056             SYSWARN("Failed to send SIGKILL to %d", handler->pid);
 1057 
 1058     do {
 1059         ret = waitpid(-1, &status, 0);
 1060     } while (ret > 0);
 1061 }
 1062 
 1063 static int do_start(void *data)
 1064 {
 1065     struct lxc_handler *handler = data;
 1066     __lxc_unused __do_close int data_sock0 = handler->data_sock[0],
 1067                        data_sock1 = handler->data_sock[1];
 1068     __do_close int devnull_fd = -EBADF, status_fd = -EBADF;
 1069     int ret;
 1070     uid_t new_uid;
 1071     gid_t new_gid;
 1072     struct lxc_list *iterator;
 1073     uid_t nsuid = 0;
 1074     gid_t nsgid = 0;
 1075 
 1076     lxc_sync_fini_parent(handler);
 1077 
 1078     if (lxc_abstract_unix_recv_one_fd(data_sock1, &status_fd, NULL, 0) < 0) {
 1079         ERROR("Failed to receive status file descriptor from parent process");
 1080         goto out_warn_father;
 1081     }
 1082 
 1083     /* This prctl must be before the synchro, so if the parent dies before
 1084      * we set the parent death signal, we will detect its death with the
 1085      * synchro right after, otherwise we have a window where the parent can
 1086      * exit before we set the pdeath signal leading to a unsupervized
 1087      * container.
 1088      */
 1089     ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
 1090     if (ret < 0) {
 1091         SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
 1092         goto out_warn_father;
 1093     }
 1094 
 1095     ret = lxc_ambient_caps_up();
 1096     if (ret < 0) {
 1097         ERROR("Failed to raise ambient capabilities");
 1098         goto out_warn_father;
 1099     }
 1100 
 1101     ret = pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
 1102     if (ret < 0) {
 1103         SYSERROR("Failed to set signal mask");
 1104         goto out_warn_father;
 1105     }
 1106 
 1107     if (!lxc_sync_wait_parent(handler, START_SYNC_STARTUP))
 1108         goto out_warn_father;
 1109 
 1110     /* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
 1111      * https://github.com/lxc/lxd/issues/1978.
 1112      */
 1113     if (handler->ns_unshare_flags & CLONE_NEWNET) {
 1114         ret = unshare(CLONE_NEWNET);
 1115         if (ret < 0) {
 1116             SYSERROR("Failed to unshare CLONE_NEWNET");
 1117             goto out_warn_father;
 1118         }
 1119         INFO("Unshared CLONE_NEWNET");
 1120     }
 1121 
 1122     /* If we are in a new user namespace, become root there to have
 1123      * privilege over our namespace.
 1124      */
 1125     if (!lxc_list_empty(&handler->conf->id_map)) {
 1126         if (!handler->conf->root_nsuid_map)
 1127             nsuid = handler->conf->init_uid;
 1128 
 1129         if (!handler->conf->root_nsgid_map)
 1130             nsgid = handler->conf->init_gid;
 1131 
 1132         /* Drop groups only after we switched to a valid gid in the new
 1133          * user namespace.
 1134          */
 1135         if (!lxc_drop_groups() &&
 1136             (handler->am_root || errno != EPERM))
 1137             goto out_warn_father;
 1138 
 1139         if (!lxc_switch_uid_gid(nsuid, nsgid))
 1140             goto out_warn_father;
 1141 
 1142         ret = prctl(PR_SET_DUMPABLE, prctl_arg(1), prctl_arg(0),
 1143                 prctl_arg(0), prctl_arg(0));
 1144         if (ret < 0)
 1145             goto out_warn_father;
 1146 
 1147         /* set{g,u}id() clears deathsignal */
 1148         ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
 1149         if (ret < 0) {
 1150             SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
 1151             goto out_warn_father;
 1152         }
 1153     }
 1154 
 1155     ret = access(handler->lxcpath, X_OK);
 1156     if (ret != 0) {
 1157         print_top_failing_dir(handler->lxcpath);
 1158         goto out_warn_father;
 1159     }
 1160 
 1161     /* In order to checkpoint restore, we need to have everything in the
 1162      * same mount namespace. However, some containers may not have a
 1163      * reasonable /dev (in particular, they may not have /dev/null), so we
 1164      * can't set init's std fds to /dev/null by opening it from inside the
 1165      * container.
 1166      *
 1167      * If that's the case, fall back to using the host's /dev/null. This
 1168      * means that migration won't work, but at least we won't spew output
 1169      * where it isn't wanted.
 1170      */
 1171     if (handler->daemonize && !handler->conf->autodev) {
 1172         char path[PATH_MAX];
 1173 
 1174         ret = strnprintf(path, sizeof(path), "%s/dev/null",
 1175                  handler->conf->rootfs.mount);
 1176         if (ret < 0)
 1177             goto out_warn_father;
 1178 
 1179         ret = access(path, F_OK);
 1180         if (ret != 0) {
 1181             devnull_fd = open_devnull();
 1182 
 1183             if (devnull_fd < 0)
 1184                 goto out_warn_father;
 1185             WARN("Using /dev/null from the host for container init's standard file descriptors. Migration will not work");
 1186         }
 1187     }
 1188 
 1189     /*
 1190      * Tell the parent task it can begin to configure the container and wait
 1191      * for it to finish.
 1192      */
 1193     if (!lxc_sync_wake_parent(handler, START_SYNC_CONFIGURE))
 1194         goto out_error;
 1195 
 1196     /* Unshare cgroup namespace after we have setup our cgroups. If we do it
 1197      * earlier we end up with a wrong view of /proc/self/cgroup. For
 1198      * example, assume we unshare(CLONE_NEWCGROUP) first, and then create
 1199      * the cgroup for the container, say /sys/fs/cgroup/cpuset/lxc/c, then
 1200      * /proc/self/cgroup would show us:
 1201      *
 1202      *  8:cpuset:/lxc/c
 1203      *
 1204      * whereas it should actually show
 1205      *
 1206      *  8:cpuset:/
 1207      */
 1208     if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
 1209         ret = unshare(CLONE_NEWCGROUP);
 1210         if (ret < 0) {
 1211             if (errno != EINVAL) {
 1212                 SYSERROR("Failed to unshare CLONE_NEWCGROUP");
 1213                 goto out_warn_father;
 1214             }
 1215 
 1216             handler->ns_clone_flags &= ~CLONE_NEWCGROUP;
 1217             SYSINFO("Kernel does not support CLONE_NEWCGROUP");
 1218         } else {
 1219             INFO("Unshared CLONE_NEWCGROUP");
 1220         }
 1221     }
 1222 
 1223     if (handler->ns_unshare_flags & CLONE_NEWTIME) {
 1224         ret = unshare(CLONE_NEWTIME);
 1225         if (ret < 0) {
 1226             if (errno != EINVAL) {
 1227                 SYSERROR("Failed to unshare CLONE_NEWTIME");
 1228                 goto out_warn_father;
 1229             }
 1230 
 1231             handler->ns_clone_flags &= ~CLONE_NEWTIME;
 1232             SYSINFO("Kernel does not support CLONE_NEWTIME");
 1233         } else {
 1234             __do_close int timens_fd = -EBADF;
 1235 
 1236             INFO("Unshared CLONE_NEWTIME");
 1237 
 1238             if (handler->conf->timens.s_boot)
 1239                 ret = timens_offset_write(CLOCK_BOOTTIME, handler->conf->timens.s_boot, 0);
 1240             else if (handler->conf->timens.ns_boot)
 1241                 ret = timens_offset_write(CLOCK_BOOTTIME, 0, handler->conf->timens.ns_boot);
 1242             if (ret) {
 1243                 SYSERROR("Failed to write CLONE_BOOTTIME offset");
 1244                 goto out_warn_father;
 1245             }
 1246             TRACE("Wrote CLOCK_BOOTTIME offset");
 1247 
 1248             if (handler->conf->timens.s_monotonic)
 1249                 ret = timens_offset_write(CLOCK_MONOTONIC, handler->conf->timens.s_monotonic, 0);
 1250             else if (handler->conf->timens.ns_monotonic)
 1251                 ret = timens_offset_write(CLOCK_MONOTONIC, 0, handler->conf->timens.ns_monotonic);
 1252             if (ret) {
 1253                 SYSERROR("Failed to write CLONE_MONOTONIC offset");
 1254                 goto out_warn_father;
 1255             }
 1256             TRACE("Wrote CLOCK_MONOTONIC offset");
 1257 
 1258             timens_fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
 1259             if (timens_fd < 0) {
 1260                 SYSERROR("Failed to open \"/proc/self/ns/time_for_children\"");
 1261                 goto out_warn_father;
 1262             }
 1263 
 1264             ret = setns(timens_fd, CLONE_NEWTIME);
 1265             if (ret) {
 1266                 SYSERROR("Failed to setns(%d(\"/proc/self/ns/time_for_children\"))", timens_fd);
 1267                 goto out_warn_father;
 1268             }
 1269         }
 1270     }
 1271 
 1272     /* Add the requested environment variables to the current environment to
 1273      * allow them to be used by the various hooks, such as the start hook
 1274      * below.
 1275      */
 1276     lxc_list_for_each(iterator, &handler->conf->environment) {
 1277         ret = putenv((char *)iterator->elem);
 1278         if (ret < 0) {
 1279             SYSERROR("Failed to set environment variable: %s",
 1280                  (char *)iterator->elem);
 1281             goto out_warn_father;
 1282         }
 1283     }
 1284 
 1285     if (!lxc_sync_wait_parent(handler, START_SYNC_POST_CONFIGURE))
 1286         goto out_warn_father;
 1287 
 1288     /* Setup the container, ip, names, utsname, ... */
 1289     ret = lxc_setup(handler);
 1290     if (ret < 0) {
 1291         ERROR("Failed to setup container \"%s\"", handler->name);
 1292         goto out_warn_father;
 1293     }
 1294 
 1295     /* Set the label to change to when we exec(2) the container's init. */
 1296     ret = handler->lsm_ops->process_label_set(handler->lsm_ops, NULL, handler->conf, true);
 1297     if (ret < 0)
 1298         goto out_warn_father;
 1299 
 1300     /* Set PR_SET_NO_NEW_PRIVS after we changed the lsm label. If we do it
 1301      * before we aren't allowed anymore.
 1302      */
 1303     if (handler->conf->no_new_privs) {
 1304         ret = prctl(PR_SET_NO_NEW_PRIVS, prctl_arg(1), prctl_arg(0),
 1305                 prctl_arg(0), prctl_arg(0));
 1306         if (ret < 0) {
 1307             SYSERROR("Could not set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges");
 1308             goto out_warn_father;
 1309         }
 1310         DEBUG("Set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges");
 1311     }
 1312 
 1313     /* If we mounted a temporary proc, then unmount it now. */
 1314     tmp_proc_unmount(handler->conf);
 1315 
 1316     ret = lxc_seccomp_load(handler->conf);
 1317     if (ret < 0)
 1318         goto out_warn_father;
 1319 
 1320     ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
 1321     if (ret < 0) {
 1322         ERROR("Failed to run lxc.hook.start for container \"%s\"",
 1323               handler->name);
 1324         goto out_warn_father;
 1325     }
 1326 
 1327     close_prot_errno_disarm(handler->sigfd);
 1328 
 1329     if (handler->conf->console.pty < 0 && handler->daemonize) {
 1330         if (devnull_fd < 0) {
 1331             devnull_fd = open_devnull();
 1332             if (devnull_fd < 0)
 1333                 goto out_warn_father;
 1334         }
 1335 
 1336         ret = set_stdfds(devnull_fd);
 1337         if (ret < 0) {
 1338             ERROR("Failed to redirect std{in,out,err} to \"/dev/null\"");
 1339             goto out_warn_father;
 1340         }
 1341     }
 1342 
 1343     close_prot_errno_disarm(devnull_fd);
 1344 
 1345     setsid();
 1346 
 1347     if (handler->conf->init_cwd) {
 1348         ret = chdir(handler->conf->init_cwd);
 1349         if (ret < 0) {
 1350             SYSERROR("Could not change directory to \"%s\"",
 1351                  handler->conf->init_cwd);
 1352             goto out_warn_father;
 1353         }
 1354     }
 1355 
 1356     if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP_LIMITS))
 1357         goto out_warn_father;
 1358 
 1359     ret = lxc_sync_fds_child(handler);
 1360     if (ret < 0) {
 1361         SYSERROR("Failed to sync file descriptors with parent");
 1362         goto out_warn_father;
 1363     }
 1364 
 1365     if (!lxc_sync_wait_parent(handler, START_SYNC_READY_START))
 1366         goto out_warn_father;
 1367 
 1368     /* Reset the environment variables the user requested in a clear
 1369      * environment.
 1370      */
 1371     ret = clearenv();
 1372     /* Don't error out though. */
 1373     if (ret < 0)
 1374         SYSERROR("Failed to clear environment.");
 1375 
 1376     lxc_list_for_each(iterator, &handler->conf->environment) {
 1377         ret = putenv((char *)iterator->elem);
 1378         if (ret < 0) {
 1379             SYSERROR("Failed to set environment variable: %s",
 1380                  (char *)iterator->elem);
 1381             goto out_warn_father;
 1382         }
 1383     }
 1384 
 1385     ret = putenv("container=lxc");
 1386     if (ret < 0) {
 1387         SYSERROR("Failed to set environment variable: container=lxc");
 1388         goto out_warn_father;
 1389     }
 1390 
 1391     if (handler->conf->ttys.tty_names) {
 1392         ret = putenv(handler->conf->ttys.tty_names);
 1393         if (ret < 0) {
 1394             SYSERROR("Failed to set environment variable for container ptys");
 1395             goto out_warn_father;
 1396         }
 1397     }
 1398 
 1399     /* The container has been setup. We can now switch to an unprivileged
 1400      * uid/gid.
 1401      */
 1402     new_uid = handler->conf->init_uid;
 1403     new_gid = handler->conf->init_gid;
 1404 
 1405     /* Avoid unnecessary syscalls. */
 1406     if (new_uid == nsuid)
 1407         new_uid = LXC_INVALID_UID;
 1408 
 1409     if (new_gid == nsgid)
 1410         new_gid = LXC_INVALID_GID;
 1411 
 1412     /* Make sure that the processes STDIO is correctly owned by the user that we are switching to */
 1413     ret = fix_stdio_permissions(new_uid);
 1414     if (ret)
 1415         WARN("Failed to ajust stdio permissions");
 1416 
 1417     /* If we are in a new user namespace we already dropped all groups when
 1418      * we switched to root in the new user namespace further above. Only
 1419      * drop groups if we can, so ensure that we have necessary privilege.
 1420      */
 1421     if (lxc_list_empty(&handler->conf->id_map)) {
 1422         #if HAVE_LIBCAP
 1423         if (lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE))
 1424         #endif
 1425         {
 1426             if (handler->conf->init_groups.size > 0) {
 1427                 if (!lxc_setgroups(handler->conf->init_groups.list,
 1428                            handler->conf->init_groups.size))
 1429                     goto out_warn_father;
 1430             } else {
 1431                 if (!lxc_drop_groups())
 1432                     goto out_warn_father;
 1433             }
 1434         }
 1435     }
 1436 
 1437     if (!lxc_switch_uid_gid(new_uid, new_gid))
 1438         goto out_warn_father;
 1439 
 1440     ret = lxc_ambient_caps_down();
 1441     if (ret < 0) {
 1442         ERROR("Failed to clear ambient capabilities");
 1443         goto out_warn_father;
 1444     }
 1445 
 1446     if (handler->conf->monitor_signal_pdeath != SIGKILL) {
 1447         ret = lxc_set_death_signal(handler->conf->monitor_signal_pdeath,
 1448                        handler->monitor_pid, status_fd);
 1449         if (ret < 0) {
 1450             SYSERROR("Failed to set PR_SET_PDEATHSIG to %d",
 1451                  handler->conf->monitor_signal_pdeath);
 1452             goto out_warn_father;
 1453         }
 1454     }
 1455 
 1456     /*
 1457      * After this call, we are in error because this ops should not return
 1458      * as it execs.
 1459      */
 1460     handler->ops->start(handler, handler->data);
 1461 
 1462 out_warn_father:
 1463     /*
 1464      * We want the parent to know something went wrong, so we return a
 1465      * special error code.
 1466      */
 1467     lxc_sync_wake_parent(handler, SYNC_ERROR);
 1468 
 1469 out_error:
 1470     return -1;
 1471 }
 1472 
 1473 int resolve_clone_flags(struct lxc_handler *handler)
 1474 {
 1475     int i;
 1476     struct lxc_conf *conf = handler->conf;
 1477     bool wants_timens = conf->timens.s_boot || conf->timens.ns_boot ||
 1478                 conf->timens.s_monotonic || conf->timens.ns_monotonic;
 1479 
 1480     for (i = 0; i < LXC_NS_MAX; i++) {
 1481         if (conf->ns_keep) {
 1482             if (!(conf->ns_keep & ns_info[i].clone_flag))
 1483                 handler->ns_clone_flags |= ns_info[i].clone_flag;
 1484         } else if (conf->ns_clone) {
 1485             if ((conf->ns_clone & ns_info[i].clone_flag))
 1486                 handler->ns_clone_flags |= ns_info[i].clone_flag;
 1487         } else {
 1488             if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
 1489                 continue;
 1490 
 1491             if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
 1492                 continue;
 1493 
 1494             if (i == LXC_NS_CGROUP && !cgns_supported())
 1495                 continue;
 1496 
 1497             if (i == LXC_NS_TIME && !wants_timens)
 1498                 continue;
 1499 
 1500             handler->ns_clone_flags |= ns_info[i].clone_flag;
 1501         }
 1502 
 1503         if (!conf->ns_share[i])
 1504             continue;
 1505 
 1506         handler->ns_clone_flags &= ~ns_info[i].clone_flag;
 1507         TRACE("Sharing %s namespace", ns_info[i].proc_name);
 1508     }
 1509 
 1510     if (wants_timens && (conf->ns_keep & ns_info[LXC_NS_TIME].clone_flag))
 1511         return log_trace_errno(-1, EINVAL, "Requested to keep time namespace while also specifying offsets");
 1512 
 1513     /* Deal with namespaces that are unshared. */
 1514     if (handler->ns_clone_flags & CLONE_NEWTIME)
 1515         handler->ns_unshare_flags |= CLONE_NEWTIME;
 1516 
 1517     if (!pure_unified_layout(handler->cgroup_ops) && handler->ns_clone_flags & CLONE_NEWCGROUP)
 1518         handler->ns_unshare_flags |= CLONE_NEWCGROUP;
 1519 
 1520     if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
 1521         (CLONE_NEWNET | CLONE_NEWUSER))
 1522         handler->ns_unshare_flags |= CLONE_NEWNET;
 1523 
 1524     /* Deal with namespaces that are spawned. */
 1525     handler->ns_on_clone_flags = handler->ns_clone_flags & ~handler->ns_unshare_flags;
 1526 
 1527     handler->clone_flags = handler->ns_on_clone_flags | CLONE_PIDFD;
 1528 
 1529     return 0;
 1530 }
 1531 
 1532 /* Note that this function is used with clone(CLONE_VM). Some glibc versions
 1533  * used to reset the pid/tid to -1 when CLONE_VM was used without CLONE_THREAD.
 1534  * But since the memory between parent and child is shared on CLONE_VM this
 1535  * would invalidate the getpid() cache that glibc used to maintain and so
 1536  * getpid() in the child would return the parent's pid. This is all fixed in
 1537  * newer glibc versions where the getpid() cache is removed and the pid/tid is
 1538  * not reset anymore.
 1539  * However, if for whatever reason you - dear committer - somehow need to get the
 1540  * pid of the placeholder intermediate process for do_share_ns() you need to
 1541  * call lxc_raw_getpid(). The next lxc_raw_clone() call does not employ
 1542  * CLONE_VM and will be fine.
 1543  */
 1544 static inline int do_share_ns(void *arg)
 1545 {
 1546     int i, flags, ret;
 1547     struct lxc_handler *handler = arg;
 1548 
 1549     for (i = 0; i < LXC_NS_MAX; i++) {
 1550         if (handler->nsfd[i] < 0)
 1551             continue;
 1552 
 1553         ret = setns(handler->nsfd[i], 0);
 1554         if (ret < 0) {
 1555             /*
 1556              * Note that joining a user and/or mount namespace
 1557              * requires the process is not multithreaded otherwise
 1558              * setns() will fail here.
 1559              */
 1560             SYSERROR("Failed to inherit %s namespace",
 1561                  ns_info[i].proc_name);
 1562             return -1;
 1563         }
 1564 
 1565         DEBUG("Inherited %s namespace", ns_info[i].proc_name);
 1566     }
 1567 
 1568     flags = handler->ns_on_clone_flags;
 1569     flags |= CLONE_PARENT;
 1570     handler->pid = lxc_raw_clone_cb(do_start, handler, CLONE_PIDFD | flags,
 1571                     &handler->pidfd);
 1572     if (handler->pid < 0)
 1573         return -1;
 1574 
 1575     return 0;
 1576 }
 1577 
 1578 /* lxc_spawn() performs crucial setup tasks and clone()s the new process which
 1579  * exec()s the requested container binary.
 1580  * Note that lxc_spawn() runs in the parent namespaces. Any operations performed
 1581  * right here should be double checked if they'd pose a security risk. (For
 1582  * example, any {u}mount() operations performed here will be reflected on the
 1583  * host!)
 1584  */
 1585 static int lxc_spawn(struct lxc_handler *handler)
 1586 {
 1587     __do_close int data_sock0 = -EBADF, data_sock1 = -EBADF;
 1588     int i, ret;
 1589     char pidstr[20];
 1590     bool wants_to_map_ids;
 1591     struct lxc_list *id_map;
 1592     const char *name = handler->name;
 1593     const char *lxcpath = handler->lxcpath;
 1594     bool share_ns = false;
 1595     struct lxc_conf *conf = handler->conf;
 1596     struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
 1597 
 1598     id_map = &conf->id_map;
 1599     wants_to_map_ids = !lxc_list_empty(id_map);
 1600 
 1601     for (i = 0; i < LXC_NS_MAX; i++) {
 1602         if (!conf->ns_share[i])
 1603             continue;
 1604 
 1605         handler->nsfd[i] = lxc_inherit_namespace(conf->ns_share[i], lxcpath, ns_info[i].proc_name);
 1606         if (handler->nsfd[i] < 0)
 1607             return -1;
 1608 
 1609         share_ns = true;
 1610     }
 1611 
 1612     if (!lxc_sync_init(handler))
 1613         return -1;
 1614 
 1615     ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0,
 1616              handler->data_sock);
 1617     if (ret < 0)
 1618         goto out_sync_fini;
 1619     data_sock0 = handler->data_sock[0];
 1620     data_sock1 = handler->data_sock[1];
 1621 
 1622     ret = resolve_clone_flags(handler);
 1623     if (ret < 0)
 1624         goto out_sync_fini;
 1625 
 1626     if (handler->ns_clone_flags & CLONE_NEWNET) {
 1627         ret = lxc_find_gateway_addresses(handler);
 1628         if (ret) {
 1629             ERROR("Failed to find gateway addresses");
 1630             goto out_sync_fini;
 1631         }
 1632     }
 1633 
 1634     if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
 1635         ERROR("Failed creating cgroups");
 1636         goto out_delete_net;
 1637     }
 1638 
 1639     /* Create a process in a new set of namespaces. */
 1640     if (share_ns) {
 1641         pid_t attacher_pid;
 1642 
 1643         attacher_pid = lxc_clone(do_share_ns, handler,
 1644                      CLONE_VFORK | CLONE_VM | CLONE_FILES, NULL);
 1645         if (attacher_pid < 0) {
 1646             SYSERROR(LXC_CLONE_ERROR);
 1647             goto out_delete_net;
 1648         }
 1649 
 1650         ret = wait_for_pid(attacher_pid);
 1651         if (ret < 0) {
 1652             SYSERROR("Intermediate process failed");
 1653             goto out_delete_net;
 1654         }
 1655 
 1656         if (handler->pid < 0) {
 1657             SYSERROR(LXC_CLONE_ERROR);
 1658             goto out_delete_net;
 1659         }
 1660     } else {
 1661         int cgroup_fd = -EBADF;
 1662 
 1663         struct lxc_clone_args clone_args = {
 1664             .flags = handler->clone_flags,
 1665             .pidfd = ptr_to_u64(&handler->pidfd),
 1666             .exit_signal = SIGCHLD,
 1667         };
 1668 
 1669         if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
 1670             cgroup_fd = cgroup_unified_fd(cgroup_ops);
 1671             if (cgroup_fd >= 0) {
 1672                 handler->clone_flags    |= CLONE_INTO_CGROUP;
 1673                 clone_args.flags    |= CLONE_INTO_CGROUP;
 1674                 clone_args.cgroup   = cgroup_fd;
 1675             }
 1676         }
 1677 
 1678         /* Try to spawn directly into target cgroup. */
 1679         handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2);
 1680         if (handler->pid < 0) {
 1681             SYSTRACE("Failed to spawn container directly into target cgroup");
 1682 
 1683             /* Kernel might simply be too old for CLONE_INTO_CGROUP. */
 1684             handler->clone_flags        &= ~(CLONE_INTO_CGROUP | CLONE_NEWCGROUP);
 1685             handler->ns_on_clone_flags  &= ~CLONE_NEWCGROUP;
 1686             handler->ns_unshare_flags   |= CLONE_NEWCGROUP;
 1687 
 1688             clone_args.flags        = handler->clone_flags;
 1689 
 1690             handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER0);
 1691         } else if (cgroup_fd >= 0) {
 1692             TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd);
 1693         }
 1694 
 1695         /* Kernel might be too old for clone3(). */
 1696         if (handler->pid < 0) {
 1697             SYSTRACE("Failed to spawn container via clone3()");
 1698 
 1699         /*
 1700          * In contrast to all other architectures arm64 verifies that
 1701          * the argument we use to retrieve the pidfd with is
 1702          * initialized to 0. But we need to be able to initialize it to
 1703          * a negative value such as our customary -EBADF so we can
 1704          * detect whether this kernel supports pidfds. If the syscall
 1705          * returns and the pidfd variable is set to something >= 0 then
 1706          * we know this is a kernel supporting pidfds. But if we can't
 1707          * set it to -EBADF then this won't work since 0 is a valid
 1708          * file descriptor too. And since legacy clone silently ignores
 1709          * unknown flags we are left without any way to detect support
 1710          * for pidfds. So let's special-case arm64 to not fail starting
 1711          * containers.
 1712          */
 1713         #if defined(__aarch64__)
 1714             handler->pid = lxc_raw_legacy_clone(handler->clone_flags & ~CLONE_PIDFD, NULL);
 1715         #else
 1716             handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
 1717         #endif
 1718         }
 1719 
 1720         if (handler->pid < 0) {
 1721             SYSERROR(LXC_CLONE_ERROR);
 1722             goto out_delete_net;
 1723         }
 1724 
 1725         if (handler->pid == 0) {
 1726             (void)do_start(handler);
 1727             _exit(EXIT_FAILURE);
 1728         }
 1729     }
 1730     if (handler->pidfd < 0)
 1731         handler->clone_flags &= ~CLONE_PIDFD;
 1732     TRACE("Cloned child process %d", handler->pid);
 1733 
 1734     /* Verify that we can actually make use of pidfds. */
 1735     if (!lxc_can_use_pidfd(handler->pidfd))
 1736         close_prot_errno_disarm(handler->pidfd);
 1737 
 1738     ret = strnprintf(pidstr, 20, "%d", handler->pid);
 1739     if (ret < 0)
 1740         goto out_delete_net;
 1741 
 1742     ret = setenv("LXC_PID", pidstr, 1);
 1743     if (ret < 0)
 1744         SYSERROR("Failed to set environment variable: LXC_PID=%s", pidstr);
 1745 
 1746     for (i = 0; i < LXC_NS_MAX; i++)
 1747         if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
 1748             INFO("Cloned %s", ns_info[i].flag_name);
 1749 
 1750     if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags)) {
 1751         ERROR("Failed to preserve cloned namespaces for lxc.hook.stop");
 1752         goto out_delete_net;
 1753     }
 1754 
 1755     lxc_sync_fini_child(handler);
 1756 
 1757     if (lxc_abstract_unix_send_fds(handler->data_sock[0], &handler->monitor_status_fd, 1, NULL, 0) < 0) {
 1758         ERROR("Failed to send status file descriptor to child process");
 1759         goto out_delete_net;
 1760     }
 1761     close_prot_errno_disarm(handler->monitor_status_fd);
 1762 
 1763     /* Map the container uids. The container became an invalid userid the
 1764      * moment it was cloned with CLONE_NEWUSER. This call doesn't change
 1765      * anything immediately, but allows the container to setuid(0) (0 being
 1766      * mapped to something else on the host.) later to become a valid uid
 1767      * again.
 1768      */
 1769     if (wants_to_map_ids) {
 1770         if (!handler->conf->ns_share[LXC_NS_USER] &&
 1771             (handler->conf->ns_keep & CLONE_NEWUSER) == 0) {
 1772             ret = lxc_map_ids(id_map, handler->pid);
 1773             if (ret < 0) {
 1774                 ERROR("Failed to set up id mapping.");
 1775                 goto out_delete_net;
 1776             }
 1777         }
 1778     }
 1779 
 1780     if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) {
 1781         ERROR("Failed to setup cgroup limits for container \"%s\"", name);
 1782         goto out_delete_net;
 1783     }
 1784 
 1785     if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
 1786         ERROR("Failed to delegate controllers to payload cgroup");
 1787         goto out_delete_net;
 1788     }
 1789 
 1790     if (!cgroup_ops->payload_enter(cgroup_ops, handler)) {
 1791         ERROR("Failed to enter cgroups");
 1792         goto out_delete_net;
 1793     }
 1794 
 1795     if (!cgroup_ops->setup_limits(cgroup_ops, handler)) {
 1796         ERROR("Failed to setup cgroup limits for container \"%s\"", name);
 1797         goto out_delete_net;
 1798     }
 1799 
 1800     if (!cgroup_ops->chown(cgroup_ops, handler->conf))
 1801         goto out_delete_net;
 1802 
 1803     if (!lxc_sync_barrier_child(handler, START_SYNC_STARTUP))
 1804         goto out_delete_net;
 1805 
 1806     /* If not done yet, we're now ready to preserve the network namespace */
 1807     if (handler->nsfd[LXC_NS_NET] < 0) {
 1808         ret = lxc_try_preserve_namespace(handler, LXC_NS_NET, "net");
 1809         if (ret < 0) {
 1810             if (ret != -ENOENT) {
 1811                 SYSERROR("Failed to preserve net namespace");
 1812                 goto out_delete_net;
 1813             }
 1814         }
 1815     }
 1816     ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
 1817     if (ret < 0)
 1818         SYSWARN("Failed to allocate new network namespace id");
 1819     else
 1820         TRACE("Allocated new network namespace id");
 1821 
 1822     /* Create the network configuration. */
 1823     if (handler->ns_clone_flags & CLONE_NEWNET) {
 1824         ret = lxc_create_network(handler);
 1825         if (ret < 0) {
 1826             ERROR("Failed to create the network");
 1827             goto out_delete_net;
 1828         }
 1829     }
 1830 
 1831     if (!lxc_list_empty(&conf->procs)) {
 1832         ret = setup_proc_filesystem(&conf->procs, handler->pid);
 1833         if (ret < 0)
 1834             goto out_delete_net;
 1835     }
 1836 
 1837     if (!lxc_list_empty(&conf->limits)) {
 1838         ret = setup_resource_limits(&conf->limits, handler->pid);
 1839         if (ret < 0) {
 1840             ERROR("Failed to setup resource limits");
 1841             goto out_delete_net;
 1842         }
 1843     }
 1844 
 1845     /* Tell the child to continue its initialization. */
 1846     if (!lxc_sync_wake_child(handler, START_SYNC_POST_CONFIGURE))
 1847         goto out_delete_net;
 1848 
 1849     ret = lxc_rootfs_prepare_parent(handler);
 1850     if (ret) {
 1851         ERROR("Failed to prepare rootfs");
 1852         goto out_delete_net;
 1853     }
 1854 
 1855     if (handler->ns_clone_flags & CLONE_NEWNET) {
 1856         ret = lxc_network_send_to_child(handler);
 1857         if (ret < 0) {
 1858             SYSERROR("Failed to send veth names to child");
 1859             goto out_delete_net;
 1860         }
 1861     }
 1862 
 1863     if (!lxc_sync_wait_child(handler, START_SYNC_IDMAPPED_MOUNTS))
 1864         goto out_delete_net;
 1865 
 1866     ret = lxc_idmapped_mounts_parent(handler);
 1867     if (ret) {
 1868         ERROR("Failed to setup mount entries");
 1869         goto out_delete_net;
 1870     }
 1871 
 1872     if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS))
 1873         goto out_delete_net;
 1874 
 1875     /*
 1876      * With isolation the limiting devices cgroup was already setup, so
 1877      * only setup devices here if we have no namespace directory.
 1878      */
 1879     if (!handler->conf->cgroup_meta.namespace_dir &&
 1880         !cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, true)) {
 1881         ERROR("Failed to setup legacy device cgroup controller limits");
 1882         goto out_delete_net;
 1883     }
 1884     TRACE("Set up legacy device cgroup controller limits");
 1885 
 1886     if (!cgroup_ops->devices_activate(cgroup_ops, handler)) {
 1887         ERROR("Failed to setup cgroup2 device controller limits");
 1888         goto out_delete_net;
 1889     }
 1890     TRACE("Set up cgroup2 device controller limits");
 1891 
 1892     cgroup_ops->finalize(cgroup_ops);
 1893     TRACE("Finished setting up cgroups");
 1894 
 1895     /* Run any host-side start hooks */
 1896     ret = run_lxc_hooks(name, "start-host", conf, NULL);
 1897     if (ret < 0) {
 1898         ERROR("Failed to run lxc.hook.start-host");
 1899         goto out_delete_net;
 1900     }
 1901 
 1902     if (!lxc_sync_wake_child(handler, START_SYNC_FDS))
 1903         goto out_delete_net;
 1904 
 1905     if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
 1906         /* Now we're ready to preserve the cgroup namespace */
 1907         ret = lxc_try_preserve_namespace(handler, LXC_NS_CGROUP, "cgroup");
 1908         if (ret < 0) {
 1909             if (ret != -ENOENT) {
 1910                 SYSERROR("Failed to preserve cgroup namespace");
 1911                 goto out_delete_net;
 1912             }
 1913         }
 1914     }
 1915 
 1916     if (handler->ns_unshare_flags & CLONE_NEWTIME) {
 1917         /* Now we're ready to preserve the time namespace */
 1918         ret = lxc_try_preserve_namespace(handler, LXC_NS_TIME, "time");
 1919         if (ret < 0) {
 1920             if (ret != -ENOENT) {
 1921                 SYSERROR("Failed to preserve time namespace");
 1922                 goto out_delete_net;
 1923             }
 1924         }
 1925     }
 1926 
 1927     ret = lxc_sync_fds_parent(handler);
 1928     if (ret < 0) {
 1929         SYSERROR("Failed to sync file descriptors with child");
 1930         goto out_delete_net;
 1931     }
 1932 
 1933     /*
 1934      * Tell the child to complete its initialization and wait for it to
 1935      * exec or return an error. (The child will never return
 1936      * START_SYNC_READY_START+1. It will either close the sync pipe,
 1937      * causing lxc_sync_barrier_child to return success, or return a
 1938      * different value, causing us to error out).
 1939      */
 1940     if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
 1941         goto out_delete_net;
 1942 
 1943     /* Now all networks are created, network devices are moved into place,
 1944      * and the correct names and ifindices in the respective namespaces have
 1945      * been recorded. The corresponding structs have now all been filled. So
 1946      * log them for debugging purposes.
 1947      */
 1948     lxc_log_configured_netdevs(conf);
 1949 
 1950     ret = handler->ops->post_start(handler, handler->data);
 1951     if (ret < 0)
 1952         goto out_abort;
 1953 
 1954     ret = lxc_set_state(name, handler, RUNNING);
 1955     if (ret < 0) {
 1956         ERROR("Failed to set state to \"%s\"", lxc_state2str(RUNNING));
 1957         goto out_abort;
 1958     }
 1959 
 1960     lxc_sync_fini(handler);
 1961 
 1962     return 0;
 1963 
 1964 out_delete_net:
 1965     if (handler->ns_clone_flags & CLONE_NEWNET)
 1966         lxc_delete_network(handler);
 1967 
 1968 out_abort:
 1969     lxc_abort(handler);
 1970 
 1971 out_sync_fini:
 1972     lxc_sync_fini(handler);
 1973 
 1974     return -1;
 1975 }
 1976 
 1977 int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
 1978         void *data, const char *lxcpath, bool daemonize, int *error_num)
 1979 {
 1980     int ret, status;
 1981     const char *name = handler->name;
 1982     struct lxc_conf *conf = handler->conf;
 1983     struct cgroup_ops *cgroup_ops;
 1984 
 1985     ret = lxc_init(name, handler);
 1986     if (ret < 0) {
 1987         ERROR("Failed to initialize container \"%s\"", name);
 1988         goto out_abort;
 1989     }
 1990     handler->ops = ops;
 1991     handler->data = data;
 1992     handler->daemonize = daemonize;
 1993     cgroup_ops = handler->cgroup_ops;
 1994 
 1995     if (!attach_block_device(handler->conf)) {
 1996         ERROR("Failed to attach block device");
 1997         ret = -1;
 1998         goto out_abort;
 1999     }
 2000 
 2001     if (!cgroup_ops->monitor_create(cgroup_ops, handler)) {
 2002         ERROR("Failed to create monitor cgroup");
 2003         ret = -1;
 2004         goto out_abort;
 2005     }
 2006 
 2007     if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
 2008         ERROR("Failed to delegate controllers to monitor cgroup");
 2009         ret = -1;
 2010         goto out_abort;
 2011     }
 2012 
 2013     if (!cgroup_ops->monitor_enter(cgroup_ops, handler)) {
 2014         ERROR("Failed to enter monitor cgroup");
 2015         ret = -1;
 2016         goto out_abort;
 2017     }
 2018 
 2019     /* If the rootfs is not a blockdev, prevent the container from marking
 2020      * it readonly.
 2021      * If the container is unprivileged then skip rootfs pinning.
 2022      */
 2023     ret = lxc_rootfs_init(conf, !lxc_list_empty(&conf->id_map));
 2024     if (ret) {
 2025         ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
 2026         ret = -1;
 2027         goto out_abort;
 2028     }
 2029 
 2030     if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
 2031         /*
 2032          * Most filesystems can't be mounted inside a userns so handle them here.
 2033          */
 2034         if (rootfs_is_blockdev(conf)) {
 2035             ret = unshare(CLONE_NEWNS);
 2036             if (ret < 0) {
 2037                 ERROR("Failed to unshare CLONE_NEWNS");
 2038                 goto out_abort;
 2039             }
 2040             INFO("Unshared CLONE_NEWNS");
 2041 
 2042             ret = lxc_setup_rootfs_prepare_root(conf, name, lxcpath);
 2043             if (ret < 0) {
 2044                 ERROR("Error setting up rootfs mount as root before spawn");
 2045                 goto out_abort;
 2046             }
 2047             INFO("Set up container rootfs as host root");
 2048         }
 2049     }
 2050 
 2051     ret = lxc_spawn(handler);
 2052     if (ret < 0) {
 2053         ERROR("Failed to spawn container \"%s\"", name);
 2054         goto out_detach_blockdev;
 2055     }
 2056 
 2057     handler->conf->reboot = REBOOT_NONE;
 2058 
 2059     ret = lxc_poll(name, handler);
 2060     if (ret) {
 2061         ERROR("LXC mainloop exited with error: %d", ret);
 2062         goto out_delete_network;
 2063     }
 2064 
 2065     if (!handler->init_died && handler->pid > 0) {
 2066         ERROR("Child process is not killed");
 2067         ret = -1;
 2068         goto out_delete_network;
 2069     }
 2070 
 2071     status = lxc_wait_for_pid_status(handler->pid);
 2072     if (status < 0)
 2073         SYSERROR("Failed to retrieve status for %d", handler->pid);
 2074 
 2075     /* If the child process exited but was not signaled, it didn't call
 2076      * reboot. This should mean it was an lxc-execute which simply exited.
 2077      * In any case, treat it as a 'halt'.
 2078      */
 2079     if (WIFSIGNALED(status)) {
 2080         switch(WTERMSIG(status)) {
 2081         case SIGINT: /* halt */
 2082             DEBUG("Container \"%s\" is halting", name);
 2083             break;
 2084         case SIGHUP: /* reboot */
 2085             DEBUG("Container \"%s\" is rebooting", name);
 2086             handler->conf->reboot = REBOOT_REQ;
 2087             break;
 2088         case SIGSYS: /* seccomp */
 2089             DEBUG("Container \"%s\" violated its seccomp policy", name);
 2090             break;
 2091         default:
 2092             DEBUG("Unknown exit status for container \"%s\" init %d", name, WTERMSIG(status));
 2093             break;
 2094         }
 2095     }
 2096 
 2097     ret = lxc_restore_phys_nics_to_netns(handler);
 2098     if (ret < 0)
 2099         ERROR("Failed to move physical network devices back to parent network namespace");
 2100 
 2101     lxc_monitor_send_exit_code(name, status, handler->lxcpath);
 2102     lxc_error_set_and_log(handler->pid, status);
 2103     if (error_num)
 2104         *error_num = handler->exit_status;
 2105 
 2106     lxc_delete_network(handler);
 2107     detach_block_device(handler->conf);
 2108     lxc_end(handler);
 2109     return ret;
 2110 
 2111 out_abort:
 2112     lxc_abort(handler);
 2113     lxc_end(handler);
 2114     return ret;
 2115 
 2116 out_detach_blockdev:
 2117     lxc_abort(handler);
 2118     detach_block_device(handler->conf);
 2119     lxc_end(handler);
 2120     return ret;
 2121 
 2122 out_delete_network:
 2123     lxc_abort(handler);
 2124     lxc_restore_phys_nics_to_netns(handler);
 2125     lxc_delete_network(handler);
 2126     detach_block_device(handler->conf);
 2127     lxc_end(handler);
 2128     return ret;
 2129 }
 2130 
 2131 struct start_args {
 2132     char *const *argv;
 2133 };
 2134 
 2135 static int start(struct lxc_handler *handler, void* data)
 2136 {
 2137     struct start_args *arg = data;
 2138 
 2139     NOTICE("Exec'ing \"%s\"", arg->argv[0]);
 2140 
 2141     execvp(arg->argv[0], arg->argv);
 2142     SYSERROR("Failed to exec \"%s\"", arg->argv[0]);
 2143     return 0;
 2144 }
 2145 
 2146 static int post_start(struct lxc_handler *handler, void* data)
 2147 {
 2148     struct start_args *arg = data;
 2149 
 2150     NOTICE("Started \"%s\" with pid \"%d\"", arg->argv[0], handler->pid);
 2151     return 0;
 2152 }
 2153 
 2154 static struct lxc_operations start_ops = {
 2155     .start = start,
 2156     .post_start = post_start
 2157 };
 2158 
 2159 int lxc_start(char *const argv[], struct lxc_handler *handler,
 2160           const char *lxcpath, bool daemonize, int *error_num)
 2161 {
 2162     struct start_args start_arg = {
 2163         .argv = argv,
 2164     };
 2165 
 2166     TRACE("Doing lxc_start");
 2167     return __lxc_start(handler, &start_ops, &start_arg, lxcpath, daemonize, error_num);
 2168 }
 2169 
 2170 static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
 2171                         const char *name)
 2172 {
 2173     char destroy[PATH_MAX];
 2174     struct lxc_container *c;
 2175     int ret = 0;
 2176     bool bret = true;
 2177 
 2178     if (handler->conf->rootfs.path && handler->conf->rootfs.mount) {
 2179         bret = do_destroy_container(handler);
 2180         if (!bret) {
 2181             ERROR("Error destroying rootfs for container \"%s\"", name);
 2182             return;
 2183         }
 2184     }
 2185     INFO("Destroyed rootfs for container \"%s\"", name);
 2186 
 2187     ret = strnprintf(destroy, sizeof(destroy), "%s/%s", handler->lxcpath, name);
 2188     if (ret < 0) {
 2189         ERROR("Error destroying directory for container \"%s\"", name);
 2190         return;
 2191     }
 2192 
 2193     c = lxc_container_new(name, handler->lxcpath);
 2194     if (c) {
 2195         if (container_disk_lock(c)) {
 2196             INFO("Could not update lxc_snapshots file");
 2197             lxc_container_put(c);
 2198         } else {
 2199             mod_all_rdeps(c, false);
 2200             container_disk_unlock(c);
 2201             lxc_container_put(c);
 2202         }
 2203     }
 2204 
 2205     if (!handler->am_root)
 2206         ret = userns_exec_full(handler->conf, lxc_rmdir_onedev_wrapper,
 2207                        destroy, "lxc_rmdir_onedev_wrapper");
 2208     else
 2209         ret = lxc_rmdir_onedev(destroy, NULL);
 2210 
 2211     if (ret < 0) {
 2212         ERROR("Error destroying directory for container \"%s\"", name);
 2213         return;
 2214     }
 2215     INFO("Destroyed directory for container \"%s\"", name);
 2216 }
 2217 
 2218 static int lxc_rmdir_onedev_wrapper(void *data)
 2219 {
 2220     char *arg = (char *) data;
 2221     return lxc_rmdir_onedev(arg, NULL);
 2222 }
 2223 
 2224 static bool do_destroy_container(struct lxc_handler *handler)
 2225 {
 2226     int ret;
 2227 
 2228     if (!handler->am_root) {
 2229         ret = userns_exec_full(handler->conf, storage_destroy_wrapper,
 2230                        handler->conf, "storage_destroy_wrapper");
 2231         if (ret < 0)
 2232             return false;
 2233 
 2234         return true;
 2235     }
 2236 
 2237     return storage_destroy(handler->conf);
 2238 }