"Fossies" - the Fresh Open Source Software Archive

Member "lxc-4.0.10/src/lxc/seccomp.c" (16 Jul 2021, 42533 Bytes) of package /linux/misc/lxc-4.0.10.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "seccomp.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 4.0.9_vs_4.0.10.

    1 /* SPDX-License-Identifier: LGPL-2.1+ */
    2 
    3 #ifndef _GNU_SOURCE
    4 #define _GNU_SOURCE 1
    5 #endif
    6 #include <errno.h>
    7 #include <seccomp.h>
    8 #include <stdio.h>
    9 #include <stdlib.h>
   10 #include <sys/epoll.h>
   11 #include <sys/mount.h>
   12 #include <sys/utsname.h>
   13 
   14 #include "af_unix.h"
   15 #include "commands.h"
   16 #include "config.h"
   17 #include "log.h"
   18 #include "lxccontainer.h"
   19 #include "lxcseccomp.h"
   20 #include "mainloop.h"
   21 #include "memory_utils.h"
   22 #include "utils.h"
   23 
   24 #ifdef __MIPSEL__
   25 #define MIPS_ARCH_O32 lxc_seccomp_arch_mipsel
   26 #define MIPS_ARCH_N64 lxc_seccomp_arch_mipsel64
   27 #else
   28 #define MIPS_ARCH_O32 lxc_seccomp_arch_mips
   29 #define MIPS_ARCH_N64 lxc_seccomp_arch_mips64
   30 #endif
   31 
   32 #ifndef SECCOMP_GET_NOTIF_SIZES
   33 #define SECCOMP_GET_NOTIF_SIZES 3
   34 #endif
   35 
   36 lxc_log_define(seccomp, lxc);
   37 
   38 #if HAVE_DECL_SECCOMP_NOTIFY_FD
   39 static inline int __seccomp(unsigned int operation, unsigned int flags,
   40               void *args)
   41 {
   42     return syscall(__NR_seccomp, operation, flags, args);
   43 }
   44 #endif
   45 
   46 static int parse_config_v1(FILE *f, char *line, size_t *line_bufsz, struct lxc_conf *conf)
   47 {
   48     int ret = 0;
   49 
   50     while (getline(&line, line_bufsz, f) != -1) {
   51         int nr;
   52 
   53         ret = sscanf(line, "%d", &nr);
   54         if (ret != 1) {
   55             ret = -1;
   56             break;
   57         }
   58 
   59 #if HAVE_SCMP_FILTER_CTX
   60         ret = seccomp_rule_add(conf->seccomp.seccomp_ctx, SCMP_ACT_ALLOW, nr, 0);
   61 #else
   62         ret = seccomp_rule_add(SCMP_ACT_ALLOW, nr, 0);
   63 #endif
   64         if (ret < 0) {
   65             ERROR("Failed loading allow rule for %d", nr);
   66             break;
   67         }
   68     }
   69     free(line);
   70 
   71     return ret;
   72 }
   73 
   74 #if HAVE_DECL_SECCOMP_SYSCALL_RESOLVE_NAME_ARCH
   75 static const char *get_action_name(uint32_t action)
   76 {
   77     /* The upper 16 bits indicate the type of the seccomp action. */
   78     switch (action & 0xffff0000) {
   79     case SCMP_ACT_KILL:
   80         return "kill";
   81     case SCMP_ACT_ALLOW:
   82         return "allow";
   83     case SCMP_ACT_TRAP:
   84         return "trap";
   85     case SCMP_ACT_ERRNO(0):
   86         return "errno";
   87 #if HAVE_DECL_SECCOMP_NOTIFY_FD
   88     case SCMP_ACT_NOTIFY:
   89         return "notify";
   90 #endif
   91     }
   92 
   93     return "invalid action";
   94 }
   95 
   96 static uint32_t get_v2_default_action(char *line)
   97 {
   98     uint32_t ret_action = -1;
   99 
  100     while (*line == ' ')
  101         line++;
  102 
  103     /* After 'allowlist' or 'denylist' comes default behavior. */
  104     if (strnequal(line, "kill", 4)) {
  105         ret_action = SCMP_ACT_KILL;
  106     } else if (strnequal(line, "errno", 5)) {
  107         int e, ret;
  108 
  109         ret = sscanf(line + 5, "%d", &e);
  110         if (ret != 1) {
  111             ERROR("Failed to parse errno value from %s", line);
  112             return -2;
  113         }
  114 
  115         ret_action = SCMP_ACT_ERRNO(e);
  116     } else if (strnequal(line, "allow", 5)) {
  117         ret_action = SCMP_ACT_ALLOW;
  118     } else if (strnequal(line, "trap", 4)) {
  119         ret_action = SCMP_ACT_TRAP;
  120 #if HAVE_DECL_SECCOMP_NOTIFY_FD
  121     } else if (strnequal(line, "notify", 6)) {
  122         ret_action = SCMP_ACT_NOTIFY;
  123 #endif
  124     } else if (line[0]) {
  125         ERROR("Unrecognized seccomp action \"%s\"", line);
  126         return -2;
  127     }
  128 
  129     return ret_action;
  130 }
  131 
  132 static uint32_t get_v2_action(char *line, uint32_t def_action)
  133 {
  134     char *p;
  135     uint32_t ret;
  136 
  137     p = strchr(line, ' ');
  138     if (!p)
  139         return def_action;
  140     p++;
  141 
  142     while (*p == ' ')
  143         p++;
  144 
  145     if (!*p || *p == '#')
  146         return def_action;
  147 
  148     ret = get_v2_default_action(p);
  149     switch (ret) {
  150     case -2:
  151         return -1;
  152     case -1:
  153         return def_action;
  154     }
  155 
  156     return ret;
  157 }
  158 
  159 struct seccomp_v2_rule_args {
  160     uint32_t index;
  161     uint64_t value;
  162     uint64_t mask;
  163     enum scmp_compare op;
  164 };
  165 
  166 struct seccomp_v2_rule {
  167     uint32_t action;
  168     uint32_t args_num;
  169     struct seccomp_v2_rule_args args_value[6];
  170 };
  171 
  172 static enum scmp_compare parse_v2_rule_op(char *s)
  173 {
  174     if (strequal(s, "SCMP_CMP_NE") || strequal(s, "!="))
  175         return SCMP_CMP_NE;
  176     else if (strequal(s, "SCMP_CMP_LT") || strequal(s, "<"))
  177         return SCMP_CMP_LT;
  178     else if (strequal(s, "SCMP_CMP_LE") || strequal(s, "<="))
  179         return SCMP_CMP_LE;
  180     else if (strequal(s, "SCMP_CMP_EQ") || strequal(s, "=="))
  181         return SCMP_CMP_EQ;
  182     else if (strequal(s, "SCMP_CMP_GE") || strequal(s, ">="))
  183         return SCMP_CMP_GE;
  184     else if (strequal(s, "SCMP_CMP_GT") || strequal(s, ">"))
  185         return SCMP_CMP_GT;
  186     else if (strequal(s, "SCMP_CMP_MASKED_EQ") || strequal(s, "&="))
  187         return SCMP_CMP_MASKED_EQ;
  188 
  189     return _SCMP_CMP_MAX;
  190 }
  191 
  192 /*
  193  * This function is used to parse the args string into the structure.
  194  * args string format:[index,value,op,mask] or [index,value,op]
  195  * index: the index for syscall arguments (type uint)
  196  * value: the value for syscall arguments (type uint64)
  197  * op: the operator for syscall arguments(string),
  198      a valid list of constants as of libseccomp v2.3.2 is
  199      SCMP_CMP_NE,SCMP_CMP_LE,SCMP_CMP_LE, SCMP_CMP_EQ, SCMP_CMP_GE,
  200      SCMP_CMP_GT, SCMP_CMP_MASKED_EQ, or !=,<=,==,>=,>,&=
  201  * mask: the mask to apply on "value" for SCMP_CMP_MASKED_EQ (type uint64, optional)
  202  * Returns 0 on success, < 0 otherwise.
  203  */
  204 static int get_seccomp_arg_value(char *key, struct seccomp_v2_rule_args *rule_args)
  205 {
  206     int ret = 0;
  207     uint32_t index = 0;
  208     uint64_t mask = 0, value = 0;
  209     enum scmp_compare op = 0;
  210     char *tmp = NULL;
  211     char s[31] = {0}, v[24] = {0}, m[24] = {'0'};
  212 
  213     tmp = strchr(key, '[');
  214     if (!tmp) {
  215         ERROR("Failed to interpret args");
  216         return -1;
  217     }
  218 
  219     ret = sscanf(tmp, "[%i,%23[^,],%30[^0-9^,],%23[^,]", &index, v, s, m);
  220     if ((ret != 3 && ret != 4) || index >= 6) {
  221         ERROR("Failed to interpret args value");
  222         return -1;
  223     }
  224 
  225     ret = lxc_safe_uint64(v, &value, 0);
  226     if (ret < 0) {
  227         ERROR("Invalid argument value");
  228         return -1;
  229     }
  230 
  231     ret = lxc_safe_uint64(m, &mask, 0);
  232     if (ret < 0) {
  233         ERROR("Invalid argument mask");
  234         return -1;
  235     }
  236 
  237     op = parse_v2_rule_op(s);
  238     if (op == _SCMP_CMP_MAX) {
  239         ERROR("Failed to interpret args operator value");
  240         return -1;
  241     }
  242 
  243     rule_args->index = index;
  244     rule_args->value = value;
  245     rule_args->mask = mask;
  246     rule_args->op = op;
  247     return 0;
  248 }
  249 
  250 /* This function is used to parse the seccomp rule entry.
  251  * @line    : seccomp rule entry string.
  252  * @def_action  : default action used in the case if the 'line' contain non valid action.
  253  * @rules   : output struct.
  254  * Returns 0 on success, < 0 otherwise.
  255  */
  256 static int parse_v2_rules(char *line, uint32_t def_action,
  257               struct seccomp_v2_rule *rules)
  258 {
  259     int i = 0, ret = -1;
  260     char *key = NULL, *saveptr = NULL, *tmp = NULL;
  261 
  262     tmp = strdup(line);
  263     if (!tmp)
  264         return -1;
  265 
  266     /* read optional action which follows the syscall */
  267     rules->action = get_v2_action(tmp, def_action);
  268     if (rules->action == -1) {
  269         ERROR("Failed to interpret action");
  270         ret = -1;
  271         goto on_error;
  272     }
  273 
  274     ret = 0;
  275     rules->args_num = 0;
  276     if (!strchr(tmp, '['))
  277         goto on_error;
  278 
  279     ret = -1;
  280     for ((key = strtok_r(tmp, "]", &saveptr)), i = 0; key && i < 6;
  281          (key = strtok_r(NULL, "]", &saveptr)), i++) {
  282         ret = get_seccomp_arg_value(key, &rules->args_value[i]);
  283         if (ret < 0)
  284             goto on_error;
  285 
  286         rules->args_num++;
  287     }
  288 
  289     ret = 0;
  290 
  291 on_error:
  292     free(tmp);
  293 
  294     return ret;
  295 }
  296 #endif
  297 
  298 #if HAVE_DECL_SECCOMP_SYSCALL_RESOLVE_NAME_ARCH
  299 enum lxc_hostarch_t {
  300     lxc_seccomp_arch_all = 0,
  301     lxc_seccomp_arch_native,
  302     lxc_seccomp_arch_i386,
  303     lxc_seccomp_arch_x32,
  304     lxc_seccomp_arch_amd64,
  305     lxc_seccomp_arch_arm,
  306     lxc_seccomp_arch_arm64,
  307     lxc_seccomp_arch_ppc64,
  308     lxc_seccomp_arch_ppc64le,
  309     lxc_seccomp_arch_ppc,
  310     lxc_seccomp_arch_mips,
  311     lxc_seccomp_arch_mips64,
  312     lxc_seccomp_arch_mips64n32,
  313     lxc_seccomp_arch_mipsel,
  314     lxc_seccomp_arch_mipsel64,
  315     lxc_seccomp_arch_mipsel64n32,
  316     lxc_seccomp_arch_s390x,
  317     lxc_seccomp_arch_s390,
  318     lxc_seccomp_arch_unknown = 999,
  319 };
  320 
  321 static int get_hostarch(void)
  322 {
  323     struct utsname uts;
  324     if (uname(&uts) < 0) {
  325         SYSERROR("Failed to read host arch");
  326         return -1;
  327     }
  328 
  329     if (strequal(uts.machine, "i686"))
  330         return lxc_seccomp_arch_i386;
  331     /* no x32 kernels */
  332     else if (strequal(uts.machine, "x86_64"))
  333         return lxc_seccomp_arch_amd64;
  334     else if (strnequal(uts.machine, "armv7", 5))
  335         return lxc_seccomp_arch_arm;
  336     else if (strnequal(uts.machine, "aarch64", 7))
  337         return lxc_seccomp_arch_arm64;
  338     else if (strnequal(uts.machine, "ppc64le", 7))
  339         return lxc_seccomp_arch_ppc64le;
  340     else if (strnequal(uts.machine, "ppc64", 5))
  341         return lxc_seccomp_arch_ppc64;
  342     else if (strnequal(uts.machine, "ppc", 3))
  343         return lxc_seccomp_arch_ppc;
  344     else if (strnequal(uts.machine, "mips64", 6))
  345         return MIPS_ARCH_N64;
  346     else if (strnequal(uts.machine, "mips", 4))
  347         return MIPS_ARCH_O32;
  348     else if (strnequal(uts.machine, "s390x", 5))
  349         return lxc_seccomp_arch_s390x;
  350     else if (strnequal(uts.machine, "s390", 4))
  351         return lxc_seccomp_arch_s390;
  352     return lxc_seccomp_arch_unknown;
  353 }
  354 
  355 static scmp_filter_ctx get_new_ctx(enum lxc_hostarch_t n_arch, uint32_t default_policy_action,
  356                    bool *needs_merge)
  357 {
  358     int ret;
  359     uint32_t arch;
  360     scmp_filter_ctx ctx;
  361 
  362     switch (n_arch) {
  363     case lxc_seccomp_arch_i386:
  364         arch = SCMP_ARCH_X86;
  365         break;
  366     case lxc_seccomp_arch_x32:
  367         arch = SCMP_ARCH_X32;
  368         break;
  369     case lxc_seccomp_arch_amd64:
  370         arch = SCMP_ARCH_X86_64;
  371         break;
  372     case lxc_seccomp_arch_arm:
  373         arch = SCMP_ARCH_ARM;
  374         break;
  375 #ifdef SCMP_ARCH_AARCH64
  376     case lxc_seccomp_arch_arm64:
  377         arch = SCMP_ARCH_AARCH64;
  378         break;
  379 #endif
  380 #ifdef SCMP_ARCH_PPC64LE
  381     case lxc_seccomp_arch_ppc64le:
  382         arch = SCMP_ARCH_PPC64LE;
  383         break;
  384 #endif
  385 #ifdef SCMP_ARCH_PPC64
  386     case lxc_seccomp_arch_ppc64:
  387         arch = SCMP_ARCH_PPC64;
  388         break;
  389 #endif
  390 #ifdef SCMP_ARCH_PPC
  391     case lxc_seccomp_arch_ppc:
  392         arch = SCMP_ARCH_PPC;
  393         break;
  394 #endif
  395 #ifdef SCMP_ARCH_MIPS
  396     case lxc_seccomp_arch_mips:
  397         arch = SCMP_ARCH_MIPS;
  398         break;
  399     case lxc_seccomp_arch_mips64:
  400         arch = SCMP_ARCH_MIPS64;
  401         break;
  402     case lxc_seccomp_arch_mips64n32:
  403         arch = SCMP_ARCH_MIPS64N32;
  404         break;
  405     case lxc_seccomp_arch_mipsel:
  406         arch = SCMP_ARCH_MIPSEL;
  407         break;
  408     case lxc_seccomp_arch_mipsel64:
  409         arch = SCMP_ARCH_MIPSEL64;
  410         break;
  411     case lxc_seccomp_arch_mipsel64n32:
  412         arch = SCMP_ARCH_MIPSEL64N32;
  413         break;
  414 #endif
  415 #ifdef SCMP_ARCH_S390X
  416     case lxc_seccomp_arch_s390x:
  417         arch = SCMP_ARCH_S390X;
  418         break;
  419 #endif
  420 #ifdef SCMP_ARCH_S390
  421     case lxc_seccomp_arch_s390:
  422         arch = SCMP_ARCH_S390;
  423         break;
  424 #endif
  425     default:
  426         return NULL;
  427     }
  428 
  429     ctx = seccomp_init(default_policy_action);
  430     if (!ctx) {
  431         ERROR("Error initializing seccomp context");
  432         return NULL;
  433     }
  434 
  435     ret = seccomp_attr_set(ctx, SCMP_FLTATR_CTL_NNP, 0);
  436     if (ret < 0) {
  437         errno = -ret;
  438         SYSERROR("Failed to turn off no-new-privs");
  439         seccomp_release(ctx);
  440         return NULL;
  441     }
  442 
  443 #ifdef SCMP_FLTATR_ATL_TSKIP
  444     ret = seccomp_attr_set(ctx, SCMP_FLTATR_ATL_TSKIP, 1);
  445     if (ret < 0) {
  446         errno = -ret;
  447         SYSWARN("Failed to turn on seccomp nop-skip, continuing");
  448     }
  449 #endif
  450 
  451     ret = seccomp_arch_exist(ctx, arch);
  452     if (ret < 0) {
  453         if (ret != -EEXIST) {
  454             errno = -ret;
  455             SYSERROR("Failed to determine whether arch %d is "
  456                      "already present in the main seccomp context",
  457                      (int)n_arch);
  458             seccomp_release(ctx);
  459             return NULL;
  460         }
  461 
  462         ret = seccomp_arch_add(ctx, arch);
  463         if (ret != 0) {
  464             errno = -ret;
  465             SYSERROR("Failed to add arch %d to main seccomp context",
  466                      (int)n_arch);
  467             seccomp_release(ctx);
  468             return NULL;
  469         }
  470         TRACE("Added arch %d to main seccomp context", (int)n_arch);
  471 
  472         ret = seccomp_arch_remove(ctx, SCMP_ARCH_NATIVE);
  473         if (ret != 0) {
  474             ERROR("Failed to remove native arch from main seccomp context");
  475             seccomp_release(ctx);
  476             return NULL;
  477         }
  478         TRACE("Removed native arch from main seccomp context");
  479 
  480         *needs_merge = true;
  481     } else {
  482         *needs_merge = false;
  483         TRACE("Arch %d already present in main seccomp context", (int)n_arch);
  484     }
  485 
  486     return ctx;
  487 }
  488 
  489 enum lxc_seccomp_rule_status_t {
  490   lxc_seccomp_rule_added = 0,
  491   lxc_seccomp_rule_err,
  492   lxc_seccomp_rule_undefined_syscall,
  493   lxc_seccomp_rule_unsupported_arch,
  494 };
  495 
  496 static enum lxc_seccomp_rule_status_t do_resolve_add_rule(uint32_t arch, char *line, scmp_filter_ctx ctx,
  497                 struct seccomp_v2_rule *rule)
  498 {
  499     int i, nr, ret;
  500     struct scmp_arg_cmp arg_cmp[6];
  501 
  502     ret = seccomp_arch_exist(ctx, arch);
  503     if (arch && ret != 0) {
  504         errno = -ret;
  505         SYSERROR("Seccomp: rule and context arch do not match (arch %d)", arch);
  506         return lxc_seccomp_rule_err;
  507     }
  508 
  509     /*get the syscall name*/
  510     char *p = strchr(line, ' ');
  511     if (p)
  512         *p = '\0';
  513 
  514     if (strnequal(line, "reject_force_umount", 19)) {
  515         ret = seccomp_rule_add_exact(ctx, SCMP_ACT_ERRNO(EACCES),
  516                          SCMP_SYS(umount2), 1,
  517                          SCMP_A1(SCMP_CMP_MASKED_EQ, MNT_FORCE, MNT_FORCE));
  518         if (ret < 0) {
  519             errno = -ret;
  520             SYSERROR("Failed loading rule to reject force umount");
  521             return lxc_seccomp_rule_err;
  522         }
  523 
  524         INFO("Set seccomp rule to reject force umounts");
  525         return lxc_seccomp_rule_added;
  526     }
  527 
  528     nr = seccomp_syscall_resolve_name(line);
  529     if (nr == __NR_SCMP_ERROR) {
  530         INFO("The syscall[%s] is is undefined on host native arch", line);
  531         return lxc_seccomp_rule_undefined_syscall;
  532     }
  533 
  534     // The syscall resolves to a pseudo syscall and may be available on compat archs.
  535     if (nr < 0 && arch == SCMP_ARCH_NATIVE) {
  536         DEBUG("The syscall[%d:%s] is a pseudo syscall and not available on host native arch.", nr, line);
  537         return lxc_seccomp_rule_unsupported_arch;
  538     }
  539 
  540     if (arch != SCMP_ARCH_NATIVE && seccomp_syscall_resolve_name_arch(arch, line) < 0) {
  541         DEBUG("The syscall[%d:%s] is not supported on compat arch[%u]", nr, line, arch);
  542         return lxc_seccomp_rule_unsupported_arch;
  543     }
  544 
  545     memset(&arg_cmp, 0, sizeof(arg_cmp));
  546     for (i = 0; i < rule->args_num; i++) {
  547         INFO("arg_cmp[%d]: SCMP_CMP(%u, %llu, %llu, %llu)", i,
  548              rule->args_value[i].index,
  549              (long long unsigned int)rule->args_value[i].op,
  550              (long long unsigned int)rule->args_value[i].mask,
  551              (long long unsigned int)rule->args_value[i].value);
  552 
  553         if (SCMP_CMP_MASKED_EQ == rule->args_value[i].op)
  554             arg_cmp[i] = SCMP_CMP(rule->args_value[i].index,
  555                           rule->args_value[i].op,
  556                           rule->args_value[i].mask,
  557                           rule->args_value[i].value);
  558         else
  559             arg_cmp[i] = SCMP_CMP(rule->args_value[i].index,
  560                           rule->args_value[i].op,
  561                           rule->args_value[i].value);
  562     }
  563 
  564     INFO("Adding %s rule for syscall[%d:%s] action[%d:%s] arch[%u]",
  565               (arch == SCMP_ARCH_NATIVE) ? "native" : "compat",
  566               nr, line, rule->action, get_action_name(rule->action), arch);
  567 
  568     ret = seccomp_rule_add_exact_array(ctx, rule->action, nr,
  569                        rule->args_num, arg_cmp);
  570     if (ret < 0) {
  571         errno = -ret;
  572         SYSERROR("Failed to add rule for syscall[%d:%s] action[%d:%s] arch[%u]",
  573                  nr, line, rule->action, get_action_name(rule->action), arch);
  574         return lxc_seccomp_rule_err;
  575     }
  576 
  577     return lxc_seccomp_rule_added;
  578 }
  579 
  580 /*
  581  * It is unfortunate, but we can't simply remove those terms since this would
  582  * break way too many users.
  583  */
  584 #define BACKWARDCOMPAT_TERMINOLOGY_DENYLIST "blacklist"
  585 #define BACKWARDCOMPAT_TERMINOLOGY_ALLOWLIST "whitelist"
  586 
  587 static inline bool is_denylist(const char *type)
  588 {
  589     return strnequal(type, "denylist", STRLITERALLEN("denylist")) ||
  590            strnequal(type, BACKWARDCOMPAT_TERMINOLOGY_DENYLIST,
  591              STRLITERALLEN(BACKWARDCOMPAT_TERMINOLOGY_DENYLIST));
  592 }
  593 
  594 static inline bool is_allowlist(const char *type)
  595 {
  596     return strnequal(type, "allowlist", STRLITERALLEN("allowlist")) ||
  597            strnequal(type, BACKWARDCOMPAT_TERMINOLOGY_ALLOWLIST,
  598              STRLITERALLEN(BACKWARDCOMPAT_TERMINOLOGY_ALLOWLIST));
  599 }
  600 
  601 /*
  602  * v2 consists of
  603  * [x86]
  604  * open
  605  * read
  606  * write
  607  * close
  608  * # a comment
  609  * [x86_64]
  610  * open
  611  * read
  612  * write
  613  * close
  614  */
  615 static int parse_config_v2(FILE *f, char *line, size_t *line_bufsz, struct lxc_conf *conf)
  616 {
  617     int ret;
  618     char *p;
  619     enum lxc_hostarch_t cur_rule_arch, native_arch;
  620     bool denylist = false;
  621     uint32_t default_policy_action = -1, default_rule_action = -1;
  622     struct seccomp_v2_rule rule;
  623     struct scmp_ctx_info {
  624         uint32_t architectures[3];
  625         scmp_filter_ctx contexts[3];
  626         bool needs_merge[3];
  627     } ctx;
  628 
  629     if (is_denylist(line))
  630         denylist = true;
  631     else if (!is_allowlist(line))
  632         return log_error(-EINVAL, "Bad seccomp policy style \"%s\"", line);
  633 
  634     p = strchr(line, ' ');
  635     if (p) {
  636         default_policy_action = get_v2_default_action(p + 1);
  637         if (default_policy_action == -2)
  638             return -1;
  639     }
  640 
  641     /* for denylist, allow any syscall which has no rule */
  642     if (denylist) {
  643         if (default_policy_action == -1)
  644             default_policy_action = SCMP_ACT_ALLOW;
  645 
  646         if (default_rule_action == -1)
  647             default_rule_action = SCMP_ACT_KILL;
  648     } else {
  649         if (default_policy_action == -1)
  650             default_policy_action = SCMP_ACT_KILL;
  651 
  652         if (default_rule_action == -1)
  653             default_rule_action = SCMP_ACT_ALLOW;
  654     }
  655 
  656     DEBUG("Host native arch is [%u]", seccomp_arch_native());
  657 
  658     memset(&ctx, 0, sizeof(ctx));
  659     ctx.architectures[0] = SCMP_ARCH_NATIVE;
  660     ctx.architectures[1] = SCMP_ARCH_NATIVE;
  661     ctx.architectures[2] = SCMP_ARCH_NATIVE;
  662     native_arch = get_hostarch();
  663     cur_rule_arch = native_arch;
  664     if (native_arch == lxc_seccomp_arch_amd64) {
  665         cur_rule_arch = lxc_seccomp_arch_all;
  666 
  667         ctx.architectures[0] = SCMP_ARCH_X86;
  668         ctx.contexts[0] = get_new_ctx(lxc_seccomp_arch_i386,
  669                           default_policy_action,
  670                           &ctx.needs_merge[0]);
  671         if (!ctx.contexts[0])
  672             goto bad;
  673 
  674         ctx.architectures[1] = SCMP_ARCH_X32;
  675         ctx.contexts[1] = get_new_ctx(lxc_seccomp_arch_x32,
  676                           default_policy_action,
  677                           &ctx.needs_merge[1]);
  678         if (!ctx.contexts[1])
  679             goto bad;
  680 
  681         ctx.architectures[2] = SCMP_ARCH_X86_64;
  682         ctx.contexts[2] = get_new_ctx(lxc_seccomp_arch_amd64,
  683                           default_policy_action,
  684                           &ctx.needs_merge[2]);
  685         if (!ctx.contexts[2])
  686             goto bad;
  687 #ifdef SCMP_ARCH_PPC
  688     } else if (native_arch == lxc_seccomp_arch_ppc64) {
  689         cur_rule_arch = lxc_seccomp_arch_all;
  690 
  691         ctx.architectures[0] = SCMP_ARCH_PPC;
  692         ctx.contexts[0] = get_new_ctx(lxc_seccomp_arch_ppc,
  693                           default_policy_action,
  694                           &ctx.needs_merge[0]);
  695         if (!ctx.contexts[0])
  696             goto bad;
  697 
  698         ctx.architectures[2] = SCMP_ARCH_PPC64;
  699         ctx.contexts[2] = get_new_ctx(lxc_seccomp_arch_ppc64,
  700                           default_policy_action,
  701                           &ctx.needs_merge[2]);
  702         if (!ctx.contexts[2])
  703             goto bad;
  704 #endif
  705 #ifdef SCMP_ARCH_ARM
  706     } else if (native_arch == lxc_seccomp_arch_arm64) {
  707         cur_rule_arch = lxc_seccomp_arch_all;
  708 
  709         ctx.architectures[0] = SCMP_ARCH_ARM;
  710         ctx.contexts[0] = get_new_ctx(lxc_seccomp_arch_arm,
  711                           default_policy_action,
  712                           &ctx.needs_merge[0]);
  713         if (!ctx.contexts[0])
  714             goto bad;
  715 
  716 #ifdef SCMP_ARCH_AARCH64
  717         ctx.architectures[2] = SCMP_ARCH_AARCH64;
  718         ctx.contexts[2] = get_new_ctx(lxc_seccomp_arch_arm64,
  719                           default_policy_action,
  720                           &ctx.needs_merge[2]);
  721         if (!ctx.contexts[2])
  722             goto bad;
  723 #endif
  724 #endif
  725 #ifdef SCMP_ARCH_MIPS
  726     } else if (native_arch == lxc_seccomp_arch_mips64) {
  727         cur_rule_arch = lxc_seccomp_arch_all;
  728 
  729         ctx.architectures[0] = SCMP_ARCH_MIPS;
  730         ctx.contexts[0] = get_new_ctx(lxc_seccomp_arch_mips,
  731                           default_policy_action,
  732                           &ctx.needs_merge[0]);
  733         if (!ctx.contexts[0])
  734             goto bad;
  735 
  736         ctx.architectures[1] = SCMP_ARCH_MIPS64N32;
  737         ctx.contexts[1] = get_new_ctx(lxc_seccomp_arch_mips64n32,
  738                           default_policy_action,
  739                           &ctx.needs_merge[1]);
  740         if (!ctx.contexts[1])
  741             goto bad;
  742 
  743         ctx.architectures[2] = SCMP_ARCH_MIPS64;
  744         ctx.contexts[2] = get_new_ctx(lxc_seccomp_arch_mips64,
  745                           default_policy_action,
  746                           &ctx.needs_merge[2]);
  747         if (!ctx.contexts[2])
  748             goto bad;
  749     } else if (native_arch == lxc_seccomp_arch_mipsel64) {
  750         cur_rule_arch = lxc_seccomp_arch_all;
  751 
  752         ctx.architectures[0] = SCMP_ARCH_MIPSEL;
  753         ctx.contexts[0] = get_new_ctx(lxc_seccomp_arch_mipsel,
  754                           default_policy_action,
  755                           &ctx.needs_merge[0]);
  756         if (!ctx.contexts[0])
  757             goto bad;
  758 
  759         ctx.architectures[1] = SCMP_ARCH_MIPSEL64N32;
  760         ctx.contexts[1] = get_new_ctx(lxc_seccomp_arch_mipsel64n32,
  761                           default_policy_action,
  762                           &ctx.needs_merge[1]);
  763         if (!ctx.contexts[1])
  764             goto bad;
  765 
  766         ctx.architectures[2] = SCMP_ARCH_MIPSEL64;
  767         ctx.contexts[2] = get_new_ctx(lxc_seccomp_arch_mipsel64,
  768                           default_policy_action,
  769                           &ctx.needs_merge[2]);
  770         if (!ctx.contexts[2])
  771             goto bad;
  772 #endif
  773     }
  774 
  775     if (default_policy_action != SCMP_ACT_KILL) {
  776         ret = seccomp_reset(conf->seccomp.seccomp_ctx, default_policy_action);
  777         if (ret != 0) {
  778             ERROR("Error re-initializing Seccomp");
  779             return -1;
  780         }
  781 
  782         ret = seccomp_attr_set(conf->seccomp.seccomp_ctx, SCMP_FLTATR_CTL_NNP, 0);
  783         if (ret < 0) {
  784             errno = -ret;
  785             SYSERROR("Failed to turn off no-new-privs");
  786             return -1;
  787         }
  788 
  789 #ifdef SCMP_FLTATR_ATL_TSKIP
  790         ret = seccomp_attr_set(conf->seccomp.seccomp_ctx, SCMP_FLTATR_ATL_TSKIP, 1);
  791         if (ret < 0) {
  792             errno = -ret;
  793             SYSWARN("Failed to turn on seccomp nop-skip, continuing");
  794         }
  795 #endif
  796     }
  797 
  798     while (getline(&line, line_bufsz, f) != -1) {
  799         if (line[0] == '#')
  800             continue;
  801 
  802         if (line[0] == '\0')
  803             continue;
  804 
  805         remove_trailing_newlines(line);
  806 
  807         INFO("Processing \"%s\"", line);
  808         if (line[0] == '[') {
  809             /* Read the architecture for next set of rules. */
  810             if (strequal(line, "[x86]") ||
  811                 strequal(line, "[X86]")) {
  812                 if (native_arch != lxc_seccomp_arch_i386 &&
  813                     native_arch != lxc_seccomp_arch_amd64) {
  814                     cur_rule_arch = lxc_seccomp_arch_unknown;
  815                     continue;
  816                 }
  817 
  818                 cur_rule_arch = lxc_seccomp_arch_i386;
  819             } else if (strequal(line, "[x32]") ||
  820                    strequal(line, "[X32]")) {
  821                 if (native_arch != lxc_seccomp_arch_amd64) {
  822                     cur_rule_arch = lxc_seccomp_arch_unknown;
  823                     continue;
  824                 }
  825 
  826                 cur_rule_arch = lxc_seccomp_arch_x32;
  827             } else if (strequal(line, "[X86_64]") ||
  828                    strequal(line, "[x86_64]")) {
  829                 if (native_arch != lxc_seccomp_arch_amd64) {
  830                     cur_rule_arch = lxc_seccomp_arch_unknown;
  831                     continue;
  832                 }
  833 
  834                 cur_rule_arch = lxc_seccomp_arch_amd64;
  835             } else if (strequal(line, "[all]") ||
  836                    strequal(line, "[ALL]")) {
  837                 cur_rule_arch = lxc_seccomp_arch_all;
  838             }
  839 #ifdef SCMP_ARCH_ARM
  840             else if (strequal(line, "[arm]") ||
  841                  strequal(line, "[ARM]")) {
  842                 if (native_arch != lxc_seccomp_arch_arm &&
  843                     native_arch != lxc_seccomp_arch_arm64) {
  844                     cur_rule_arch = lxc_seccomp_arch_unknown;
  845                     continue;
  846                 }
  847 
  848                 cur_rule_arch = lxc_seccomp_arch_arm;
  849             }
  850 #endif
  851 #ifdef SCMP_ARCH_AARCH64
  852             else if (strequal(line, "[arm64]") ||
  853                  strequal(line, "[ARM64]")) {
  854                 if (native_arch != lxc_seccomp_arch_arm64) {
  855                     cur_rule_arch = lxc_seccomp_arch_unknown;
  856                     continue;
  857                 }
  858 
  859                 cur_rule_arch = lxc_seccomp_arch_arm64;
  860             }
  861 #endif
  862 #ifdef SCMP_ARCH_PPC64LE
  863             else if (strequal(line, "[ppc64le]") ||
  864                  strequal(line, "[PPC64LE]")) {
  865                 if (native_arch != lxc_seccomp_arch_ppc64le) {
  866                     cur_rule_arch = lxc_seccomp_arch_unknown;
  867                     continue;
  868                 }
  869 
  870                 cur_rule_arch = lxc_seccomp_arch_ppc64le;
  871             }
  872 #endif
  873 #ifdef SCMP_ARCH_PPC64
  874             else if (strequal(line, "[ppc64]") ||
  875                  strequal(line, "[PPC64]")) {
  876                 if (native_arch != lxc_seccomp_arch_ppc64) {
  877                     cur_rule_arch = lxc_seccomp_arch_unknown;
  878                     continue;
  879                 }
  880 
  881                 cur_rule_arch = lxc_seccomp_arch_ppc64;
  882             }
  883 #endif
  884 #ifdef SCMP_ARCH_PPC
  885             else if (strequal(line, "[ppc]") ||
  886                  strequal(line, "[PPC]")) {
  887                 if (native_arch != lxc_seccomp_arch_ppc &&
  888                     native_arch != lxc_seccomp_arch_ppc64) {
  889                     cur_rule_arch = lxc_seccomp_arch_unknown;
  890                     continue;
  891                 }
  892 
  893                 cur_rule_arch = lxc_seccomp_arch_ppc;
  894             }
  895 #endif
  896 #ifdef SCMP_ARCH_MIPS
  897             else if (strequal(line, "[mips64]") ||
  898                  strequal(line, "[MIPS64]")) {
  899                 if (native_arch != lxc_seccomp_arch_mips64) {
  900                     cur_rule_arch = lxc_seccomp_arch_unknown;
  901                     continue;
  902                 }
  903 
  904                 cur_rule_arch = lxc_seccomp_arch_mips64;
  905             } else if (strequal(line, "[mips64n32]") ||
  906                    strequal(line, "[MIPS64N32]")) {
  907                 if (native_arch != lxc_seccomp_arch_mips64) {
  908                     cur_rule_arch = lxc_seccomp_arch_unknown;
  909                     continue;
  910                 }
  911 
  912                 cur_rule_arch = lxc_seccomp_arch_mips64n32;
  913             } else if (strequal(line, "[mips]") ||
  914                    strequal(line, "[MIPS]")) {
  915                 if (native_arch != lxc_seccomp_arch_mips &&
  916                     native_arch != lxc_seccomp_arch_mips64) {
  917                     cur_rule_arch = lxc_seccomp_arch_unknown;
  918                     continue;
  919                 }
  920 
  921                 cur_rule_arch = lxc_seccomp_arch_mips;
  922             } else if (strequal(line, "[mipsel64]") ||
  923                    strequal(line, "[MIPSEL64]")) {
  924                 if (native_arch != lxc_seccomp_arch_mipsel64) {
  925                     cur_rule_arch = lxc_seccomp_arch_unknown;
  926                     continue;
  927                 }
  928 
  929                 cur_rule_arch = lxc_seccomp_arch_mipsel64;
  930             } else if (strequal(line, "[mipsel64n32]") ||
  931                    strequal(line, "[MIPSEL64N32]")) {
  932                 if (native_arch != lxc_seccomp_arch_mipsel64) {
  933                     cur_rule_arch = lxc_seccomp_arch_unknown;
  934                     continue;
  935                 }
  936 
  937                 cur_rule_arch = lxc_seccomp_arch_mipsel64n32;
  938             } else if (strequal(line, "[mipsel]") ||
  939                    strequal(line, "[MIPSEL]")) {
  940                 if (native_arch != lxc_seccomp_arch_mipsel &&
  941                     native_arch != lxc_seccomp_arch_mipsel64) {
  942                     cur_rule_arch = lxc_seccomp_arch_unknown;
  943                     continue;
  944                 }
  945 
  946                 cur_rule_arch = lxc_seccomp_arch_mipsel;
  947             }
  948 #endif
  949 #ifdef SCMP_ARCH_S390X
  950             else if (strequal(line, "[s390x]") ||
  951                  strequal(line, "[S390X]")) {
  952                 if (native_arch != lxc_seccomp_arch_s390x) {
  953                     cur_rule_arch = lxc_seccomp_arch_unknown;
  954                     continue;
  955                 }
  956 
  957                 cur_rule_arch = lxc_seccomp_arch_s390x;
  958             }
  959 #endif
  960 #ifdef SCMP_ARCH_S390
  961             else if (strequal(line, "[s390]") ||
  962                  strequal(line, "[S390]")) {
  963                 if (native_arch != lxc_seccomp_arch_s390) {
  964                     cur_rule_arch = lxc_seccomp_arch_unknown;
  965                     continue;
  966                 }
  967 
  968                 cur_rule_arch = lxc_seccomp_arch_s390;
  969             }
  970 #endif
  971             else {
  972                 goto bad_arch;
  973             }
  974 
  975             continue;
  976         }
  977 
  978         /* irrelevant arch - i.e. arm on i386 */
  979         if (cur_rule_arch == lxc_seccomp_arch_unknown)
  980             continue;
  981 
  982         memset(&rule, 0, sizeof(rule));
  983         /* read optional action which follows the syscall */
  984         ret = parse_v2_rules(line, default_rule_action, &rule);
  985         if (ret != 0) {
  986             ERROR("Failed to interpret seccomp rule");
  987             goto bad_rule;
  988         }
  989 
  990 #if HAVE_DECL_SECCOMP_NOTIFY_FD
  991         if ((rule.action == SCMP_ACT_NOTIFY) &&
  992             !conf->seccomp.notifier.wants_supervision) {
  993             conf->seccomp.notifier.wants_supervision = true;
  994             TRACE("Set SECCOMP_FILTER_FLAG_NEW_LISTENER attribute");
  995         }
  996 #endif
  997 
  998 
  999         ret = do_resolve_add_rule(SCMP_ARCH_NATIVE, line,
 1000                      conf->seccomp.seccomp_ctx, &rule);
 1001         if (ret == lxc_seccomp_rule_err)
 1002             goto bad_rule;
 1003         if (ret == lxc_seccomp_rule_undefined_syscall)
 1004             continue;
 1005 
 1006         for (int i = 0; i < 3; i++ ) {
 1007             uint32_t arch = ctx.architectures[i];
 1008             if (arch != SCMP_ARCH_NATIVE && arch != seccomp_arch_native()) {
 1009                 if (lxc_seccomp_rule_err == do_resolve_add_rule(arch, line,
 1010                             ctx.contexts[i], &rule))
 1011                     goto bad_rule;
 1012             }
 1013         }
 1014 
 1015     }
 1016 
 1017     INFO("Merging compat seccomp contexts into main context");
 1018     if (ctx.contexts[0]) {
 1019         if (ctx.needs_merge[0]) {
 1020             ret = seccomp_merge(conf->seccomp.seccomp_ctx, ctx.contexts[0]);
 1021             if (ret < 0) {
 1022                 ERROR("Failed to merge first compat seccomp "
 1023                       "context into main context");
 1024                 goto bad;
 1025             }
 1026 
 1027             TRACE("Merged first compat seccomp context into main context");
 1028         } else {
 1029             seccomp_release(ctx.contexts[0]);
 1030             ctx.contexts[0] = NULL;
 1031         }
 1032     }
 1033 
 1034     if (ctx.contexts[1]) {
 1035         if (ctx.needs_merge[1]) {
 1036             ret = seccomp_merge(conf->seccomp.seccomp_ctx, ctx.contexts[1]);
 1037             if (ret < 0) {
 1038                 ERROR("Failed to merge first compat seccomp "
 1039                       "context into main context");
 1040                 goto bad;
 1041             }
 1042 
 1043             TRACE("Merged second compat seccomp context into main context");
 1044         } else {
 1045             seccomp_release(ctx.contexts[1]);
 1046             ctx.contexts[1] = NULL;
 1047         }
 1048     }
 1049 
 1050     if (ctx.contexts[2]) {
 1051         if (ctx.needs_merge[2]) {
 1052             ret = seccomp_merge(conf->seccomp.seccomp_ctx, ctx.contexts[2]);
 1053             if (ret < 0) {
 1054                 ERROR("Failed to merge third compat seccomp "
 1055                       "context into main context");
 1056                 goto bad;
 1057             }
 1058 
 1059             TRACE("Merged third compat seccomp context into main context");
 1060         } else {
 1061             seccomp_release(ctx.contexts[2]);
 1062             ctx.contexts[2] = NULL;
 1063         }
 1064     }
 1065 
 1066     free(line);
 1067     return 0;
 1068 
 1069 bad_arch:
 1070     ERROR("Unsupported architecture \"%s\"", line);
 1071 
 1072 bad_rule:
 1073 bad:
 1074     if (ctx.contexts[0])
 1075         seccomp_release(ctx.contexts[0]);
 1076 
 1077     if (ctx.contexts[1])
 1078         seccomp_release(ctx.contexts[1]);
 1079 
 1080     if (ctx.contexts[2])
 1081         seccomp_release(ctx.contexts[2]);
 1082 
 1083     free(line);
 1084 
 1085     return -1;
 1086 }
 1087 #else /* HAVE_DECL_SECCOMP_SYSCALL_RESOLVE_NAME_ARCH */
 1088 static int parse_config_v2(FILE *f, char *line, struct lxc_conf *conf)
 1089 {
 1090     return -1;
 1091 }
 1092 #endif /* HAVE_DECL_SECCOMP_SYSCALL_RESOLVE_NAME_ARCH */
 1093 
 1094 /*
 1095  * The first line of the config file has a policy language version
 1096  * the second line has some directives
 1097  * then comes policy subject to the directives
 1098  * right now version must be '1' or '2'
 1099  * the directives must include 'allowlist'(version == 1 or 2) or 'denylist'
 1100  * (version == 2) and can include 'debug' (though debug is not yet supported).
 1101  */
 1102 static int parse_config(FILE *f, struct lxc_conf *conf)
 1103 {
 1104     char *line = NULL;
 1105     size_t line_bufsz = 0;
 1106     int ret, version;
 1107 
 1108     ret = fscanf(f, "%d\n", &version);
 1109     if (ret != 1 || (version != 1 && version != 2)) {
 1110         ERROR("Invalid version");
 1111         return -1;
 1112     }
 1113 
 1114     if (getline(&line, &line_bufsz, f) == -1) {
 1115         ERROR("Invalid config file");
 1116         goto bad_line;
 1117     }
 1118 
 1119     if (version == 1 && !strstr(line, "allowlist")) {
 1120         ERROR("Only allowlist policy is supported");
 1121         goto bad_line;
 1122     }
 1123 
 1124     if (strstr(line, "debug")) {
 1125         ERROR("Debug not yet implemented");
 1126         goto bad_line;
 1127     }
 1128 
 1129     if (version == 1)
 1130         return parse_config_v1(f, line, &line_bufsz, conf);
 1131 
 1132     return parse_config_v2(f, line, &line_bufsz, conf);
 1133 
 1134 bad_line:
 1135     free(line);
 1136     return -1;
 1137 }
 1138 
 1139 /*
 1140  * use_seccomp: return true if we should try and apply a seccomp policy
 1141  * if defined for the container.
 1142  * This will return false if
 1143  *   1. seccomp is not enabled in the kernel
 1144  *   2. a seccomp policy is already enabled for this task
 1145  */
 1146 static bool use_seccomp(const struct lxc_conf *conf)
 1147 {
 1148     __do_free char *line = NULL;
 1149     __do_fclose FILE *f = NULL;
 1150     int ret, v;
 1151     size_t line_bufsz = 0;
 1152     bool already_enabled = false, found = false;
 1153 
 1154     if (conf->seccomp.allow_nesting > 0)
 1155         return true;
 1156 
 1157     f = fopen("/proc/self/status", "re");
 1158     if (!f)
 1159         return true;
 1160 
 1161     while (getline(&line, &line_bufsz, f) != -1) {
 1162         if (strnequal(line, "Seccomp:", 8)) {
 1163             found = true;
 1164 
 1165             ret = sscanf(line + 8, "%d", &v);
 1166             if (ret == 1 && v != 0)
 1167                 already_enabled = true;
 1168 
 1169             break;
 1170         }
 1171     }
 1172 
 1173     if (!found) {
 1174         INFO("Seccomp is not enabled in the kernel");
 1175         return false;
 1176     }
 1177 
 1178     if (already_enabled) {
 1179         INFO("Already seccomp-confined, not loading new policy");
 1180         return false;
 1181     }
 1182 
 1183     return true;
 1184 }
 1185 
 1186 int lxc_read_seccomp_config(struct lxc_conf *conf)
 1187 {
 1188     __do_fclose FILE *f = NULL;
 1189     int ret;
 1190 
 1191     if (!conf->seccomp.seccomp)
 1192         return 0;
 1193 
 1194     if (!use_seccomp(conf))
 1195         return 0;
 1196 
 1197 #if HAVE_SCMP_FILTER_CTX
 1198     /* XXX for debug, pass in SCMP_ACT_TRAP */
 1199     conf->seccomp.seccomp_ctx = seccomp_init(SCMP_ACT_KILL);
 1200     ret = !conf->seccomp.seccomp_ctx;
 1201 #else
 1202     ret = seccomp_init(SCMP_ACT_KILL) < 0;
 1203 #endif
 1204     if (ret) {
 1205         ERROR("Failed initializing seccomp");
 1206         return -1;
 1207     }
 1208 
 1209 /* turn off no-new-privs. We don't want it in lxc, and it breaks
 1210  * with apparmor */
 1211 #if HAVE_SCMP_FILTER_CTX
 1212     ret = seccomp_attr_set(conf->seccomp.seccomp_ctx, SCMP_FLTATR_CTL_NNP, 0);
 1213 #else
 1214     ret = seccomp_attr_set(SCMP_FLTATR_CTL_NNP, 0);
 1215 #endif
 1216     if (ret < 0) {
 1217         errno = -ret;
 1218         SYSERROR("Failed to turn off no-new-privs");
 1219         return -1;
 1220     }
 1221 
 1222 #ifdef SCMP_FLTATR_ATL_TSKIP
 1223     ret = seccomp_attr_set(conf->seccomp.seccomp_ctx, SCMP_FLTATR_ATL_TSKIP, 1);
 1224     if (ret < 0) {
 1225         errno = -ret;
 1226         SYSWARN("Failed to turn on seccomp nop-skip, continuing");
 1227     }
 1228 #endif
 1229 
 1230     f = fopen(conf->seccomp.seccomp, "re");
 1231     if (!f) {
 1232         SYSERROR("Failed to open seccomp policy file %s", conf->seccomp.seccomp);
 1233         return -1;
 1234     }
 1235 
 1236     return parse_config(f, conf);
 1237 }
 1238 
 1239 int lxc_seccomp_load(struct lxc_conf *conf)
 1240 {
 1241     int ret;
 1242 
 1243     if (!conf->seccomp.seccomp)
 1244         return 0;
 1245 
 1246     if (!use_seccomp(conf))
 1247         return 0;
 1248 
 1249 #if HAVE_SCMP_FILTER_CTX
 1250     ret = seccomp_load(conf->seccomp.seccomp_ctx);
 1251 #else
 1252     ret = seccomp_load();
 1253 #endif
 1254     if (ret < 0) {
 1255         errno = -ret;
 1256         SYSERROR("Error loading the seccomp policy");
 1257         return -1;
 1258     }
 1259 
 1260 /* After load seccomp filter into the kernel successfully, export the current seccomp
 1261  * filter to log file */
 1262 #if HAVE_SCMP_FILTER_CTX
 1263     if (lxc_log_trace()) {
 1264         int fd_log;
 1265 
 1266         fd_log = lxc_log_get_fd();
 1267         if (fd_log >= 0) {
 1268             ret = seccomp_export_pfc(conf->seccomp.seccomp_ctx, fd_log);
 1269             if (ret < 0) {
 1270                 errno = -ret;
 1271                 SYSWARN("Failed to export seccomp filter to log file");
 1272             }
 1273         }
 1274     }
 1275 #endif
 1276 
 1277 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1278     if (conf->seccomp.notifier.wants_supervision) {
 1279         ret = seccomp_notify_fd(conf->seccomp.seccomp_ctx);
 1280         if (ret < 0) {
 1281             errno = -ret;
 1282             return -1;
 1283         }
 1284 
 1285         if (fd_make_nonblocking(ret))
 1286             return log_error_errno(-1, errno, "Failed to make seccomp listener fd non-blocking");;
 1287 
 1288         conf->seccomp.notifier.notify_fd = ret;
 1289         TRACE("Retrieved new seccomp listener fd %d", ret);
 1290     }
 1291 #endif
 1292 
 1293     return 0;
 1294 }
 1295 
 1296 void lxc_seccomp_free(struct lxc_seccomp *seccomp)
 1297 {
 1298     free_disarm(seccomp->seccomp);
 1299 
 1300 #if HAVE_SCMP_FILTER_CTX
 1301     if (seccomp->seccomp_ctx) {
 1302         seccomp_release(seccomp->seccomp_ctx);
 1303         seccomp->seccomp_ctx = NULL;
 1304     }
 1305 #endif
 1306 
 1307 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1308     close_prot_errno_disarm(seccomp->notifier.notify_fd);
 1309     close_prot_errno_disarm(seccomp->notifier.proxy_fd);
 1310     seccomp_notify_free(seccomp->notifier.req_buf, seccomp->notifier.rsp_buf);
 1311     seccomp->notifier.req_buf = NULL;
 1312     seccomp->notifier.rsp_buf = NULL;
 1313     free_disarm(seccomp->notifier.cookie);
 1314 #endif
 1315 }
 1316 
 1317 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1318 static int seccomp_notify_reconnect(struct lxc_handler *handler)
 1319 {
 1320     __do_close int notify_fd = -EBADF;
 1321 
 1322     close_prot_errno_disarm(handler->conf->seccomp.notifier.proxy_fd);
 1323 
 1324     notify_fd = lxc_unix_connect_type(
 1325         &handler->conf->seccomp.notifier.proxy_addr, SOCK_SEQPACKET);
 1326     if (notify_fd < 0) {
 1327         SYSERROR("Failed to reconnect to seccomp proxy");
 1328         return -1;
 1329     }
 1330 
 1331     /* 30 second timeout */
 1332     if (lxc_socket_set_timeout(notify_fd, 30, 30)) {
 1333         SYSERROR("Failed to set socket timeout");
 1334         return -1;
 1335     }
 1336     handler->conf->seccomp.notifier.proxy_fd = move_fd(notify_fd);
 1337     return 0;
 1338 }
 1339 #endif
 1340 
 1341 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1342 static void seccomp_notify_default_answer(int fd, struct seccomp_notif *req,
 1343                       struct seccomp_notif_resp *resp,
 1344                       struct lxc_handler *handler)
 1345 {
 1346     resp->id = req->id;
 1347     resp->error = -ENOSYS;
 1348     resp->val = 0;
 1349     resp->flags = 0;
 1350 
 1351     if (seccomp_notify_respond(fd, resp))
 1352         SYSERROR("Failed to send default message to seccomp notification with id(%llu)",
 1353              (long long unsigned int)resp->id);
 1354     else
 1355         TRACE("Sent default response for seccomp notification with id(%llu)",
 1356               (long long unsigned int)resp->id);
 1357     memset(resp, 0, handler->conf->seccomp.notifier.sizes.seccomp_notif_resp);
 1358 }
 1359 #endif
 1360 
 1361 int seccomp_notify_handler(int fd, uint32_t events, void *data,
 1362                struct lxc_epoll_descr *descr)
 1363 {
 1364 
 1365 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1366     __do_close int fd_pid = -EBADF;
 1367     __do_close int fd_mem = -EBADF;
 1368     int ret;
 1369     ssize_t bytes;
 1370     int send_fd_list[3];
 1371     struct iovec iov[4];
 1372     size_t iov_len, msg_base_size, msg_full_size;
 1373     char mem_path[6 /* /proc/ */
 1374               + INTTYPE_TO_STRLEN(int64_t)
 1375               + 3 /* mem */
 1376               + 1 /* \0 */];
 1377     bool reconnected = false;
 1378     struct lxc_handler *hdlr = data;
 1379     struct lxc_conf *conf = hdlr->conf;
 1380     struct seccomp_notif *req = conf->seccomp.notifier.req_buf;
 1381     struct seccomp_notif_resp *resp = conf->seccomp.notifier.rsp_buf;
 1382     int listener_proxy_fd = conf->seccomp.notifier.proxy_fd;
 1383     struct seccomp_notify_proxy_msg msg = {0};
 1384     char *cookie = conf->seccomp.notifier.cookie;
 1385     __u64 req_id;
 1386 
 1387     if (events & EPOLLHUP) {
 1388         lxc_mainloop_del_handler(descr, fd);
 1389         close(fd);
 1390         return log_trace(0, "Removing seccomp notifier fd %d", fd);
 1391     }
 1392 
 1393     memset(req, 0, conf->seccomp.notifier.sizes.seccomp_notif);
 1394     ret = seccomp_notify_receive(fd, req);
 1395     if (ret) {
 1396         if (errno == ENOENT)
 1397             TRACE("Intercepted system call aborted");
 1398         else
 1399             SYSERROR("Failed to read seccomp notification");
 1400         goto out;
 1401     }
 1402 
 1403     if (listener_proxy_fd < 0) {
 1404         ret = -1;
 1405         /* Same condition as for the initial setup_proxy() */
 1406         if (conf->seccomp.notifier.wants_supervision &&
 1407             conf->seccomp.notifier.proxy_addr.sun_path[1] != '\0') {
 1408             ret = seccomp_notify_reconnect(hdlr);
 1409         }
 1410         if (ret) {
 1411             ERROR("No seccomp proxy registered");
 1412             seccomp_notify_default_answer(fd, req, resp, hdlr);
 1413             goto out;
 1414         }
 1415         listener_proxy_fd = conf->seccomp.notifier.proxy_fd;
 1416     }
 1417 
 1418     /* remember the ID in case we receive garbage from the proxy */
 1419     resp->id = req_id = req->id;
 1420     TRACE("Received seccomp notification with id(%llu)", (long long unsigned int)req_id);
 1421 
 1422     ret = strnprintf(mem_path, sizeof(mem_path), "/proc/%d", req->pid);
 1423     if (ret < 0) {
 1424         seccomp_notify_default_answer(fd, req, resp, hdlr);
 1425         SYSERROR("Failed to create path to process's proc directory");
 1426         goto out;
 1427     }
 1428 
 1429     fd_pid = open(mem_path, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
 1430     if (fd_pid < 0) {
 1431         seccomp_notify_default_answer(fd, req, resp, hdlr);
 1432         SYSERROR("Failed to open process pidfd for seccomp notify request");
 1433         goto out;
 1434     }
 1435 
 1436     ret = strnprintf(mem_path, sizeof(mem_path), "/proc/%d/mem", req->pid);
 1437     if (ret < 0) {
 1438         seccomp_notify_default_answer(fd, req, resp, hdlr);
 1439         SYSERROR("Failed to create path to process's virtual memory");
 1440         goto out;
 1441     }
 1442 
 1443     fd_mem = open(mem_path, O_RDWR | O_CLOEXEC);
 1444     if (fd_mem < 0) {
 1445         seccomp_notify_default_answer(fd, req, resp, hdlr);
 1446         SYSERROR("Failed to open process memory for seccomp notify request");
 1447         goto out;
 1448     }
 1449 
 1450     /*
 1451      * Make sure that the fd for /proc/<pid>/mem we just opened still
 1452      * refers to the correct process's memory.
 1453      */
 1454     ret = seccomp_notify_id_valid(fd, req->id);
 1455     if (ret < 0) {
 1456         seccomp_notify_default_answer(fd, req, resp, hdlr);
 1457         SYSERROR("Invalid seccomp notify request id(%llu)", (long long unsigned int)req->id);
 1458         goto out;
 1459     }
 1460 
 1461     msg.monitor_pid = hdlr->monitor_pid;
 1462     msg.init_pid = hdlr->pid;
 1463     memcpy(&msg.sizes, &conf->seccomp.notifier.sizes, sizeof(msg.sizes));
 1464 
 1465     msg_base_size = 0;
 1466     iov[0].iov_base = &msg;
 1467     msg_base_size += (iov[0].iov_len = sizeof(msg));
 1468     iov[1].iov_base = req;
 1469     msg_base_size += (iov[1].iov_len = msg.sizes.seccomp_notif);
 1470     iov[2].iov_base = resp;
 1471     msg_base_size += (iov[2].iov_len = msg.sizes.seccomp_notif_resp);
 1472     msg_full_size = msg_base_size;
 1473 
 1474     if (cookie) {
 1475         size_t len = strlen(cookie);
 1476 
 1477         msg.cookie_len = (uint64_t)len;
 1478 
 1479         iov[3].iov_base = cookie;
 1480         msg_full_size += (iov[3].iov_len = len);
 1481 
 1482         iov_len = 4;
 1483     } else {
 1484         iov_len = 3;
 1485     }
 1486 
 1487     send_fd_list[0] = fd_pid;
 1488     send_fd_list[1] = fd_mem;
 1489     send_fd_list[2] = fd;
 1490 
 1491 retry:
 1492     bytes = lxc_abstract_unix_send_fds_iov(listener_proxy_fd, send_fd_list, 3, iov, iov_len);
 1493     if (bytes != (ssize_t)msg_full_size) {
 1494         SYSERROR("Failed to forward message to seccomp proxy");
 1495         if (!reconnected) {
 1496             ret = seccomp_notify_reconnect(hdlr);
 1497             if (ret == 0) {
 1498                 reconnected = true;
 1499                 goto retry;
 1500             }
 1501         }
 1502 
 1503         seccomp_notify_default_answer(fd, req, resp, hdlr);
 1504         goto out;
 1505     }
 1506 
 1507     close_prot_errno_disarm(fd_mem);
 1508 
 1509     if (msg.__reserved != 0) {
 1510         ERROR("Proxy filled reserved data in response");
 1511         seccomp_notify_default_answer(fd, req, resp, hdlr);
 1512         goto out;
 1513     }
 1514 
 1515     if (resp->id != req_id) {
 1516         ERROR("Proxy returned response with invalid id(%llu) != id(%llu)",
 1517               (long long unsigned int)resp->id, (long long unsigned int)req_id);
 1518         resp->id = req_id;
 1519         seccomp_notify_default_answer(fd, req, resp, hdlr);
 1520         goto out;
 1521     }
 1522 
 1523     bytes = lxc_recvmsg_nointr_iov(listener_proxy_fd, iov, iov_len, MSG_TRUNC);
 1524     if (bytes != (ssize_t)msg_base_size) {
 1525         SYSERROR("Failed to receive message from seccomp proxy");
 1526         seccomp_notify_default_answer(fd, req, resp, hdlr);
 1527         goto out;
 1528     }
 1529 
 1530     if (resp->id != req_id) {
 1531         ERROR("Proxy returned response with invalid id(%llu) != id(%llu)",
 1532               (long long unsigned int)resp->id, (long long unsigned int)req_id);
 1533         resp->id = req_id;
 1534     }
 1535 
 1536     ret = seccomp_notify_respond(fd, resp);
 1537     if (ret)
 1538         SYSERROR("Failed to send seccomp notification");
 1539     else
 1540         TRACE("Sent response for seccomp notification with id(%llu)",
 1541               (long long unsigned int)resp->id);
 1542     memset(resp, 0, conf->seccomp.notifier.sizes.seccomp_notif_resp);
 1543 
 1544 out:
 1545 #endif
 1546     return LXC_MAINLOOP_CONTINUE;
 1547 }
 1548 
 1549 void seccomp_conf_init(struct lxc_conf *conf)
 1550 {
 1551     conf->seccomp.seccomp = NULL;
 1552 #if HAVE_SCMP_FILTER_CTX
 1553     conf->seccomp.allow_nesting = 0;
 1554     memset(&conf->seccomp.seccomp_ctx, 0, sizeof(conf->seccomp.seccomp_ctx));
 1555 #endif /* HAVE_SCMP_FILTER_CTX */
 1556 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1557     conf->seccomp.notifier.wants_supervision = false;
 1558     conf->seccomp.notifier.notify_fd = -EBADF;
 1559     conf->seccomp.notifier.proxy_fd = -EBADF;
 1560     memset(&conf->seccomp.notifier.proxy_addr, 0,
 1561            sizeof(conf->seccomp.notifier.proxy_addr));
 1562     conf->seccomp.notifier.req_buf = NULL;
 1563     conf->seccomp.notifier.rsp_buf = NULL;
 1564     conf->seccomp.notifier.cookie = NULL;
 1565 #endif
 1566 }
 1567 
 1568 int lxc_seccomp_setup_proxy(struct lxc_seccomp *seccomp,
 1569                 struct lxc_epoll_descr *descr,
 1570                 struct lxc_handler *handler)
 1571 {
 1572 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1573     if (seccomp->notifier.wants_supervision &&
 1574         seccomp->notifier.proxy_addr.sun_path[1] != '\0') {
 1575         __do_close int notify_fd = -EBADF;
 1576         int ret;
 1577 
 1578         notify_fd = lxc_unix_connect_type(&seccomp->notifier.proxy_addr,
 1579                          SOCK_SEQPACKET);
 1580         if (notify_fd < 0) {
 1581             SYSERROR("Failed to connect to seccomp proxy");
 1582             return -1;
 1583         }
 1584 
 1585         /* 30 second timeout */
 1586         ret = lxc_socket_set_timeout(notify_fd, 30, 30);
 1587         if (ret) {
 1588             SYSERROR("Failed to set timeouts for seccomp proxy");
 1589             return -1;
 1590         }
 1591 
 1592         ret = __seccomp(SECCOMP_GET_NOTIF_SIZES, 0,
 1593                 &seccomp->notifier.sizes);
 1594         if (ret) {
 1595             SYSERROR("Failed to query seccomp notify struct sizes");
 1596             return -1;
 1597         }
 1598 
 1599         ret = seccomp_notify_alloc(&seccomp->notifier.req_buf,
 1600                       &seccomp->notifier.rsp_buf);
 1601         if (ret) {
 1602             ERROR("Failed to allocate seccomp notify request and response buffers");
 1603             errno = ret;
 1604             return -1;
 1605         }
 1606 
 1607         ret = lxc_mainloop_add_handler(descr,
 1608                            seccomp->notifier.notify_fd,
 1609                            seccomp_notify_handler, handler);
 1610         if (ret < 0) {
 1611             ERROR("Failed to add seccomp notify handler for %d to mainloop",
 1612                   notify_fd);
 1613             return -1;
 1614         }
 1615 
 1616         seccomp->notifier.proxy_fd = move_fd(notify_fd);
 1617     }
 1618 #endif
 1619     return 0;
 1620 }
 1621 
 1622 int lxc_seccomp_send_notifier_fd(struct lxc_seccomp *seccomp, int socket_fd)
 1623 {
 1624 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1625     if (seccomp->notifier.wants_supervision) {
 1626         if (lxc_abstract_unix_send_fds(socket_fd,
 1627                            &seccomp->notifier.notify_fd, 1,
 1628                            NULL, 0) < 0)
 1629             return -1;
 1630         close_prot_errno_disarm(seccomp->notifier.notify_fd);
 1631     }
 1632 #endif
 1633     return 0;
 1634 }
 1635 
 1636 int lxc_seccomp_recv_notifier_fd(struct lxc_seccomp *seccomp, int socket_fd)
 1637 {
 1638 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1639     if (seccomp->notifier.wants_supervision) {
 1640         int ret;
 1641 
 1642         ret = lxc_abstract_unix_recv_one_fd(socket_fd,
 1643                             &seccomp->notifier.notify_fd,
 1644                             NULL, 0);
 1645         if (ret < 0)
 1646             return -1;
 1647     }
 1648 #endif
 1649     return 0;
 1650 }
 1651 
 1652 int lxc_seccomp_add_notifier(const char *name, const char *lxcpath,
 1653                  struct lxc_seccomp *seccomp)
 1654 {
 1655 #if HAVE_DECL_SECCOMP_NOTIFY_FD
 1656     if (seccomp->notifier.wants_supervision) {
 1657         int ret;
 1658 
 1659         ret = lxc_cmd_seccomp_notify_add_listener(name, lxcpath,
 1660                               seccomp->notifier.notify_fd,
 1661                               -1, 0);
 1662         close_prot_errno_disarm(seccomp->notifier.notify_fd);
 1663         if (ret < 0)
 1664             return -1;
 1665     }
 1666 #endif
 1667     return 0;
 1668 }