"Fossies" - the Fresh Open Source Software Archive

Member "openbgpd-6.5p0/src/bgpd/rde_rib.c" (12 Mar 2019, 37007 Bytes) of package /linux/privat/openbgpd-6.5p0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "rde_rib.c" see the Fossies "Dox" file reference documentation.

    1 /*  $OpenBSD: rde_rib.c,v 1.190 2019/03/07 07:42:36 claudio Exp $ */
    2 
    3 /*
    4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
    5  *
    6  * Permission to use, copy, modify, and distribute this software for any
    7  * purpose with or without fee is hereby granted, provided that the above
    8  * copyright notice and this permission notice appear in all copies.
    9  *
   10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
   12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
   13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
   14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
   17  */
   18 
   19 #include <sys/types.h>
   20 #include <sys/queue.h>
   21 
   22 #include <limits.h>
   23 #include <stdlib.h>
   24 #include <string.h>
   25 #include <siphash.h>
   26 #include <time.h>
   27 
   28 #include "bgpd.h"
   29 #include "rde.h"
   30 #include "log.h"
   31 
   32 /*
   33  * BGP RIB -- Routing Information Base
   34  *
   35  * The RIB is build with one aspect in mind. Speed -- actually update speed.
   36  * Therefore one thing needs to be absolutely avoided, long table walks.
   37  * This is achieved by heavily linking the different parts together.
   38  */
   39 u_int16_t rib_size;
   40 struct rib_desc *ribs;
   41 
   42 struct rib_entry *rib_add(struct rib *, struct bgpd_addr *, int);
   43 static inline int rib_compare(const struct rib_entry *,
   44             const struct rib_entry *);
   45 void rib_remove(struct rib_entry *);
   46 int rib_empty(struct rib_entry *);
   47 static void rib_dump_abort(u_int16_t);
   48 
   49 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare);
   50 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare);
   51 
   52 struct rib_context {
   53     LIST_ENTRY(rib_context)      entry;
   54     struct rib_entry        *ctx_re;
   55     u_int16_t            ctx_rib_id;
   56     void        (*ctx_upcall)(struct rib_entry *, void *);
   57     void        (*ctx_done)(void *, u_int8_t);
   58     int     (*ctx_throttle)(void *);
   59     void                *ctx_arg;
   60     unsigned int             ctx_count;
   61     u_int8_t             ctx_aid;
   62 };
   63 LIST_HEAD(, rib_context) rib_dumps = LIST_HEAD_INITIALIZER(rib_dumps);
   64 
   65 static int   prefix_add(struct bgpd_addr *, int, struct rib *,
   66             struct rde_peer *, struct rde_aspath *,
   67             struct filterstate *, u_int8_t);
   68 static int   prefix_move(struct prefix *, struct rde_peer *,
   69             struct rde_aspath *, struct filterstate *, u_int8_t);
   70 
   71 static inline struct rib_entry *
   72 re_lock(struct rib_entry *re)
   73 {
   74     if (re->lock != 0)
   75         log_warnx("%s: entry already locked", __func__);
   76     re->lock = 1;
   77     return re;
   78 }
   79 
   80 static inline struct rib_entry *
   81 re_unlock(struct rib_entry *re)
   82 {
   83     if (re->lock == 0)
   84         log_warnx("%s: entry already unlocked", __func__);
   85     re->lock = 0;
   86     return re;
   87 }
   88 
   89 static inline int
   90 re_is_locked(struct rib_entry *re)
   91 {
   92     return (re->lock != 0);
   93 }
   94 
   95 static inline struct rib_tree *
   96 rib_tree(struct rib *rib)
   97 {
   98     return (&rib->tree);
   99 }
  100 
  101 static inline int
  102 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
  103 {
  104     /* need to handle NULL entries because of EoR marker */
  105     if (a == NULL && b == NULL)
  106         return (0);
  107     else if (b == NULL)
  108         return (1);
  109     else if (a == NULL)
  110         return (-1);
  111     return (pt_prefix_cmp(a->prefix, b->prefix));
  112 }
  113 
  114 /* RIB specific functions */
  115 struct rib *
  116 rib_new(char *name, u_int rtableid, u_int16_t flags)
  117 {
  118     struct rib_desc *xribs;
  119     u_int16_t   id;
  120 
  121     for (id = 0; id < rib_size; id++) {
  122         if (!rib_valid(id))
  123             break;
  124     }
  125 
  126     if (id >= rib_size) {
  127         if ((xribs = reallocarray(ribs, id + 1,
  128             sizeof(struct rib_desc))) == NULL) {
  129             /* XXX this is not clever */
  130             fatal(NULL);
  131         }
  132         ribs = xribs;
  133         rib_size = id + 1;
  134     }
  135 
  136     bzero(&ribs[id], sizeof(struct rib_desc));
  137     strlcpy(ribs[id].name, name, sizeof(ribs[id].name));
  138     RB_INIT(rib_tree(&ribs[id].rib));
  139     ribs[id].state = RECONF_REINIT;
  140     ribs[id].rib.id = id;
  141     ribs[id].rib.flags = flags;
  142     ribs[id].rib.rtableid = rtableid;
  143 
  144     ribs[id].in_rules = calloc(1, sizeof(struct filter_head));
  145     if (ribs[id].in_rules == NULL)
  146         fatal(NULL);
  147     TAILQ_INIT(ribs[id].in_rules);
  148 
  149     log_debug("%s: %s -> %u", __func__, name, id);
  150     return (&ribs[id].rib);
  151 }
  152 
  153 struct rib *
  154 rib_byid(u_int16_t rid)
  155 {
  156     if (rib_valid(rid))
  157         return &ribs[rid].rib;
  158     return NULL;
  159 }
  160 
  161 u_int16_t
  162 rib_find(char *name)
  163 {
  164     u_int16_t id;
  165 
  166     /* no name returns the first Loc-RIB */
  167     if (name == NULL || *name == '\0')
  168         return RIB_LOC_START;
  169 
  170     for (id = 0; id < rib_size; id++) {
  171         if (!strcmp(ribs[id].name, name))
  172             return id;
  173     }
  174 
  175     return RIB_NOTFOUND;
  176 }
  177 
  178 struct rib_desc *
  179 rib_desc(struct rib *rib)
  180 {
  181     return (&ribs[rib->id]);
  182 }
  183 
  184 void
  185 rib_free(struct rib *rib)
  186 {
  187     struct rib_desc *rd;
  188     struct rib_entry *re, *xre;
  189     struct prefix *p, *np;
  190 
  191     rib_dump_abort(rib->id);
  192 
  193     for (re = RB_MIN(rib_tree, rib_tree(rib)); re != NULL; re = xre) {
  194         xre = RB_NEXT(rib_tree, rib_tree(rib), re);
  195 
  196         /*
  197          * Removing the prefixes is tricky because the last one
  198          * will remove the rib_entry as well and because we do
  199          * an empty check in prefix_destroy() it is not possible to
  200          * use the default for loop.
  201          */
  202         while ((p = LIST_FIRST(&re->prefix_h))) {
  203             struct rde_aspath *asp = prefix_aspath(p);
  204             np = LIST_NEXT(p, rib_l);
  205             if (asp && asp->pftableid) {
  206                 struct bgpd_addr addr;
  207 
  208                 pt_getaddr(p->re->prefix, &addr);
  209                 /* Commit is done in peer_down() */
  210                 rde_send_pftable(asp->pftableid, &addr,
  211                     p->re->prefix->prefixlen, 1);
  212             }
  213             prefix_destroy(p);
  214             if (np == NULL)
  215                 break;
  216         }
  217     }
  218     if (rib->id <= RIB_LOC_START)
  219         return; /* never remove the default ribs */
  220     rd = &ribs[rib->id];
  221     filterlist_free(rd->in_rules_tmp);
  222     filterlist_free(rd->in_rules);
  223     bzero(rd, sizeof(struct rib_desc));
  224 }
  225 
  226 void
  227 rib_shutdown(void)
  228 {
  229     u_int16_t id;
  230 
  231     for (id = 0; id < rib_size; id++) {
  232         if (!rib_valid(id))
  233             continue;
  234         if (!RB_EMPTY(rib_tree(&ribs[id].rib)))
  235             log_warnx("%s: rib %s is not empty", __func__,
  236                 ribs[id].name);
  237         rib_free(&ribs[id].rib);
  238     }
  239     for (id = 0; id <= RIB_LOC_START; id++) {
  240         struct rib_desc *rd = &ribs[id];
  241         filterlist_free(rd->in_rules_tmp);
  242         filterlist_free(rd->in_rules);
  243         bzero(rd, sizeof(struct rib_desc));
  244     }
  245     free(ribs);
  246 }
  247 
  248 struct rib_entry *
  249 rib_get(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
  250 {
  251     struct rib_entry xre, *re;
  252     struct pt_entry *pte;
  253 
  254     pte = pt_fill(prefix, prefixlen);
  255     bzero(&xre, sizeof(xre));
  256     xre.prefix = pte;
  257 
  258     re = RB_FIND(rib_tree, rib_tree(rib), &xre);
  259     if (re && re->rib_id != rib->id)
  260         fatalx("%s: Unexpected RIB %u != %u.", __func__,
  261             re->rib_id, rib->id);
  262     return re;
  263 }
  264 
  265 struct rib_entry *
  266 rib_lookup(struct rib *rib, struct bgpd_addr *addr)
  267 {
  268     struct rib_entry *re;
  269     int      i;
  270 
  271     switch (addr->aid) {
  272     case AID_INET:
  273     case AID_VPN_IPv4:
  274         for (i = 32; i >= 0; i--) {
  275             re = rib_get(rib, addr, i);
  276             if (re != NULL)
  277                 return (re);
  278         }
  279         break;
  280     case AID_INET6:
  281     case AID_VPN_IPv6:
  282         for (i = 128; i >= 0; i--) {
  283             re = rib_get(rib, addr, i);
  284             if (re != NULL)
  285                 return (re);
  286         }
  287         break;
  288     default:
  289         fatalx("rib_lookup: unknown af");
  290     }
  291     return (NULL);
  292 }
  293 
  294 
  295 struct rib_entry *
  296 rib_add(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
  297 {
  298     struct pt_entry *pte;
  299     struct rib_entry *re;
  300 
  301     pte = pt_get(prefix, prefixlen);
  302     if (pte == NULL)
  303         pte = pt_add(prefix, prefixlen);
  304 
  305     if ((re = calloc(1, sizeof(*re))) == NULL)
  306         fatal("rib_add");
  307 
  308     LIST_INIT(&re->prefix_h);
  309     re->prefix = pte;
  310     re->rib_id = rib->id;
  311 
  312     if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) {
  313         log_warnx("rib_add: insert failed");
  314         free(re);
  315         return (NULL);
  316     }
  317 
  318     pt_ref(pte);
  319 
  320     rdemem.rib_cnt++;
  321 
  322     return (re);
  323 }
  324 
  325 void
  326 rib_remove(struct rib_entry *re)
  327 {
  328     if (!rib_empty(re))
  329         fatalx("rib_remove: entry not empty");
  330 
  331     if (re_is_locked(re))
  332         /* entry is locked, don't free it. */
  333         return;
  334 
  335     pt_unref(re->prefix);
  336     if (pt_empty(re->prefix))
  337         pt_remove(re->prefix);
  338 
  339     if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL)
  340         log_warnx("rib_remove: remove failed.");
  341 
  342     free(re);
  343     rdemem.rib_cnt--;
  344 }
  345 
  346 int
  347 rib_empty(struct rib_entry *re)
  348 {
  349     return LIST_EMPTY(&re->prefix_h);
  350 }
  351 
  352 static struct rib_entry *
  353 rib_restart(struct rib_context *ctx)
  354 {
  355     struct rib_entry *re;
  356 
  357     re = ctx->ctx_re;
  358     re_unlock(re);
  359 
  360     /* find first non empty element */
  361     while (re && rib_empty(re))
  362         re = RB_NEXT(rib_tree, unused, re);
  363 
  364     /* free the previously locked rib element if empty */
  365     if (rib_empty(ctx->ctx_re))
  366         rib_remove(ctx->ctx_re);
  367     ctx->ctx_re = NULL;
  368     return (re);
  369 }
  370 
  371 static void
  372 rib_dump_r(struct rib_context *ctx)
  373 {
  374     struct rib_entry    *re, *next;
  375     struct rib      *rib;
  376     unsigned int         i;
  377 
  378     rib = rib_byid(ctx->ctx_rib_id);
  379     if (rib == NULL)
  380         fatalx("%s: rib id %u gone", __func__, ctx->ctx_rib_id);
  381 
  382     if (ctx->ctx_re == NULL)
  383         re = RB_MIN(rib_tree, rib_tree(rib));
  384     else
  385         re = rib_restart(ctx);
  386 
  387     for (i = 0; re != NULL; re = next) {
  388         next = RB_NEXT(rib_tree, unused, re);
  389         if (re->rib_id != ctx->ctx_rib_id)
  390             fatalx("%s: Unexpected RIB %u != %u.", __func__,
  391                 re->rib_id, ctx->ctx_rib_id);
  392         if (ctx->ctx_aid != AID_UNSPEC &&
  393             ctx->ctx_aid != re->prefix->aid)
  394             continue;
  395         if (ctx->ctx_count && i++ >= ctx->ctx_count &&
  396             !re_is_locked(re)) {
  397             /* store and lock last element */
  398             ctx->ctx_re = re;
  399             re_lock(re);
  400             return;
  401         }
  402         ctx->ctx_upcall(re, ctx->ctx_arg);
  403     }
  404 
  405     if (ctx->ctx_done)
  406         ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
  407     LIST_REMOVE(ctx, entry);
  408     free(ctx);
  409 }
  410 
  411 int
  412 rib_dump_pending(void)
  413 {
  414     struct rib_context *ctx;
  415 
  416     /* return true if at least one context is not throttled */
  417     LIST_FOREACH(ctx, &rib_dumps, entry) {
  418         if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
  419             continue;
  420         return 1;
  421     }
  422     return 0;
  423 }
  424 
  425 void
  426 rib_dump_runner(void)
  427 {
  428     struct rib_context *ctx, *next;
  429 
  430     LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
  431         if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
  432             continue;
  433         rib_dump_r(ctx);
  434     }
  435 }
  436 
  437 static void
  438 rib_dump_abort(u_int16_t id)
  439 {
  440     struct rib_context *ctx, *next;
  441 
  442     LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
  443         if (id != ctx->ctx_rib_id)
  444             continue;
  445         if (ctx->ctx_done)
  446             ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
  447         LIST_REMOVE(ctx, entry);
  448         free(ctx);
  449     }
  450 }
  451 
  452 int
  453 rib_dump_new(u_int16_t id, u_int8_t aid, unsigned int count, void *arg,
  454     void (*upcall)(struct rib_entry *, void *), void (*done)(void *, u_int8_t),
  455     int (*throttle)(void *))
  456 {
  457     struct rib_context *ctx;
  458 
  459     if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
  460         return -1;
  461     ctx->ctx_rib_id = id;
  462     ctx->ctx_aid = aid;
  463     ctx->ctx_count = count;
  464     ctx->ctx_arg = arg;
  465     ctx->ctx_upcall = upcall;
  466     ctx->ctx_done = done;
  467     ctx->ctx_throttle = throttle;
  468 
  469     LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
  470 
  471     /* requested a sync traversal */
  472     if (count == 0)
  473         rib_dump_r(ctx);
  474 
  475     return 0;
  476 }
  477 
  478 void
  479 rib_dump_terminate(u_int16_t id, void *arg,
  480     void (*upcall)(struct rib_entry *, void *))
  481 {
  482     struct rib_context *ctx, *next;
  483 
  484     LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
  485         if (id != ctx->ctx_rib_id || ctx->ctx_arg != arg ||
  486             ctx->ctx_upcall != upcall)
  487             continue;
  488         if (ctx->ctx_done)
  489             ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
  490         LIST_REMOVE(ctx, entry);
  491         free(ctx);
  492     }
  493 }
  494 
  495 /* path specific functions */
  496 
  497 static struct rde_aspath *path_lookup(struct rde_aspath *);
  498 static u_int64_t path_hash(struct rde_aspath *);
  499 static void path_link(struct rde_aspath *);
  500 static void path_unlink(struct rde_aspath *);
  501 
  502 struct path_table {
  503     struct aspath_head  *path_hashtbl;
  504     u_int64_t        path_hashmask;
  505 } pathtable;
  506 
  507 SIPHASH_KEY pathtablekey;
  508 
  509 #define PATH_HASH(x)    &pathtable.path_hashtbl[x & pathtable.path_hashmask]
  510 
  511 
  512 static inline int
  513 path_empty(struct rde_aspath *asp)
  514 {
  515     return (asp == NULL || asp->refcnt <= 0);
  516 }
  517 
  518 static inline void
  519 path_ref(struct rde_aspath *asp)
  520 {
  521     if ((asp->flags & F_ATTR_LINKED) == 0)
  522         fatalx("%s: unlinked object", __func__);
  523     asp->refcnt++;
  524     rdemem.path_refs++;
  525 }
  526 
  527 static inline void
  528 path_unref(struct rde_aspath *asp)
  529 {
  530     if ((asp->flags & F_ATTR_LINKED) == 0)
  531         fatalx("%s: unlinked object", __func__);
  532     asp->refcnt--;
  533     rdemem.path_refs--;
  534 }
  535 
  536 void
  537 path_init(u_int32_t hashsize)
  538 {
  539     u_int32_t   hs, i;
  540 
  541     for (hs = 1; hs < hashsize; hs <<= 1)
  542         ;
  543     pathtable.path_hashtbl = calloc(hs, sizeof(*pathtable.path_hashtbl));
  544     if (pathtable.path_hashtbl == NULL)
  545         fatal("path_init");
  546 
  547     for (i = 0; i < hs; i++)
  548         LIST_INIT(&pathtable.path_hashtbl[i]);
  549 
  550     pathtable.path_hashmask = hs - 1;
  551     arc4random_buf(&pathtablekey, sizeof(pathtablekey));
  552 }
  553 
  554 void
  555 path_shutdown(void)
  556 {
  557     u_int32_t   i;
  558 
  559     for (i = 0; i <= pathtable.path_hashmask; i++)
  560         if (!LIST_EMPTY(&pathtable.path_hashtbl[i]))
  561             log_warnx("path_free: free non-free table");
  562 
  563     free(pathtable.path_hashtbl);
  564 }
  565 
  566 void
  567 path_hash_stats(struct rde_hashstats *hs)
  568 {
  569     struct rde_aspath   *a;
  570     u_int32_t       i;
  571     int64_t         n;
  572 
  573     memset(hs, 0, sizeof(*hs));
  574     strlcpy(hs->name, "path hash", sizeof(hs->name));
  575     hs->min = LLONG_MAX;
  576     hs->num = pathtable.path_hashmask + 1;
  577 
  578     for (i = 0; i <= pathtable.path_hashmask; i++) {
  579         n = 0;
  580         LIST_FOREACH(a, &pathtable.path_hashtbl[i], path_l)
  581             n++;
  582         if (n < hs->min)
  583             hs->min = n;
  584         if (n > hs->max)
  585             hs->max = n;
  586         hs->sum += n;
  587         hs->sumq += n * n;
  588     }
  589 }
  590 
  591 /*
  592  * Update a prefix belonging to a possible new aspath.
  593  * Return 1 if prefix was newly added, 0 if it was just changed or 2 if no
  594  * change happened at all.
  595  */
  596 int
  597 path_update(struct rib *rib, struct rde_peer *peer, struct filterstate *state,
  598     struct bgpd_addr *prefix, int prefixlen, u_int8_t vstate)
  599 {
  600     struct rde_aspath   *asp, *nasp = &state->aspath;
  601     struct prefix       *p;
  602 
  603     if (nasp->pftableid) {
  604         rde_send_pftable(nasp->pftableid, prefix, prefixlen, 0);
  605         rde_send_pftable_commit();
  606     }
  607 
  608     /*
  609      * First try to find a prefix in the specified RIB.
  610      */
  611     if ((p = prefix_get(rib, peer, prefix, prefixlen)) != NULL) {
  612         if (path_compare(nasp, prefix_aspath(p)) == 0 &&
  613             prefix_nexthop(p) == state->nexthop &&
  614             prefix_nhflags(p) == state->nhflags) {
  615             /* no change, update last change */
  616             p->lastchange = time(NULL);
  617             p->validation_state = vstate;
  618             return (2);
  619         }
  620         if (p->flags) {
  621             struct prefix_tree *prefix_head;
  622             /* prefix is a pending update */
  623             prefix_head = p->flags & PREFIX_FLAG_UPDATE ?
  624                 &peer->updates[prefix->aid] :
  625                 &peer->withdraws[prefix->aid];
  626             RB_REMOVE(prefix_tree, prefix_head, p);
  627             p->flags = 0;
  628         }
  629     }
  630 
  631     /*
  632      * Either the prefix does not exist or the path changed.
  633      * In both cases lookup the new aspath to make sure it is not
  634      * already in the RIB.
  635      */
  636     if ((asp = path_lookup(nasp)) == NULL) {
  637         /* Path not available, create and link a new one. */
  638         asp = path_copy(path_get(), nasp);
  639         path_link(asp);
  640     }
  641 
  642     /* If the prefix was found move it else add it to the aspath. */
  643     if (p != NULL)
  644         return (prefix_move(p, peer, asp, state, vstate));
  645     else
  646         return (prefix_add(prefix, prefixlen, rib, peer, asp,
  647             state, vstate));
  648 }
  649 
  650 int
  651 path_compare(struct rde_aspath *a, struct rde_aspath *b)
  652 {
  653     int      r;
  654 
  655     if (a == NULL && b == NULL)
  656         return (0);
  657     else if (b == NULL)
  658         return (1);
  659     else if (a == NULL)
  660         return (-1);
  661     if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
  662         return (1);
  663     if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
  664         return (-1);
  665     if (a->origin > b->origin)
  666         return (1);
  667     if (a->origin < b->origin)
  668         return (-1);
  669     if (a->med > b->med)
  670         return (1);
  671     if (a->med < b->med)
  672         return (-1);
  673     if (a->lpref > b->lpref)
  674         return (1);
  675     if (a->lpref < b->lpref)
  676         return (-1);
  677     if (a->weight > b->weight)
  678         return (1);
  679     if (a->weight < b->weight)
  680         return (-1);
  681     if (a->rtlabelid > b->rtlabelid)
  682         return (1);
  683     if (a->rtlabelid < b->rtlabelid)
  684         return (-1);
  685     if (a->pftableid > b->pftableid)
  686         return (1);
  687     if (a->pftableid < b->pftableid)
  688         return (-1);
  689 
  690     r = aspath_compare(a->aspath, b->aspath);
  691     if (r > 0)
  692         return (1);
  693     if (r < 0)
  694         return (-1);
  695 
  696     return (attr_compare(a, b));
  697 }
  698 
  699 static u_int64_t
  700 path_hash(struct rde_aspath *asp)
  701 {
  702     SIPHASH_CTX ctx;
  703     u_int64_t   hash;
  704 
  705     SipHash24_Init(&ctx, &pathtablekey);
  706     SipHash24_Update(&ctx, &asp->origin, sizeof(asp->origin));
  707     SipHash24_Update(&ctx, &asp->med, sizeof(asp->med));
  708     SipHash24_Update(&ctx, &asp->lpref, sizeof(asp->lpref));
  709     SipHash24_Update(&ctx, &asp->weight, sizeof(asp->weight));
  710     SipHash24_Update(&ctx, &asp->rtlabelid, sizeof(asp->rtlabelid));
  711     SipHash24_Update(&ctx, &asp->pftableid, sizeof(asp->pftableid));
  712 
  713     if (asp->aspath)
  714         SipHash24_Update(&ctx, asp->aspath->data, asp->aspath->len);
  715 
  716     hash = attr_hash(asp);
  717     SipHash24_Update(&ctx, &hash, sizeof(hash));
  718 
  719     return (SipHash24_End(&ctx));
  720 }
  721 
  722 static struct rde_aspath *
  723 path_lookup(struct rde_aspath *aspath)
  724 {
  725     struct aspath_head  *head;
  726     struct rde_aspath   *asp;
  727     u_int64_t        hash;
  728 
  729     hash = path_hash(aspath);
  730     head = PATH_HASH(hash);
  731 
  732     LIST_FOREACH(asp, head, path_l) {
  733         if (asp->hash == hash && path_compare(aspath, asp) == 0)
  734             return (asp);
  735     }
  736     return (NULL);
  737 }
  738 
  739 /*
  740  * Link this aspath into the global hash table.
  741  * The asp had to be alloced with path_get.
  742  */
  743 static void
  744 path_link(struct rde_aspath *asp)
  745 {
  746     struct aspath_head  *head;
  747 
  748     asp->hash = path_hash(asp);
  749     head = PATH_HASH(asp->hash);
  750 
  751     LIST_INSERT_HEAD(head, asp, path_l);
  752     asp->flags |= F_ATTR_LINKED;
  753 }
  754 
  755 /*
  756  * This function can only be called when all prefix have been removed first.
  757  * Normally this happens directly out of the prefix removal functions.
  758  */
  759 static void
  760 path_unlink(struct rde_aspath *asp)
  761 {
  762     if (asp == NULL)
  763         return;
  764 
  765     /* make sure no reference is hold for this rde_aspath */
  766     if (!path_empty(asp))
  767         fatalx("%s: still has prefixes", __func__);
  768 
  769     LIST_REMOVE(asp, path_l);
  770     asp->flags &= ~F_ATTR_LINKED;
  771 
  772     path_put(asp);
  773 }
  774 
  775 /*
  776  * Copy asp to a new UNLINKED aspath.
  777  * On dst either path_get() or path_prep() had to be called before.
  778  */
  779 struct rde_aspath *
  780 path_copy(struct rde_aspath *dst, const struct rde_aspath *src)
  781 {
  782     dst->aspath = src->aspath;
  783     if (dst->aspath != NULL) {
  784         dst->aspath->refcnt++;
  785         rdemem.aspath_refs++;
  786     }
  787     dst->hash = 0;
  788     dst->med = src->med;
  789     dst->lpref = src->lpref;
  790     dst->weight = src->weight;
  791     dst->origin = src->origin;
  792     dst->rtlabelid = rtlabel_ref(src->rtlabelid);
  793     dst->pftableid = pftable_ref(src->pftableid);
  794 
  795     dst->flags = src->flags & ~F_ATTR_LINKED;
  796     dst->refcnt = 0;    /* not linked so no refcnt */
  797 
  798     attr_copy(dst, src);
  799 
  800     return (dst);
  801 }
  802 
  803 /* initialize or pepare an aspath for use */
  804 struct rde_aspath *
  805 path_prep(struct rde_aspath *asp)
  806 {
  807     memset(asp, 0, sizeof(*asp));
  808     asp->origin = ORIGIN_INCOMPLETE;
  809     asp->lpref = DEFAULT_LPREF;
  810 
  811     return (asp);
  812 }
  813 
  814 /* alloc and initialize new entry. May not fail. */
  815 struct rde_aspath *
  816 path_get(void)
  817 {
  818     struct rde_aspath *asp;
  819 
  820     asp = malloc(sizeof(*asp));
  821     if (asp == NULL)
  822         fatal("path_get");
  823     rdemem.path_cnt++;
  824 
  825     return (path_prep(asp));
  826 }
  827 
  828 /* clean up an asp after use (frees all references to sub-objects) */
  829 void
  830 path_clean(struct rde_aspath *asp)
  831 {
  832     if (asp->flags & F_ATTR_LINKED)
  833         fatalx("%s: linked object", __func__);
  834 
  835     rtlabel_unref(asp->rtlabelid);
  836     pftable_unref(asp->pftableid);
  837     aspath_put(asp->aspath);
  838     attr_freeall(asp);
  839 }
  840 
  841 /* free an unlinked element */
  842 void
  843 path_put(struct rde_aspath *asp)
  844 {
  845     if (asp == NULL)
  846         return;
  847 
  848     path_clean(asp);
  849 
  850     rdemem.path_cnt--;
  851     free(asp);
  852 }
  853 
  854 /* prefix specific functions */
  855 
  856 static void      prefix_link(struct prefix *, struct rib_entry *,
  857                  struct rde_peer *, struct rde_aspath *,
  858                  struct filterstate *, u_int8_t);
  859 static void      prefix_unlink(struct prefix *);
  860 static struct prefix    *prefix_alloc(void);
  861 static void      prefix_free(struct prefix *);
  862 
  863 /* RB tree comparison function */
  864 static inline int
  865 prefix_cmp(struct prefix *a, struct prefix *b)
  866 {
  867     if (a->eor != b->eor)
  868         return a->eor - b->eor;
  869     if (a->aspath != b->aspath)
  870         return (a->aspath > b->aspath ? 1 : -1);
  871     if (a->nexthop != b->nexthop)
  872         return (a->nexthop > b->nexthop ? 1 : -1);
  873     if (a->nhflags != b->nhflags)
  874         return (a->nhflags > b->nhflags ? 1 : -1);
  875     return rib_compare(a->re, b->re);
  876 }
  877 
  878 RB_GENERATE(prefix_tree, prefix, entry, prefix_cmp)
  879 
  880 /*
  881  * search for specified prefix of a peer. Returns NULL if not found.
  882  */
  883 struct prefix *
  884 prefix_get(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
  885     int prefixlen)
  886 {
  887     struct rib_entry    *re;
  888 
  889     re = rib_get(rib, prefix, prefixlen);
  890     if (re == NULL)
  891         return (NULL);
  892     return (prefix_bypeer(re, peer));
  893 }
  894 
  895 /*
  896  * Adds or updates a prefix.
  897  */
  898 static int
  899 prefix_add(struct bgpd_addr *prefix, int prefixlen, struct rib *rib,
  900     struct rde_peer *peer, struct rde_aspath *asp, struct filterstate *state,
  901     u_int8_t vstate)
  902 {
  903     struct prefix       *p;
  904     struct rib_entry    *re;
  905 
  906     re = rib_get(rib, prefix, prefixlen);
  907     if (re == NULL)
  908         re = rib_add(rib, prefix, prefixlen);
  909 
  910     p = prefix_bypeer(re, peer);
  911     if (p == NULL) {
  912         p = prefix_alloc();
  913         prefix_link(p, re, peer, asp, state, vstate);
  914         return (1);
  915     } else {
  916         if (prefix_aspath(p) != asp ||
  917             prefix_nexthop(p) != state->nexthop ||
  918             prefix_nhflags(p) != state->nhflags) {
  919             /* prefix metadata changed therefor move */
  920             return (prefix_move(p, peer, asp, state, vstate));
  921         }
  922         p->lastchange = time(NULL);
  923         p->validation_state = vstate;
  924         return (0);
  925     }
  926 }
  927 
  928 /*
  929  * Move the prefix to the specified as path, removes the old asp if needed.
  930  */
  931 static int
  932 prefix_move(struct prefix *p, struct rde_peer *peer,
  933     struct rde_aspath *asp, struct filterstate *state, u_int8_t vstate)
  934 {
  935     struct prefix       *np;
  936     struct rde_aspath   *oasp;
  937 
  938     if (peer != prefix_peer(p))
  939         fatalx("prefix_move: cross peer move");
  940 
  941     np = prefix_alloc();
  942     np->aspath = asp;
  943     np->nexthop = nexthop_ref(state->nexthop);
  944     nexthop_link(np);
  945     np->peer = peer;
  946     np->re = p->re;
  947     np->lastchange = time(NULL);
  948     np->nhflags = state->nhflags;
  949     np->validation_state = vstate;
  950 
  951     /* add reference to new as path */
  952     path_ref(asp);
  953 
  954     /*
  955      * no need to update the peer prefix count because we are only moving
  956      * the prefix without changing the peer.
  957      */
  958 
  959     /*
  960      * First kick the old prefix node out of the prefix list,
  961      * afterwards run the route decision for new prefix node.
  962      * Because of this only one update is generated if the prefix
  963      * was active.
  964      * This is safe because we create a new prefix and so the change
  965      * is noticed by prefix_evaluate().
  966      */
  967     LIST_REMOVE(p, rib_l);
  968     prefix_evaluate(np, np->re);
  969 
  970     /* remove old prefix node */
  971     /* as before peer count needs no update because of move */
  972     oasp = p->aspath;
  973     if (oasp)
  974         path_unref(oasp);
  975 
  976     /* destroy all references to other objects and free the old prefix */
  977     p->aspath = NULL;
  978     p->peer = NULL;
  979     p->re = NULL;
  980     nexthop_unlink(p);
  981     nexthop_put(p->nexthop);
  982     p->nexthop = NULL;
  983     prefix_free(p);
  984 
  985     /* destroy old path if empty */
  986     if (path_empty(oasp))
  987         path_unlink(oasp);
  988 
  989     return (0);
  990 }
  991 
  992 /*
  993  * Removes a prefix from all lists. If the parent objects -- path or
  994  * pt_entry -- become empty remove them too.
  995  */
  996 int
  997 prefix_remove(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
  998     int prefixlen)
  999 {
 1000     struct prefix       *p;
 1001     struct rde_aspath   *asp;
 1002 
 1003     p = prefix_get(rib, peer, prefix, prefixlen);
 1004     if (p == NULL)      /* Got a dummy withdrawn request. */
 1005         return (0);
 1006 
 1007     asp = prefix_aspath(p);
 1008     if (asp && asp->pftableid) {
 1009         /* only prefixes in the local RIB were pushed into pf */
 1010         rde_send_pftable(asp->pftableid, prefix, prefixlen, 1);
 1011         rde_send_pftable_commit();
 1012     }
 1013 
 1014     prefix_destroy(p);
 1015 
 1016     return (1);
 1017 }
 1018 
 1019 /*
 1020  * Insert an End-of-RIB marker into the update queue.
 1021  */
 1022 void
 1023 prefix_add_eor(struct rde_peer *peer, u_int8_t aid)
 1024 {
 1025     struct prefix *p;
 1026 
 1027     p = prefix_alloc();
 1028     p->eor = 1;
 1029     p->flags = PREFIX_FLAG_UPDATE;
 1030     if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL)
 1031         /* no need to add if EoR marker already present */
 1032         prefix_free(p);
 1033 }
 1034 
 1035 /*
 1036  * Put a prefix from the Adj-RIB-Out onto the update queue.
 1037  */
 1038 void
 1039 prefix_update(struct rib *rib, struct rde_peer *peer,
 1040     struct bgpd_addr *prefix, int prefixlen)
 1041 {
 1042     struct prefix *p;
 1043 
 1044     p = prefix_get(rib, peer, prefix, prefixlen);
 1045     if (p == NULL)      /* Got a dummy withdrawn request. */
 1046         return;
 1047 
 1048     if (p->flags != 0)
 1049         fatalx("%s: bad flags %x", __func__, p->flags);
 1050     p->flags = PREFIX_FLAG_UPDATE;
 1051     if (RB_INSERT(prefix_tree, &peer->updates[prefix->aid], p) != NULL)
 1052         fatalx("%s: RB tree invariant violated", __func__);
 1053 }
 1054 
 1055 /*
 1056  * Withdraw a prefix from the Adj-RIB-Out, this unlinks the aspath but leaves
 1057  * the prefix in the RIB linked to the peer withdraw list.
 1058  */
 1059 int
 1060 prefix_withdraw(struct rib *rib, struct rde_peer *peer,
 1061     struct bgpd_addr *prefix, int prefixlen)
 1062 {
 1063     struct prefix       *p;
 1064     struct rde_aspath   *asp;
 1065 
 1066     p = prefix_get(rib, peer, prefix, prefixlen);
 1067     if (p == NULL)      /* Got a dummy withdrawn request. */
 1068         return (0);
 1069 
 1070     /* unlink from aspath ...*/
 1071     asp = p->aspath;
 1072     if (asp != NULL) {
 1073         path_unref(asp);
 1074         p->aspath = NULL;
 1075         if (path_empty(asp))
 1076             path_unlink(asp);
 1077     }
 1078 
 1079     /* ... and nexthop but keep the re link */
 1080     nexthop_unlink(p);
 1081     nexthop_put(p->nexthop);
 1082     p->nexthop = NULL;
 1083     p->nhflags = 0;
 1084     /* re link still exists */
 1085 
 1086     if (p->flags) {
 1087         struct prefix_tree *prefix_head;
 1088         /* p is a pending update or withdraw, remove first */
 1089         prefix_head = p->flags & PREFIX_FLAG_UPDATE ?
 1090             &peer->updates[prefix->aid] :
 1091             &peer->withdraws[prefix->aid];
 1092         RB_REMOVE(prefix_tree, prefix_head, p);
 1093         p->flags = 0;
 1094     }
 1095     p->flags = PREFIX_FLAG_WITHDRAW;
 1096     if (RB_INSERT(prefix_tree, &peer->withdraws[prefix->aid], p) != NULL)
 1097         fatalx("%s: RB tree invariant violated", __func__);
 1098     return (1);
 1099 }
 1100 
 1101 
 1102 /* dump a prefix into specified buffer */
 1103 int
 1104 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, u_int8_t plen,
 1105     int withdraw)
 1106 {
 1107     int totlen;
 1108 
 1109     switch (prefix->aid) {
 1110     case AID_INET:
 1111     case AID_INET6:
 1112         totlen = PREFIX_SIZE(plen);
 1113 
 1114         if (totlen > len)
 1115             return (-1);
 1116         *buf++ = plen;
 1117         memcpy(buf, &prefix->ba, totlen - 1);
 1118         return (totlen);
 1119     case AID_VPN_IPv4:
 1120         totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn4.rd);
 1121         plen += sizeof(prefix->vpn4.rd) * 8;
 1122         if (withdraw) {
 1123             /* withdraw have one compat label as placeholder */
 1124             totlen += 3;
 1125             plen += 3 * 8;
 1126         } else {
 1127             totlen += prefix->vpn4.labellen;
 1128             plen += prefix->vpn4.labellen * 8;
 1129         }
 1130 
 1131         if (totlen > len)
 1132             return (-1);
 1133         *buf++ = plen;
 1134         if (withdraw) {
 1135             /* magic compatibility label as per rfc8277 */
 1136             *buf++ = 0x80;
 1137             *buf++ = 0x0;
 1138             *buf++ = 0x0;
 1139         } else  {
 1140             memcpy(buf, &prefix->vpn4.labelstack,
 1141                 prefix->vpn4.labellen);
 1142             buf += prefix->vpn4.labellen;
 1143         }
 1144         memcpy(buf, &prefix->vpn4.rd, sizeof(prefix->vpn4.rd));
 1145         buf += sizeof(prefix->vpn4.rd);
 1146         memcpy(buf, &prefix->vpn4.addr, PREFIX_SIZE(plen) - 1);
 1147         return (totlen);
 1148     case AID_VPN_IPv6:
 1149         totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn6.rd);
 1150         plen += sizeof(prefix->vpn6.rd) * 8;
 1151         if (withdraw) {
 1152             /* withdraw have one compat label as placeholder */
 1153             totlen += 3;
 1154             plen += 3 * 8;
 1155         } else {
 1156             totlen += prefix->vpn6.labellen;
 1157             plen += prefix->vpn6.labellen * 8;
 1158         }
 1159         if (totlen > len)
 1160             return (-1);
 1161         *buf++ = plen;
 1162         if (withdraw) {
 1163             /* magic compatibility label as per rfc8277 */
 1164             *buf++ = 0x80;
 1165             *buf++ = 0x0;
 1166             *buf++ = 0x0;
 1167         } else  {
 1168             memcpy(buf, &prefix->vpn6.labelstack,
 1169                 prefix->vpn6.labellen);
 1170             buf += prefix->vpn6.labellen;
 1171         }
 1172         memcpy(buf, &prefix->vpn6.rd, sizeof(prefix->vpn6.rd));
 1173         buf += sizeof(prefix->vpn6.rd);
 1174         memcpy(buf, &prefix->vpn6.addr, PREFIX_SIZE(plen) - 1);
 1175         return (totlen);
 1176     default:
 1177         return (-1);
 1178     }
 1179 }
 1180 
 1181 int
 1182 prefix_writebuf(struct ibuf *buf, struct bgpd_addr *prefix, u_int8_t plen)
 1183 {
 1184     int  totlen;
 1185     void    *bptr;
 1186 
 1187     switch (prefix->aid) {
 1188     case AID_INET:
 1189     case AID_INET6:
 1190         totlen = PREFIX_SIZE(plen);
 1191         break;
 1192     case AID_VPN_IPv4:
 1193         totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn4.rd) +
 1194             prefix->vpn4.labellen;
 1195         break;
 1196     case AID_VPN_IPv6:
 1197         totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn6.rd) +
 1198             prefix->vpn6.labellen;
 1199         break;
 1200     default:
 1201         return (-1);
 1202     }
 1203 
 1204     if ((bptr = ibuf_reserve(buf, totlen)) == NULL)
 1205         return (-1);
 1206     if (prefix_write(bptr, totlen, prefix, plen, 0) == -1)
 1207         return (-1);
 1208     return (0);
 1209 }
 1210 
 1211 /*
 1212  * Searches in the prefix list of specified rib_entry for a prefix entry
 1213  * belonging to the peer peer. Returns NULL if no match found.
 1214  */
 1215 struct prefix *
 1216 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer)
 1217 {
 1218     struct prefix   *p;
 1219 
 1220     LIST_FOREACH(p, &re->prefix_h, rib_l)
 1221         if (prefix_peer(p) == peer)
 1222             return (p);
 1223     return (NULL);
 1224 }
 1225 
 1226 void
 1227 prefix_updateall(struct prefix *p, enum nexthop_state state,
 1228     enum nexthop_state oldstate)
 1229 {
 1230     /* Skip non local-RIBs or RIBs that are flagged as noeval. */
 1231     if (re_rib(p->re)->flags & F_RIB_NOEVALUATE)
 1232         return;
 1233 
 1234     if (oldstate == state && state == NEXTHOP_REACH) {
 1235         /*
 1236          * The state of the nexthop did not change. The only
 1237          * thing that may have changed is the true_nexthop
 1238          * or other internal infos. This will not change
 1239          * the routing decision so shortcut here.
 1240          */
 1241         if ((re_rib(p->re)->flags & F_RIB_NOFIB) == 0 &&
 1242             p == p->re->active)
 1243             rde_send_kroute(re_rib(p->re), p, NULL);
 1244         return;
 1245     }
 1246 
 1247     /* redo the route decision */
 1248     LIST_REMOVE(p, rib_l);
 1249     /*
 1250      * If the prefix is the active one remove it first,
 1251      * this has to be done because we can not detect when
 1252      * the active prefix changes its state. In this case
 1253      * we know that this is a withdrawal and so the second
 1254      * prefix_evaluate() will generate no update because
 1255      * the nexthop is unreachable or ineligible.
 1256      */
 1257     if (p == p->re->active)
 1258         prefix_evaluate(NULL, p->re);
 1259     prefix_evaluate(p, p->re);
 1260 }
 1261 
 1262 /* kill a prefix. */
 1263 void
 1264 prefix_destroy(struct prefix *p)
 1265 {
 1266     struct rde_aspath   *asp;
 1267 
 1268     asp = prefix_aspath(p);
 1269     prefix_unlink(p);
 1270     prefix_free(p);
 1271 
 1272     if (path_empty(asp))
 1273         path_unlink(asp);
 1274 }
 1275 
 1276 /*
 1277  * Link a prefix into the different parent objects.
 1278  */
 1279 static void
 1280 prefix_link(struct prefix *p, struct rib_entry *re, struct rde_peer *peer,
 1281     struct rde_aspath *asp, struct filterstate *state, u_int8_t vstate)
 1282 {
 1283     path_ref(asp);
 1284 
 1285     p->aspath = asp;
 1286     p->peer = peer;
 1287     p->nexthop = nexthop_ref(state->nexthop);
 1288     nexthop_link(p);
 1289     p->re = re;
 1290     p->lastchange = time(NULL);
 1291     p->nhflags = state->nhflags;
 1292     p->validation_state = vstate;
 1293 
 1294     /* make route decision */
 1295     prefix_evaluate(p, re);
 1296 }
 1297 
 1298 /*
 1299  * Unlink a prefix from the different parent objects.
 1300  */
 1301 static void
 1302 prefix_unlink(struct prefix *p)
 1303 {
 1304     struct rib_entry    *re = p->re;
 1305 
 1306     if (p->eor) /* nothing to unlink for EoR markers */
 1307         return;
 1308 
 1309     /* make route decision */
 1310     LIST_REMOVE(p, rib_l);
 1311     prefix_evaluate(NULL, re);
 1312 
 1313     if (p->aspath)
 1314         path_unref(p->aspath);
 1315 
 1316     if (rib_empty(re))
 1317         rib_remove(re);
 1318 
 1319     /* destroy all references to other objects */
 1320     nexthop_unlink(p);
 1321     nexthop_put(p->nexthop);
 1322     p->nexthop = NULL;
 1323     p->aspath = NULL;
 1324     p->peer = NULL;
 1325     p->re = NULL;
 1326 
 1327     /*
 1328      * It's the caller's duty to do accounting and remove empty aspath
 1329      * structures. Also freeing the unlinked prefix is the caller's duty.
 1330      */
 1331 }
 1332 
 1333 /* alloc and bzero new entry. May not fail. */
 1334 static struct prefix *
 1335 prefix_alloc(void)
 1336 {
 1337     struct prefix *p;
 1338 
 1339     p = calloc(1, sizeof(*p));
 1340     if (p == NULL)
 1341         fatal("prefix_alloc");
 1342     rdemem.prefix_cnt++;
 1343     return p;
 1344 }
 1345 
 1346 /* free a unlinked entry */
 1347 static void
 1348 prefix_free(struct prefix *p)
 1349 {
 1350     rdemem.prefix_cnt--;
 1351     free(p);
 1352 }
 1353 
 1354 /*
 1355  * nexthop functions
 1356  */
 1357 struct nexthop_head *nexthop_hash(struct bgpd_addr *);
 1358 struct nexthop      *nexthop_lookup(struct bgpd_addr *);
 1359 
 1360 /*
 1361  * In BGP there exist two nexthops: the exit nexthop which was announced via
 1362  * BGP and the true nexthop which is used in the FIB -- forward information
 1363  * base a.k.a kernel routing table. When sending updates it is even more
 1364  * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
 1365  * while in EBGP normally the address of the router is sent. The exit nexthop
 1366  * may be passed to the external neighbor if the neighbor and the exit nexthop
 1367  * reside in the same subnet -- directly connected.
 1368  */
 1369 struct nexthop_table {
 1370     LIST_HEAD(nexthop_head, nexthop)    *nexthop_hashtbl;
 1371     u_int32_t                nexthop_hashmask;
 1372 } nexthoptable;
 1373 
 1374 SIPHASH_KEY nexthoptablekey;
 1375 
 1376 void
 1377 nexthop_init(u_int32_t hashsize)
 1378 {
 1379     u_int32_t    hs, i;
 1380 
 1381     for (hs = 1; hs < hashsize; hs <<= 1)
 1382         ;
 1383     nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_head));
 1384     if (nexthoptable.nexthop_hashtbl == NULL)
 1385         fatal("nextop_init");
 1386 
 1387     for (i = 0; i < hs; i++)
 1388         LIST_INIT(&nexthoptable.nexthop_hashtbl[i]);
 1389     arc4random_buf(&nexthoptablekey, sizeof(nexthoptablekey));
 1390 
 1391     nexthoptable.nexthop_hashmask = hs - 1;
 1392 }
 1393 
 1394 void
 1395 nexthop_shutdown(void)
 1396 {
 1397     u_int32_t        i;
 1398     struct nexthop      *nh, *nnh;
 1399 
 1400     for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) {
 1401         for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
 1402             nh != NULL; nh = nnh) {
 1403             nnh = LIST_NEXT(nh, nexthop_l);
 1404             nh->state = NEXTHOP_UNREACH;
 1405             (void)nexthop_put(nh);
 1406         }
 1407         if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i])) {
 1408             nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
 1409             log_warnx("nexthop_shutdown: non-free table, "
 1410                 "nexthop %s refcnt %d",
 1411                 log_addr(&nh->exit_nexthop), nh->refcnt);
 1412         }
 1413     }
 1414 
 1415     free(nexthoptable.nexthop_hashtbl);
 1416 }
 1417 
 1418 void
 1419 nexthop_update(struct kroute_nexthop *msg)
 1420 {
 1421     struct nexthop      *nh;
 1422     struct prefix       *p;
 1423     enum nexthop_state   oldstate;
 1424 
 1425     nh = nexthop_lookup(&msg->nexthop);
 1426     if (nh == NULL) {
 1427         log_warnx("nexthop_update: non-existent nexthop %s",
 1428             log_addr(&msg->nexthop));
 1429         return;
 1430     }
 1431 
 1432     oldstate = nh->state;
 1433 
 1434     if (msg->valid)
 1435         nh->state = NEXTHOP_REACH;
 1436     else
 1437         nh->state = NEXTHOP_UNREACH;
 1438 
 1439     if (msg->connected) {
 1440         nh->flags |= NEXTHOP_CONNECTED;
 1441         memcpy(&nh->true_nexthop, &nh->exit_nexthop,
 1442             sizeof(nh->true_nexthop));
 1443     } else
 1444         memcpy(&nh->true_nexthop, &msg->gateway,
 1445             sizeof(nh->true_nexthop));
 1446 
 1447     memcpy(&nh->nexthop_net, &msg->net,
 1448         sizeof(nh->nexthop_net));
 1449     nh->nexthop_netlen = msg->netlen;
 1450 
 1451     if (oldstate == NEXTHOP_LOOKUP)
 1452         /* drop reference which was hold during the lookup */
 1453         if (nexthop_put(nh))
 1454             return;
 1455 
 1456     if (rde_noevaluate())
 1457         /*
 1458          * if the decision process is turned off there is no need
 1459          * for the aspath list walk.
 1460          */
 1461         return;
 1462 
 1463     LIST_FOREACH(p, &nh->prefix_h, nexthop_l) {
 1464         prefix_updateall(p, nh->state, oldstate);
 1465     }
 1466 }
 1467 
 1468 void
 1469 nexthop_modify(struct nexthop *setnh, enum action_types type, u_int8_t aid,
 1470     struct nexthop **nexthop, u_int8_t *flags)
 1471 {
 1472     switch (type) {
 1473     case ACTION_SET_NEXTHOP_REJECT:
 1474         *flags = NEXTHOP_REJECT;
 1475         break;
 1476     case ACTION_SET_NEXTHOP_BLACKHOLE:
 1477         *flags = NEXTHOP_BLACKHOLE;
 1478         break;
 1479     case ACTION_SET_NEXTHOP_NOMODIFY:
 1480         *flags = NEXTHOP_NOMODIFY;
 1481         break;
 1482     case ACTION_SET_NEXTHOP_SELF:
 1483         *flags = NEXTHOP_SELF;
 1484         break;
 1485     case ACTION_SET_NEXTHOP:
 1486         /*
 1487          * it is possible that a prefix matches but has the wrong
 1488          * address family for the set nexthop. In this case ignore it.
 1489          */
 1490         if (aid != setnh->exit_nexthop.aid)
 1491             break;
 1492         nexthop_put(*nexthop);
 1493         *nexthop = nexthop_ref(setnh);
 1494         *flags = 0;
 1495         break;
 1496     default:
 1497         break;
 1498     }
 1499 }
 1500 
 1501 void
 1502 nexthop_link(struct prefix *p)
 1503 {
 1504     if (p->nexthop == NULL)
 1505         return;
 1506 
 1507     LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, nexthop_l);
 1508 }
 1509 
 1510 void
 1511 nexthop_unlink(struct prefix *p)
 1512 {
 1513     if (p->nexthop == NULL)
 1514         return;
 1515 
 1516     LIST_REMOVE(p, nexthop_l);
 1517 }
 1518 
 1519 struct nexthop *
 1520 nexthop_get(struct bgpd_addr *nexthop)
 1521 {
 1522     struct nexthop  *nh;
 1523 
 1524     nh = nexthop_lookup(nexthop);
 1525     if (nh == NULL) {
 1526         nh = calloc(1, sizeof(*nh));
 1527         if (nh == NULL)
 1528             fatal("nexthop_alloc");
 1529         rdemem.nexthop_cnt++;
 1530 
 1531         LIST_INIT(&nh->prefix_h);
 1532         nh->state = NEXTHOP_LOOKUP;
 1533         nexthop_ref(nh);    /* take reference for lookup */
 1534         nh->exit_nexthop = *nexthop;
 1535         LIST_INSERT_HEAD(nexthop_hash(nexthop), nh,
 1536             nexthop_l);
 1537 
 1538         rde_send_nexthop(&nh->exit_nexthop, 1);
 1539     }
 1540 
 1541     return nexthop_ref(nh);
 1542 }
 1543 
 1544 struct nexthop *
 1545 nexthop_ref(struct nexthop *nexthop)
 1546 {
 1547     if (nexthop)
 1548         nexthop->refcnt++;
 1549     return (nexthop);
 1550 }
 1551 
 1552 int
 1553 nexthop_put(struct nexthop *nh)
 1554 {
 1555     if (nh == NULL)
 1556         return (0);
 1557 
 1558     if (--nh->refcnt > 0)
 1559         return (0);
 1560 
 1561     /* sanity check */
 1562     if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP)
 1563         fatalx("nexthop_put: refcnt error");
 1564 
 1565     LIST_REMOVE(nh, nexthop_l);
 1566     rde_send_nexthop(&nh->exit_nexthop, 0);
 1567 
 1568     rdemem.nexthop_cnt--;
 1569     free(nh);
 1570     return (1);
 1571 }
 1572 
 1573 int
 1574 nexthop_compare(struct nexthop *na, struct nexthop *nb)
 1575 {
 1576     struct bgpd_addr    *a, *b;
 1577 
 1578     if (na == nb)
 1579         return (0);
 1580     if (na == NULL)
 1581         return (-1);
 1582     if (nb == NULL)
 1583         return (1);
 1584 
 1585     a = &na->exit_nexthop;
 1586     b = &nb->exit_nexthop;
 1587 
 1588     if (a->aid != b->aid)
 1589         return (a->aid - b->aid);
 1590 
 1591     switch (a->aid) {
 1592     case AID_INET:
 1593         if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
 1594             return (1);
 1595         if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
 1596             return (-1);
 1597         return (0);
 1598     case AID_INET6:
 1599         return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
 1600     default:
 1601         fatalx("nexthop_cmp: unknown af");
 1602     }
 1603     return (-1);
 1604 }
 1605 
 1606 struct nexthop *
 1607 nexthop_lookup(struct bgpd_addr *nexthop)
 1608 {
 1609     struct nexthop  *nh;
 1610 
 1611     LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) {
 1612         if (memcmp(&nh->exit_nexthop, nexthop,
 1613             sizeof(struct bgpd_addr)) == 0)
 1614             return (nh);
 1615     }
 1616     return (NULL);
 1617 }
 1618 
 1619 struct nexthop_head *
 1620 nexthop_hash(struct bgpd_addr *nexthop)
 1621 {
 1622     u_int32_t    h = 0;
 1623 
 1624     switch (nexthop->aid) {
 1625     case AID_INET:
 1626         h = SipHash24(&nexthoptablekey, &nexthop->v4.s_addr,
 1627             sizeof(nexthop->v4.s_addr));
 1628         break;
 1629     case AID_INET6:
 1630         h = SipHash24(&nexthoptablekey, &nexthop->v6,
 1631             sizeof(struct in6_addr));
 1632         break;
 1633     default:
 1634         fatalx("nexthop_hash: unsupported AF");
 1635     }
 1636     return (&nexthoptable.nexthop_hashtbl[h & nexthoptable.nexthop_hashmask]);
 1637 }