"Fossies" - the Fresh Open Source Software Archive

Member "memcached-1.6.9/memcached.c" (21 Nov 2020, 215933 Bytes) of package /linux/www/memcached-1.6.9.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "memcached.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.6.8_vs_1.6.9.

    1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
    2 /*
    3  *  memcached - memory caching daemon
    4  *
    5  *       https://www.memcached.org/
    6  *
    7  *  Copyright 2003 Danga Interactive, Inc.  All rights reserved.
    8  *
    9  *  Use and distribution licensed under the BSD license.  See
   10  *  the LICENSE file for full text.
   11  *
   12  *  Authors:
   13  *      Anatoly Vorobey <mellon@pobox.com>
   14  *      Brad Fitzpatrick <brad@danga.com>
   15  */
   16 #include "memcached.h"
   17 #include "storage.h"
   18 #include "authfile.h"
   19 #include "restart.h"
   20 #include <sys/stat.h>
   21 #include <sys/socket.h>
   22 #include <sys/un.h>
   23 #include <signal.h>
   24 #include <sys/param.h>
   25 #include <sys/resource.h>
   26 #include <sys/uio.h>
   27 #include <ctype.h>
   28 #include <stdarg.h>
   29 
   30 /* some POSIX systems need the following definition
   31  * to get mlockall flags out of sys/mman.h.  */
   32 #ifndef _P1003_1B_VISIBLE
   33 #define _P1003_1B_VISIBLE
   34 #endif
   35 #include <pwd.h>
   36 #include <sys/mman.h>
   37 #include <fcntl.h>
   38 #include <netinet/tcp.h>
   39 #include <arpa/inet.h>
   40 #include <errno.h>
   41 #include <stdlib.h>
   42 #include <stdio.h>
   43 #include <string.h>
   44 #include <time.h>
   45 #include <assert.h>
   46 #include <sysexits.h>
   47 #include <stddef.h>
   48 
   49 #ifdef HAVE_GETOPT_LONG
   50 #include <getopt.h>
   51 #endif
   52 
   53 #ifdef TLS
   54 #include "tls.h"
   55 #endif
   56 
   57 #include "proto_text.h"
   58 #include "proto_bin.h"
   59 
   60 #if defined(__FreeBSD__)
   61 #include <sys/sysctl.h>
   62 #endif
   63 
   64 /*
   65  * forward declarations
   66  */
   67 static void drive_machine(conn *c);
   68 static int new_socket(struct addrinfo *ai);
   69 static ssize_t tcp_read(conn *arg, void *buf, size_t count);
   70 static ssize_t tcp_sendmsg(conn *arg, struct msghdr *msg, int flags);
   71 static ssize_t tcp_write(conn *arg, void *buf, size_t count);
   72 
   73 enum try_read_result {
   74     READ_DATA_RECEIVED,
   75     READ_NO_DATA_RECEIVED,
   76     READ_ERROR,            /** an error occurred (on the socket) (or client closed connection) */
   77     READ_MEMORY_ERROR      /** failed to allocate more memory */
   78 };
   79 
   80 static int try_read_command_negotiate(conn *c);
   81 static int try_read_command_udp(conn *c);
   82 
   83 static enum try_read_result try_read_network(conn *c);
   84 static enum try_read_result try_read_udp(conn *c);
   85 
   86 static int start_conn_timeout_thread();
   87 
   88 
   89 /* stats */
   90 static void stats_init(void);
   91 static void conn_to_str(const conn *c, char *addr, char *svr_addr);
   92 
   93 /* defaults */
   94 static void settings_init(void);
   95 
   96 /* event handling, network IO */
   97 static void event_handler(const evutil_socket_t fd, const short which, void *arg);
   98 static void conn_close(conn *c);
   99 static void conn_init(void);
  100 static bool update_event(conn *c, const int new_flags);
  101 static void complete_nread(conn *c);
  102 
  103 static void conn_free(conn *c);
  104 
  105 /** exported globals **/
  106 struct stats stats;
  107 struct stats_state stats_state;
  108 struct settings settings;
  109 time_t process_started;     /* when the process was started */
  110 conn **conns;
  111 
  112 struct slab_rebalance slab_rebal;
  113 volatile int slab_rebalance_signal;
  114 #ifdef EXTSTORE
  115 /* hoping this is temporary; I'd prefer to cut globals, but will complete this
  116  * battle another day.
  117  */
  118 void *ext_storage = NULL;
  119 #endif
  120 /** file scope variables **/
  121 static conn *listen_conn = NULL;
  122 static int max_fds;
  123 static struct event_base *main_base;
  124 
  125 enum transmit_result {
  126     TRANSMIT_COMPLETE,   /** All done writing. */
  127     TRANSMIT_INCOMPLETE, /** More data remaining to write. */
  128     TRANSMIT_SOFT_ERROR, /** Can't write any more right now. */
  129     TRANSMIT_HARD_ERROR  /** Can't write (c->state is set to conn_closing) */
  130 };
  131 
  132 /* Default methods to read from/ write to a socket */
  133 ssize_t tcp_read(conn *c, void *buf, size_t count) {
  134     assert (c != NULL);
  135     return read(c->sfd, buf, count);
  136 }
  137 
  138 ssize_t tcp_sendmsg(conn *c, struct msghdr *msg, int flags) {
  139     assert (c != NULL);
  140     return sendmsg(c->sfd, msg, flags);
  141 }
  142 
  143 ssize_t tcp_write(conn *c, void *buf, size_t count) {
  144     assert (c != NULL);
  145     return write(c->sfd, buf, count);
  146 }
  147 
  148 static enum transmit_result transmit(conn *c);
  149 
  150 /* This reduces the latency without adding lots of extra wiring to be able to
  151  * notify the listener thread of when to listen again.
  152  * Also, the clock timer could be broken out into its own thread and we
  153  * can block the listener via a condition.
  154  */
  155 static volatile bool allow_new_conns = true;
  156 static int stop_main_loop = NOT_STOP;
  157 static struct event maxconnsevent;
  158 static void maxconns_handler(const evutil_socket_t fd, const short which, void *arg) {
  159     struct timeval t = {.tv_sec = 0, .tv_usec = 10000};
  160 
  161     if (fd == -42 || allow_new_conns == false) {
  162         /* reschedule in 10ms if we need to keep polling */
  163         evtimer_set(&maxconnsevent, maxconns_handler, 0);
  164         event_base_set(main_base, &maxconnsevent);
  165         evtimer_add(&maxconnsevent, &t);
  166     } else {
  167         evtimer_del(&maxconnsevent);
  168         accept_new_conns(true);
  169     }
  170 }
  171 
  172 /*
  173  * given time value that's either unix time or delta from current unix time, return
  174  * unix time. Use the fact that delta can't exceed one month (and real time value can't
  175  * be that low).
  176  */
  177 rel_time_t realtime(const time_t exptime) {
  178     /* no. of seconds in 30 days - largest possible delta exptime */
  179 
  180     if (exptime == 0) return 0; /* 0 means never expire */
  181 
  182     if (exptime > REALTIME_MAXDELTA) {
  183         /* if item expiration is at/before the server started, give it an
  184            expiration time of 1 second after the server started.
  185            (because 0 means don't expire).  without this, we'd
  186            underflow and wrap around to some large value way in the
  187            future, effectively making items expiring in the past
  188            really expiring never */
  189         if (exptime <= process_started)
  190             return (rel_time_t)1;
  191         return (rel_time_t)(exptime - process_started);
  192     } else {
  193         return (rel_time_t)(exptime + current_time);
  194     }
  195 }
  196 
  197 static void stats_init(void) {
  198     memset(&stats, 0, sizeof(struct stats));
  199     memset(&stats_state, 0, sizeof(struct stats_state));
  200     stats_state.accepting_conns = true; /* assuming we start in this state. */
  201 
  202     /* make the time we started always be 2 seconds before we really
  203        did, so time(0) - time.started is never zero.  if so, things
  204        like 'settings.oldest_live' which act as booleans as well as
  205        values are now false in boolean context... */
  206     process_started = time(0) - ITEM_UPDATE_INTERVAL - 2;
  207     stats_prefix_init(settings.prefix_delimiter);
  208 }
  209 
  210 void stats_reset(void) {
  211     STATS_LOCK();
  212     memset(&stats, 0, sizeof(struct stats));
  213     stats_prefix_clear();
  214     STATS_UNLOCK();
  215     threadlocal_stats_reset();
  216     item_stats_reset();
  217 }
  218 
  219 static void settings_init(void) {
  220     settings.use_cas = true;
  221     settings.access = 0700;
  222     settings.port = 11211;
  223     settings.udpport = 0;
  224 #ifdef TLS
  225     settings.ssl_enabled = false;
  226     settings.ssl_ctx = NULL;
  227     settings.ssl_chain_cert = NULL;
  228     settings.ssl_key = NULL;
  229     settings.ssl_verify_mode = SSL_VERIFY_NONE;
  230     settings.ssl_keyformat = SSL_FILETYPE_PEM;
  231     settings.ssl_ciphers = NULL;
  232     settings.ssl_ca_cert = NULL;
  233     settings.ssl_last_cert_refresh_time = current_time;
  234     settings.ssl_wbuf_size = 16 * 1024; // default is 16KB (SSL max frame size is 17KB)
  235     settings.ssl_session_cache = false;
  236 #endif
  237     /* By default this string should be NULL for getaddrinfo() */
  238     settings.inter = NULL;
  239     settings.maxbytes = 64 * 1024 * 1024; /* default is 64MB */
  240     settings.maxconns = 1024;         /* to limit connections-related memory to about 5MB */
  241     settings.verbose = 0;
  242     settings.oldest_live = 0;
  243     settings.oldest_cas = 0;          /* supplements accuracy of oldest_live */
  244     settings.evict_to_free = 1;       /* push old items out of cache when memory runs out */
  245     settings.socketpath = NULL;       /* by default, not using a unix socket */
  246     settings.auth_file = NULL;        /* by default, not using ASCII authentication tokens */
  247     settings.factor = 1.25;
  248     settings.chunk_size = 48;         /* space for a modest key and value */
  249     settings.num_threads = 4;         /* N workers */
  250     settings.num_threads_per_udp = 0;
  251     settings.prefix_delimiter = ':';
  252     settings.detail_enabled = 0;
  253     settings.reqs_per_event = 20;
  254     settings.backlog = 1024;
  255     settings.binding_protocol = negotiating_prot;
  256     settings.item_size_max = 1024 * 1024; /* The famous 1MB upper limit. */
  257     settings.slab_page_size = 1024 * 1024; /* chunks are split from 1MB pages. */
  258     settings.slab_chunk_size_max = settings.slab_page_size / 2;
  259     settings.sasl = false;
  260     settings.maxconns_fast = true;
  261     settings.lru_crawler = false;
  262     settings.lru_crawler_sleep = 100;
  263     settings.lru_crawler_tocrawl = 0;
  264     settings.lru_maintainer_thread = false;
  265     settings.lru_segmented = true;
  266     settings.hot_lru_pct = 20;
  267     settings.warm_lru_pct = 40;
  268     settings.hot_max_factor = 0.2;
  269     settings.warm_max_factor = 2.0;
  270     settings.temp_lru = false;
  271     settings.temporary_ttl = 61;
  272     settings.idle_timeout = 0; /* disabled */
  273     settings.hashpower_init = 0;
  274     settings.slab_reassign = true;
  275     settings.slab_automove = 1;
  276     settings.slab_automove_ratio = 0.8;
  277     settings.slab_automove_window = 30;
  278     settings.shutdown_command = false;
  279     settings.tail_repair_time = TAIL_REPAIR_TIME_DEFAULT;
  280     settings.flush_enabled = true;
  281     settings.dump_enabled = true;
  282     settings.crawls_persleep = 1000;
  283     settings.logger_watcher_buf_size = LOGGER_WATCHER_BUF_SIZE;
  284     settings.logger_buf_size = LOGGER_BUF_SIZE;
  285     settings.drop_privileges = false;
  286     settings.watch_enabled = true;
  287     settings.read_buf_mem_limit = 0;
  288 #ifdef MEMCACHED_DEBUG
  289     settings.relaxed_privileges = false;
  290 #endif
  291     settings.num_napi_ids = 0;
  292     settings.memory_file = NULL;
  293 }
  294 
  295 extern pthread_mutex_t conn_lock;
  296 
  297 /* Connection timeout thread bits */
  298 static pthread_t conn_timeout_tid;
  299 static int do_run_conn_timeout_thread;
  300 static pthread_cond_t conn_timeout_cond = PTHREAD_COND_INITIALIZER;
  301 static pthread_mutex_t conn_timeout_lock = PTHREAD_MUTEX_INITIALIZER;
  302 
  303 #define CONNS_PER_SLICE 100
  304 #define TIMEOUT_MSG_SIZE (1 + sizeof(int))
  305 static void *conn_timeout_thread(void *arg) {
  306     int i;
  307     conn *c;
  308     char buf[TIMEOUT_MSG_SIZE];
  309     rel_time_t oldest_last_cmd;
  310     int sleep_time;
  311     int sleep_slice = max_fds / CONNS_PER_SLICE;
  312     if (sleep_slice == 0)
  313         sleep_slice = CONNS_PER_SLICE;
  314 
  315     useconds_t timeslice = 1000000 / sleep_slice;
  316 
  317     mutex_lock(&conn_timeout_lock);
  318     while(do_run_conn_timeout_thread) {
  319         if (settings.verbose > 2)
  320             fprintf(stderr, "idle timeout thread at top of connection list\n");
  321 
  322         oldest_last_cmd = current_time;
  323 
  324         for (i = 0; i < max_fds; i++) {
  325             if ((i % CONNS_PER_SLICE) == 0) {
  326                 if (settings.verbose > 2)
  327                     fprintf(stderr, "idle timeout thread sleeping for %ulus\n",
  328                         (unsigned int)timeslice);
  329                 usleep(timeslice);
  330             }
  331 
  332             if (!conns[i])
  333                 continue;
  334 
  335             c = conns[i];
  336 
  337             if (!IS_TCP(c->transport))
  338                 continue;
  339 
  340             if (c->state != conn_new_cmd && c->state != conn_read)
  341                 continue;
  342 
  343             if ((current_time - c->last_cmd_time) > settings.idle_timeout) {
  344                 buf[0] = 't';
  345                 memcpy(&buf[1], &i, sizeof(int));
  346                 if (write(c->thread->notify_send_fd, buf, TIMEOUT_MSG_SIZE)
  347                     != TIMEOUT_MSG_SIZE)
  348                     perror("Failed to write timeout to notify pipe");
  349             } else {
  350                 if (c->last_cmd_time < oldest_last_cmd)
  351                     oldest_last_cmd = c->last_cmd_time;
  352             }
  353         }
  354 
  355         /* This is the soonest we could have another connection time out */
  356         sleep_time = settings.idle_timeout - (current_time - oldest_last_cmd) + 1;
  357         if (sleep_time <= 0)
  358             sleep_time = 1;
  359 
  360         if (settings.verbose > 2)
  361             fprintf(stderr,
  362                     "idle timeout thread finished pass, sleeping for %ds\n",
  363                     sleep_time);
  364 
  365         struct timeval now;
  366         struct timespec to_sleep;
  367         gettimeofday(&now, NULL);
  368         to_sleep.tv_sec = now.tv_sec + sleep_time;
  369         to_sleep.tv_nsec = 0;
  370 
  371         pthread_cond_timedwait(&conn_timeout_cond, &conn_timeout_lock, &to_sleep);
  372     }
  373 
  374     mutex_unlock(&conn_timeout_lock);
  375     return NULL;
  376 }
  377 
  378 static int start_conn_timeout_thread() {
  379     int ret;
  380 
  381     if (settings.idle_timeout == 0)
  382         return -1;
  383 
  384     do_run_conn_timeout_thread = 1;
  385     if ((ret = pthread_create(&conn_timeout_tid, NULL,
  386         conn_timeout_thread, NULL)) != 0) {
  387         fprintf(stderr, "Can't create idle connection timeout thread: %s\n",
  388             strerror(ret));
  389         return -1;
  390     }
  391 
  392     return 0;
  393 }
  394 
  395 int stop_conn_timeout_thread(void) {
  396     if (!do_run_conn_timeout_thread)
  397         return -1;
  398     mutex_lock(&conn_timeout_lock);
  399     do_run_conn_timeout_thread = 0;
  400     pthread_cond_signal(&conn_timeout_cond);
  401     mutex_unlock(&conn_timeout_lock);
  402     pthread_join(conn_timeout_tid, NULL);
  403     return 0;
  404 }
  405 
  406 /*
  407  * read buffer cache helper functions
  408  */
  409 static void rbuf_release(conn *c) {
  410     if (c->rbuf != NULL && c->rbytes == 0 && !IS_UDP(c->transport)) {
  411         if (c->rbuf_malloced) {
  412             free(c->rbuf);
  413             c->rbuf_malloced = false;
  414         } else {
  415             do_cache_free(c->thread->rbuf_cache, c->rbuf);
  416         }
  417         c->rsize = 0;
  418         c->rbuf = NULL;
  419         c->rcurr = NULL;
  420     }
  421 }
  422 
  423 static bool rbuf_alloc(conn *c) {
  424     if (c->rbuf == NULL) {
  425         c->rbuf = do_cache_alloc(c->thread->rbuf_cache);
  426         if (!c->rbuf) {
  427             THR_STATS_LOCK(c);
  428             c->thread->stats.read_buf_oom++;
  429             THR_STATS_UNLOCK(c);
  430             return false;
  431         }
  432         c->rsize = READ_BUFFER_SIZE;
  433         c->rcurr = c->rbuf;
  434     }
  435     return true;
  436 }
  437 
  438 // Just for handling huge ASCII multigets.
  439 // The previous system was essentially the same; realloc'ing until big enough,
  440 // then realloc'ing back down after the request finished.
  441 bool rbuf_switch_to_malloc(conn *c) {
  442     // Might as well start with x2 and work from there.
  443     size_t size = c->rsize * 2;
  444     char *tmp = malloc(size);
  445     if (!tmp)
  446         return false;
  447 
  448     do_cache_free(c->thread->rbuf_cache, c->rbuf);
  449     memcpy(tmp, c->rcurr, c->rbytes);
  450 
  451     c->rcurr = c->rbuf = tmp;
  452     c->rsize = size;
  453     c->rbuf_malloced = true;
  454     return true;
  455 }
  456 
  457 /*
  458  * Initializes the connections array. We don't actually allocate connection
  459  * structures until they're needed, so as to avoid wasting memory when the
  460  * maximum connection count is much higher than the actual number of
  461  * connections.
  462  *
  463  * This does end up wasting a few pointers' worth of memory for FDs that are
  464  * used for things other than connections, but that's worth it in exchange for
  465  * being able to directly index the conns array by FD.
  466  */
  467 static void conn_init(void) {
  468     /* We're unlikely to see an FD much higher than maxconns. */
  469     int next_fd = dup(1);
  470     if (next_fd < 0) {
  471         perror("Failed to duplicate file descriptor\n");
  472         exit(1);
  473     }
  474     int headroom = 10;      /* account for extra unexpected open FDs */
  475     struct rlimit rl;
  476 
  477     max_fds = settings.maxconns + headroom + next_fd;
  478 
  479     /* But if possible, get the actual highest FD we can possibly ever see. */
  480     if (getrlimit(RLIMIT_NOFILE, &rl) == 0) {
  481         max_fds = rl.rlim_max;
  482     } else {
  483         fprintf(stderr, "Failed to query maximum file descriptor; "
  484                         "falling back to maxconns\n");
  485     }
  486 
  487     close(next_fd);
  488 
  489     if ((conns = calloc(max_fds, sizeof(conn *))) == NULL) {
  490         fprintf(stderr, "Failed to allocate connection structures\n");
  491         /* This is unrecoverable so bail out early. */
  492         exit(1);
  493     }
  494 }
  495 
  496 static const char *prot_text(enum protocol prot) {
  497     char *rv = "unknown";
  498     switch(prot) {
  499         case ascii_prot:
  500             rv = "ascii";
  501             break;
  502         case binary_prot:
  503             rv = "binary";
  504             break;
  505         case negotiating_prot:
  506             rv = "auto-negotiate";
  507             break;
  508     }
  509     return rv;
  510 }
  511 
  512 void conn_close_idle(conn *c) {
  513     if (settings.idle_timeout > 0 &&
  514         (current_time - c->last_cmd_time) > settings.idle_timeout) {
  515         if (c->state != conn_new_cmd && c->state != conn_read) {
  516             if (settings.verbose > 1)
  517                 fprintf(stderr,
  518                     "fd %d wants to timeout, but isn't in read state", c->sfd);
  519             return;
  520         }
  521 
  522         if (settings.verbose > 1)
  523             fprintf(stderr, "Closing idle fd %d\n", c->sfd);
  524 
  525         pthread_mutex_lock(&c->thread->stats.mutex);
  526         c->thread->stats.idle_kicks++;
  527         pthread_mutex_unlock(&c->thread->stats.mutex);
  528 
  529         conn_set_state(c, conn_closing);
  530         drive_machine(c);
  531     }
  532 }
  533 
  534 /* bring conn back from a sidethread. could have had its event base moved. */
  535 void conn_worker_readd(conn *c) {
  536     if (c->state == conn_io_queue) {
  537         c->io_queues_submitted--;
  538         // If we're still waiting for other queues to return, don't re-add the
  539         // connection yet.
  540         if (c->io_queues_submitted != 0) {
  541             return;
  542         }
  543     }
  544     c->ev_flags = EV_READ | EV_PERSIST;
  545     event_set(&c->event, c->sfd, c->ev_flags, event_handler, (void *)c);
  546     event_base_set(c->thread->base, &c->event);
  547 
  548     // TODO: call conn_cleanup/fail/etc
  549     if (event_add(&c->event, 0) == -1) {
  550         perror("event_add");
  551     }
  552 
  553     // side thread wanted us to close immediately.
  554     if (c->state == conn_closing) {
  555         drive_machine(c);
  556         return;
  557     } else if (c->state == conn_io_queue) {
  558         // machine will know how to return based on secondary state.
  559         drive_machine(c);
  560     } else {
  561         conn_set_state(c, conn_new_cmd);
  562     }
  563 }
  564 
  565 void conn_io_queue_add(conn *c, int type, void *ctx, io_queue_stack_cb cb, io_queue_stack_cb com_cb, io_queue_cb fin_cb) {
  566     io_queue_t *q = c->io_queues;
  567     while (q->type != IO_QUEUE_NONE) {
  568         q++;
  569     }
  570     q->type = type;
  571     q->ctx = ctx;
  572     q->stack_ctx = NULL;
  573     q->submit_cb = cb;
  574     q->complete_cb = com_cb;
  575     q->finalize_cb = fin_cb;
  576     return;
  577 }
  578 
  579 io_queue_t *conn_io_queue_get(conn *c, int type) {
  580     io_queue_t *q = c->io_queues;
  581     while (q->type != IO_QUEUE_NONE) {
  582         if (q->type == type) {
  583             return q;
  584         }
  585         q++;
  586     }
  587     return NULL;
  588 }
  589 
  590 // called after returning to the main worker thread.
  591 // users of the queue need to distinguish if the IO was actually consumed or
  592 // not and handle appropriately.
  593 static void conn_io_queue_complete(conn *c) {
  594     io_queue_t *q = c->io_queues;
  595     while (q->type != IO_QUEUE_NONE) {
  596         // Reuse the same submit stack. We zero it out first so callbacks can
  597         // queue new IO's if necessary.
  598         if (q->stack_ctx) {
  599             void *tmp = q->stack_ctx;
  600             q->stack_ctx = NULL;
  601             q->complete_cb(q->ctx, tmp);
  602         }
  603         q++;
  604     }
  605 }
  606 
  607 conn *conn_new(const int sfd, enum conn_states init_state,
  608                 const int event_flags,
  609                 const int read_buffer_size, enum network_transport transport,
  610                 struct event_base *base, void *ssl) {
  611     conn *c;
  612 
  613     assert(sfd >= 0 && sfd < max_fds);
  614     c = conns[sfd];
  615 
  616     if (NULL == c) {
  617         if (!(c = (conn *)calloc(1, sizeof(conn)))) {
  618             STATS_LOCK();
  619             stats.malloc_fails++;
  620             STATS_UNLOCK();
  621             fprintf(stderr, "Failed to allocate connection object\n");
  622             return NULL;
  623         }
  624         MEMCACHED_CONN_CREATE(c);
  625         c->read = NULL;
  626         c->sendmsg = NULL;
  627         c->write = NULL;
  628         c->rbuf = NULL;
  629 
  630         c->rsize = read_buffer_size;
  631 
  632         // UDP connections use a persistent static buffer.
  633         if (c->rsize) {
  634             c->rbuf = (char *)malloc((size_t)c->rsize);
  635         }
  636 
  637         if (c->rsize && c->rbuf == NULL) {
  638             conn_free(c);
  639             STATS_LOCK();
  640             stats.malloc_fails++;
  641             STATS_UNLOCK();
  642             fprintf(stderr, "Failed to allocate buffers for connection\n");
  643             return NULL;
  644         }
  645 
  646 
  647         STATS_LOCK();
  648         stats_state.conn_structs++;
  649         STATS_UNLOCK();
  650 
  651         c->sfd = sfd;
  652         conns[sfd] = c;
  653     }
  654 
  655     c->transport = transport;
  656     c->protocol = settings.binding_protocol;
  657 
  658     /* unix socket mode doesn't need this, so zeroed out.  but why
  659      * is this done for every command?  presumably for UDP
  660      * mode.  */
  661     if (!settings.socketpath) {
  662         c->request_addr_size = sizeof(c->request_addr);
  663     } else {
  664         c->request_addr_size = 0;
  665     }
  666 
  667     if (transport == tcp_transport && init_state == conn_new_cmd) {
  668         if (getpeername(sfd, (struct sockaddr *) &c->request_addr,
  669                         &c->request_addr_size)) {
  670             perror("getpeername");
  671             memset(&c->request_addr, 0, sizeof(c->request_addr));
  672         }
  673     }
  674 
  675     if (settings.verbose > 1) {
  676         if (init_state == conn_listening) {
  677             fprintf(stderr, "<%d server listening (%s)\n", sfd,
  678                 prot_text(c->protocol));
  679         } else if (IS_UDP(transport)) {
  680             fprintf(stderr, "<%d server listening (udp)\n", sfd);
  681         } else if (c->protocol == negotiating_prot) {
  682             fprintf(stderr, "<%d new auto-negotiating client connection\n",
  683                     sfd);
  684         } else if (c->protocol == ascii_prot) {
  685             fprintf(stderr, "<%d new ascii client connection.\n", sfd);
  686         } else if (c->protocol == binary_prot) {
  687             fprintf(stderr, "<%d new binary client connection.\n", sfd);
  688         } else {
  689             fprintf(stderr, "<%d new unknown (%d) client connection\n",
  690                 sfd, c->protocol);
  691             assert(false);
  692         }
  693     }
  694 
  695 #ifdef TLS
  696     c->ssl = NULL;
  697     c->ssl_wbuf = NULL;
  698     c->ssl_enabled = false;
  699 #endif
  700     c->state = init_state;
  701     c->rlbytes = 0;
  702     c->cmd = -1;
  703     c->rbytes = 0;
  704     c->rcurr = c->rbuf;
  705     c->ritem = 0;
  706     c->rbuf_malloced = false;
  707     c->sasl_started = false;
  708     c->set_stale = false;
  709     c->mset_res = false;
  710     c->close_after_write = false;
  711     c->last_cmd_time = current_time; /* initialize for idle kicker */
  712     // wipe all queues.
  713     memset(c->io_queues, 0, sizeof(c->io_queues));
  714     c->io_queues_submitted = 0;
  715 
  716     c->item = 0;
  717 
  718     c->noreply = false;
  719 
  720 #ifdef TLS
  721     if (ssl) {
  722         c->ssl = (SSL*)ssl;
  723         c->read = ssl_read;
  724         c->sendmsg = ssl_sendmsg;
  725         c->write = ssl_write;
  726         c->ssl_enabled = true;
  727         SSL_set_info_callback(c->ssl, ssl_callback);
  728     } else
  729 #else
  730     // This must be NULL if TLS is not enabled.
  731     assert(ssl == NULL);
  732 #endif
  733     {
  734         c->read = tcp_read;
  735         c->sendmsg = tcp_sendmsg;
  736         c->write = tcp_write;
  737     }
  738 
  739     if (IS_UDP(transport)) {
  740         c->try_read_command = try_read_command_udp;
  741     } else {
  742         switch (c->protocol) {
  743             case ascii_prot:
  744                 if (settings.auth_file == NULL) {
  745                     c->authenticated = true;
  746                     c->try_read_command = try_read_command_ascii;
  747                 } else {
  748                     c->authenticated = false;
  749                     c->try_read_command = try_read_command_asciiauth;
  750                 }
  751                 break;
  752             case binary_prot:
  753                 // binprot handles its own authentication via SASL parsing.
  754                 c->authenticated = false;
  755                 c->try_read_command = try_read_command_binary;
  756                 break;
  757             case negotiating_prot:
  758                 c->try_read_command = try_read_command_negotiate;
  759                 break;
  760         }
  761     }
  762 
  763     event_set(&c->event, sfd, event_flags, event_handler, (void *)c);
  764     event_base_set(base, &c->event);
  765     c->ev_flags = event_flags;
  766 
  767     if (event_add(&c->event, 0) == -1) {
  768         perror("event_add");
  769         return NULL;
  770     }
  771 
  772     STATS_LOCK();
  773     stats_state.curr_conns++;
  774     stats.total_conns++;
  775     STATS_UNLOCK();
  776 
  777     MEMCACHED_CONN_ALLOCATE(c->sfd);
  778 
  779     return c;
  780 }
  781 
  782 void conn_release_items(conn *c) {
  783     assert(c != NULL);
  784 
  785     if (c->item) {
  786         item_remove(c->item);
  787         c->item = 0;
  788     }
  789 
  790     // Cull any unsent responses.
  791     if (c->resp_head) {
  792         mc_resp *resp = c->resp_head;
  793         // r_f() handles the chain maintenance.
  794         while (resp) {
  795             // temporary by default. hide behind a debug flag in the future:
  796             // double free detection. Transmit loops can drop out early, but
  797             // here we could infinite loop.
  798             if (resp->free) {
  799                 fprintf(stderr, "ERROR: double free detected during conn_release_items(): [%d] [%s]\n",
  800                         c->sfd, c->protocol == binary_prot ? "binary" : "ascii");
  801                 // Since this is a critical failure, just leak the memory.
  802                 // If these errors are seen, an abort() can be used instead.
  803                 c->resp_head = NULL;
  804                 c->resp = NULL;
  805                 break;
  806             }
  807             resp = resp_finish(c, resp);
  808         }
  809     }
  810 }
  811 
  812 static void conn_cleanup(conn *c) {
  813     assert(c != NULL);
  814 
  815     conn_release_items(c);
  816 
  817     if (c->sasl_conn) {
  818         assert(settings.sasl);
  819         sasl_dispose(&c->sasl_conn);
  820         c->sasl_conn = NULL;
  821     }
  822 
  823     if (IS_UDP(c->transport)) {
  824         conn_set_state(c, conn_read);
  825     }
  826 }
  827 
  828 /*
  829  * Frees a connection.
  830  */
  831 void conn_free(conn *c) {
  832     if (c) {
  833         assert(c != NULL);
  834         assert(c->sfd >= 0 && c->sfd < max_fds);
  835 
  836         MEMCACHED_CONN_DESTROY(c);
  837         conns[c->sfd] = NULL;
  838         if (c->rbuf)
  839             free(c->rbuf);
  840 #ifdef TLS
  841         if (c->ssl_wbuf)
  842             c->ssl_wbuf = NULL;
  843 #endif
  844 
  845         free(c);
  846     }
  847 }
  848 
  849 static void conn_close(conn *c) {
  850     assert(c != NULL);
  851 
  852     /* delete the event, the socket and the conn */
  853     event_del(&c->event);
  854 
  855     if (settings.verbose > 1)
  856         fprintf(stderr, "<%d connection closed.\n", c->sfd);
  857 
  858     conn_cleanup(c);
  859 
  860     // force release of read buffer.
  861     if (c->thread) {
  862         c->rbytes = 0;
  863         rbuf_release(c);
  864     }
  865 
  866     MEMCACHED_CONN_RELEASE(c->sfd);
  867     conn_set_state(c, conn_closed);
  868 #ifdef TLS
  869     if (c->ssl) {
  870         SSL_shutdown(c->ssl);
  871         SSL_free(c->ssl);
  872     }
  873 #endif
  874     close(c->sfd);
  875     pthread_mutex_lock(&conn_lock);
  876     allow_new_conns = true;
  877     pthread_mutex_unlock(&conn_lock);
  878 
  879     STATS_LOCK();
  880     stats_state.curr_conns--;
  881     STATS_UNLOCK();
  882 
  883     return;
  884 }
  885 
  886 // Since some connections might be off on side threads and some are managed as
  887 // listeners we need to walk through them all from a central point.
  888 // Must be called with all worker threads hung or in the process of closing.
  889 void conn_close_all(void) {
  890     int i;
  891     for (i = 0; i < max_fds; i++) {
  892         if (conns[i] && conns[i]->state != conn_closed) {
  893             conn_close(conns[i]);
  894         }
  895     }
  896 }
  897 
  898 /**
  899  * Convert a state name to a human readable form.
  900  */
  901 static const char *state_text(enum conn_states state) {
  902     const char* const statenames[] = { "conn_listening",
  903                                        "conn_new_cmd",
  904                                        "conn_waiting",
  905                                        "conn_read",
  906                                        "conn_parse_cmd",
  907                                        "conn_write",
  908                                        "conn_nread",
  909                                        "conn_swallow",
  910                                        "conn_closing",
  911                                        "conn_mwrite",
  912                                        "conn_closed",
  913                                        "conn_watch",
  914                                        "conn_io_queue" };
  915     return statenames[state];
  916 }
  917 
  918 /*
  919  * Sets a connection's current state in the state machine. Any special
  920  * processing that needs to happen on certain state transitions can
  921  * happen here.
  922  */
  923 void conn_set_state(conn *c, enum conn_states state) {
  924     assert(c != NULL);
  925     assert(state >= conn_listening && state < conn_max_state);
  926 
  927     if (state != c->state) {
  928         if (settings.verbose > 2) {
  929             fprintf(stderr, "%d: going from %s to %s\n",
  930                     c->sfd, state_text(c->state),
  931                     state_text(state));
  932         }
  933 
  934         if (state == conn_write || state == conn_mwrite) {
  935             MEMCACHED_PROCESS_COMMAND_END(c->sfd, c->resp->wbuf, c->resp->wbytes);
  936         }
  937         c->state = state;
  938     }
  939 }
  940 
  941 /*
  942  * response object helper functions
  943  */
  944 void resp_reset(mc_resp *resp) {
  945     if (resp->item) {
  946         item_remove(resp->item);
  947         resp->item = NULL;
  948     }
  949     if (resp->write_and_free) {
  950         free(resp->write_and_free);
  951         resp->write_and_free = NULL;
  952     }
  953     resp->wbytes = 0;
  954     resp->tosend = 0;
  955     resp->iovcnt = 0;
  956     resp->chunked_data_iov = 0;
  957     resp->chunked_total = 0;
  958     resp->skip = false;
  959 }
  960 
  961 void resp_add_iov(mc_resp *resp, const void *buf, int len) {
  962     assert(resp->iovcnt < MC_RESP_IOVCOUNT);
  963     int x = resp->iovcnt;
  964     resp->iov[x].iov_base = (void *)buf;
  965     resp->iov[x].iov_len = len;
  966     resp->iovcnt++;
  967     resp->tosend += len;
  968 }
  969 
  970 // Notes that an IOV should be handled as a chunked item header.
  971 // TODO: I'm hoping this isn't a permanent abstraction while I learn what the
  972 // API should be.
  973 void resp_add_chunked_iov(mc_resp *resp, const void *buf, int len) {
  974     resp->chunked_data_iov = resp->iovcnt;
  975     resp->chunked_total = len;
  976     resp_add_iov(resp, buf, len);
  977 }
  978 
  979 // resp_allocate and resp_free are a wrapper around read buffers which makes
  980 // read buffers the only network memory to track.
  981 // Normally this would be too excessive. In this case it allows end users to
  982 // track a single memory limit for ephemeral connection buffers.
  983 // Fancy bit twiddling tricks are avoided to help keep this straightforward.
  984 static mc_resp* resp_allocate(conn *c) {
  985     LIBEVENT_THREAD *th = c->thread;
  986     mc_resp *resp = NULL;
  987     mc_resp_bundle *b = th->open_bundle;
  988 
  989     if (b != NULL) {
  990         for (int i = 0; i < MAX_RESP_PER_BUNDLE; i++) {
  991             // loop around starting from the most likely to be free
  992             int x = (i + b->next_check) % MAX_RESP_PER_BUNDLE;
  993             if (b->r[x].free) {
  994                 resp = &b->r[x];
  995                 b->next_check = x+1;
  996                 break;
  997             }
  998         }
  999 
 1000         if (resp != NULL) {
 1001             b->refcount++;
 1002             resp->free = false;
 1003             if (b->refcount == MAX_RESP_PER_BUNDLE) {
 1004                 assert(b->prev == NULL);
 1005                 // We only allocate off the head. Assign new head.
 1006                 th->open_bundle = b->next;
 1007                 // Remove ourselves from the list.
 1008                 if (b->next) {
 1009                     b->next->prev = 0;
 1010                     b->next = 0;
 1011                 }
 1012             }
 1013         }
 1014     }
 1015 
 1016     if (resp == NULL) {
 1017         assert(th->open_bundle == NULL);
 1018         b = do_cache_alloc(th->rbuf_cache);
 1019         if (b) {
 1020             THR_STATS_LOCK(c);
 1021             c->thread->stats.response_obj_bytes += READ_BUFFER_SIZE;
 1022             THR_STATS_UNLOCK(c);
 1023             b->next_check = 1;
 1024             b->refcount = 1;
 1025             for (int i = 0; i < MAX_RESP_PER_BUNDLE; i++) {
 1026                 b->r[i].bundle = b;
 1027                 b->r[i].free = true;
 1028             }
 1029             b->next = 0;
 1030             b->prev = 0;
 1031             th->open_bundle = b;
 1032             resp = &b->r[0];
 1033             resp->free = false;
 1034         } else {
 1035             return NULL;
 1036         }
 1037     }
 1038 
 1039     return resp;
 1040 }
 1041 
 1042 static void resp_free(conn *c, mc_resp *resp) {
 1043     LIBEVENT_THREAD *th = c->thread;
 1044     mc_resp_bundle *b = resp->bundle;
 1045 
 1046     resp->free = true;
 1047     b->refcount--;
 1048     if (b->refcount == 0) {
 1049         if (b == th->open_bundle && b->next == 0) {
 1050             // This is the final bundle. Just hold and reuse to skip init loop
 1051             assert(b->prev == 0);
 1052             b->next_check = 0;
 1053         } else {
 1054             // Assert that we're either in the list or at the head.
 1055             assert((b->next || b->prev) || b == th->open_bundle);
 1056 
 1057             // unlink from list.
 1058             mc_resp_bundle **head = &th->open_bundle;
 1059             if (*head == b) *head = b->next;
 1060             // Not tracking the tail.
 1061             assert(b->next != b && b->prev != b);
 1062 
 1063             if (b->next) b->next->prev = b->prev;
 1064             if (b->prev) b->prev->next = b->next;
 1065 
 1066             // Now completely done with this buffer.
 1067             do_cache_free(th->rbuf_cache, b);
 1068             THR_STATS_LOCK(c);
 1069             c->thread->stats.response_obj_bytes -= READ_BUFFER_SIZE;
 1070             THR_STATS_UNLOCK(c);
 1071         }
 1072     } else {
 1073         mc_resp_bundle **head = &th->open_bundle;
 1074         // NOTE: since we're not tracking tail, latest free ends up in head.
 1075         if (b == th->open_bundle || (b->prev || b->next)) {
 1076             // If we're already linked, leave it in place to save CPU.
 1077         } else {
 1078             // Non-zero refcount, need to link into the freelist.
 1079             b->prev = 0;
 1080             b->next = *head;
 1081             if (b->next) b->next->prev = b;
 1082             *head = b;
 1083         }
 1084 
 1085     }
 1086 }
 1087 
 1088 bool resp_start(conn *c) {
 1089     mc_resp *resp = resp_allocate(c);
 1090     if (!resp) {
 1091         THR_STATS_LOCK(c);
 1092         c->thread->stats.response_obj_oom++;
 1093         THR_STATS_UNLOCK(c);
 1094         return false;
 1095     }
 1096     // handling the stats counters here to simplify testing
 1097     THR_STATS_LOCK(c);
 1098     c->thread->stats.response_obj_count++;
 1099     THR_STATS_UNLOCK(c);
 1100     // Skip zeroing the bundle pointer at the start.
 1101     // TODO: this line is here temporarily to make the code easy to disable.
 1102     // when it's more mature, move the memset into resp_allocate() and have it
 1103     // set the bundle pointer on allocate so this line isn't as complex.
 1104     memset((char *)resp + sizeof(mc_resp_bundle*), 0, sizeof(*resp) - sizeof(mc_resp_bundle*));
 1105     // TODO: this next line works. memset _does_ show up significantly under
 1106     // perf reports due to zeroing out the entire resp->wbuf. before swapping
 1107     // the lines more validation work should be done to ensure wbuf's aren't
 1108     // accidentally reused without being written to.
 1109     //memset((char *)resp + sizeof(mc_resp_bundle*), 0, offsetof(mc_resp, wbuf));
 1110     if (!c->resp_head) {
 1111         c->resp_head = resp;
 1112     }
 1113     if (!c->resp) {
 1114         c->resp = resp;
 1115     } else {
 1116         c->resp->next = resp;
 1117         c->resp = resp;
 1118     }
 1119     if (IS_UDP(c->transport)) {
 1120         // need to hold on to some data for async responses.
 1121         c->resp->request_id = c->request_id;
 1122         c->resp->request_addr = c->request_addr;
 1123         c->resp->request_addr_size = c->request_addr_size;
 1124     }
 1125     return true;
 1126 }
 1127 
 1128 // returns next response in chain.
 1129 mc_resp* resp_finish(conn *c, mc_resp *resp) {
 1130     mc_resp *next = resp->next;
 1131     if (resp->item) {
 1132         // TODO: cache hash value in resp obj?
 1133         item_remove(resp->item);
 1134         resp->item = NULL;
 1135     }
 1136     if (resp->write_and_free) {
 1137         free(resp->write_and_free);
 1138     }
 1139     if (resp->io_pending) {
 1140         // If we had a pending IO, tell it to internally clean up then return
 1141         // the main object back to our thread cache.
 1142         resp->io_pending->q->finalize_cb(resp->io_pending);
 1143         do_cache_free(c->thread->io_cache, resp->io_pending);
 1144         resp->io_pending = NULL;
 1145     }
 1146     if (c->resp_head == resp) {
 1147         c->resp_head = next;
 1148     }
 1149     if (c->resp == resp) {
 1150         c->resp = NULL;
 1151     }
 1152     resp_free(c, resp);
 1153     THR_STATS_LOCK(c);
 1154     c->thread->stats.response_obj_count--;
 1155     THR_STATS_UNLOCK(c);
 1156     return next;
 1157 }
 1158 
 1159 // tells if connection has a depth of response objects to process.
 1160 bool resp_has_stack(conn *c) {
 1161     return c->resp_head->next != NULL ? true : false;
 1162 }
 1163 
 1164 void out_string(conn *c, const char *str) {
 1165     size_t len;
 1166     assert(c != NULL);
 1167     mc_resp *resp = c->resp;
 1168 
 1169     // if response was original filled with something, but we're now writing
 1170     // out an error or similar, have to reset the object first.
 1171     // TODO: since this is often redundant with allocation, how many callers
 1172     // are actually requiring it be reset? Can we fast test by just looking at
 1173     // tosend and reset if nonzero?
 1174     resp_reset(resp);
 1175 
 1176     if (c->noreply) {
 1177         // TODO: just invalidate the response since nothing's been attempted
 1178         // to send yet?
 1179         resp->skip = true;
 1180         if (settings.verbose > 1)
 1181             fprintf(stderr, ">%d NOREPLY %s\n", c->sfd, str);
 1182         conn_set_state(c, conn_new_cmd);
 1183         return;
 1184     }
 1185 
 1186     if (settings.verbose > 1)
 1187         fprintf(stderr, ">%d %s\n", c->sfd, str);
 1188 
 1189     // Fill response object with static string.
 1190 
 1191     len = strlen(str);
 1192     if ((len + 2) > WRITE_BUFFER_SIZE) {
 1193         /* ought to be always enough. just fail for simplicity */
 1194         str = "SERVER_ERROR output line too long";
 1195         len = strlen(str);
 1196     }
 1197 
 1198     memcpy(resp->wbuf, str, len);
 1199     memcpy(resp->wbuf + len, "\r\n", 2);
 1200     resp_add_iov(resp, resp->wbuf, len + 2);
 1201 
 1202     conn_set_state(c, conn_new_cmd);
 1203     return;
 1204 }
 1205 
 1206 // For metaget-style ASCII commands. Ignores noreply, ensuring clients see
 1207 // protocol level errors.
 1208 void out_errstring(conn *c, const char *str) {
 1209     c->noreply = false;
 1210     out_string(c, str);
 1211 }
 1212 
 1213 /*
 1214  * Outputs a protocol-specific "out of memory" error. For ASCII clients,
 1215  * this is equivalent to out_string().
 1216  */
 1217 void out_of_memory(conn *c, char *ascii_error) {
 1218     const static char error_prefix[] = "SERVER_ERROR ";
 1219     const static int error_prefix_len = sizeof(error_prefix) - 1;
 1220 
 1221     if (c->protocol == binary_prot) {
 1222         /* Strip off the generic error prefix; it's irrelevant in binary */
 1223         if (!strncmp(ascii_error, error_prefix, error_prefix_len)) {
 1224             ascii_error += error_prefix_len;
 1225         }
 1226         write_bin_error(c, PROTOCOL_BINARY_RESPONSE_ENOMEM, ascii_error, 0);
 1227     } else {
 1228         out_string(c, ascii_error);
 1229     }
 1230 }
 1231 
 1232 static void append_bin_stats(const char *key, const uint16_t klen,
 1233                              const char *val, const uint32_t vlen,
 1234                              conn *c) {
 1235     char *buf = c->stats.buffer + c->stats.offset;
 1236     uint32_t bodylen = klen + vlen;
 1237     protocol_binary_response_header header = {
 1238         .response.magic = (uint8_t)PROTOCOL_BINARY_RES,
 1239         .response.opcode = PROTOCOL_BINARY_CMD_STAT,
 1240         .response.keylen = (uint16_t)htons(klen),
 1241         .response.datatype = (uint8_t)PROTOCOL_BINARY_RAW_BYTES,
 1242         .response.bodylen = htonl(bodylen),
 1243         .response.opaque = c->opaque
 1244     };
 1245 
 1246     memcpy(buf, header.bytes, sizeof(header.response));
 1247     buf += sizeof(header.response);
 1248 
 1249     if (klen > 0) {
 1250         memcpy(buf, key, klen);
 1251         buf += klen;
 1252 
 1253         if (vlen > 0) {
 1254             memcpy(buf, val, vlen);
 1255         }
 1256     }
 1257 
 1258     c->stats.offset += sizeof(header.response) + bodylen;
 1259 }
 1260 
 1261 static void append_ascii_stats(const char *key, const uint16_t klen,
 1262                                const char *val, const uint32_t vlen,
 1263                                conn *c) {
 1264     char *pos = c->stats.buffer + c->stats.offset;
 1265     uint32_t nbytes = 0;
 1266     int remaining = c->stats.size - c->stats.offset;
 1267     int room = remaining - 1;
 1268 
 1269     if (klen == 0 && vlen == 0) {
 1270         nbytes = snprintf(pos, room, "END\r\n");
 1271     } else if (vlen == 0) {
 1272         nbytes = snprintf(pos, room, "STAT %s\r\n", key);
 1273     } else {
 1274         nbytes = snprintf(pos, room, "STAT %s %s\r\n", key, val);
 1275     }
 1276 
 1277     c->stats.offset += nbytes;
 1278 }
 1279 
 1280 static bool grow_stats_buf(conn *c, size_t needed) {
 1281     size_t nsize = c->stats.size;
 1282     size_t available = nsize - c->stats.offset;
 1283     bool rv = true;
 1284 
 1285     /* Special case: No buffer -- need to allocate fresh */
 1286     if (c->stats.buffer == NULL) {
 1287         nsize = 1024;
 1288         available = c->stats.size = c->stats.offset = 0;
 1289     }
 1290 
 1291     while (needed > available) {
 1292         assert(nsize > 0);
 1293         nsize = nsize << 1;
 1294         available = nsize - c->stats.offset;
 1295     }
 1296 
 1297     if (nsize != c->stats.size) {
 1298         char *ptr = realloc(c->stats.buffer, nsize);
 1299         if (ptr) {
 1300             c->stats.buffer = ptr;
 1301             c->stats.size = nsize;
 1302         } else {
 1303             STATS_LOCK();
 1304             stats.malloc_fails++;
 1305             STATS_UNLOCK();
 1306             rv = false;
 1307         }
 1308     }
 1309 
 1310     return rv;
 1311 }
 1312 
 1313 void append_stats(const char *key, const uint16_t klen,
 1314                   const char *val, const uint32_t vlen,
 1315                   const void *cookie)
 1316 {
 1317     /* value without a key is invalid */
 1318     if (klen == 0 && vlen > 0) {
 1319         return;
 1320     }
 1321 
 1322     conn *c = (conn*)cookie;
 1323 
 1324     if (c->protocol == binary_prot) {
 1325         size_t needed = vlen + klen + sizeof(protocol_binary_response_header);
 1326         if (!grow_stats_buf(c, needed)) {
 1327             return;
 1328         }
 1329         append_bin_stats(key, klen, val, vlen, c);
 1330     } else {
 1331         size_t needed = vlen + klen + 10; // 10 == "STAT = \r\n"
 1332         if (!grow_stats_buf(c, needed)) {
 1333             return;
 1334         }
 1335         append_ascii_stats(key, klen, val, vlen, c);
 1336     }
 1337 
 1338     assert(c->stats.offset <= c->stats.size);
 1339 }
 1340 
 1341 static void reset_cmd_handler(conn *c) {
 1342     c->cmd = -1;
 1343     c->substate = bin_no_state;
 1344     if (c->item != NULL) {
 1345         // TODO: Any other way to get here?
 1346         // SASL auth was mistakenly using it. Nothing else should?
 1347         item_remove(c->item);
 1348         c->item = NULL;
 1349     }
 1350     if (c->rbytes > 0) {
 1351         conn_set_state(c, conn_parse_cmd);
 1352     } else if (c->resp_head) {
 1353         conn_set_state(c, conn_mwrite);
 1354     } else {
 1355         conn_set_state(c, conn_waiting);
 1356     }
 1357 }
 1358 
 1359 static void complete_nread(conn *c) {
 1360     assert(c != NULL);
 1361     assert(c->protocol == ascii_prot
 1362            || c->protocol == binary_prot);
 1363 
 1364     if (c->protocol == ascii_prot) {
 1365         complete_nread_ascii(c);
 1366     } else if (c->protocol == binary_prot) {
 1367         complete_nread_binary(c);
 1368     }
 1369 }
 1370 
 1371 /* Destination must always be chunked */
 1372 /* This should be part of item.c */
 1373 static int _store_item_copy_chunks(item *d_it, item *s_it, const int len) {
 1374     item_chunk *dch = (item_chunk *) ITEM_schunk(d_it);
 1375     /* Advance dch until we find free space */
 1376     while (dch->size == dch->used) {
 1377         if (dch->next) {
 1378             dch = dch->next;
 1379         } else {
 1380             break;
 1381         }
 1382     }
 1383 
 1384     if (s_it->it_flags & ITEM_CHUNKED) {
 1385         int remain = len;
 1386         item_chunk *sch = (item_chunk *) ITEM_schunk(s_it);
 1387         int copied = 0;
 1388         /* Fills dch's to capacity, not straight copy sch in case data is
 1389          * being added or removed (ie append/prepend)
 1390          */
 1391         while (sch && dch && remain) {
 1392             assert(dch->used <= dch->size);
 1393             int todo = (dch->size - dch->used < sch->used - copied)
 1394                 ? dch->size - dch->used : sch->used - copied;
 1395             if (remain < todo)
 1396                 todo = remain;
 1397             memcpy(dch->data + dch->used, sch->data + copied, todo);
 1398             dch->used += todo;
 1399             copied += todo;
 1400             remain -= todo;
 1401             assert(dch->used <= dch->size);
 1402             if (dch->size == dch->used) {
 1403                 item_chunk *tch = do_item_alloc_chunk(dch, remain);
 1404                 if (tch) {
 1405                     dch = tch;
 1406                 } else {
 1407                     return -1;
 1408                 }
 1409             }
 1410             assert(copied <= sch->used);
 1411             if (copied == sch->used) {
 1412                 copied = 0;
 1413                 sch = sch->next;
 1414             }
 1415         }
 1416         /* assert that the destination had enough space for the source */
 1417         assert(remain == 0);
 1418     } else {
 1419         int done = 0;
 1420         /* Fill dch's via a non-chunked item. */
 1421         while (len > done && dch) {
 1422             int todo = (dch->size - dch->used < len - done)
 1423                 ? dch->size - dch->used : len - done;
 1424             //assert(dch->size - dch->used != 0);
 1425             memcpy(dch->data + dch->used, ITEM_data(s_it) + done, todo);
 1426             done += todo;
 1427             dch->used += todo;
 1428             assert(dch->used <= dch->size);
 1429             if (dch->size == dch->used) {
 1430                 item_chunk *tch = do_item_alloc_chunk(dch, len - done);
 1431                 if (tch) {
 1432                     dch = tch;
 1433                 } else {
 1434                     return -1;
 1435                 }
 1436             }
 1437         }
 1438         assert(len == done);
 1439     }
 1440     return 0;
 1441 }
 1442 
 1443 static int _store_item_copy_data(int comm, item *old_it, item *new_it, item *add_it) {
 1444     if (comm == NREAD_APPEND) {
 1445         if (new_it->it_flags & ITEM_CHUNKED) {
 1446             if (_store_item_copy_chunks(new_it, old_it, old_it->nbytes - 2) == -1 ||
 1447                 _store_item_copy_chunks(new_it, add_it, add_it->nbytes) == -1) {
 1448                 return -1;
 1449             }
 1450         } else {
 1451             memcpy(ITEM_data(new_it), ITEM_data(old_it), old_it->nbytes);
 1452             memcpy(ITEM_data(new_it) + old_it->nbytes - 2 /* CRLF */, ITEM_data(add_it), add_it->nbytes);
 1453         }
 1454     } else {
 1455         /* NREAD_PREPEND */
 1456         if (new_it->it_flags & ITEM_CHUNKED) {
 1457             if (_store_item_copy_chunks(new_it, add_it, add_it->nbytes - 2) == -1 ||
 1458                 _store_item_copy_chunks(new_it, old_it, old_it->nbytes) == -1) {
 1459                 return -1;
 1460             }
 1461         } else {
 1462             memcpy(ITEM_data(new_it), ITEM_data(add_it), add_it->nbytes);
 1463             memcpy(ITEM_data(new_it) + add_it->nbytes - 2 /* CRLF */, ITEM_data(old_it), old_it->nbytes);
 1464         }
 1465     }
 1466     return 0;
 1467 }
 1468 
 1469 /*
 1470  * Stores an item in the cache according to the semantics of one of the set
 1471  * commands. Protected by the item lock.
 1472  *
 1473  * Returns the state of storage.
 1474  */
 1475 enum store_item_type do_store_item(item *it, int comm, conn *c, const uint32_t hv) {
 1476     char *key = ITEM_key(it);
 1477     item *old_it = do_item_get(key, it->nkey, hv, c, DONT_UPDATE);
 1478     enum store_item_type stored = NOT_STORED;
 1479 
 1480     enum cas_result { CAS_NONE, CAS_MATCH, CAS_BADVAL, CAS_STALE, CAS_MISS };
 1481 
 1482     item *new_it = NULL;
 1483     uint32_t flags;
 1484 
 1485     /* Do the CAS test up front so we can apply to all store modes */
 1486     enum cas_result cas_res = CAS_NONE;
 1487 
 1488     bool do_store = false;
 1489     if (old_it != NULL) {
 1490         // Most of the CAS work requires something to compare to.
 1491         uint64_t it_cas = ITEM_get_cas(it);
 1492         uint64_t old_cas = ITEM_get_cas(old_it);
 1493         if (it_cas == 0) {
 1494             cas_res = CAS_NONE;
 1495         } else if (it_cas == old_cas) {
 1496             cas_res = CAS_MATCH;
 1497         } else if (c->set_stale && it_cas < old_cas) {
 1498             cas_res = CAS_STALE;
 1499         } else {
 1500             cas_res = CAS_BADVAL;
 1501         }
 1502 
 1503         switch (comm) {
 1504             case NREAD_ADD:
 1505                 /* add only adds a nonexistent item, but promote to head of LRU */
 1506                 do_item_update(old_it);
 1507                 break;
 1508             case NREAD_CAS:
 1509                 if (cas_res == CAS_MATCH) {
 1510                     // cas validates
 1511                     // it and old_it may belong to different classes.
 1512                     // I'm updating the stats for the one that's getting pushed out
 1513                     pthread_mutex_lock(&c->thread->stats.mutex);
 1514                     c->thread->stats.slab_stats[ITEM_clsid(old_it)].cas_hits++;
 1515                     pthread_mutex_unlock(&c->thread->stats.mutex);
 1516                     do_store = true;
 1517                 } else if (cas_res == CAS_STALE) {
 1518                     // if we're allowed to set a stale value, CAS must be lower than
 1519                     // the current item's CAS.
 1520                     // This replaces the value, but should preserve TTL, and stale
 1521                     // item marker bit + token sent if exists.
 1522                     it->exptime = old_it->exptime;
 1523                     it->it_flags |= ITEM_STALE;
 1524                     if (old_it->it_flags & ITEM_TOKEN_SENT) {
 1525                         it->it_flags |= ITEM_TOKEN_SENT;
 1526                     }
 1527 
 1528                     pthread_mutex_lock(&c->thread->stats.mutex);
 1529                     c->thread->stats.slab_stats[ITEM_clsid(old_it)].cas_hits++;
 1530                     pthread_mutex_unlock(&c->thread->stats.mutex);
 1531                     do_store = true;
 1532                 } else {
 1533                     // NONE or BADVAL are the same for CAS cmd
 1534                     pthread_mutex_lock(&c->thread->stats.mutex);
 1535                     c->thread->stats.slab_stats[ITEM_clsid(old_it)].cas_badval++;
 1536                     pthread_mutex_unlock(&c->thread->stats.mutex);
 1537 
 1538                     if (settings.verbose > 1) {
 1539                         fprintf(stderr, "CAS:  failure: expected %llu, got %llu\n",
 1540                                 (unsigned long long)ITEM_get_cas(old_it),
 1541                                 (unsigned long long)ITEM_get_cas(it));
 1542                     }
 1543                     stored = EXISTS;
 1544                 }
 1545                 break;
 1546             case NREAD_APPEND:
 1547             case NREAD_PREPEND:
 1548                 if (cas_res != CAS_NONE && cas_res != CAS_MATCH) {
 1549                     stored = EXISTS;
 1550                     break;
 1551                 }
 1552 #ifdef EXTSTORE
 1553                 if ((old_it->it_flags & ITEM_HDR) != 0) {
 1554                     /* block append/prepend from working with extstore-d items.
 1555                      * leave response code to NOT_STORED default */
 1556                     break;
 1557                 }
 1558 #endif
 1559                 /* we have it and old_it here - alloc memory to hold both */
 1560                 FLAGS_CONV(old_it, flags);
 1561                 new_it = do_item_alloc(key, it->nkey, flags, old_it->exptime, it->nbytes + old_it->nbytes - 2 /* CRLF */);
 1562 
 1563                 // OOM trying to copy.
 1564                 if (new_it == NULL)
 1565                     break;
 1566                 /* copy data from it and old_it to new_it */
 1567                 if (_store_item_copy_data(comm, old_it, new_it, it) == -1) {
 1568                     // failed data copy
 1569                     break;
 1570                 } else {
 1571                     // refcount of new_it is 1 here. will end up 2 after link.
 1572                     // it's original ref is managed outside of this function
 1573                     it = new_it;
 1574                     do_store = true;
 1575                 }
 1576                 break;
 1577             case NREAD_REPLACE:
 1578             case NREAD_SET:
 1579                 do_store = true;
 1580                 break;
 1581         }
 1582 
 1583         if (do_store) {
 1584             STORAGE_delete(c->thread->storage, old_it);
 1585             item_replace(old_it, it, hv);
 1586             stored = STORED;
 1587         }
 1588 
 1589         do_item_remove(old_it);         /* release our reference */
 1590         if (new_it != NULL) {
 1591             // append/prepend end up with an extra reference for new_it.
 1592             do_item_remove(new_it);
 1593         }
 1594     } else {
 1595         /* No pre-existing item to replace or compare to. */
 1596         if (ITEM_get_cas(it) != 0) {
 1597             /* Asked for a CAS match but nothing to compare it to. */
 1598             cas_res = CAS_MISS;
 1599         }
 1600 
 1601         switch (comm) {
 1602             case NREAD_ADD:
 1603             case NREAD_SET:
 1604                 do_store = true;
 1605                 break;
 1606             case NREAD_CAS:
 1607                 // LRU expired
 1608                 stored = NOT_FOUND;
 1609                 pthread_mutex_lock(&c->thread->stats.mutex);
 1610                 c->thread->stats.cas_misses++;
 1611                 pthread_mutex_unlock(&c->thread->stats.mutex);
 1612                 break;
 1613             case NREAD_REPLACE:
 1614             case NREAD_APPEND:
 1615             case NREAD_PREPEND:
 1616                 /* Requires an existing item. */
 1617                 break;
 1618         }
 1619 
 1620         if (do_store) {
 1621             do_item_link(it, hv);
 1622             stored = STORED;
 1623         }
 1624     }
 1625 
 1626     if (stored == STORED) {
 1627         c->cas = ITEM_get_cas(it);
 1628     }
 1629     LOGGER_LOG(c->thread->l, LOG_MUTATIONS, LOGGER_ITEM_STORE, NULL,
 1630             stored, comm, ITEM_key(it), it->nkey, it->exptime, ITEM_clsid(it), c->sfd);
 1631 
 1632     return stored;
 1633 }
 1634 
 1635 /* set up a connection to write a buffer then free it, used for stats */
 1636 void write_and_free(conn *c, char *buf, int bytes) {
 1637     if (buf) {
 1638         mc_resp *resp = c->resp;
 1639         resp->write_and_free = buf;
 1640         resp_add_iov(resp, buf, bytes);
 1641         conn_set_state(c, conn_new_cmd);
 1642     } else {
 1643         out_of_memory(c, "SERVER_ERROR out of memory writing stats");
 1644     }
 1645 }
 1646 
 1647 void append_stat(const char *name, ADD_STAT add_stats, conn *c,
 1648                  const char *fmt, ...) {
 1649     char val_str[STAT_VAL_LEN];
 1650     int vlen;
 1651     va_list ap;
 1652 
 1653     assert(name);
 1654     assert(add_stats);
 1655     assert(c);
 1656     assert(fmt);
 1657 
 1658     va_start(ap, fmt);
 1659     vlen = vsnprintf(val_str, sizeof(val_str) - 1, fmt, ap);
 1660     va_end(ap);
 1661 
 1662     add_stats(name, strlen(name), val_str, vlen, c);
 1663 }
 1664 
 1665 /* return server specific stats only */
 1666 void server_stats(ADD_STAT add_stats, conn *c) {
 1667     pid_t pid = getpid();
 1668     rel_time_t now = current_time;
 1669 
 1670     struct thread_stats thread_stats;
 1671     threadlocal_stats_aggregate(&thread_stats);
 1672     struct slab_stats slab_stats;
 1673     slab_stats_aggregate(&thread_stats, &slab_stats);
 1674 #ifndef WIN32
 1675     struct rusage usage;
 1676     getrusage(RUSAGE_SELF, &usage);
 1677 #endif /* !WIN32 */
 1678 
 1679     STATS_LOCK();
 1680 
 1681     APPEND_STAT("pid", "%lu", (long)pid);
 1682     APPEND_STAT("uptime", "%u", now - ITEM_UPDATE_INTERVAL);
 1683     APPEND_STAT("time", "%ld", now + (long)process_started);
 1684     APPEND_STAT("version", "%s", VERSION);
 1685     APPEND_STAT("libevent", "%s", event_get_version());
 1686     APPEND_STAT("pointer_size", "%d", (int)(8 * sizeof(void *)));
 1687 
 1688 #ifndef WIN32
 1689     append_stat("rusage_user", add_stats, c, "%ld.%06ld",
 1690                 (long)usage.ru_utime.tv_sec,
 1691                 (long)usage.ru_utime.tv_usec);
 1692     append_stat("rusage_system", add_stats, c, "%ld.%06ld",
 1693                 (long)usage.ru_stime.tv_sec,
 1694                 (long)usage.ru_stime.tv_usec);
 1695 #endif /* !WIN32 */
 1696 
 1697     APPEND_STAT("max_connections", "%d", settings.maxconns);
 1698     APPEND_STAT("curr_connections", "%llu", (unsigned long long)stats_state.curr_conns - 1);
 1699     APPEND_STAT("total_connections", "%llu", (unsigned long long)stats.total_conns);
 1700     if (settings.maxconns_fast) {
 1701         APPEND_STAT("rejected_connections", "%llu", (unsigned long long)stats.rejected_conns);
 1702     }
 1703     APPEND_STAT("connection_structures", "%u", stats_state.conn_structs);
 1704     APPEND_STAT("response_obj_oom", "%llu", (unsigned long long)thread_stats.response_obj_oom);
 1705     APPEND_STAT("response_obj_count", "%llu", (unsigned long long)thread_stats.response_obj_count);
 1706     APPEND_STAT("response_obj_bytes", "%llu", (unsigned long long)thread_stats.response_obj_bytes);
 1707     APPEND_STAT("read_buf_count", "%llu", (unsigned long long)thread_stats.read_buf_count);
 1708     APPEND_STAT("read_buf_bytes", "%llu", (unsigned long long)thread_stats.read_buf_bytes);
 1709     APPEND_STAT("read_buf_bytes_free", "%llu", (unsigned long long)thread_stats.read_buf_bytes_free);
 1710     APPEND_STAT("read_buf_oom", "%llu", (unsigned long long)thread_stats.read_buf_oom);
 1711     APPEND_STAT("reserved_fds", "%u", stats_state.reserved_fds);
 1712     APPEND_STAT("cmd_get", "%llu", (unsigned long long)thread_stats.get_cmds);
 1713     APPEND_STAT("cmd_set", "%llu", (unsigned long long)slab_stats.set_cmds);
 1714     APPEND_STAT("cmd_flush", "%llu", (unsigned long long)thread_stats.flush_cmds);
 1715     APPEND_STAT("cmd_touch", "%llu", (unsigned long long)thread_stats.touch_cmds);
 1716     APPEND_STAT("cmd_meta", "%llu", (unsigned long long)thread_stats.meta_cmds);
 1717     APPEND_STAT("get_hits", "%llu", (unsigned long long)slab_stats.get_hits);
 1718     APPEND_STAT("get_misses", "%llu", (unsigned long long)thread_stats.get_misses);
 1719     APPEND_STAT("get_expired", "%llu", (unsigned long long)thread_stats.get_expired);
 1720     APPEND_STAT("get_flushed", "%llu", (unsigned long long)thread_stats.get_flushed);
 1721 #ifdef EXTSTORE
 1722     if (c->thread->storage) {
 1723         APPEND_STAT("get_extstore", "%llu", (unsigned long long)thread_stats.get_extstore);
 1724         APPEND_STAT("get_aborted_extstore", "%llu", (unsigned long long)thread_stats.get_aborted_extstore);
 1725         APPEND_STAT("get_oom_extstore", "%llu", (unsigned long long)thread_stats.get_oom_extstore);
 1726         APPEND_STAT("recache_from_extstore", "%llu", (unsigned long long)thread_stats.recache_from_extstore);
 1727         APPEND_STAT("miss_from_extstore", "%llu", (unsigned long long)thread_stats.miss_from_extstore);
 1728         APPEND_STAT("badcrc_from_extstore", "%llu", (unsigned long long)thread_stats.badcrc_from_extstore);
 1729     }
 1730 #endif
 1731     APPEND_STAT("delete_misses", "%llu", (unsigned long long)thread_stats.delete_misses);
 1732     APPEND_STAT("delete_hits", "%llu", (unsigned long long)slab_stats.delete_hits);
 1733     APPEND_STAT("incr_misses", "%llu", (unsigned long long)thread_stats.incr_misses);
 1734     APPEND_STAT("incr_hits", "%llu", (unsigned long long)slab_stats.incr_hits);
 1735     APPEND_STAT("decr_misses", "%llu", (unsigned long long)thread_stats.decr_misses);
 1736     APPEND_STAT("decr_hits", "%llu", (unsigned long long)slab_stats.decr_hits);
 1737     APPEND_STAT("cas_misses", "%llu", (unsigned long long)thread_stats.cas_misses);
 1738     APPEND_STAT("cas_hits", "%llu", (unsigned long long)slab_stats.cas_hits);
 1739     APPEND_STAT("cas_badval", "%llu", (unsigned long long)slab_stats.cas_badval);
 1740     APPEND_STAT("touch_hits", "%llu", (unsigned long long)slab_stats.touch_hits);
 1741     APPEND_STAT("touch_misses", "%llu", (unsigned long long)thread_stats.touch_misses);
 1742     APPEND_STAT("auth_cmds", "%llu", (unsigned long long)thread_stats.auth_cmds);
 1743     APPEND_STAT("auth_errors", "%llu", (unsigned long long)thread_stats.auth_errors);
 1744     if (settings.idle_timeout) {
 1745         APPEND_STAT("idle_kicks", "%llu", (unsigned long long)thread_stats.idle_kicks);
 1746     }
 1747     APPEND_STAT("bytes_read", "%llu", (unsigned long long)thread_stats.bytes_read);
 1748     APPEND_STAT("bytes_written", "%llu", (unsigned long long)thread_stats.bytes_written);
 1749     APPEND_STAT("limit_maxbytes", "%llu", (unsigned long long)settings.maxbytes);
 1750     APPEND_STAT("accepting_conns", "%u", stats_state.accepting_conns);
 1751     APPEND_STAT("listen_disabled_num", "%llu", (unsigned long long)stats.listen_disabled_num);
 1752     APPEND_STAT("time_in_listen_disabled_us", "%llu", stats.time_in_listen_disabled_us);
 1753     APPEND_STAT("threads", "%d", settings.num_threads);
 1754     APPEND_STAT("conn_yields", "%llu", (unsigned long long)thread_stats.conn_yields);
 1755     APPEND_STAT("hash_power_level", "%u", stats_state.hash_power_level);
 1756     APPEND_STAT("hash_bytes", "%llu", (unsigned long long)stats_state.hash_bytes);
 1757     APPEND_STAT("hash_is_expanding", "%u", stats_state.hash_is_expanding);
 1758     if (settings.slab_reassign) {
 1759         APPEND_STAT("slab_reassign_rescues", "%llu", stats.slab_reassign_rescues);
 1760         APPEND_STAT("slab_reassign_chunk_rescues", "%llu", stats.slab_reassign_chunk_rescues);
 1761         APPEND_STAT("slab_reassign_evictions_nomem", "%llu", stats.slab_reassign_evictions_nomem);
 1762         APPEND_STAT("slab_reassign_inline_reclaim", "%llu", stats.slab_reassign_inline_reclaim);
 1763         APPEND_STAT("slab_reassign_busy_items", "%llu", stats.slab_reassign_busy_items);
 1764         APPEND_STAT("slab_reassign_busy_deletes", "%llu", stats.slab_reassign_busy_deletes);
 1765         APPEND_STAT("slab_reassign_running", "%u", stats_state.slab_reassign_running);
 1766         APPEND_STAT("slabs_moved", "%llu", stats.slabs_moved);
 1767     }
 1768     if (settings.lru_crawler) {
 1769         APPEND_STAT("lru_crawler_running", "%u", stats_state.lru_crawler_running);
 1770         APPEND_STAT("lru_crawler_starts", "%u", stats.lru_crawler_starts);
 1771     }
 1772     if (settings.lru_maintainer_thread) {
 1773         APPEND_STAT("lru_maintainer_juggles", "%llu", (unsigned long long)stats.lru_maintainer_juggles);
 1774     }
 1775     APPEND_STAT("malloc_fails", "%llu",
 1776                 (unsigned long long)stats.malloc_fails);
 1777     APPEND_STAT("log_worker_dropped", "%llu", (unsigned long long)stats.log_worker_dropped);
 1778     APPEND_STAT("log_worker_written", "%llu", (unsigned long long)stats.log_worker_written);
 1779     APPEND_STAT("log_watcher_skipped", "%llu", (unsigned long long)stats.log_watcher_skipped);
 1780     APPEND_STAT("log_watcher_sent", "%llu", (unsigned long long)stats.log_watcher_sent);
 1781     STATS_UNLOCK();
 1782 #ifdef EXTSTORE
 1783     storage_stats(add_stats, c);
 1784 #endif
 1785 #ifdef TLS
 1786     if (settings.ssl_enabled) {
 1787         if (settings.ssl_session_cache) {
 1788             APPEND_STAT("ssl_new_sessions", "%llu", (unsigned long long)stats.ssl_new_sessions);
 1789         }
 1790         APPEND_STAT("ssl_handshake_errors", "%llu", (unsigned long long)stats.ssl_handshake_errors);
 1791         APPEND_STAT("time_since_server_cert_refresh", "%u", now - settings.ssl_last_cert_refresh_time);
 1792     }
 1793 #endif
 1794     APPEND_STAT("unexpected_napi_ids", "%llu", (unsigned long long)stats.unexpected_napi_ids);
 1795     APPEND_STAT("round_robin_fallback", "%llu", (unsigned long long)stats.round_robin_fallback);
 1796 }
 1797 
 1798 void process_stat_settings(ADD_STAT add_stats, void *c) {
 1799     assert(add_stats);
 1800     APPEND_STAT("maxbytes", "%llu", (unsigned long long)settings.maxbytes);
 1801     APPEND_STAT("maxconns", "%d", settings.maxconns);
 1802     APPEND_STAT("tcpport", "%d", settings.port);
 1803     APPEND_STAT("udpport", "%d", settings.udpport);
 1804     APPEND_STAT("inter", "%s", settings.inter ? settings.inter : "NULL");
 1805     APPEND_STAT("verbosity", "%d", settings.verbose);
 1806     APPEND_STAT("oldest", "%lu", (unsigned long)settings.oldest_live);
 1807     APPEND_STAT("evictions", "%s", settings.evict_to_free ? "on" : "off");
 1808     APPEND_STAT("domain_socket", "%s",
 1809                 settings.socketpath ? settings.socketpath : "NULL");
 1810     APPEND_STAT("umask", "%o", settings.access);
 1811     APPEND_STAT("growth_factor", "%.2f", settings.factor);
 1812     APPEND_STAT("chunk_size", "%d", settings.chunk_size);
 1813     APPEND_STAT("num_threads", "%d", settings.num_threads);
 1814     APPEND_STAT("num_threads_per_udp", "%d", settings.num_threads_per_udp);
 1815     APPEND_STAT("stat_key_prefix", "%c", settings.prefix_delimiter);
 1816     APPEND_STAT("detail_enabled", "%s",
 1817                 settings.detail_enabled ? "yes" : "no");
 1818     APPEND_STAT("reqs_per_event", "%d", settings.reqs_per_event);
 1819     APPEND_STAT("cas_enabled", "%s", settings.use_cas ? "yes" : "no");
 1820     APPEND_STAT("tcp_backlog", "%d", settings.backlog);
 1821     APPEND_STAT("binding_protocol", "%s",
 1822                 prot_text(settings.binding_protocol));
 1823     APPEND_STAT("auth_enabled_sasl", "%s", settings.sasl ? "yes" : "no");
 1824     APPEND_STAT("auth_enabled_ascii", "%s", settings.auth_file ? settings.auth_file : "no");
 1825     APPEND_STAT("item_size_max", "%d", settings.item_size_max);
 1826     APPEND_STAT("maxconns_fast", "%s", settings.maxconns_fast ? "yes" : "no");
 1827     APPEND_STAT("hashpower_init", "%d", settings.hashpower_init);
 1828     APPEND_STAT("slab_reassign", "%s", settings.slab_reassign ? "yes" : "no");
 1829     APPEND_STAT("slab_automove", "%d", settings.slab_automove);
 1830     APPEND_STAT("slab_automove_ratio", "%.2f", settings.slab_automove_ratio);
 1831     APPEND_STAT("slab_automove_window", "%u", settings.slab_automove_window);
 1832     APPEND_STAT("slab_chunk_max", "%d", settings.slab_chunk_size_max);
 1833     APPEND_STAT("lru_crawler", "%s", settings.lru_crawler ? "yes" : "no");
 1834     APPEND_STAT("lru_crawler_sleep", "%d", settings.lru_crawler_sleep);
 1835     APPEND_STAT("lru_crawler_tocrawl", "%lu", (unsigned long)settings.lru_crawler_tocrawl);
 1836     APPEND_STAT("tail_repair_time", "%d", settings.tail_repair_time);
 1837     APPEND_STAT("flush_enabled", "%s", settings.flush_enabled ? "yes" : "no");
 1838     APPEND_STAT("dump_enabled", "%s", settings.dump_enabled ? "yes" : "no");
 1839     APPEND_STAT("hash_algorithm", "%s", settings.hash_algorithm);
 1840     APPEND_STAT("lru_maintainer_thread", "%s", settings.lru_maintainer_thread ? "yes" : "no");
 1841     APPEND_STAT("lru_segmented", "%s", settings.lru_segmented ? "yes" : "no");
 1842     APPEND_STAT("hot_lru_pct", "%d", settings.hot_lru_pct);
 1843     APPEND_STAT("warm_lru_pct", "%d", settings.warm_lru_pct);
 1844     APPEND_STAT("hot_max_factor", "%.2f", settings.hot_max_factor);
 1845     APPEND_STAT("warm_max_factor", "%.2f", settings.warm_max_factor);
 1846     APPEND_STAT("temp_lru", "%s", settings.temp_lru ? "yes" : "no");
 1847     APPEND_STAT("temporary_ttl", "%u", settings.temporary_ttl);
 1848     APPEND_STAT("idle_timeout", "%d", settings.idle_timeout);
 1849     APPEND_STAT("watcher_logbuf_size", "%u", settings.logger_watcher_buf_size);
 1850     APPEND_STAT("worker_logbuf_size", "%u", settings.logger_buf_size);
 1851     APPEND_STAT("read_buf_mem_limit", "%u", settings.read_buf_mem_limit);
 1852     APPEND_STAT("track_sizes", "%s", item_stats_sizes_status() ? "yes" : "no");
 1853     APPEND_STAT("inline_ascii_response", "%s", "no"); // setting is dead, cannot be yes.
 1854 #ifdef HAVE_DROP_PRIVILEGES
 1855     APPEND_STAT("drop_privileges", "%s", settings.drop_privileges ? "yes" : "no");
 1856 #endif
 1857 #ifdef EXTSTORE
 1858     APPEND_STAT("ext_item_size", "%u", settings.ext_item_size);
 1859     APPEND_STAT("ext_item_age", "%u", settings.ext_item_age);
 1860     APPEND_STAT("ext_low_ttl", "%u", settings.ext_low_ttl);
 1861     APPEND_STAT("ext_recache_rate", "%u", settings.ext_recache_rate);
 1862     APPEND_STAT("ext_wbuf_size", "%u", settings.ext_wbuf_size);
 1863     APPEND_STAT("ext_compact_under", "%u", settings.ext_compact_under);
 1864     APPEND_STAT("ext_drop_under", "%u", settings.ext_drop_under);
 1865     APPEND_STAT("ext_max_frag", "%.2f", settings.ext_max_frag);
 1866     APPEND_STAT("slab_automove_freeratio", "%.3f", settings.slab_automove_freeratio);
 1867     APPEND_STAT("ext_drop_unread", "%s", settings.ext_drop_unread ? "yes" : "no");
 1868 #endif
 1869 #ifdef TLS
 1870     APPEND_STAT("ssl_enabled", "%s", settings.ssl_enabled ? "yes" : "no");
 1871     APPEND_STAT("ssl_chain_cert", "%s", settings.ssl_chain_cert);
 1872     APPEND_STAT("ssl_key", "%s", settings.ssl_key);
 1873     APPEND_STAT("ssl_verify_mode", "%d", settings.ssl_verify_mode);
 1874     APPEND_STAT("ssl_keyformat", "%d", settings.ssl_keyformat);
 1875     APPEND_STAT("ssl_ciphers", "%s", settings.ssl_ciphers ? settings.ssl_ciphers : "NULL");
 1876     APPEND_STAT("ssl_ca_cert", "%s", settings.ssl_ca_cert ? settings.ssl_ca_cert : "NULL");
 1877     APPEND_STAT("ssl_wbuf_size", "%u", settings.ssl_wbuf_size);
 1878     APPEND_STAT("ssl_session_cache", "%s", settings.ssl_session_cache ? "yes" : "no");
 1879 #endif
 1880     APPEND_STAT("num_napi_ids", "%s", settings.num_napi_ids);
 1881     APPEND_STAT("memory_file", "%s", settings.memory_file);
 1882 }
 1883 
 1884 static int nz_strcmp(int nzlength, const char *nz, const char *z) {
 1885     int zlength=strlen(z);
 1886     return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
 1887 }
 1888 
 1889 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
 1890     bool ret = true;
 1891 
 1892     if (add_stats != NULL) {
 1893         if (!stat_type) {
 1894             /* prepare general statistics for the engine */
 1895             STATS_LOCK();
 1896             APPEND_STAT("bytes", "%llu", (unsigned long long)stats_state.curr_bytes);
 1897             APPEND_STAT("curr_items", "%llu", (unsigned long long)stats_state.curr_items);
 1898             APPEND_STAT("total_items", "%llu", (unsigned long long)stats.total_items);
 1899             STATS_UNLOCK();
 1900             APPEND_STAT("slab_global_page_pool", "%u", global_page_pool_size(NULL));
 1901             item_stats_totals(add_stats, c);
 1902         } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
 1903             item_stats(add_stats, c);
 1904         } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
 1905             slabs_stats(add_stats, c);
 1906         } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
 1907             item_stats_sizes(add_stats, c);
 1908         } else if (nz_strcmp(nkey, stat_type, "sizes_enable") == 0) {
 1909             item_stats_sizes_enable(add_stats, c);
 1910         } else if (nz_strcmp(nkey, stat_type, "sizes_disable") == 0) {
 1911             item_stats_sizes_disable(add_stats, c);
 1912         } else {
 1913             ret = false;
 1914         }
 1915     } else {
 1916         ret = false;
 1917     }
 1918 
 1919     return ret;
 1920 }
 1921 
 1922 static inline void get_conn_text(const conn *c, const int af,
 1923                 char* addr, struct sockaddr *sock_addr) {
 1924     char addr_text[MAXPATHLEN];
 1925     addr_text[0] = '\0';
 1926     const char *protoname = "?";
 1927     unsigned short port = 0;
 1928 
 1929     switch (af) {
 1930         case AF_INET:
 1931             (void) inet_ntop(af,
 1932                     &((struct sockaddr_in *)sock_addr)->sin_addr,
 1933                     addr_text,
 1934                     sizeof(addr_text) - 1);
 1935             port = ntohs(((struct sockaddr_in *)sock_addr)->sin_port);
 1936             protoname = IS_UDP(c->transport) ? "udp" : "tcp";
 1937             break;
 1938 
 1939         case AF_INET6:
 1940             addr_text[0] = '[';
 1941             addr_text[1] = '\0';
 1942             if (inet_ntop(af,
 1943                     &((struct sockaddr_in6 *)sock_addr)->sin6_addr,
 1944                     addr_text + 1,
 1945                     sizeof(addr_text) - 2)) {
 1946                 strcat(addr_text, "]");
 1947             }
 1948             port = ntohs(((struct sockaddr_in6 *)sock_addr)->sin6_port);
 1949             protoname = IS_UDP(c->transport) ? "udp6" : "tcp6";
 1950             break;
 1951 
 1952 #ifndef DISABLE_UNIX_SOCKET
 1953         case AF_UNIX:
 1954         {
 1955             size_t pathlen = 0;
 1956             // this strncpy call originally could piss off an address
 1957             // sanitizer; we supplied the size of the dest buf as a limiter,
 1958             // but optimized versions of strncpy could read past the end of
 1959             // *src while looking for a null terminator. Since buf and
 1960             // sun_path here are both on the stack they could even overlap,
 1961             // which is "undefined". In all OSS versions of strncpy I could
 1962             // find this has no effect; it'll still only copy until the first null
 1963             // terminator is found. Thus it's possible to get the OS to
 1964             // examine past the end of sun_path but it's unclear to me if this
 1965             // can cause any actual problem.
 1966             //
 1967             // We need a safe_strncpy util function but I'll punt on figuring
 1968             // that out for now.
 1969             pathlen = sizeof(((struct sockaddr_un *)sock_addr)->sun_path);
 1970             if (MAXPATHLEN <= pathlen) {
 1971                 pathlen = MAXPATHLEN - 1;
 1972             }
 1973             strncpy(addr_text,
 1974                     ((struct sockaddr_un *)sock_addr)->sun_path,
 1975                     pathlen);
 1976             addr_text[pathlen] = '\0';
 1977             protoname = "unix";
 1978         }
 1979             break;
 1980 #endif /* #ifndef DISABLE_UNIX_SOCKET */
 1981     }
 1982 
 1983     if (strlen(addr_text) < 2) {
 1984         /* Most likely this is a connected UNIX-domain client which
 1985          * has no peer socket address, but there's no portable way
 1986          * to tell for sure.
 1987          */
 1988         sprintf(addr_text, "<AF %d>", af);
 1989     }
 1990 
 1991     if (port) {
 1992         sprintf(addr, "%s:%s:%u", protoname, addr_text, port);
 1993     } else {
 1994         sprintf(addr, "%s:%s", protoname, addr_text);
 1995     }
 1996 }
 1997 
 1998 static void conn_to_str(const conn *c, char *addr, char *svr_addr) {
 1999     if (!c) {
 2000         strcpy(addr, "<null>");
 2001     } else if (c->state == conn_closed) {
 2002         strcpy(addr, "<closed>");
 2003     } else {
 2004         struct sockaddr_in6 local_addr;
 2005         struct sockaddr *sock_addr = (void *)&c->request_addr;
 2006 
 2007         /* For listen ports and idle UDP ports, show listen address */
 2008         if (c->state == conn_listening ||
 2009                 (IS_UDP(c->transport) &&
 2010                  c->state == conn_read)) {
 2011             socklen_t local_addr_len = sizeof(local_addr);
 2012 
 2013             if (getsockname(c->sfd,
 2014                         (struct sockaddr *)&local_addr,
 2015                         &local_addr_len) == 0) {
 2016                 sock_addr = (struct sockaddr *)&local_addr;
 2017             }
 2018         }
 2019         get_conn_text(c, sock_addr->sa_family, addr, sock_addr);
 2020 
 2021         if (c->state != conn_listening && !(IS_UDP(c->transport) &&
 2022                  c->state == conn_read)) {
 2023             struct sockaddr_storage svr_sock_addr;
 2024             socklen_t svr_addr_len = sizeof(svr_sock_addr);
 2025             getsockname(c->sfd, (struct sockaddr *)&svr_sock_addr, &svr_addr_len);
 2026             get_conn_text(c, svr_sock_addr.ss_family, svr_addr, (struct sockaddr *)&svr_sock_addr);
 2027         }
 2028     }
 2029 }
 2030 
 2031 void process_stats_conns(ADD_STAT add_stats, void *c) {
 2032     int i;
 2033     char key_str[STAT_KEY_LEN];
 2034     char val_str[STAT_VAL_LEN];
 2035     size_t extras_len = sizeof("unix:") + sizeof("65535");
 2036     char addr[MAXPATHLEN + extras_len];
 2037     char svr_addr[MAXPATHLEN + extras_len];
 2038     int klen = 0, vlen = 0;
 2039 
 2040     assert(add_stats);
 2041 
 2042     for (i = 0; i < max_fds; i++) {
 2043         if (conns[i]) {
 2044             /* This is safe to do unlocked because conns are never freed; the
 2045              * worst that'll happen will be a minor inconsistency in the
 2046              * output -- not worth the complexity of the locking that'd be
 2047              * required to prevent it.
 2048              */
 2049             if (IS_UDP(conns[i]->transport)) {
 2050                 APPEND_NUM_STAT(i, "UDP", "%s", "UDP");
 2051             }
 2052             if (conns[i]->state != conn_closed) {
 2053                 conn_to_str(conns[i], addr, svr_addr);
 2054 
 2055                 APPEND_NUM_STAT(i, "addr", "%s", addr);
 2056                 if (conns[i]->state != conn_listening &&
 2057                     !(IS_UDP(conns[i]->transport) && conns[i]->state == conn_read)) {
 2058                     APPEND_NUM_STAT(i, "listen_addr", "%s", svr_addr);
 2059                 }
 2060                 APPEND_NUM_STAT(i, "state", "%s",
 2061                         state_text(conns[i]->state));
 2062                 APPEND_NUM_STAT(i, "secs_since_last_cmd", "%d",
 2063                         current_time - conns[i]->last_cmd_time);
 2064             }
 2065         }
 2066     }
 2067 }
 2068 
 2069 #define IT_REFCOUNT_LIMIT 60000
 2070 item* limited_get(char *key, size_t nkey, conn *c, uint32_t exptime, bool should_touch, bool do_update, bool *overflow) {
 2071     item *it;
 2072     if (should_touch) {
 2073         it = item_touch(key, nkey, exptime, c);
 2074     } else {
 2075         it = item_get(key, nkey, c, do_update);
 2076     }
 2077     if (it && it->refcount > IT_REFCOUNT_LIMIT) {
 2078         item_remove(it);
 2079         it = NULL;
 2080         *overflow = true;
 2081     } else {
 2082         *overflow = false;
 2083     }
 2084     return it;
 2085 }
 2086 
 2087 // Semantics are different than limited_get; since the item is returned
 2088 // locked, caller can directly change what it needs.
 2089 // though it might eventually be a better interface to sink it all into
 2090 // items.c.
 2091 item* limited_get_locked(char *key, size_t nkey, conn *c, bool do_update, uint32_t *hv, bool *overflow) {
 2092     item *it;
 2093     it = item_get_locked(key, nkey, c, do_update, hv);
 2094     if (it && it->refcount > IT_REFCOUNT_LIMIT) {
 2095         do_item_remove(it);
 2096         it = NULL;
 2097         item_unlock(*hv);
 2098         *overflow = true;
 2099     } else {
 2100         *overflow = false;
 2101     }
 2102     return it;
 2103 }
 2104 
 2105 /*
 2106  * adds a delta value to a numeric item.
 2107  *
 2108  * c     connection requesting the operation
 2109  * it    item to adjust
 2110  * incr  true to increment value, false to decrement
 2111  * delta amount to adjust value by
 2112  * buf   buffer for response string
 2113  *
 2114  * returns a response string to send back to the client.
 2115  */
 2116 enum delta_result_type do_add_delta(conn *c, const char *key, const size_t nkey,
 2117                                     const bool incr, const int64_t delta,
 2118                                     char *buf, uint64_t *cas,
 2119                                     const uint32_t hv,
 2120                                     item **it_ret) {
 2121     char *ptr;
 2122     uint64_t value;
 2123     int res;
 2124     item *it;
 2125 
 2126     it = do_item_get(key, nkey, hv, c, DONT_UPDATE);
 2127     if (!it) {
 2128         return DELTA_ITEM_NOT_FOUND;
 2129     }
 2130 
 2131     /* Can't delta zero byte values. 2-byte are the "\r\n" */
 2132     /* Also can't delta for chunked items. Too large to be a number */
 2133 #ifdef EXTSTORE
 2134     if (it->nbytes <= 2 || (it->it_flags & (ITEM_CHUNKED|ITEM_HDR)) != 0) {
 2135 #else
 2136     if (it->nbytes <= 2 || (it->it_flags & (ITEM_CHUNKED)) != 0) {
 2137 #endif
 2138         do_item_remove(it);
 2139         return NON_NUMERIC;
 2140     }
 2141 
 2142     if (cas != NULL && *cas != 0 && ITEM_get_cas(it) != *cas) {
 2143         do_item_remove(it);
 2144         return DELTA_ITEM_CAS_MISMATCH;
 2145     }
 2146 
 2147     ptr = ITEM_data(it);
 2148 
 2149     if (!safe_strtoull(ptr, &value)) {
 2150         do_item_remove(it);
 2151         return NON_NUMERIC;
 2152     }
 2153 
 2154     if (incr) {
 2155         value += delta;
 2156         MEMCACHED_COMMAND_INCR(c->sfd, ITEM_key(it), it->nkey, value);
 2157     } else {
 2158         if(delta > value) {
 2159             value = 0;
 2160         } else {
 2161             value -= delta;
 2162         }
 2163         MEMCACHED_COMMAND_DECR(c->sfd, ITEM_key(it), it->nkey, value);
 2164     }
 2165 
 2166     pthread_mutex_lock(&c->thread->stats.mutex);
 2167     if (incr) {
 2168         c->thread->stats.slab_stats[ITEM_clsid(it)].incr_hits++;
 2169     } else {
 2170         c->thread->stats.slab_stats[ITEM_clsid(it)].decr_hits++;
 2171     }
 2172     pthread_mutex_unlock(&c->thread->stats.mutex);
 2173 
 2174     itoa_u64(value, buf);
 2175     res = strlen(buf);
 2176     /* refcount == 2 means we are the only ones holding the item, and it is
 2177      * linked. We hold the item's lock in this function, so refcount cannot
 2178      * increase. */
 2179     if (res + 2 <= it->nbytes && it->refcount == 2) { /* replace in-place */
 2180         /* When changing the value without replacing the item, we
 2181            need to update the CAS on the existing item. */
 2182         /* We also need to fiddle it in the sizes tracker in case the tracking
 2183          * was enabled at runtime, since it relies on the CAS value to know
 2184          * whether to remove an item or not. */
 2185         item_stats_sizes_remove(it);
 2186         ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
 2187         item_stats_sizes_add(it);
 2188         memcpy(ITEM_data(it), buf, res);
 2189         memset(ITEM_data(it) + res, ' ', it->nbytes - res - 2);
 2190         do_item_update(it);
 2191     } else if (it->refcount > 1) {
 2192         item *new_it;
 2193         uint32_t flags;
 2194         FLAGS_CONV(it, flags);
 2195         new_it = do_item_alloc(ITEM_key(it), it->nkey, flags, it->exptime, res + 2);
 2196         if (new_it == 0) {
 2197             do_item_remove(it);
 2198             return EOM;
 2199         }
 2200         memcpy(ITEM_data(new_it), buf, res);
 2201         memcpy(ITEM_data(new_it) + res, "\r\n", 2);
 2202         item_replace(it, new_it, hv);
 2203         // Overwrite the older item's CAS with our new CAS since we're
 2204         // returning the CAS of the old item below.
 2205         ITEM_set_cas(it, (settings.use_cas) ? ITEM_get_cas(new_it) : 0);
 2206         do_item_remove(new_it);       /* release our reference */
 2207     } else {
 2208         /* Should never get here. This means we somehow fetched an unlinked
 2209          * item. TODO: Add a counter? */
 2210         if (settings.verbose) {
 2211             fprintf(stderr, "Tried to do incr/decr on invalid item\n");
 2212         }
 2213         if (it->refcount == 1)
 2214             do_item_remove(it);
 2215         return DELTA_ITEM_NOT_FOUND;
 2216     }
 2217 
 2218     if (cas) {
 2219         *cas = ITEM_get_cas(it);    /* swap the incoming CAS value */
 2220     }
 2221     if (it_ret != NULL) {
 2222         *it_ret = it;
 2223     } else {
 2224         do_item_remove(it);         /* release our reference */
 2225     }
 2226     return OK;
 2227 }
 2228 
 2229 static int try_read_command_negotiate(conn *c) {
 2230     assert(c->protocol == negotiating_prot);
 2231     assert(c != NULL);
 2232     assert(c->rcurr <= (c->rbuf + c->rsize));
 2233     assert(c->rbytes > 0);
 2234 
 2235     if ((unsigned char)c->rbuf[0] == (unsigned char)PROTOCOL_BINARY_REQ) {
 2236         c->protocol = binary_prot;
 2237         c->try_read_command = try_read_command_binary;
 2238     } else {
 2239         // authentication doesn't work with negotiated protocol.
 2240         c->protocol = ascii_prot;
 2241         c->try_read_command = try_read_command_ascii;
 2242     }
 2243 
 2244     if (settings.verbose > 1) {
 2245         fprintf(stderr, "%d: Client using the %s protocol\n", c->sfd,
 2246                 prot_text(c->protocol));
 2247     }
 2248 
 2249     return c->try_read_command(c);
 2250 }
 2251 
 2252 static int try_read_command_udp(conn *c) {
 2253     assert(c != NULL);
 2254     assert(c->rcurr <= (c->rbuf + c->rsize));
 2255     assert(c->rbytes > 0);
 2256 
 2257     if ((unsigned char)c->rbuf[0] == (unsigned char)PROTOCOL_BINARY_REQ) {
 2258         c->protocol = binary_prot;
 2259         return try_read_command_binary(c);
 2260     } else {
 2261         c->protocol = ascii_prot;
 2262         return try_read_command_ascii(c);
 2263     }
 2264 }
 2265 
 2266 /*
 2267  * read a UDP request.
 2268  */
 2269 static enum try_read_result try_read_udp(conn *c) {
 2270     int res;
 2271 
 2272     assert(c != NULL);
 2273 
 2274     c->request_addr_size = sizeof(c->request_addr);
 2275     res = recvfrom(c->sfd, c->rbuf, c->rsize,
 2276                    0, (struct sockaddr *)&c->request_addr,
 2277                    &c->request_addr_size);
 2278     if (res > 8) {
 2279         unsigned char *buf = (unsigned char *)c->rbuf;
 2280         pthread_mutex_lock(&c->thread->stats.mutex);
 2281         c->thread->stats.bytes_read += res;
 2282         pthread_mutex_unlock(&c->thread->stats.mutex);
 2283 
 2284         /* Beginning of UDP packet is the request ID; save it. */
 2285         c->request_id = buf[0] * 256 + buf[1];
 2286 
 2287         /* If this is a multi-packet request, drop it. */
 2288         if (buf[4] != 0 || buf[5] != 1) {
 2289             return READ_NO_DATA_RECEIVED;
 2290         }
 2291 
 2292         /* Don't care about any of the rest of the header. */
 2293         res -= 8;
 2294         memmove(c->rbuf, c->rbuf + 8, res);
 2295 
 2296         c->rbytes = res;
 2297         c->rcurr = c->rbuf;
 2298         return READ_DATA_RECEIVED;
 2299     }
 2300     return READ_NO_DATA_RECEIVED;
 2301 }
 2302 
 2303 /*
 2304  * read from network as much as we can, handle buffer overflow and connection
 2305  * close.
 2306  * before reading, move the remaining incomplete fragment of a command
 2307  * (if any) to the beginning of the buffer.
 2308  *
 2309  * To protect us from someone flooding a connection with bogus data causing
 2310  * the connection to eat up all available memory, break out and start looking
 2311  * at the data I've got after a number of reallocs...
 2312  *
 2313  * @return enum try_read_result
 2314  */
 2315 static enum try_read_result try_read_network(conn *c) {
 2316     enum try_read_result gotdata = READ_NO_DATA_RECEIVED;
 2317     int res;
 2318     int num_allocs = 0;
 2319     assert(c != NULL);
 2320 
 2321     if (c->rcurr != c->rbuf) {
 2322         if (c->rbytes != 0) /* otherwise there's nothing to copy */
 2323             memmove(c->rbuf, c->rcurr, c->rbytes);
 2324         c->rcurr = c->rbuf;
 2325     }
 2326 
 2327     while (1) {
 2328         // TODO: move to rbuf_* func?
 2329         if (c->rbytes >= c->rsize && c->rbuf_malloced) {
 2330             if (num_allocs == 4) {
 2331                 return gotdata;
 2332             }
 2333             ++num_allocs;
 2334             char *new_rbuf = realloc(c->rbuf, c->rsize * 2);
 2335             if (!new_rbuf) {
 2336                 STATS_LOCK();
 2337                 stats.malloc_fails++;
 2338                 STATS_UNLOCK();
 2339                 if (settings.verbose > 0) {
 2340                     fprintf(stderr, "Couldn't realloc input buffer\n");
 2341                 }
 2342                 c->rbytes = 0; /* ignore what we read */
 2343                 out_of_memory(c, "SERVER_ERROR out of memory reading request");
 2344                 c->close_after_write = true;
 2345                 return READ_MEMORY_ERROR;
 2346             }
 2347             c->rcurr = c->rbuf = new_rbuf;
 2348             c->rsize *= 2;
 2349         }
 2350 
 2351         int avail = c->rsize - c->rbytes;
 2352         res = c->read(c, c->rbuf + c->rbytes, avail);
 2353         if (res > 0) {
 2354             pthread_mutex_lock(&c->thread->stats.mutex);
 2355             c->thread->stats.bytes_read += res;
 2356             pthread_mutex_unlock(&c->thread->stats.mutex);
 2357             gotdata = READ_DATA_RECEIVED;
 2358             c->rbytes += res;
 2359             if (res == avail && c->rbuf_malloced) {
 2360                 // Resize rbuf and try a few times if huge ascii multiget.
 2361                 continue;
 2362             } else {
 2363                 break;
 2364             }
 2365         }
 2366         if (res == 0) {
 2367             return READ_ERROR;
 2368         }
 2369         if (res == -1) {
 2370             if (errno == EAGAIN || errno == EWOULDBLOCK) {
 2371                 break;
 2372             }
 2373             return READ_ERROR;
 2374         }
 2375     }
 2376     return gotdata;
 2377 }
 2378 
 2379 static bool update_event(conn *c, const int new_flags) {
 2380     assert(c != NULL);
 2381 
 2382     struct event_base *base = c->event.ev_base;
 2383     if (c->ev_flags == new_flags)
 2384         return true;
 2385     if (event_del(&c->event) == -1) return false;
 2386     event_set(&c->event, c->sfd, new_flags, event_handler, (void *)c);
 2387     event_base_set(base, &c->event);
 2388     c->ev_flags = new_flags;
 2389     if (event_add(&c->event, 0) == -1) return false;
 2390     return true;
 2391 }
 2392 
 2393 /*
 2394  * Sets whether we are listening for new connections or not.
 2395  */
 2396 void do_accept_new_conns(const bool do_accept) {
 2397     conn *next;
 2398 
 2399     for (next = listen_conn; next; next = next->next) {
 2400         if (do_accept) {
 2401             update_event(next, EV_READ | EV_PERSIST);
 2402             if (listen(next->sfd, settings.backlog) != 0) {
 2403                 perror("listen");
 2404             }
 2405         }
 2406         else {
 2407             update_event(next, 0);
 2408             if (listen(next->sfd, 0) != 0) {
 2409                 perror("listen");
 2410             }
 2411         }
 2412     }
 2413 
 2414     if (do_accept) {
 2415         struct timeval maxconns_exited;
 2416         uint64_t elapsed_us;
 2417         gettimeofday(&maxconns_exited,NULL);
 2418         STATS_LOCK();
 2419         elapsed_us =
 2420             (maxconns_exited.tv_sec - stats.maxconns_entered.tv_sec) * 1000000
 2421             + (maxconns_exited.tv_usec - stats.maxconns_entered.tv_usec);
 2422         stats.time_in_listen_disabled_us += elapsed_us;
 2423         stats_state.accepting_conns = true;
 2424         STATS_UNLOCK();
 2425     } else {
 2426         STATS_LOCK();
 2427         stats_state.accepting_conns = false;
 2428         gettimeofday(&stats.maxconns_entered,NULL);
 2429         stats.listen_disabled_num++;
 2430         STATS_UNLOCK();
 2431         allow_new_conns = false;
 2432         maxconns_handler(-42, 0, 0);
 2433     }
 2434 }
 2435 
 2436 #define TRANSMIT_ONE_RESP true
 2437 #define TRANSMIT_ALL_RESP false
 2438 static int _transmit_pre(conn *c, struct iovec *iovs, int iovused, bool one_resp) {
 2439     mc_resp *resp = c->resp_head;
 2440     while (resp && iovused + resp->iovcnt < IOV_MAX-1) {
 2441         if (resp->skip) {
 2442             // Don't actually unchain the resp obj here since it's singly-linked.
 2443             // Just let the post function handle it linearly.
 2444             resp = resp->next;
 2445             continue;
 2446         }
 2447         if (resp->chunked_data_iov) {
 2448             // Handle chunked items specially.
 2449             // They spend much more time in send so we can be a bit wasteful
 2450             // in rebuilding iovecs for them.
 2451             item_chunk *ch = (item_chunk *)ITEM_schunk((item *)resp->iov[resp->chunked_data_iov].iov_base);
 2452             int x;
 2453             for (x = 0; x < resp->iovcnt; x++) {
 2454                 // This iov is tracking how far we've copied so far.
 2455                 if (x == resp->chunked_data_iov) {
 2456                     int done = resp->chunked_total - resp->iov[x].iov_len;
 2457                     // Start from the len to allow binprot to cut the \r\n
 2458                     int todo = resp->iov[x].iov_len;
 2459                     while (ch && todo > 0 && iovused < IOV_MAX-1) {
 2460                         int skip = 0;
 2461                         if (!ch->used) {
 2462                             ch = ch->next;
 2463                             continue;
 2464                         }
 2465                         // Skip parts we've already sent.
 2466                         if (done >= ch->used) {
 2467                             done -= ch->used;
 2468                             ch = ch->next;
 2469                             continue;
 2470                         } else if (done) {
 2471                             skip = done;
 2472                             done = 0;
 2473                         }
 2474                         iovs[iovused].iov_base = ch->data + skip;
 2475                         // Stupid binary protocol makes this go negative.
 2476                         iovs[iovused].iov_len = ch->used - skip > todo ? todo : ch->used - skip;
 2477                         iovused++;
 2478                         todo -= ch->used - skip;
 2479                         ch = ch->next;
 2480                     }
 2481                 } else {
 2482                     iovs[iovused].iov_base = resp->iov[x].iov_base;
 2483                     iovs[iovused].iov_len = resp->iov[x].iov_len;
 2484                     iovused++;
 2485                 }
 2486                 if (iovused >= IOV_MAX-1)
 2487                     break;
 2488             }
 2489         } else {
 2490             memcpy(&iovs[iovused], resp->iov, sizeof(struct iovec)*resp->iovcnt);
 2491             iovused += resp->iovcnt;
 2492         }
 2493 
 2494         // done looking at first response, walk down the chain.
 2495         resp = resp->next;
 2496         // used for UDP mode: UDP cannot send multiple responses per packet.
 2497         if (one_resp)
 2498             break;
 2499     }
 2500     return iovused;
 2501 }
 2502 
 2503 /*
 2504  * Decrements and completes responses based on how much data was transmitted.
 2505  * Takes the connection and current result bytes.
 2506  */
 2507 static void _transmit_post(conn *c, ssize_t res) {
 2508     // We've written some of the data. Remove the completed
 2509     // responses from the list of pending writes.
 2510     mc_resp *resp = c->resp_head;
 2511     while (resp) {
 2512         int x;
 2513         if (resp->skip) {
 2514             resp = resp_finish(c, resp);
 2515             continue;
 2516         }
 2517 
 2518         // fastpath check. all small responses should cut here.
 2519         if (res >= resp->tosend) {
 2520             res -= resp->tosend;
 2521             resp = resp_finish(c, resp);
 2522             continue;
 2523         }
 2524 
 2525         // it's fine to re-check iov's that were zeroed out before.
 2526         for (x = 0; x < resp->iovcnt; x++) {
 2527             struct iovec *iov = &resp->iov[x];
 2528             if (res >= iov->iov_len) {
 2529                 resp->tosend -= iov->iov_len;
 2530                 res -= iov->iov_len;
 2531                 iov->iov_len = 0;
 2532             } else {
 2533                 // Dumb special case for chunked items. Currently tracking
 2534                 // where to inject the chunked item via iov_base.
 2535                 // Extra not-great since chunked items can't be the first
 2536                 // index, so we have to check for non-zero c_d_iov first.
 2537                 if (!resp->chunked_data_iov || x != resp->chunked_data_iov) {
 2538                     iov->iov_base = (char *)iov->iov_base + res;
 2539                 }
 2540                 iov->iov_len -= res;
 2541                 resp->tosend -= res;
 2542                 res = 0;
 2543                 break;
 2544             }
 2545         }
 2546 
 2547         // are we done with this response object?
 2548         if (resp->tosend == 0) {
 2549             resp = resp_finish(c, resp);
 2550         } else {
 2551             // Jammed up here. This is the new head.
 2552             break;
 2553         }
 2554     }
 2555 }
 2556 
 2557 /*
 2558  * Transmit the next chunk of data from our list of msgbuf structures.
 2559  *
 2560  * Returns:
 2561  *   TRANSMIT_COMPLETE   All done writing.
 2562  *   TRANSMIT_INCOMPLETE More data remaining to write.
 2563  *   TRANSMIT_SOFT_ERROR Can't write any more right now.
 2564  *   TRANSMIT_HARD_ERROR Can't write (c->state is set to conn_closing)
 2565  */
 2566 static enum transmit_result transmit(conn *c) {
 2567     assert(c != NULL);
 2568     struct iovec iovs[IOV_MAX];
 2569     struct msghdr msg;
 2570     int iovused = 0;
 2571 
 2572     // init the msg.
 2573     memset(&msg, 0, sizeof(struct msghdr));
 2574     msg.msg_iov = iovs;
 2575 
 2576     iovused = _transmit_pre(c, iovs, iovused, TRANSMIT_ALL_RESP);
 2577     if (iovused == 0) {
 2578         // Avoid the syscall if we're only handling a noreply.
 2579         // Return the response object.
 2580         _transmit_post(c, 0);
 2581         return TRANSMIT_COMPLETE;
 2582     }
 2583 
 2584     // Alright, send.
 2585     ssize_t res;
 2586     msg.msg_iovlen = iovused;
 2587     res = c->sendmsg(c, &msg, 0);
 2588     if (res >= 0) {
 2589         pthread_mutex_lock(&c->thread->stats.mutex);
 2590         c->thread->stats.bytes_written += res;
 2591         pthread_mutex_unlock(&c->thread->stats.mutex);
 2592 
 2593         // Decrement any partial IOV's and complete any finished resp's.
 2594         _transmit_post(c, res);
 2595 
 2596         if (c->resp_head) {
 2597             return TRANSMIT_INCOMPLETE;
 2598         } else {
 2599             return TRANSMIT_COMPLETE;
 2600         }
 2601     }
 2602 
 2603     if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
 2604         if (!update_event(c, EV_WRITE | EV_PERSIST)) {
 2605             if (settings.verbose > 0)
 2606                 fprintf(stderr, "Couldn't update event\n");
 2607             conn_set_state(c, conn_closing);
 2608             return TRANSMIT_HARD_ERROR;
 2609         }
 2610         return TRANSMIT_SOFT_ERROR;
 2611     }
 2612     /* if res == -1 and error is not EAGAIN or EWOULDBLOCK,
 2613        we have a real error, on which we close the connection */
 2614     if (settings.verbose > 0)
 2615         perror("Failed to write, and not due to blocking");
 2616 
 2617     conn_set_state(c, conn_closing);
 2618     return TRANSMIT_HARD_ERROR;
 2619 }
 2620 
 2621 static void build_udp_header(unsigned char *hdr, mc_resp *resp) {
 2622     // We need to communicate the total number of packets
 2623     // If this isn't set, it's the first time this response is building a udp
 2624     // header, so "tosend" must be static.
 2625     if (!resp->udp_total) {
 2626         uint32_t total;
 2627         total = resp->tosend / UDP_MAX_PAYLOAD_SIZE;
 2628         if (resp->tosend % UDP_MAX_PAYLOAD_SIZE)
 2629             total++;
 2630         // The spec doesn't really say what we should do here. It's _probably_
 2631         // better to bail out?
 2632         if (total > USHRT_MAX) {
 2633             total = USHRT_MAX;
 2634         }
 2635         resp->udp_total = total;
 2636     }
 2637 
 2638     // TODO: why wasn't this hto*'s and casts?
 2639     // this ends up sending UDP hdr data specifically in host byte order.
 2640     *hdr++ = resp->request_id / 256;
 2641     *hdr++ = resp->request_id % 256;
 2642     *hdr++ = resp->udp_sequence / 256;
 2643     *hdr++ = resp->udp_sequence % 256;
 2644     *hdr++ = resp->udp_total / 256;
 2645     *hdr++ = resp->udp_total % 256;
 2646     *hdr++ = 0;
 2647     *hdr++ = 0;
 2648     resp->udp_sequence++;
 2649 }
 2650 
 2651 /*
 2652  * UDP specific transmit function. Uses its own function rather than check
 2653  * IS_UDP() five times. If we ever implement sendmmsg or similar support they
 2654  * will diverge even more.
 2655  * Does not use TLS.
 2656  *
 2657  * Returns:
 2658  *   TRANSMIT_COMPLETE   All done writing.
 2659  *   TRANSMIT_INCOMPLETE More data remaining to write.
 2660  *   TRANSMIT_SOFT_ERROR Can't write any more right now.
 2661  *   TRANSMIT_HARD_ERROR Can't write (c->state is set to conn_closing)
 2662  */
 2663 static enum transmit_result transmit_udp(conn *c) {
 2664     assert(c != NULL);
 2665     struct iovec iovs[IOV_MAX];
 2666     struct msghdr msg;
 2667     mc_resp *resp;
 2668     int iovused = 0;
 2669     unsigned char udp_hdr[UDP_HEADER_SIZE];
 2670 
 2671     // We only send one UDP packet per call (ugh), so we can only operate on a
 2672     // single response at a time.
 2673     resp = c->resp_head;
 2674 
 2675     if (!resp) {
 2676         return TRANSMIT_COMPLETE;
 2677     }
 2678 
 2679     if (resp->skip) {
 2680         resp = resp_finish(c, resp);
 2681         return TRANSMIT_INCOMPLETE;
 2682     }
 2683 
 2684     // clear the message and initialize it.
 2685     memset(&msg, 0, sizeof(struct msghdr));
 2686     msg.msg_iov = iovs;
 2687 
 2688     // the UDP source to return to.
 2689     msg.msg_name = &resp->request_addr;
 2690     msg.msg_namelen = resp->request_addr_size;
 2691 
 2692     // First IOV is the custom UDP header.
 2693     iovs[0].iov_base = (void *)udp_hdr;
 2694     iovs[0].iov_len = UDP_HEADER_SIZE;
 2695     build_udp_header(udp_hdr, resp);
 2696     iovused++;
 2697 
 2698     // Fill the IOV's the standard way.
 2699     // TODO: might get a small speedup if we let it break early with a length
 2700     // limit.
 2701     iovused = _transmit_pre(c, iovs, iovused, TRANSMIT_ONE_RESP);
 2702 
 2703     // Clip the IOV's to the max UDP packet size.
 2704     // If we add support for send_mmsg, this can be where we split msg's.
 2705     {
 2706         int x = 0;
 2707         int len = 0;
 2708         for (x = 0; x < iovused; x++) {
 2709             if (len + iovs[x].iov_len >= UDP_MAX_PAYLOAD_SIZE) {
 2710                 iovs[x].iov_len = UDP_MAX_PAYLOAD_SIZE - len;
 2711                 x++;
 2712                 break;
 2713             } else {
 2714                 len += iovs[x].iov_len;
 2715             }
 2716         }
 2717         iovused = x;
 2718     }
 2719 
 2720     ssize_t res;
 2721     msg.msg_iovlen = iovused;
 2722     // NOTE: uses system sendmsg since we have no support for indirect UDP.
 2723     res = sendmsg(c->sfd, &msg, 0);
 2724     if (res >= 0) {
 2725         pthread_mutex_lock(&c->thread->stats.mutex);
 2726         c->thread->stats.bytes_written += res;
 2727         pthread_mutex_unlock(&c->thread->stats.mutex);
 2728 
 2729         // Ignore the header size from forwarding the IOV's
 2730         res -= UDP_HEADER_SIZE;
 2731 
 2732         // Decrement any partial IOV's and complete any finished resp's.
 2733         _transmit_post(c, res);
 2734 
 2735         if (c->resp_head) {
 2736             return TRANSMIT_INCOMPLETE;
 2737         } else {
 2738             return TRANSMIT_COMPLETE;
 2739         }
 2740     }
 2741 
 2742     if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
 2743         if (!update_event(c, EV_WRITE | EV_PERSIST)) {
 2744             if (settings.verbose > 0)
 2745                 fprintf(stderr, "Couldn't update event\n");
 2746             conn_set_state(c, conn_closing);
 2747             return TRANSMIT_HARD_ERROR;
 2748         }
 2749         return TRANSMIT_SOFT_ERROR;
 2750     }
 2751     /* if res == -1 and error is not EAGAIN or EWOULDBLOCK,
 2752        we have a real error, on which we close the connection */
 2753     if (settings.verbose > 0)
 2754         perror("Failed to write, and not due to blocking");
 2755 
 2756     conn_set_state(c, conn_read);
 2757     return TRANSMIT_HARD_ERROR;
 2758 }
 2759 
 2760 
 2761 /* Does a looped read to fill data chunks */
 2762 /* TODO: restrict number of times this can loop.
 2763  * Also, benchmark using readv's.
 2764  */
 2765 static int read_into_chunked_item(conn *c) {
 2766     int total = 0;
 2767     int res;
 2768     assert(c->rcurr != c->ritem);
 2769 
 2770     while (c->rlbytes > 0) {
 2771         item_chunk *ch = (item_chunk *)c->ritem;
 2772         if (ch->size == ch->used) {
 2773             // FIXME: ch->next is currently always 0. remove this?
 2774             if (ch->next) {
 2775                 c->ritem = (char *) ch->next;
 2776             } else {
 2777                 /* Allocate next chunk. Binary protocol needs 2b for \r\n */
 2778                 c->ritem = (char *) do_item_alloc_chunk(ch, c->rlbytes +
 2779                        ((c->protocol == binary_prot) ? 2 : 0));
 2780                 if (!c->ritem) {
 2781                     // We failed an allocation. Let caller handle cleanup.
 2782                     total = -2;
 2783                     break;
 2784                 }
 2785                 // ritem has new chunk, restart the loop.
 2786                 continue;
 2787                 //assert(c->rlbytes == 0);
 2788             }
 2789         }
 2790 
 2791         int unused = ch->size - ch->used;
 2792         /* first check if we have leftovers in the conn_read buffer */
 2793         if (c->rbytes > 0) {
 2794             total = 0;
 2795             int tocopy = c->rbytes > c->rlbytes ? c->rlbytes : c->rbytes;
 2796             tocopy = tocopy > unused ? unused : tocopy;
 2797             if (c->ritem != c->rcurr) {
 2798                 memmove(ch->data + ch->used, c->rcurr, tocopy);
 2799             }
 2800             total += tocopy;
 2801             c->rlbytes -= tocopy;
 2802             c->rcurr += tocopy;
 2803             c->rbytes -= tocopy;
 2804             ch->used += tocopy;
 2805             if (c->rlbytes == 0) {
 2806                 break;
 2807             }
 2808         } else {
 2809             /*  now try reading from the socket */
 2810             res = c->read(c, ch->data + ch->used,
 2811                     (unused > c->rlbytes ? c->rlbytes : unused));
 2812             if (res > 0) {
 2813                 pthread_mutex_lock(&c->thread->stats.mutex);
 2814                 c->thread->stats.bytes_read += res;
 2815                 pthread_mutex_unlock(&c->thread->stats.mutex);
 2816                 ch->used += res;
 2817                 total += res;
 2818                 c->rlbytes -= res;
 2819             } else {
 2820                 /* Reset total to the latest result so caller can handle it */
 2821                 total = res;
 2822                 break;
 2823             }
 2824         }
 2825     }
 2826 
 2827     /* At some point I will be able to ditch the \r\n from item storage and
 2828        remove all of these kludges.
 2829        The above binprot check ensures inline space for \r\n, but if we do
 2830        exactly enough allocs there will be no additional chunk for \r\n.
 2831      */
 2832     if (c->rlbytes == 0 && c->protocol == binary_prot && total >= 0) {
 2833         item_chunk *ch = (item_chunk *)c->ritem;
 2834         if (ch->size - ch->used < 2) {
 2835             c->ritem = (char *) do_item_alloc_chunk(ch, 2);
 2836             if (!c->ritem) {
 2837                 total = -2;
 2838             }
 2839         }
 2840     }
 2841     return total;
 2842 }
 2843 
 2844 static void drive_machine(conn *c) {
 2845     bool stop = false;
 2846     int sfd;
 2847     socklen_t addrlen;
 2848     struct sockaddr_storage addr;
 2849     int nreqs = settings.reqs_per_event;
 2850     int res;
 2851     const char *str;
 2852 #ifdef HAVE_ACCEPT4
 2853     static int  use_accept4 = 1;
 2854 #else
 2855     static int  use_accept4 = 0;
 2856 #endif
 2857 
 2858     assert(c != NULL);
 2859 
 2860     while (!stop) {
 2861 
 2862         switch(c->state) {
 2863         case conn_listening:
 2864             addrlen = sizeof(addr);
 2865 #ifdef HAVE_ACCEPT4
 2866             if (use_accept4) {
 2867                 sfd = accept4(c->sfd, (struct sockaddr *)&addr, &addrlen, SOCK_NONBLOCK);
 2868             } else {
 2869                 sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen);
 2870             }
 2871 #else
 2872             sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen);
 2873 #endif
 2874             if (sfd == -1) {
 2875                 if (use_accept4 && errno == ENOSYS) {
 2876                     use_accept4 = 0;
 2877                     continue;
 2878                 }
 2879                 perror(use_accept4 ? "accept4()" : "accept()");
 2880                 if (errno == EAGAIN || errno == EWOULDBLOCK) {
 2881                     /* these are transient, so don't log anything */
 2882                     stop = true;
 2883                 } else if (errno == EMFILE) {
 2884                     if (settings.verbose > 0)
 2885                         fprintf(stderr, "Too many open connections\n");
 2886                     accept_new_conns(false);
 2887                     stop = true;
 2888                 } else {
 2889                     perror("accept()");
 2890                     stop = true;
 2891                 }
 2892                 break;
 2893             }
 2894             if (!use_accept4) {
 2895                 if (fcntl(sfd, F_SETFL, fcntl(sfd, F_GETFL) | O_NONBLOCK) < 0) {
 2896                     perror("setting O_NONBLOCK");
 2897                     close(sfd);
 2898                     break;
 2899                 }
 2900             }
 2901 
 2902             bool reject;
 2903             if (settings.maxconns_fast) {
 2904                 reject = sfd >= settings.maxconns - 1;
 2905                 if (reject) {
 2906                     STATS_LOCK();
 2907                     stats.rejected_conns++;
 2908                     STATS_UNLOCK();
 2909                 }
 2910             } else {
 2911                 reject = false;
 2912             }
 2913 
 2914             if (reject) {
 2915                 str = "ERROR Too many open connections\r\n";
 2916                 res = write(sfd, str, strlen(str));
 2917                 close(sfd);
 2918             } else {
 2919                 void *ssl_v = NULL;
 2920 #ifdef TLS
 2921                 SSL *ssl = NULL;
 2922                 if (c->ssl_enabled) {
 2923                     assert(IS_TCP(c->transport) && settings.ssl_enabled);
 2924 
 2925                     if (settings.ssl_ctx == NULL) {
 2926                         if (settings.verbose) {
 2927                             fprintf(stderr, "SSL context is not initialized\n");
 2928                         }
 2929                         close(sfd);
 2930                         break;
 2931                     }
 2932                     SSL_LOCK();
 2933                     ssl = SSL_new(settings.ssl_ctx);
 2934                     SSL_UNLOCK();
 2935                     if (ssl == NULL) {
 2936                         if (settings.verbose) {
 2937                             fprintf(stderr, "Failed to created the SSL object\n");
 2938                         }
 2939                         close(sfd);
 2940                         break;
 2941                     }
 2942                     SSL_set_fd(ssl, sfd);
 2943                     int ret = SSL_accept(ssl);
 2944                     if (ret <= 0) {
 2945                         int err = SSL_get_error(ssl, ret);
 2946                         if (err == SSL_ERROR_SYSCALL || err == SSL_ERROR_SSL) {
 2947                             if (settings.verbose) {
 2948                                 fprintf(stderr, "SSL connection failed with error code : %d : %s\n", err, strerror(errno));
 2949                             }
 2950                             SSL_free(ssl);
 2951                             close(sfd);
 2952                             STATS_LOCK();
 2953                             stats.ssl_handshake_errors++;
 2954                             STATS_UNLOCK();
 2955                             break;
 2956                         }
 2957                     }
 2958                 }
 2959                 ssl_v = (void*) ssl;
 2960 #endif
 2961 
 2962                 dispatch_conn_new(sfd, conn_new_cmd, EV_READ | EV_PERSIST,
 2963                                      READ_BUFFER_CACHED, c->transport, ssl_v);
 2964             }
 2965 
 2966             stop = true;
 2967             break;
 2968 
 2969         case conn_waiting:
 2970             rbuf_release(c);
 2971             if (!update_event(c, EV_READ | EV_PERSIST)) {
 2972                 if (settings.verbose > 0)
 2973                     fprintf(stderr, "Couldn't update event\n");
 2974                 conn_set_state(c, conn_closing);
 2975                 break;
 2976             }
 2977 
 2978             conn_set_state(c, conn_read);
 2979             stop = true;
 2980             break;
 2981 
 2982         case conn_read:
 2983             if (!IS_UDP(c->transport)) {
 2984                 // Assign a read buffer if necessary.
 2985                 if (!rbuf_alloc(c)) {
 2986                     // TODO: Some way to allow for temporary failures.
 2987                     conn_set_state(c, conn_closing);
 2988                     break;
 2989                 }
 2990                 res = try_read_network(c);
 2991             } else {
 2992                 // UDP connections always have a static buffer.
 2993                 res = try_read_udp(c);
 2994             }
 2995 
 2996             switch (res) {
 2997             case READ_NO_DATA_RECEIVED:
 2998                 conn_set_state(c, conn_waiting);
 2999                 break;
 3000             case READ_DATA_RECEIVED:
 3001                 conn_set_state(c, conn_parse_cmd);
 3002                 break;
 3003             case READ_ERROR:
 3004                 conn_set_state(c, conn_closing);
 3005                 break;
 3006             case READ_MEMORY_ERROR: /* Failed to allocate more memory */
 3007                 /* State already set by try_read_network */
 3008                 break;
 3009             }
 3010             break;
 3011 
 3012         case conn_parse_cmd:
 3013             c->noreply = false;
 3014             if (c->try_read_command(c) == 0) {
 3015                 /* wee need more data! */
 3016                 if (c->resp_head) {
 3017                     // Buffered responses waiting, flush in the meantime.
 3018                     conn_set_state(c, conn_mwrite);
 3019                 } else {
 3020                     conn_set_state(c, conn_waiting);
 3021                 }
 3022             }
 3023 
 3024             break;
 3025 
 3026         case conn_new_cmd:
 3027             /* Only process nreqs at a time to avoid starving other
 3028                connections */
 3029 
 3030             --nreqs;
 3031             if (nreqs >= 0) {
 3032                 reset_cmd_handler(c);
 3033             } else if (c->resp_head) {
 3034                 // flush response pipe on yield.
 3035                 conn_set_state(c, conn_mwrite);
 3036             } else {
 3037                 pthread_mutex_lock(&c->thread->stats.mutex);
 3038                 c->thread->stats.conn_yields++;
 3039                 pthread_mutex_unlock(&c->thread->stats.mutex);
 3040                 if (c->rbytes > 0) {
 3041                     /* We have already read in data into the input buffer,
 3042                        so libevent will most likely not signal read events
 3043                        on the socket (unless more data is available. As a
 3044                        hack we should just put in a request to write data,
 3045                        because that should be possible ;-)
 3046                     */
 3047                     if (!update_event(c, EV_WRITE | EV_PERSIST)) {
 3048                         if (settings.verbose > 0)
 3049                             fprintf(stderr, "Couldn't update event\n");
 3050                         conn_set_state(c, conn_closing);
 3051                         break;
 3052                     }
 3053                 }
 3054                 stop = true;
 3055             }
 3056             break;
 3057 
 3058         case conn_nread:
 3059             if (c->rlbytes == 0) {
 3060                 complete_nread(c);
 3061                 break;
 3062             }
 3063 
 3064             /* Check if rbytes < 0, to prevent crash */
 3065             if (c->rlbytes < 0) {
 3066                 if (settings.verbose) {
 3067                     fprintf(stderr, "Invalid rlbytes to read: len %d\n", c->rlbytes);
 3068                 }
 3069                 conn_set_state(c, conn_closing);
 3070                 break;
 3071             }
 3072 
 3073             if ((((item *)c->item)->it_flags & ITEM_CHUNKED) == 0) {
 3074                 /* first check if we have leftovers in the conn_read buffer */
 3075                 if (c->rbytes > 0) {
 3076                     int tocopy = c->rbytes > c->rlbytes ? c->rlbytes : c->rbytes;
 3077                     memmove(c->ritem, c->rcurr, tocopy);
 3078                     c->ritem += tocopy;
 3079                     c->rlbytes -= tocopy;
 3080                     c->rcurr += tocopy;
 3081                     c->rbytes -= tocopy;
 3082                     if (c->rlbytes == 0) {
 3083                         break;
 3084                     }
 3085                 }
 3086 
 3087                 /*  now try reading from the socket */
 3088                 res = c->read(c, c->ritem, c->rlbytes);
 3089                 if (res > 0) {
 3090                     pthread_mutex_lock(&c->thread->stats.mutex);
 3091                     c->thread->stats.bytes_read += res;
 3092                     pthread_mutex_unlock(&c->thread->stats.mutex);
 3093                     if (c->rcurr == c->ritem) {
 3094                         c->rcurr += res;
 3095                     }
 3096                     c->ritem += res;
 3097                     c->rlbytes -= res;
 3098                     break;
 3099                 }
 3100             } else {
 3101                 res = read_into_chunked_item(c);
 3102                 if (res > 0)
 3103                     break;
 3104             }
 3105 
 3106             if (res == 0) { /* end of stream */
 3107                 conn_set_state(c, conn_closing);
 3108                 break;
 3109             }
 3110 
 3111             if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
 3112                 if (!update_event(c, EV_READ | EV_PERSIST)) {
 3113                     if (settings.verbose > 0)
 3114                         fprintf(stderr, "Couldn't update event\n");
 3115                     conn_set_state(c, conn_closing);
 3116                     break;
 3117                 }
 3118                 stop = true;
 3119                 break;
 3120             }
 3121 
 3122             /* Memory allocation failure */
 3123             if (res == -2) {
 3124                 out_of_memory(c, "SERVER_ERROR Out of memory during read");
 3125                 c->sbytes = c->rlbytes;
 3126                 conn_set_state(c, conn_swallow);
 3127                 // Ensure this flag gets cleared. It gets killed on conn_new()
 3128                 // so any conn_closing is fine, calling complete_nread is
 3129                 // fine. This swallow semms to be the only other case.
 3130                 c->set_stale = false;
 3131                 c->mset_res = false;
 3132                 break;
 3133             }
 3134             /* otherwise we have a real error, on which we close the connection */
 3135             if (settings.verbose > 0) {
 3136                 fprintf(stderr, "Failed to read, and not due to blocking:\n"
 3137                         "errno: %d %s \n"
 3138                         "rcurr=%p ritem=%p rbuf=%p rlbytes=%d rsize=%d\n",
 3139                         errno, strerror(errno),
 3140                         (void *)c->rcurr, (void *)c->ritem, (void *)c->rbuf,
 3141                         (int)c->rlbytes, (int)c->rsize);
 3142             }
 3143             conn_set_state(c, conn_closing);
 3144             break;
 3145 
 3146         case conn_swallow:
 3147             /* we are reading sbytes and throwing them away */
 3148             if (c->sbytes <= 0) {
 3149                 conn_set_state(c, conn_new_cmd);
 3150                 break;
 3151             }
 3152 
 3153             /* first check if we have leftovers in the conn_read buffer */
 3154             if (c->rbytes > 0) {
 3155                 int tocopy = c->rbytes > c->sbytes ? c->sbytes : c->rbytes;
 3156                 c->sbytes -= tocopy;
 3157                 c->rcurr += tocopy;
 3158                 c->rbytes -= tocopy;
 3159                 break;
 3160             }
 3161 
 3162             /*  now try reading from the socket */
 3163             res = c->read(c, c->rbuf, c->rsize > c->sbytes ? c->sbytes : c->rsize);
 3164             if (res > 0) {
 3165                 pthread_mutex_lock(&c->thread->stats.mutex);
 3166                 c->thread->stats.bytes_read += res;
 3167                 pthread_mutex_unlock(&c->thread->stats.mutex);
 3168                 c->sbytes -= res;
 3169                 break;
 3170             }
 3171             if (res == 0) { /* end of stream */
 3172                 conn_set_state(c, conn_closing);
 3173                 break;
 3174             }
 3175             if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
 3176                 if (!update_event(c, EV_READ | EV_PERSIST)) {
 3177                     if (settings.verbose > 0)
 3178                         fprintf(stderr, "Couldn't update event\n");
 3179                     conn_set_state(c, conn_closing);
 3180                     break;
 3181                 }
 3182                 stop = true;
 3183                 break;
 3184             }
 3185             /* otherwise we have a real error, on which we close the connection */
 3186             if (settings.verbose > 0)
 3187                 fprintf(stderr, "Failed to read, and not due to blocking\n");
 3188             conn_set_state(c, conn_closing);
 3189             break;
 3190 
 3191         case conn_write:
 3192         case conn_mwrite:
 3193             /* have side IO's that must process before transmit() can run.
 3194              * remove the connection from the worker thread and dispatch the
 3195              * IO queue
 3196              */
 3197             if (c->io_queues[0].type != IO_QUEUE_NONE) {
 3198                 assert(c->io_queues_submitted == 0);
 3199                 bool hit = false;
 3200 
 3201                 for (io_queue_t *q = c->io_queues; q->type != IO_QUEUE_NONE; q++) {
 3202                     if (q->count != 0) {
 3203                         assert(q->stack_ctx != NULL);
 3204                         hit = true;
 3205                         q->submit_cb(q->ctx, q->stack_ctx);
 3206                         c->io_queues_submitted++;
 3207                     }
 3208                 }
 3209                 if (hit) {
 3210                     conn_set_state(c, conn_io_queue);
 3211                     event_del(&c->event);
 3212 
 3213                     stop = true;
 3214                     break;
 3215                 }
 3216             }
 3217 
 3218             switch (!IS_UDP(c->transport) ? transmit(c) : transmit_udp(c)) {
 3219             case TRANSMIT_COMPLETE:
 3220                 if (c->state == conn_mwrite) {
 3221                     // Free up IO wraps and any half-uploaded items.
 3222                     conn_release_items(c);
 3223                     conn_set_state(c, conn_new_cmd);
 3224                     if (c->close_after_write) {
 3225                         conn_set_state(c, conn_closing);
 3226                     }
 3227                 } else {
 3228                     if (settings.verbose > 0)
 3229                         fprintf(stderr, "Unexpected state %d\n", c->state);
 3230                     conn_set_state(c, conn_closing);
 3231                 }
 3232                 break;
 3233 
 3234             case TRANSMIT_INCOMPLETE:
 3235             case TRANSMIT_HARD_ERROR:
 3236                 break;                   /* Continue in state machine. */
 3237 
 3238             case TRANSMIT_SOFT_ERROR:
 3239                 stop = true;
 3240                 break;
 3241             }
 3242             break;
 3243 
 3244         case conn_closing:
 3245             if (IS_UDP(c->transport))
 3246                 conn_cleanup(c);
 3247             else
 3248                 conn_close(c);
 3249             stop = true;
 3250             break;
 3251 
 3252         case conn_closed:
 3253             /* This only happens if dormando is an idiot. */
 3254             abort();
 3255             break;
 3256 
 3257         case conn_watch:
 3258             /* We handed off our connection to the logger thread. */
 3259             stop = true;
 3260             break;
 3261         case conn_io_queue:
 3262             /* Complete our queued IO's from within the worker thread. */
 3263             conn_io_queue_complete(c);
 3264             conn_set_state(c, conn_mwrite);
 3265             break;
 3266         case conn_max_state:
 3267             assert(false);
 3268             break;
 3269         }
 3270     }
 3271 
 3272     return;
 3273 }
 3274 
 3275 void event_handler(const evutil_socket_t fd, const short which, void *arg) {
 3276     conn *c;
 3277 
 3278     c = (conn *)arg;
 3279     assert(c != NULL);
 3280 
 3281     c->which = which;
 3282 
 3283     /* sanity */
 3284     if (fd != c->sfd) {
 3285         if (settings.verbose > 0)
 3286             fprintf(stderr, "Catastrophic: event fd doesn't match conn fd!\n");
 3287         conn_close(c);
 3288         return;
 3289     }
 3290 
 3291     drive_machine(c);
 3292 
 3293     /* wait for next event */
 3294     return;
 3295 }
 3296 
 3297 static int new_socket(struct addrinfo *ai) {
 3298     int sfd;
 3299     int flags;
 3300 
 3301     if ((sfd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) == -1) {
 3302         return -1;
 3303     }
 3304 
 3305     if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 ||
 3306         fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) {
 3307         perror("setting O_NONBLOCK");
 3308         close(sfd);
 3309         return -1;
 3310     }
 3311     return sfd;
 3312 }
 3313 
 3314 
 3315 /*
 3316  * Sets a socket's send buffer size to the maximum allowed by the system.
 3317  */
 3318 static void maximize_sndbuf(const int sfd) {
 3319     socklen_t intsize = sizeof(int);
 3320     int last_good = 0;
 3321     int min, max, avg;
 3322     int old_size;
 3323 
 3324     /* Start with the default size. */
 3325 #ifdef _WIN32
 3326     if (getsockopt((SOCKET)sfd, SOL_SOCKET, SO_SNDBUF, (char *)&old_size, &intsize) != 0) {
 3327 #else
 3328     if (getsockopt(sfd, SOL_SOCKET, SO_SNDBUF, &old_size, &intsize) != 0) {
 3329 #endif /* #ifdef _WIN32 */
 3330         if (settings.verbose > 0)
 3331             perror("getsockopt(SO_SNDBUF)");
 3332         return;
 3333     }
 3334 
 3335     /* Binary-search for the real maximum. */
 3336     min = old_size;
 3337     max = MAX_SENDBUF_SIZE;
 3338 
 3339     while (min <= max) {
 3340         avg = ((unsigned int)(min + max)) / 2;
 3341         if (setsockopt(sfd, SOL_SOCKET, SO_SNDBUF, (void *)&avg, intsize) == 0) {
 3342             last_good = avg;
 3343             min = avg + 1;
 3344         } else {
 3345             max = avg - 1;
 3346         }
 3347     }
 3348 
 3349     if (settings.verbose > 1)
 3350         fprintf(stderr, "<%d send buffer was %d, now %d\n", sfd, old_size, last_good);
 3351 }
 3352 
 3353 /**
 3354  * Create a socket and bind it to a specific port number
 3355  * @param interface the interface to bind to
 3356  * @param port the port number to bind to
 3357  * @param transport the transport protocol (TCP / UDP)
 3358  * @param portnumber_file A filepointer to write the port numbers to
 3359  *        when they are successfully added to the list of ports we
 3360  *        listen on.
 3361  */
 3362 static int server_socket(const char *interface,
 3363                          int port,
 3364                          enum network_transport transport,
 3365                          FILE *portnumber_file, bool ssl_enabled) {
 3366     int sfd;
 3367     struct linger ling = {0, 0};
 3368     struct addrinfo *ai;
 3369     struct addrinfo *next;
 3370     struct addrinfo hints = { .ai_flags = AI_PASSIVE,
 3371                               .ai_family = AF_UNSPEC };
 3372     char port_buf[NI_MAXSERV];
 3373     int error;
 3374     int success = 0;
 3375     int flags =1;
 3376 
 3377     hints.ai_socktype = IS_UDP(transport) ? SOCK_DGRAM : SOCK_STREAM;
 3378 
 3379     if (port == -1) {
 3380         port = 0;
 3381     }
 3382     snprintf(port_buf, sizeof(port_buf), "%d", port);
 3383     error= getaddrinfo(interface, port_buf, &hints, &ai);
 3384     if (error != 0) {
 3385         if (error != EAI_SYSTEM)
 3386           fprintf(stderr, "getaddrinfo(): %s\n", gai_strerror(error));
 3387         else
 3388           perror("getaddrinfo()");
 3389         return 1;
 3390     }
 3391 
 3392     for (next= ai; next; next= next->ai_next) {
 3393         conn *listen_conn_add;
 3394         if ((sfd = new_socket(next)) == -1) {
 3395             /* getaddrinfo can return "junk" addresses,
 3396              * we make sure at least one works before erroring.
 3397              */
 3398             if (errno == EMFILE) {
 3399                 /* ...unless we're out of fds */
 3400                 perror("server_socket");
 3401                 exit(EX_OSERR);
 3402             }
 3403             continue;
 3404         }
 3405 
 3406         if (settings.num_napi_ids) {
 3407             socklen_t len = sizeof(socklen_t);
 3408             int napi_id;
 3409             error = getsockopt(sfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len);
 3410             if (error != 0) {
 3411                 fprintf(stderr, "-N <num_napi_ids> option not supported\n");
 3412                 exit(EXIT_FAILURE);
 3413             }
 3414         }
 3415 
 3416 #ifdef IPV6_V6ONLY
 3417         if (next->ai_family == AF_INET6) {
 3418             error = setsockopt(sfd, IPPROTO_IPV6, IPV6_V6ONLY, (char *) &flags, sizeof(flags));
 3419             if (error != 0) {
 3420                 perror("setsockopt");
 3421                 close(sfd);
 3422                 continue;
 3423             }
 3424         }
 3425 #endif
 3426 
 3427         setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, (void *)&flags, sizeof(flags));
 3428         if (IS_UDP(transport)) {
 3429             maximize_sndbuf(sfd);
 3430         } else {
 3431             error = setsockopt(sfd, SOL_SOCKET, SO_KEEPALIVE, (void *)&flags, sizeof(flags));
 3432             if (error != 0)
 3433                 perror("setsockopt");
 3434 
 3435             error = setsockopt(sfd, SOL_SOCKET, SO_LINGER, (void *)&ling, sizeof(ling));
 3436             if (error != 0)
 3437                 perror("setsockopt");
 3438 
 3439             error = setsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, (void *)&flags, sizeof(flags));
 3440             if (error != 0)
 3441                 perror("setsockopt");
 3442         }
 3443 
 3444         if (bind(sfd, next->ai_addr, next->ai_addrlen) == -1) {
 3445             if (errno != EADDRINUSE) {
 3446                 perror("bind()");
 3447                 close(sfd);
 3448                 freeaddrinfo(ai);
 3449                 return 1;
 3450             }
 3451             close(sfd);
 3452             continue;
 3453         } else {
 3454             success++;
 3455             if (!IS_UDP(transport) && listen(sfd, settings.backlog) == -1) {
 3456                 perror("listen()");
 3457                 close(sfd);
 3458                 freeaddrinfo(ai);
 3459                 return 1;
 3460             }
 3461             if (portnumber_file != NULL &&
 3462                 (next->ai_addr->sa_family == AF_INET ||
 3463                  next->ai_addr->sa_family == AF_INET6)) {
 3464                 union {
 3465                     struct sockaddr_in in;
 3466                     struct sockaddr_in6 in6;
 3467                 } my_sockaddr;
 3468                 socklen_t len = sizeof(my_sockaddr);
 3469                 if (getsockname(sfd, (struct sockaddr*)&my_sockaddr, &len)==0) {
 3470                     if (next->ai_addr->sa_family == AF_INET) {
 3471                         fprintf(portnumber_file, "%s INET: %u\n",
 3472                                 IS_UDP(transport) ? "UDP" : "TCP",
 3473                                 ntohs(my_sockaddr.in.sin_port));
 3474                     } else {
 3475                         fprintf(portnumber_file, "%s INET6: %u\n",
 3476                                 IS_UDP(transport) ? "UDP" : "TCP",
 3477                                 ntohs(my_sockaddr.in6.sin6_port));
 3478                     }
 3479                 }
 3480             }
 3481         }
 3482 
 3483         if (IS_UDP(transport)) {
 3484             int c;
 3485 
 3486             for (c = 0; c < settings.num_threads_per_udp; c++) {
 3487                 /* Allocate one UDP file descriptor per worker thread;
 3488                  * this allows "stats conns" to separately list multiple
 3489                  * parallel UDP requests in progress.
 3490                  *
 3491                  * The dispatch code round-robins new connection requests
 3492                  * among threads, so this is guaranteed to assign one
 3493                  * FD to each thread.
 3494                  */
 3495                 int per_thread_fd;
 3496                 if (c == 0) {
 3497                     per_thread_fd = sfd;
 3498                 } else {
 3499                     per_thread_fd = dup(sfd);
 3500                     if (per_thread_fd < 0) {
 3501                         perror("Failed to duplicate file descriptor");
 3502                         exit(EXIT_FAILURE);
 3503                     }
 3504                 }
 3505                 dispatch_conn_new(per_thread_fd, conn_read,
 3506                                   EV_READ | EV_PERSIST,
 3507                                   UDP_READ_BUFFER_SIZE, transport, NULL);
 3508             }
 3509         } else {
 3510             if (!(listen_conn_add = conn_new(sfd, conn_listening,
 3511                                              EV_READ | EV_PERSIST, 1,
 3512                                              transport, main_base, NULL))) {
 3513                 fprintf(stderr, "failed to create listening connection\n");
 3514                 exit(EXIT_FAILURE);
 3515             }
 3516 #ifdef TLS
 3517             listen_conn_add->ssl_enabled = ssl_enabled;
 3518 #else
 3519             assert(ssl_enabled == false);
 3520 #endif
 3521             listen_conn_add->next = listen_conn;
 3522             listen_conn = listen_conn_add;
 3523         }
 3524     }
 3525 
 3526     freeaddrinfo(ai);
 3527 
 3528     /* Return zero iff we detected no errors in starting up connections */
 3529     return success == 0;
 3530 }
 3531 
 3532 static int server_sockets(int port, enum network_transport transport,
 3533                           FILE *portnumber_file) {
 3534     bool ssl_enabled = false;
 3535 
 3536 #ifdef TLS
 3537     const char *notls = "notls";
 3538     ssl_enabled = settings.ssl_enabled;
 3539 #endif
 3540 
 3541     if (settings.inter == NULL) {
 3542         return server_socket(settings.inter, port, transport, portnumber_file, ssl_enabled);
 3543     } else {
 3544         // tokenize them and bind to each one of them..
 3545         char *b;
 3546         int ret = 0;
 3547         char *list = strdup(settings.inter);
 3548 
 3549         if (list == NULL) {
 3550             fprintf(stderr, "Failed to allocate memory for parsing server interface string\n");
 3551             return 1;
 3552         }
 3553         for (char *p = strtok_r(list, ";,", &b);
 3554             p != NULL;
 3555             p = strtok_r(NULL, ";,", &b)) {
 3556             int the_port = port;
 3557 #ifdef TLS
 3558             ssl_enabled = settings.ssl_enabled;
 3559             // "notls" option is valid only when memcached is run with SSL enabled.
 3560             if (strncmp(p, notls, strlen(notls)) == 0) {
 3561                 if (!settings.ssl_enabled) {
 3562                     fprintf(stderr, "'notls' option is valid only when SSL is enabled\n");
 3563                     free(list);
 3564                     return 1;
 3565                 }
 3566                 ssl_enabled = false;
 3567                 p += strlen(notls) + 1;
 3568             }
 3569 #endif
 3570 
 3571             char *h = NULL;
 3572             if (*p == '[') {
 3573                 // expecting it to be an IPv6 address enclosed in []
 3574                 // i.e. RFC3986 style recommended by RFC5952
 3575                 char *e = strchr(p, ']');
 3576                 if (e == NULL) {
 3577                     fprintf(stderr, "Invalid IPV6 address: \"%s\"", p);
 3578                     free(list);
 3579                     return 1;
 3580                 }
 3581                 h = ++p; // skip the opening '['
 3582                 *e = '\0';
 3583                 p = ++e; // skip the closing ']'
 3584             }
 3585 
 3586             char *s = strchr(p, ':');
 3587             if (s != NULL) {
 3588                 // If no more semicolons - attempt to treat as port number.
 3589                 // Otherwise the only valid option is an unenclosed IPv6 without port, until
 3590                 // of course there was an RFC3986 IPv6 address previously specified -
 3591                 // in such a case there is no good option, will just send it to fail as port number.
 3592                 if (strchr(s + 1, ':') == NULL || h != NULL) {
 3593                     *s = '\0';
 3594                     ++s;
 3595                     if (!safe_strtol(s, &the_port)) {
 3596                         fprintf(stderr, "Invalid port number: \"%s\"", s);
 3597                         free(list);
 3598                         return 1;
 3599                     }
 3600                 }
 3601             }
 3602 
 3603             if (h != NULL)
 3604                 p = h;
 3605 
 3606             if (strcmp(p, "*") == 0) {
 3607                 p = NULL;
 3608             }
 3609             ret |= server_socket(p, the_port, transport, portnumber_file, ssl_enabled);
 3610         }
 3611         free(list);
 3612         return ret;
 3613     }
 3614 }
 3615 
 3616 #ifndef DISABLE_UNIX_SOCKET
 3617 static int new_socket_unix(void) {
 3618     int sfd;
 3619     int flags;
 3620 
 3621     if ((sfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
 3622         perror("socket()");
 3623         return -1;
 3624     }
 3625 
 3626     if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 ||
 3627         fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) {
 3628         perror("setting O_NONBLOCK");
 3629         close(sfd);
 3630         return -1;
 3631     }
 3632     return sfd;
 3633 }
 3634 
 3635 static int server_socket_unix(const char *path, int access_mask) {
 3636     int sfd;
 3637     struct linger ling = {0, 0};
 3638     struct sockaddr_un addr;
 3639     struct stat tstat;
 3640     int flags =1;
 3641     int old_umask;
 3642 
 3643     if (!path) {
 3644         return 1;
 3645     }
 3646 
 3647     if ((sfd = new_socket_unix()) == -1) {
 3648         return 1;
 3649     }
 3650 
 3651     /*
 3652      * Clean up a previous socket file if we left it around
 3653      */
 3654     if (lstat(path, &tstat) == 0) {
 3655         if (S_ISSOCK(tstat.st_mode))
 3656             unlink(path);
 3657     }
 3658 
 3659     setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, (void *)&flags, sizeof(flags));
 3660     setsockopt(sfd, SOL_SOCKET, SO_KEEPALIVE, (void *)&flags, sizeof(flags));
 3661     setsockopt(sfd, SOL_SOCKET, SO_LINGER, (void *)&ling, sizeof(ling));
 3662 
 3663     /*
 3664      * the memset call clears nonstandard fields in some implementations
 3665      * that otherwise mess things up.
 3666      */
 3667     memset(&addr, 0, sizeof(addr));
 3668 
 3669     addr.sun_family = AF_UNIX;
 3670     strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
 3671     assert(strcmp(addr.sun_path, path) == 0);
 3672     old_umask = umask( ~(access_mask&0777));
 3673     if (bind(sfd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
 3674         perror("bind()");
 3675         close(sfd);
 3676         umask(old_umask);
 3677         return 1;
 3678     }
 3679     umask(old_umask);
 3680     if (listen(sfd, settings.backlog) == -1) {
 3681         perror("listen()");
 3682         close(sfd);
 3683         return 1;
 3684     }
 3685     if (!(listen_conn = conn_new(sfd, conn_listening,
 3686                                  EV_READ | EV_PERSIST, 1,
 3687                                  local_transport, main_base, NULL))) {
 3688         fprintf(stderr, "failed to create listening connection\n");
 3689         exit(EXIT_FAILURE);
 3690     }
 3691 
 3692     return 0;
 3693 }
 3694 #else
 3695 #define server_socket_unix(path, access_mask)   -1
 3696 #endif /* #ifndef DISABLE_UNIX_SOCKET */
 3697 
 3698 /*
 3699  * We keep the current time of day in a global variable that's updated by a
 3700  * timer event. This saves us a bunch of time() system calls (we really only
 3701  * need to get the time once a second, whereas there can be tens of thousands
 3702  * of requests a second) and allows us to use server-start-relative timestamps
 3703  * rather than absolute UNIX timestamps, a space savings on systems where
 3704  * sizeof(time_t) > sizeof(unsigned int).
 3705  */
 3706 volatile rel_time_t current_time;
 3707 static struct event clockevent;
 3708 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_MONOTONIC)
 3709 static bool monotonic = false;
 3710 static int64_t monotonic_start;
 3711 #endif
 3712 
 3713 /* libevent uses a monotonic clock when available for event scheduling. Aside
 3714  * from jitter, simply ticking our internal timer here is accurate enough.
 3715  * Note that users who are setting explicit dates for expiration times *must*
 3716  * ensure their clocks are correct before starting memcached. */
 3717 static void clock_handler(const evutil_socket_t fd, const short which, void *arg) {
 3718     struct timeval t = {.tv_sec = 1, .tv_usec = 0};
 3719     static bool initialized = false;
 3720 
 3721     if (initialized) {
 3722         /* only delete the event if it's actually there. */
 3723         evtimer_del(&clockevent);
 3724     } else {
 3725         initialized = true;
 3726     }
 3727 
 3728     // While we're here, check for hash table expansion.
 3729     // This function should be quick to avoid delaying the timer.
 3730     assoc_start_expand(stats_state.curr_items);
 3731     // also, if HUP'ed we need to do some maintenance.
 3732     // for now that's just the authfile reload.
 3733     if (settings.sig_hup) {
 3734         settings.sig_hup = false;
 3735 
 3736         authfile_load(settings.auth_file);
 3737     }
 3738 
 3739     evtimer_set(&clockevent, clock_handler, 0);
 3740     event_base_set(main_base, &clockevent);
 3741     evtimer_add(&clockevent, &t);
 3742 
 3743 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_MONOTONIC)
 3744     if (monotonic) {
 3745         struct timespec ts;
 3746         if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
 3747             return;
 3748         current_time = (rel_time_t) (ts.tv_sec - monotonic_start);
 3749         return;
 3750     }
 3751 #endif
 3752     {
 3753         struct timeval tv;
 3754         gettimeofday(&tv, NULL);
 3755         current_time = (rel_time_t) (tv.tv_sec - process_started);
 3756     }
 3757 }
 3758 
 3759 static const char* flag_enabled_disabled(bool flag) {
 3760     return (flag ? "enabled" : "disabled");
 3761 }
 3762 
 3763 static void verify_default(const char* param, bool condition) {
 3764     if (!condition) {
 3765         printf("Default value of [%s] has changed."
 3766             " Modify the help text and default value check.\n", param);
 3767         exit(EXIT_FAILURE);
 3768     }
 3769 }
 3770 
 3771 static void usage(void) {
 3772     printf(PACKAGE " " VERSION "\n");
 3773     printf("-p, --port=<num>          TCP port to listen on (default: %d)\n"
 3774            "-U, --udp-port=<num>      UDP port to listen on (default: %d, off)\n",
 3775            settings.port, settings.udpport);
 3776 #ifndef DISABLE_UNIX_SOCKET
 3777     printf("-s, --unix-socket=<file>  UNIX socket to listen on (disables network support)\n");
 3778     printf("-a, --unix-mask=<mask>    access mask for UNIX socket, in octal (default: %o)\n",
 3779             settings.access);
 3780 #endif /* #ifndef DISABLE_UNIX_SOCKET */
 3781     printf("-A, --enable-shutdown     enable ascii \"shutdown\" command\n");
 3782     printf("-l, --listen=<addr>       interface to listen on (default: INADDR_ANY)\n");
 3783 #ifdef TLS
 3784     printf("                          if TLS/SSL is enabled, 'notls' prefix can be used to\n"
 3785            "                          disable for specific listeners (-l notls:<ip>:<port>) \n");
 3786 #endif
 3787     printf("-d, --daemon              run as a daemon\n"
 3788            "-r, --enable-coredumps    maximize core file limit\n"
 3789            "-u, --user=<user>         assume identity of <username> (only when run as root)\n"
 3790            "-m, --memory-limit=<num>  item memory in megabytes (default: %lu)\n"
 3791            "-M, --disable-evictions   return error on memory exhausted instead of evicting\n"
 3792            "-c, --conn-limit=<num>    max simultaneous connections (default: %d)\n"
 3793            "-k, --lock-memory         lock down all paged memory\n"
 3794            "-v, --verbose             verbose (print errors/warnings while in event loop)\n"
 3795            "-vv                       very verbose (also print client commands/responses)\n"
 3796            "-vvv                      extremely verbose (internal state transitions)\n"
 3797            "-h, --help                print this help and exit\n"
 3798            "-i, --license             print memcached and libevent license\n"
 3799            "-V, --version             print version and exit\n"
 3800            "-P, --pidfile=<file>      save PID in <file>, only used with -d option\n"
 3801            "-f, --slab-growth-factor=<num> chunk size growth factor (default: %2.2f)\n"
 3802            "-n, --slab-min-size=<bytes> min space used for key+value+flags (default: %d)\n",
 3803            (unsigned long) settings.maxbytes / (1 << 20),
 3804            settings.maxconns, settings.factor, settings.chunk_size);
 3805     verify_default("udp-port",settings.udpport == 0);
 3806     printf("-L, --enable-largepages  try to use large memory pages (if available)\n");
 3807     printf("-D <char>     Use <char> as the delimiter between key prefixes and IDs.\n"
 3808            "              This is used for per-prefix stats reporting. The default is\n"
 3809            "              \"%c\" (colon). If this option is specified, stats collection\n"
 3810            "              is turned on automatically; if not, then it may be turned on\n"
 3811            "              by sending the \"stats detail on\" command to the server.\n",
 3812            settings.prefix_delimiter);
 3813     printf("-t, --threads=<num>       number of threads to use (default: %d)\n", settings.num_threads);
 3814     printf("-R, --max-reqs-per-event  maximum number of requests per event, limits the\n"
 3815            "                          requests processed per connection to prevent \n"
 3816            "                          starvation (default: %d)\n", settings.reqs_per_event);
 3817     printf("-C, --disable-cas         disable use of CAS\n");
 3818     printf("-b, --listen-backlog=<num> set the backlog queue limit (default: %d)\n", settings.backlog);
 3819     printf("-B, --protocol=<name>     protocol - one of ascii, binary, or auto (default: %s)\n",
 3820            prot_text(settings.binding_protocol));
 3821     printf("-I, --max-item-size=<num> adjusts max item size\n"
 3822            "                          (default: %dm, min: %dk, max: %dm)\n",
 3823            settings.item_size_max/ (1 << 20), ITEM_SIZE_MAX_LOWER_LIMIT / (1 << 10),  ITEM_SIZE_MAX_UPPER_LIMIT / (1 << 20));
 3824 #ifdef ENABLE_SASL
 3825     printf("-S, --enable-sasl         turn on Sasl authentication\n");
 3826 #endif
 3827     printf("-F, --disable-flush-all   disable flush_all command\n");
 3828     printf("-X, --disable-dumping     disable stats cachedump and lru_crawler metadump\n");
 3829     printf("-W  --disable-watch       disable watch commands (live logging)\n");
 3830     printf("-Y, --auth-file=<file>    (EXPERIMENTAL) enable ASCII protocol authentication. format:\n"
 3831            "                          user:pass\\nuser2:pass2\\n\n");
 3832     printf("-e, --memory-file=<file>  (EXPERIMENTAL) mmap a file for item memory.\n"
 3833            "                          use only in ram disks or persistent memory mounts!\n"
 3834            "                          enables restartable cache (stop with SIGUSR1)\n");
 3835 #ifdef TLS
 3836     printf("-Z, --enable-ssl          enable TLS/SSL\n");
 3837 #endif
 3838     printf("-o, --extended            comma separated list of extended options\n"
 3839            "                          most options have a 'no_' prefix to disable\n"
 3840            "   - maxconns_fast:       immediately close new connections after limit (default: %s)\n"
 3841            "   - hashpower:           an integer multiplier for how large the hash\n"
 3842            "                          table should be. normally grows at runtime. (default starts at: %d)\n"
 3843            "                          set based on \"STAT hash_power_level\"\n"
 3844            "   - tail_repair_time:    time in seconds for how long to wait before\n"
 3845            "                          forcefully killing LRU tail item.\n"
 3846            "                          disabled by default; very dangerous option.\n"
 3847            "   - hash_algorithm:      the hash table algorithm\n"
 3848            "                          default is murmur3 hash. options: jenkins, murmur3\n"
 3849            "   - no_lru_crawler:      disable LRU Crawler background thread.\n"
 3850            "   - lru_crawler_sleep:   microseconds to sleep between items\n"
 3851            "                          default is %d.\n"
 3852            "   - lru_crawler_tocrawl: max items to crawl per slab per run\n"
 3853            "                          default is %u (unlimited)\n",
 3854            flag_enabled_disabled(settings.maxconns_fast), settings.hashpower_init,
 3855            settings.lru_crawler_sleep, settings.lru_crawler_tocrawl);
 3856     printf("   - read_buf_mem_limit:  limit in megabytes for connection read/response buffers.\n"
 3857            "                          do not adjust unless you have high (20k+) conn. limits.\n"
 3858            "                          0 means unlimited (default: %u)\n",
 3859            settings.read_buf_mem_limit);
 3860     verify_default("read_buf_mem_limit", settings.read_buf_mem_limit == 0);
 3861     printf("   - no_lru_maintainer:   disable new LRU system + background thread.\n"
 3862            "   - hot_lru_pct:         pct of slab memory to reserve for hot lru.\n"
 3863            "                          (requires lru_maintainer, default pct: %d)\n"
 3864            "   - warm_lru_pct:        pct of slab memory to reserve for warm lru.\n"
 3865            "                          (requires lru_maintainer, default pct: %d)\n"
 3866            "   - hot_max_factor:      items idle > cold lru age * drop from hot lru. (default: %.2f)\n"
 3867            "   - warm_max_factor:     items idle > cold lru age * this drop from warm. (default: %.2f)\n"
 3868            "   - temporary_ttl:       TTL's below get separate LRU, can't be evicted.\n"
 3869            "                          (requires lru_maintainer, default: %d)\n"
 3870            "   - idle_timeout:        timeout for idle connections. (default: %d, no timeout)\n",
 3871            settings.hot_lru_pct, settings.warm_lru_pct, settings.hot_max_factor, settings.warm_max_factor,
 3872            settings.temporary_ttl, settings.idle_timeout);
 3873     printf("   - slab_chunk_max:      (EXPERIMENTAL) maximum slab size in kilobytes. use extreme care. (default: %d)\n"
 3874            "   - watcher_logbuf_size: size in kilobytes of per-watcher write buffer. (default: %u)\n"
 3875            "   - worker_logbuf_size:  size in kilobytes of per-worker-thread buffer\n"
 3876            "                          read by background thread, then written to watchers. (default: %u)\n"
 3877            "   - track_sizes:         enable dynamic reports for 'stats sizes' command.\n"
 3878            "   - no_hashexpand:       disables hash table expansion (dangerous)\n"
 3879            "   - modern:              enables options which will be default in future.\n"
 3880            "                          currently: nothing\n"
 3881            "   - no_modern:           uses defaults of previous major version (1.4.x)\n",
 3882            settings.slab_chunk_size_max / (1 << 10), settings.logger_watcher_buf_size / (1 << 10),
 3883            settings.logger_buf_size / (1 << 10));
 3884     verify_default("tail_repair_time", settings.tail_repair_time == TAIL_REPAIR_TIME_DEFAULT);
 3885     verify_default("lru_crawler_tocrawl", settings.lru_crawler_tocrawl == 0);
 3886     verify_default("idle_timeout", settings.idle_timeout == 0);
 3887 #ifdef HAVE_DROP_PRIVILEGES
 3888     printf("   - drop_privileges:     enable dropping extra syscall privileges\n"
 3889            "   - no_drop_privileges:  disable drop_privileges in case it causes issues with\n"
 3890            "                          some customisation.\n"
 3891            "                          (default is no_drop_privileges)\n");
 3892     verify_default("drop_privileges", !settings.drop_privileges);
 3893 #ifdef MEMCACHED_DEBUG
 3894     printf("   - relaxed_privileges:  running tests requires extra privileges. (default: %s)\n",
 3895            flag_enabled_disabled(settings.relaxed_privileges));
 3896 #endif
 3897 #endif
 3898 #ifdef EXTSTORE
 3899     printf("\n   - External storage (ext_*) related options (see: https://memcached.org/extstore)\n");
 3900     printf("   - ext_path:            file to write to for external storage.\n"
 3901            "                          ie: ext_path=/mnt/d1/extstore:1G\n"
 3902            "   - ext_page_size:       size in megabytes of storage pages. (default: %u)\n"
 3903            "   - ext_wbuf_size:       size in megabytes of page write buffers. (default: %u)\n"
 3904            "   - ext_threads:         number of IO threads to run. (default: %u)\n"
 3905            "   - ext_item_size:       store items larger than this (bytes, default %u)\n"
 3906            "   - ext_item_age:        store items idle at least this long (seconds, default: no age limit)\n"
 3907            "   - ext_low_ttl:         consider TTLs lower than this specially (default: %u)\n"
 3908            "   - ext_drop_unread:     don't re-write unread values during compaction (default: %s)\n"
 3909            "   - ext_recache_rate:    recache an item every N accesses (default: %u)\n"
 3910            "   - ext_compact_under:   compact when fewer than this many free pages\n"
 3911            "                          (default: 1/4th of the assigned storage)\n"
 3912            "   - ext_drop_under:      drop COLD items when fewer than this many free pages\n"
 3913            "                          (default: 1/4th of the assigned storage)\n"
 3914            "   - ext_max_frag:        max page fragmentation to tolerate (default: %.2f)\n"
 3915            "   - slab_automove_freeratio: ratio of memory to hold free as buffer.\n"
 3916            "                          (see doc/storage.txt for more info, default: %.3f)\n",
 3917            settings.ext_page_size / (1 << 20), settings.ext_wbuf_size / (1 << 20), settings.ext_io_threadcount,
 3918            settings.ext_item_size, settings.ext_low_ttl,
 3919            flag_enabled_disabled(settings.ext_drop_unread), settings.ext_recache_rate,
 3920            settings.ext_max_frag, settings.slab_automove_freeratio);
 3921     verify_default("ext_item_age", settings.ext_item_age == UINT_MAX);
 3922 #endif
 3923 #ifdef TLS
 3924     printf("   - ssl_chain_cert:      certificate chain file in PEM format\n"
 3925            "   - ssl_key:             private key, if not part of the -ssl_chain_cert\n"
 3926            "   - ssl_keyformat:       private key format (PEM, DER or ENGINE) (default: PEM)\n");
 3927     printf("   - ssl_verify_mode:     peer certificate verification mode, default is 0(None).\n"
 3928            "                          valid values are 0(None), 1(Request), 2(Require)\n"
 3929            "                          or 3(Once)\n");
 3930     printf("   - ssl_ciphers:         specify cipher list to be used\n"
 3931            "   - ssl_ca_cert:         PEM format file of acceptable client CA's\n"
 3932            "   - ssl_wbuf_size:       size in kilobytes of per-connection SSL output buffer\n"
 3933            "                          (default: %u)\n", settings.ssl_wbuf_size / (1 << 10));
 3934     printf("   - ssl_session_cache:   enable server-side SSL session cache, to support session\n"
 3935            "                          resumption\n");
 3936     verify_default("ssl_keyformat", settings.ssl_keyformat == SSL_FILETYPE_PEM);
 3937     verify_default("ssl_verify_mode", settings.ssl_verify_mode == SSL_VERIFY_NONE);
 3938 #endif
 3939     printf("-N, --napi_ids            number of napi ids. see doc/napi_ids.txt for more details\n");
 3940     return;
 3941 }
 3942 
 3943 static void usage_license(void) {
 3944     printf(PACKAGE " " VERSION "\n\n");
 3945     printf(
 3946     "Copyright (c) 2003, Danga Interactive, Inc. <http://www.danga.com/>\n"
 3947     "All rights reserved.\n"
 3948     "\n"
 3949     "Redistribution and use in source and binary forms, with or without\n"
 3950     "modification, are permitted provided that the following conditions are\n"
 3951     "met:\n"
 3952     "\n"
 3953     "    * Redistributions of source code must retain the above copyright\n"
 3954     "notice, this list of conditions and the following disclaimer.\n"
 3955     "\n"
 3956     "    * Redistributions in binary form must reproduce the above\n"
 3957     "copyright notice, this list of conditions and the following disclaimer\n"
 3958     "in the documentation and/or other materials provided with the\n"
 3959     "distribution.\n"
 3960     "\n"
 3961     "    * Neither the name of the Danga Interactive nor the names of its\n"
 3962     "contributors may be used to endorse or promote products derived from\n"
 3963     "this software without specific prior written permission.\n"
 3964     "\n"
 3965     "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n"
 3966     "\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n"
 3967     "LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n"
 3968     "A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n"
 3969     "OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n"
 3970     "SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n"
 3971     "LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n"
 3972     "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n"
 3973     "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n"
 3974     "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n"
 3975     "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
 3976     "\n"
 3977     "\n"
 3978     "This product includes software developed by Niels Provos.\n"
 3979     "\n"
 3980     "[ libevent ]\n"
 3981     "\n"
 3982     "Copyright 2000-2003 Niels Provos <provos@citi.umich.edu>\n"
 3983     "All rights reserved.\n"
 3984     "\n"
 3985     "Redistribution and use in source and binary forms, with or without\n"
 3986     "modification, are permitted provided that the following conditions\n"
 3987     "are met:\n"
 3988     "1. Redistributions of source code must retain the above copyright\n"
 3989     "   notice, this list of conditions and the following disclaimer.\n"
 3990     "2. Redistributions in binary form must reproduce the above copyright\n"
 3991     "   notice, this list of conditions and the following disclaimer in the\n"
 3992     "   documentation and/or other materials provided with the distribution.\n"
 3993     "3. All advertising materials mentioning features or use of this software\n"
 3994     "   must display the following acknowledgement:\n"
 3995     "      This product includes software developed by Niels Provos.\n"
 3996     "4. The name of the author may not be used to endorse or promote products\n"
 3997     "   derived from this software without specific prior written permission.\n"
 3998     "\n"
 3999     "THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR\n"
 4000     "IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\n"
 4001     "OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n"
 4002     "IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,\n"
 4003     "INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT\n"
 4004     "NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n"
 4005     "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n"
 4006     "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n"
 4007     "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF\n"
 4008     "THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
 4009     );
 4010 
 4011     return;
 4012 }
 4013 
 4014 static void save_pid(const char *pid_file) {
 4015     FILE *fp;
 4016     if (access(pid_file, F_OK) == 0) {
 4017         if ((fp = fopen(pid_file, "r")) != NULL) {
 4018             char buffer[1024];
 4019             if (fgets(buffer, sizeof(buffer), fp) != NULL) {
 4020                 unsigned int pid;
 4021                 if (safe_strtoul(buffer, &pid) && kill((pid_t)pid, 0) == 0) {
 4022                     fprintf(stderr, "WARNING: The pid file contained the following (running) pid: %u\n", pid);
 4023                 }
 4024             }
 4025             fclose(fp);
 4026         }
 4027     }
 4028 
 4029     /* Create the pid file first with a temporary name, then
 4030      * atomically move the file to the real name to avoid a race with
 4031      * another process opening the file to read the pid, but finding
 4032      * it empty.
 4033      */
 4034     char tmp_pid_file[1024];
 4035     snprintf(tmp_pid_file, sizeof(tmp_pid_file), "%s.tmp", pid_file);
 4036 
 4037     if ((fp = fopen(tmp_pid_file, "w")) == NULL) {
 4038         vperror("Could not open the pid file %s for writing", tmp_pid_file);
 4039         return;
 4040     }
 4041 
 4042     fprintf(fp,"%ld\n", (long)getpid());
 4043     if (fclose(fp) == -1) {
 4044         vperror("Could not close the pid file %s", tmp_pid_file);
 4045     }
 4046 
 4047     if (rename(tmp_pid_file, pid_file) != 0) {
 4048         vperror("Could not rename the pid file from %s to %s",
 4049                 tmp_pid_file, pid_file);
 4050     }
 4051 }
 4052 
 4053 static void remove_pidfile(const char *pid_file) {
 4054   if (pid_file == NULL)
 4055       return;
 4056 
 4057   if (unlink(pid_file) != 0) {
 4058       vperror("Could not remove the pid file %s", pid_file);
 4059   }
 4060 
 4061 }
 4062 
 4063 static void sig_handler(const int sig) {
 4064     stop_main_loop = EXIT_NORMALLY;
 4065     printf("Signal handled: %s.\n", strsignal(sig));
 4066 }
 4067 
 4068 static void sighup_handler(const int sig) {
 4069     settings.sig_hup = true;
 4070 }
 4071 
 4072 static void sig_usrhandler(const int sig) {
 4073     printf("Graceful shutdown signal handled: %s.\n", strsignal(sig));
 4074     stop_main_loop = GRACE_STOP;
 4075 }
 4076 
 4077 /*
 4078  * On systems that supports multiple page sizes we may reduce the
 4079  * number of TLB-misses by using the biggest available page size
 4080  */
 4081 static int enable_large_pages(void) {
 4082 #if defined(HAVE_GETPAGESIZES) && defined(HAVE_MEMCNTL)
 4083     int ret = -1;
 4084     size_t sizes[32];
 4085     int avail = getpagesizes(sizes, 32);
 4086     if (avail != -1) {
 4087         size_t max = sizes[0];
 4088         struct memcntl_mha arg = {0};
 4089         int ii;
 4090 
 4091         for (ii = 1; ii < avail; ++ii) {
 4092             if (max < sizes[ii]) {
 4093                 max = sizes[ii];
 4094             }
 4095         }
 4096 
 4097         arg.mha_flags   = 0;
 4098         arg.mha_pagesize = max;
 4099         arg.mha_cmd = MHA_MAPSIZE_BSSBRK;
 4100 
 4101         if (memcntl(0, 0, MC_HAT_ADVISE, (caddr_t)&arg, 0, 0) == -1) {
 4102             fprintf(stderr, "Failed to set large pages: %s\n",
 4103                     strerror(errno));
 4104             fprintf(stderr, "Will use default page size\n");
 4105         } else {
 4106             ret = 0;
 4107         }
 4108     } else {
 4109         fprintf(stderr, "Failed to get supported pagesizes: %s\n",
 4110                 strerror(errno));
 4111         fprintf(stderr, "Will use default page size\n");
 4112     }
 4113 
 4114     return ret;
 4115 #elif defined(__linux__) && defined(MADV_HUGEPAGE)
 4116     /* check if transparent hugepages is compiled into the kernel */
 4117     struct stat st;
 4118     int ret = stat("/sys/kernel/mm/transparent_hugepage/enabled", &st);
 4119     if (ret || !(st.st_mode & S_IFREG)) {
 4120         fprintf(stderr, "Transparent huge pages support not detected.\n");
 4121         fprintf(stderr, "Will use default page size.\n");
 4122         return -1;
 4123     }
 4124     return 0;
 4125 #elif defined(__FreeBSD__)
 4126     int spages;
 4127     size_t spagesl = sizeof(spages);
 4128 
 4129     if (sysctlbyname("vm.pmap.pg_ps_enabled", &spages,
 4130     &spagesl, NULL, 0) != 0) {
 4131         fprintf(stderr, "Could not evaluate the presence of superpages features.");
 4132         return -1;
 4133     }
 4134     if (spages != 1) {
 4135         fprintf(stderr, "Superpages support not detected.\n");
 4136         fprintf(stderr, "Will use default page size.\n");
 4137         return -1;
 4138     }
 4139     return 0;
 4140 #else
 4141     return -1;
 4142 #endif
 4143 }
 4144 
 4145 /**
 4146  * Do basic sanity check of the runtime environment
 4147  * @return true if no errors found, false if we can't use this env
 4148  */
 4149 static bool sanitycheck(void) {
 4150     /* One of our biggest problems is old and bogus libevents */
 4151     const char *ever = event_get_version();
 4152     if (ever != NULL) {
 4153         if (strncmp(ever, "1.", 2) == 0) {
 4154             fprintf(stderr, "You are using libevent %s.\nPlease upgrade to 2.x"
 4155                         " or newer\n", event_get_version());
 4156             return false;
 4157         }
 4158     }
 4159 
 4160     return true;
 4161 }
 4162 
 4163 static bool _parse_slab_sizes(char *s, uint32_t *slab_sizes) {
 4164     char *b = NULL;
 4165     uint32_t size = 0;
 4166     int i = 0;
 4167     uint32_t last_size = 0;
 4168 
 4169     if (strlen(s) < 1)
 4170         return false;
 4171 
 4172     for (char *p = strtok_r(s, "-", &b);
 4173          p != NULL;
 4174          p = strtok_r(NULL, "-", &b)) {
 4175         if (!safe_strtoul(p, &size) || size < settings.chunk_size
 4176              || size > settings.slab_chunk_size_max) {
 4177             fprintf(stderr, "slab size %u is out of valid range\n", size);
 4178             return false;
 4179         }
 4180         if (last_size >= size) {
 4181             fprintf(stderr, "slab size %u cannot be lower than or equal to a previous class size\n", size);
 4182             return false;
 4183         }
 4184         if (size <= last_size + CHUNK_ALIGN_BYTES) {
 4185             fprintf(stderr, "slab size %u must be at least %d bytes larger than previous class\n",
 4186                     size, CHUNK_ALIGN_BYTES);
 4187             return false;
 4188         }
 4189         slab_sizes[i++] = size;
 4190         last_size = size;
 4191         if (i >= MAX_NUMBER_OF_SLAB_CLASSES-1) {
 4192             fprintf(stderr, "too many slab classes specified\n");
 4193             return false;
 4194         }
 4195     }
 4196 
 4197     slab_sizes[i] = 0;
 4198     return true;
 4199 }
 4200 
 4201 struct _mc_meta_data {
 4202     void *mmap_base;
 4203     uint64_t old_base;
 4204     char *slab_config; // string containing either factor or custom slab list.
 4205     int64_t time_delta;
 4206     uint64_t process_started;
 4207     uint32_t current_time;
 4208 };
 4209 
 4210 // We need to remember a combination of configuration settings and global
 4211 // state for restart viability and resumption of internal services.
 4212 // Compared to the number of tunables and state values, relatively little
 4213 // does need to be remembered.
 4214 // Time is the hardest; we have to assume the sys clock is correct and re-sync for
 4215 // the lost time after restart.
 4216 static int _mc_meta_save_cb(const char *tag, void *ctx, void *data) {
 4217     struct _mc_meta_data *meta = (struct _mc_meta_data *)data;
 4218 
 4219     // Settings to remember.
 4220     // TODO: should get a version of version which is numeric, else
 4221     // comparisons for compat reasons are difficult.
 4222     // it may be possible to punt on this for now; since we can test for the
 4223     // absense of another key... such as the new numeric version.
 4224     //restart_set_kv(ctx, "version", "%s", VERSION);
 4225     // We hold the original factor or subopts _string_
 4226     // it can be directly compared without roundtripping through floats or
 4227     // serializing/deserializing the long options list.
 4228     restart_set_kv(ctx, "slab_config", "%s", meta->slab_config);
 4229     restart_set_kv(ctx, "maxbytes", "%llu", (unsigned long long) settings.maxbytes);
 4230     restart_set_kv(ctx, "chunk_size", "%d", settings.chunk_size);
 4231     restart_set_kv(ctx, "item_size_max", "%d", settings.item_size_max);
 4232     restart_set_kv(ctx, "slab_chunk_size_max", "%d", settings.slab_chunk_size_max);
 4233     restart_set_kv(ctx, "slab_page_size", "%d", settings.slab_page_size);
 4234     restart_set_kv(ctx, "use_cas", "%s", settings.use_cas ? "true" : "false");
 4235     restart_set_kv(ctx, "slab_reassign", "%s", settings.slab_reassign ? "true" : "false");
 4236 
 4237     // Online state to remember.
 4238 
 4239     // current time is tough. we need to rely on the clock being correct to
 4240     // pull the delta between stop and start times. we also need to know the
 4241     // delta between start time and now to restore monotonic clocks.
 4242     // for non-monotonic clocks (some OS?), process_started is the only
 4243     // important one.
 4244     restart_set_kv(ctx, "current_time", "%u", current_time);
 4245     // types are great until... this. some systems time_t could be big, but
 4246     // I'm assuming never negative.
 4247     restart_set_kv(ctx, "process_started", "%llu", (unsigned long long) process_started);
 4248     {
 4249         struct timeval tv;
 4250         gettimeofday(&tv, NULL);
 4251         restart_set_kv(ctx, "stop_time", "%lu", tv.tv_sec);
 4252     }
 4253 
 4254     // Might as well just fetch the next CAS value to use than tightly
 4255     // coupling the internal variable into the restart system.
 4256     restart_set_kv(ctx, "current_cas", "%llu", (unsigned long long) get_cas_id());
 4257     restart_set_kv(ctx, "oldest_cas", "%llu", (unsigned long long) settings.oldest_cas);
 4258     restart_set_kv(ctx, "logger_gid", "%llu", logger_get_gid());
 4259     restart_set_kv(ctx, "hashpower", "%u", stats_state.hash_power_level);
 4260     // NOTE: oldest_live is a rel_time_t, which aliases for unsigned int.
 4261     // should future proof this with a 64bit upcast, or fetch value from a
 4262     // converter function/macro?
 4263     restart_set_kv(ctx, "oldest_live", "%u", settings.oldest_live);
 4264     // TODO: use uintptr_t etc? is it portable enough?
 4265     restart_set_kv(ctx, "mmap_oldbase", "%p", meta->mmap_base);
 4266 
 4267     return 0;
 4268 }
 4269 
 4270 // We must see at least this number of checked lines. Else empty/missing lines
 4271 // could cause a false-positive.
 4272 // TODO: Once crc32'ing of the metadata file is done this could be ensured better by
 4273 // the restart module itself (crc32 + count of lines must match on the
 4274 // backend)
 4275 #define RESTART_REQUIRED_META 17
 4276 
 4277 // With this callback we make a decision on if the current configuration
 4278 // matches up enough to allow reusing the cache.
 4279 // We also re-load important runtime information.
 4280 static int _mc_meta_load_cb(const char *tag, void *ctx, void *data) {
 4281     struct _mc_meta_data *meta = (struct _mc_meta_data *)data;
 4282     char *key;
 4283     char *val;
 4284     int reuse_mmap = 0;
 4285     meta->process_started = 0;
 4286     meta->time_delta = 0;
 4287     meta->current_time = 0;
 4288     int lines_seen = 0;
 4289 
 4290     // TODO: not sure this is any better than just doing an if/else tree with
 4291     // strcmp's...
 4292     enum {
 4293         R_MMAP_OLDBASE = 0,
 4294         R_MAXBYTES,
 4295         R_CHUNK_SIZE,
 4296         R_ITEM_SIZE_MAX,
 4297         R_SLAB_CHUNK_SIZE_MAX,
 4298         R_SLAB_PAGE_SIZE,
 4299         R_SLAB_CONFIG,
 4300         R_USE_CAS,
 4301         R_SLAB_REASSIGN,
 4302         R_CURRENT_CAS,
 4303         R_OLDEST_CAS,
 4304         R_OLDEST_LIVE,
 4305         R_LOGGER_GID,
 4306         R_CURRENT_TIME,
 4307         R_STOP_TIME,
 4308         R_PROCESS_STARTED,
 4309         R_HASHPOWER,
 4310     };
 4311 
 4312     const char *opts[] = {
 4313         [R_MMAP_OLDBASE] = "mmap_oldbase",
 4314         [R_MAXBYTES] = "maxbytes",
 4315         [R_CHUNK_SIZE] = "chunk_size",
 4316         [R_ITEM_SIZE_MAX] = "item_size_max",
 4317         [R_SLAB_CHUNK_SIZE_MAX] = "slab_chunk_size_max",
 4318         [R_SLAB_PAGE_SIZE] = "slab_page_size",
 4319         [R_SLAB_CONFIG] = "slab_config",
 4320         [R_USE_CAS] = "use_cas",
 4321         [R_SLAB_REASSIGN] = "slab_reassign",
 4322         [R_CURRENT_CAS] = "current_cas",
 4323         [R_OLDEST_CAS] = "oldest_cas",
 4324         [R_OLDEST_LIVE] = "oldest_live",
 4325         [R_LOGGER_GID] = "logger_gid",
 4326         [R_CURRENT_TIME] = "current_time",
 4327         [R_STOP_TIME] = "stop_time",
 4328         [R_PROCESS_STARTED] = "process_started",
 4329         [R_HASHPOWER] = "hashpower",
 4330         NULL
 4331     };
 4332 
 4333     while (restart_get_kv(ctx, &key, &val) == RESTART_OK) {
 4334         int type = 0;
 4335         int32_t val_int = 0;
 4336         uint32_t val_uint = 0;
 4337         int64_t bigval_int = 0;
 4338         uint64_t bigval_uint = 0;
 4339 
 4340         while (opts[type] != NULL && strcmp(key, opts[type]) != 0) {
 4341             type++;
 4342         }
 4343         if (opts[type] == NULL) {
 4344             fprintf(stderr, "[restart] unknown/unhandled key: %s\n", key);
 4345             continue;
 4346         }
 4347         lines_seen++;
 4348 
 4349         // helper for any boolean checkers.
 4350         bool val_bool = false;
 4351         bool is_bool = true;
 4352         if (strcmp(val, "false") == 0) {
 4353             val_bool = false;
 4354         } else if (strcmp(val, "true") == 0) {
 4355             val_bool = true;
 4356         } else {
 4357             is_bool = false;
 4358         }
 4359 
 4360         switch (type) {
 4361         case R_MMAP_OLDBASE:
 4362             if (!safe_strtoull_hex(val, &meta->old_base)) {
 4363                 fprintf(stderr, "[restart] failed to parse %s: %s\n", key, val);
 4364                 reuse_mmap = -1;
 4365             }
 4366             break;
 4367         case R_MAXBYTES:
 4368             if (!safe_strtoll(val, &bigval_int) || settings.maxbytes != bigval_int) {
 4369                 reuse_mmap = -1;
 4370             }
 4371             break;
 4372         case R_CHUNK_SIZE:
 4373             if (!safe_strtol(val, &val_int) || settings.chunk_size != val_int) {
 4374                 reuse_mmap = -1;
 4375             }
 4376             break;
 4377         case R_ITEM_SIZE_MAX:
 4378             if (!safe_strtol(val, &val_int) || settings.item_size_max != val_int) {
 4379                 reuse_mmap = -1;
 4380             }
 4381             break;
 4382         case R_SLAB_CHUNK_SIZE_MAX:
 4383             if (!safe_strtol(val, &val_int) || settings.slab_chunk_size_max != val_int) {
 4384                 reuse_mmap = -1;
 4385             }
 4386             break;
 4387         case R_SLAB_PAGE_SIZE:
 4388             if (!safe_strtol(val, &val_int) || settings.slab_page_size != val_int) {
 4389                 reuse_mmap = -1;
 4390             }
 4391             break;
 4392         case R_SLAB_CONFIG:
 4393             if (strcmp(val, meta->slab_config) != 0) {
 4394                 reuse_mmap = -1;
 4395             }
 4396             break;
 4397         case R_USE_CAS:
 4398             if (!is_bool || settings.use_cas != val_bool) {
 4399                 reuse_mmap = -1;
 4400             }
 4401             break;
 4402         case R_SLAB_REASSIGN:
 4403             if (!is_bool || settings.slab_reassign != val_bool) {
 4404                 reuse_mmap = -1;
 4405             }
 4406             break;
 4407         case R_CURRENT_CAS:
 4408             // FIXME: do we need to fail if these values _aren't_ found?
 4409             if (!safe_strtoull(val, &bigval_uint)) {
 4410                 reuse_mmap = -1;
 4411             } else {
 4412                 set_cas_id(bigval_uint);
 4413             }
 4414             break;
 4415         case R_OLDEST_CAS:
 4416             if (!safe_strtoull(val, &bigval_uint)) {
 4417                 reuse_mmap = -1;
 4418             } else {
 4419                 settings.oldest_cas = bigval_uint;
 4420             }
 4421             break;
 4422         case R_OLDEST_LIVE:
 4423             if (!safe_strtoul(val, &val_uint)) {
 4424                 reuse_mmap = -1;
 4425             } else {
 4426                 settings.oldest_live = val_uint;
 4427             }
 4428             break;
 4429         case R_LOGGER_GID:
 4430             if (!safe_strtoull(val, &bigval_uint)) {
 4431                 reuse_mmap = -1;
 4432             } else {
 4433                 logger_set_gid(bigval_uint);
 4434             }
 4435             break;
 4436         case R_PROCESS_STARTED:
 4437             if (!safe_strtoull(val, &bigval_uint)) {
 4438                 reuse_mmap = -1;
 4439             } else {
 4440                 meta->process_started = bigval_uint;
 4441             }
 4442             break;
 4443         case R_CURRENT_TIME:
 4444             if (!safe_strtoul(val, &val_uint)) {
 4445                 reuse_mmap = -1;
 4446             } else {
 4447                 meta->current_time = val_uint;
 4448             }
 4449             break;
 4450         case R_STOP_TIME:
 4451             if (!safe_strtoll(val, &bigval_int)) {
 4452                 reuse_mmap = -1;
 4453             } else {
 4454                 struct timeval t;
 4455                 gettimeofday(&t, NULL);
 4456                 meta->time_delta = t.tv_sec - bigval_int;
 4457                 // clock has done something crazy.
 4458                 // there are _lots_ of ways the clock can go wrong here, but
 4459                 // this is a safe sanity check since there's nothing else we
 4460                 // can realistically do.
 4461                 if (meta->time_delta <= 0) {
 4462                     reuse_mmap = -1;
 4463                 }
 4464             }
 4465             break;
 4466         case R_HASHPOWER:
 4467             if (!safe_strtoul(val, &val_uint)) {
 4468                 reuse_mmap = -1;
 4469             } else {
 4470                 settings.hashpower_init = val_uint;
 4471             }
 4472             break;
 4473         default:
 4474             fprintf(stderr, "[restart] unhandled key: %s\n", key);
 4475         }
 4476 
 4477         if (reuse_mmap != 0) {
 4478             fprintf(stderr, "[restart] restart incompatible due to setting for [%s] [old value: %s]\n", key, val);
 4479             break;
 4480         }
 4481     }
 4482 
 4483     if (lines_seen < RESTART_REQUIRED_META) {
 4484         fprintf(stderr, "[restart] missing some metadata lines\n");
 4485         reuse_mmap = -1;
 4486     }
 4487 
 4488     return reuse_mmap;
 4489 }
 4490 
 4491 int main (int argc, char **argv) {
 4492     int c;
 4493     bool lock_memory = false;
 4494     bool do_daemonize = false;
 4495     bool preallocate = false;
 4496     int maxcore = 0;
 4497     char *username = NULL;
 4498     char *pid_file = NULL;
 4499     struct passwd *pw;
 4500     struct rlimit rlim;
 4501     char *buf;
 4502     char unit = '\0';
 4503     int size_max = 0;
 4504     int retval = EXIT_SUCCESS;
 4505     bool protocol_specified = false;
 4506     bool tcp_specified = false;
 4507     bool udp_specified = false;
 4508     bool start_lru_maintainer = true;
 4509     bool start_lru_crawler = true;
 4510     bool start_assoc_maint = true;
 4511     enum hashfunc_type hash_type = MURMUR3_HASH;
 4512     uint32_t tocrawl;
 4513     uint32_t slab_sizes[MAX_NUMBER_OF_SLAB_CLASSES];
 4514     bool use_slab_sizes = false;
 4515     char *slab_sizes_unparsed = NULL;
 4516     bool slab_chunk_size_changed = false;
 4517     // struct for restart code. Initialized up here so we can curry
 4518     // important settings to save or validate.
 4519     struct _mc_meta_data *meta = malloc(sizeof(struct _mc_meta_data));
 4520     meta->slab_config = NULL;
 4521     char *subopts, *subopts_orig;
 4522     char *subopts_value;
 4523     enum {
 4524         MAXCONNS_FAST = 0,
 4525         HASHPOWER_INIT,
 4526         NO_HASHEXPAND,
 4527         SLAB_REASSIGN,
 4528         SLAB_AUTOMOVE,
 4529         SLAB_AUTOMOVE_RATIO,
 4530         SLAB_AUTOMOVE_WINDOW,
 4531         TAIL_REPAIR_TIME,
 4532         HASH_ALGORITHM,
 4533         LRU_CRAWLER,
 4534         LRU_CRAWLER_SLEEP,
 4535         LRU_CRAWLER_TOCRAWL,
 4536         LRU_MAINTAINER,
 4537         HOT_LRU_PCT,
 4538         WARM_LRU_PCT,
 4539         HOT_MAX_FACTOR,
 4540         WARM_MAX_FACTOR,
 4541         TEMPORARY_TTL,
 4542         IDLE_TIMEOUT,
 4543         WATCHER_LOGBUF_SIZE,
 4544         WORKER_LOGBUF_SIZE,
 4545         SLAB_SIZES,
 4546         SLAB_CHUNK_MAX,
 4547         TRACK_SIZES,
 4548         NO_INLINE_ASCII_RESP,
 4549         MODERN,
 4550         NO_MODERN,
 4551         NO_CHUNKED_ITEMS,
 4552         NO_SLAB_REASSIGN,
 4553         NO_SLAB_AUTOMOVE,
 4554         NO_MAXCONNS_FAST,
 4555         INLINE_ASCII_RESP,
 4556         NO_LRU_CRAWLER,
 4557         NO_LRU_MAINTAINER,
 4558         NO_DROP_PRIVILEGES,
 4559         DROP_PRIVILEGES,
 4560         RESP_OBJ_MEM_LIMIT,
 4561         READ_BUF_MEM_LIMIT,
 4562 #ifdef TLS
 4563         SSL_CERT,
 4564         SSL_KEY,
 4565         SSL_VERIFY_MODE,
 4566         SSL_KEYFORM,
 4567         SSL_CIPHERS,
 4568         SSL_CA_CERT,
 4569         SSL_WBUF_SIZE,
 4570         SSL_SESSION_CACHE,
 4571 #endif
 4572 #ifdef MEMCACHED_DEBUG
 4573         RELAXED_PRIVILEGES,
 4574 #endif
 4575     };
 4576     char *const subopts_tokens[] = {
 4577         [MAXCONNS_FAST] = "maxconns_fast",
 4578         [HASHPOWER_INIT] = "hashpower",
 4579         [NO_HASHEXPAND] = "no_hashexpand",
 4580         [SLAB_REASSIGN] = "slab_reassign",
 4581         [SLAB_AUTOMOVE] = "slab_automove",
 4582         [SLAB_AUTOMOVE_RATIO] = "slab_automove_ratio",
 4583         [SLAB_AUTOMOVE_WINDOW] = "slab_automove_window",
 4584         [TAIL_REPAIR_TIME] = "tail_repair_time",
 4585         [HASH_ALGORITHM] = "hash_algorithm",
 4586         [LRU_CRAWLER] = "lru_crawler",
 4587         [LRU_CRAWLER_SLEEP] = "lru_crawler_sleep",
 4588         [LRU_CRAWLER_TOCRAWL] = "lru_crawler_tocrawl",
 4589         [LRU_MAINTAINER] = "lru_maintainer",
 4590         [HOT_LRU_PCT] = "hot_lru_pct",
 4591         [WARM_LRU_PCT] = "warm_lru_pct",
 4592         [HOT_MAX_FACTOR] = "hot_max_factor",
 4593         [WARM_MAX_FACTOR] = "warm_max_factor",
 4594         [TEMPORARY_TTL] = "temporary_ttl",
 4595         [IDLE_TIMEOUT] = "idle_timeout",
 4596         [WATCHER_LOGBUF_SIZE] = "watcher_logbuf_size",
 4597         [WORKER_LOGBUF_SIZE] = "worker_logbuf_size",
 4598         [SLAB_SIZES] = "slab_sizes",
 4599         [SLAB_CHUNK_MAX] = "slab_chunk_max",
 4600         [TRACK_SIZES] = "track_sizes",
 4601         [NO_INLINE_ASCII_RESP] = "no_inline_ascii_resp",
 4602         [MODERN] = "modern",
 4603         [NO_MODERN] = "no_modern",
 4604         [NO_CHUNKED_ITEMS] = "no_chunked_items",
 4605         [NO_SLAB_REASSIGN] = "no_slab_reassign",
 4606         [NO_SLAB_AUTOMOVE] = "no_slab_automove",
 4607         [NO_MAXCONNS_FAST] = "no_maxconns_fast",
 4608         [INLINE_ASCII_RESP] = "inline_ascii_resp",
 4609         [NO_LRU_CRAWLER] = "no_lru_crawler",
 4610         [NO_LRU_MAINTAINER] = "no_lru_maintainer",
 4611         [NO_DROP_PRIVILEGES] = "no_drop_privileges",
 4612         [DROP_PRIVILEGES] = "drop_privileges",
 4613         [RESP_OBJ_MEM_LIMIT] = "resp_obj_mem_limit",
 4614         [READ_BUF_MEM_LIMIT] = "read_buf_mem_limit",
 4615 #ifdef TLS
 4616         [SSL_CERT] = "ssl_chain_cert",
 4617         [SSL_KEY] = "ssl_key",
 4618         [SSL_VERIFY_MODE] = "ssl_verify_mode",
 4619         [SSL_KEYFORM] = "ssl_keyformat",
 4620         [SSL_CIPHERS] = "ssl_ciphers",
 4621         [SSL_CA_CERT] = "ssl_ca_cert",
 4622         [SSL_WBUF_SIZE] = "ssl_wbuf_size",
 4623         [SSL_SESSION_CACHE] = "ssl_session_cache",
 4624 #endif
 4625 #ifdef MEMCACHED_DEBUG
 4626         [RELAXED_PRIVILEGES] = "relaxed_privileges",
 4627 #endif
 4628         NULL
 4629     };
 4630 
 4631     if (!sanitycheck()) {
 4632         free(meta);
 4633         return EX_OSERR;
 4634     }
 4635 
 4636     /* handle SIGINT, SIGTERM */
 4637     signal(SIGINT, sig_handler);
 4638     signal(SIGTERM, sig_handler);
 4639     signal(SIGHUP, sighup_handler);
 4640     signal(SIGUSR1, sig_usrhandler);
 4641 
 4642     /* init settings */
 4643     settings_init();
 4644     verify_default("hash_algorithm", hash_type == MURMUR3_HASH);
 4645 #ifdef EXTSTORE
 4646     void *storage = NULL;
 4647     void *storage_cf = storage_init_config(&settings);
 4648     bool storage_enabled = false;
 4649     if (storage_cf == NULL) {
 4650         fprintf(stderr, "failed to allocate extstore config\n");
 4651         return 1;
 4652     }
 4653 #endif
 4654 
 4655     /* Run regardless of initializing it later */
 4656     init_lru_maintainer();
 4657 
 4658     /* set stderr non-buffering (for running under, say, daemontools) */
 4659     setbuf(stderr, NULL);
 4660 
 4661     char *shortopts =
 4662           "a:"  /* access mask for unix socket */
 4663           "A"   /* enable admin shutdown command */
 4664           "Z"   /* enable SSL */
 4665           "p:"  /* TCP port number to listen on */
 4666           "s:"  /* unix socket path to listen on */
 4667           "U:"  /* UDP port number to listen on */
 4668           "m:"  /* max memory to use for items in megabytes */
 4669           "M"   /* return error on memory exhausted */
 4670           "c:"  /* max simultaneous connections */
 4671           "k"   /* lock down all paged memory */
 4672           "hiV" /* help, licence info, version */
 4673           "r"   /* maximize core file limit */
 4674           "v"   /* verbose */
 4675           "d"   /* daemon mode */
 4676           "l:"  /* interface to listen on */
 4677           "u:"  /* user identity to run as */
 4678           "P:"  /* save PID in file */
 4679           "f:"  /* factor? */
 4680           "n:"  /* minimum space allocated for key+value+flags */
 4681           "t:"  /* threads */
 4682           "D:"  /* prefix delimiter? */
 4683           "L"   /* Large memory pages */
 4684           "R:"  /* max requests per event */
 4685           "C"   /* Disable use of CAS */
 4686           "b:"  /* backlog queue limit */
 4687           "B:"  /* Binding protocol */
 4688           "I:"  /* Max item size */
 4689           "S"   /* Sasl ON */
 4690           "F"   /* Disable flush_all */
 4691           "X"   /* Disable dump commands */
 4692           "W"   /* Disable watch commands */
 4693           "Y:"   /* Enable token auth */
 4694           "e:"  /* mmap path for external item memory */
 4695           "o:"  /* Extended generic options */
 4696           "N:"  /* NAPI ID based thread selection */
 4697           ;
 4698 
 4699     /* process arguments */
 4700 #ifdef HAVE_GETOPT_LONG
 4701     const struct option longopts[] = {
 4702         {"unix-mask", required_argument, 0, 'a'},
 4703         {"enable-shutdown", no_argument, 0, 'A'},
 4704         {"enable-ssl", no_argument, 0, 'Z'},
 4705         {"port", required_argument, 0, 'p'},
 4706         {"unix-socket", required_argument, 0, 's'},
 4707         {"udp-port", required_argument, 0, 'U'},
 4708         {"memory-limit", required_argument, 0, 'm'},
 4709         {"disable-evictions", no_argument, 0, 'M'},
 4710         {"conn-limit", required_argument, 0, 'c'},
 4711         {"lock-memory", no_argument, 0, 'k'},
 4712         {"help", no_argument, 0, 'h'},
 4713         {"license", no_argument, 0, 'i'},
 4714         {"version", no_argument, 0, 'V'},
 4715         {"enable-coredumps", no_argument, 0, 'r'},
 4716         {"verbose", optional_argument, 0, 'v'},
 4717         {"daemon", no_argument, 0, 'd'},
 4718         {"listen", required_argument, 0, 'l'},
 4719         {"user", required_argument, 0, 'u'},
 4720         {"pidfile", required_argument, 0, 'P'},
 4721         {"slab-growth-factor", required_argument, 0, 'f'},
 4722         {"slab-min-size", required_argument, 0, 'n'},
 4723         {"threads", required_argument, 0, 't'},
 4724         {"enable-largepages", no_argument, 0, 'L'},
 4725         {"max-reqs-per-event", required_argument, 0, 'R'},
 4726         {"disable-cas", no_argument, 0, 'C'},
 4727         {"listen-backlog", required_argument, 0, 'b'},
 4728         {"protocol", required_argument, 0, 'B'},
 4729         {"max-item-size", required_argument, 0, 'I'},
 4730         {"enable-sasl", no_argument, 0, 'S'},
 4731         {"disable-flush-all", no_argument, 0, 'F'},
 4732         {"disable-dumping", no_argument, 0, 'X'},
 4733         {"disable-watch", no_argument, 0, 'W'},
 4734         {"auth-file", required_argument, 0, 'Y'},
 4735         {"memory-file", required_argument, 0, 'e'},
 4736         {"extended", required_argument, 0, 'o'},
 4737         {"napi-ids", required_argument, 0, 'N'},
 4738         {0, 0, 0, 0}
 4739     };
 4740     int optindex;
 4741     while (-1 != (c = getopt_long(argc, argv, shortopts,
 4742                     longopts, &optindex))) {
 4743 #else
 4744     while (-1 != (c = getopt(argc, argv, shortopts))) {
 4745 #endif
 4746         switch (c) {
 4747         case 'A':
 4748             /* enables "shutdown" command */
 4749             settings.shutdown_command = true;
 4750             break;
 4751         case 'Z':
 4752             /* enable secure communication*/
 4753 #ifdef TLS
 4754             settings.ssl_enabled = true;
 4755 #else
 4756             fprintf(stderr, "This server is not built with TLS support.\n");
 4757             exit(EX_USAGE);
 4758 #endif
 4759             break;
 4760         case 'a':
 4761 #ifndef DISABLE_UNIX_SOCKET
 4762             /* access for unix domain socket, as octal mask (like chmod)*/
 4763