"Fossies" - the Fresh Open Source Software Archive

Member "memcached-1.6.9/memcached.h" (21 Nov 2020, 36317 Bytes) of package /linux/www/memcached-1.6.9.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "memcached.h" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.6.8_vs_1.6.9.

    1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
    2 
    3 /** \file
    4  * The main memcached header holding commonly used data
    5  * structures and function prototypes.
    6  */
    7 
    8 #ifdef HAVE_CONFIG_H
    9 #include "config.h"
   10 #endif
   11 
   12 #include <sys/types.h>
   13 #include <sys/socket.h>
   14 #include <sys/time.h>
   15 #include <netinet/in.h>
   16 #include <event.h>
   17 #include <netdb.h>
   18 #include <pthread.h>
   19 #include <unistd.h>
   20 #include <assert.h>
   21 #include <grp.h>
   22 #include <signal.h>
   23 /* need this to get IOV_MAX on some platforms. */
   24 #ifndef __need_IOV_MAX
   25 #define __need_IOV_MAX
   26 #endif
   27 #include <limits.h>
   28 /* FreeBSD 4.x doesn't have IOV_MAX exposed. */
   29 #ifndef IOV_MAX
   30 #if defined(__FreeBSD__) || defined(__APPLE__) || defined(__GNU__)
   31 # define IOV_MAX 1024
   32 /* GNU/Hurd don't set MAXPATHLEN
   33  * http://www.gnu.org/software/hurd/hurd/porting/guidelines.html#PATH_MAX_tt_MAX_PATH_tt_MAXPATHL */
   34 #ifndef MAXPATHLEN
   35 #define MAXPATHLEN 4096
   36 #endif
   37 #endif
   38 #endif
   39 
   40 #include "itoa_ljust.h"
   41 #include "protocol_binary.h"
   42 #include "cache.h"
   43 #include "logger.h"
   44 
   45 #ifdef EXTSTORE
   46 #include "crc32c.h"
   47 #endif
   48 
   49 #include "sasl_defs.h"
   50 #ifdef TLS
   51 #include <openssl/ssl.h>
   52 #endif
   53 
   54 /* for NAPI pinning feature */
   55 #ifndef SO_INCOMING_NAPI_ID
   56 #define SO_INCOMING_NAPI_ID 56
   57 #endif
   58 
   59 /** Maximum length of a key. */
   60 #define KEY_MAX_LENGTH 250
   61 
   62 /** Maximum length of a uri encoded key. */
   63 #define KEY_MAX_URI_ENCODED_LENGTH (KEY_MAX_LENGTH  * 3 + 1)
   64 
   65 /** Size of an incr buf. */
   66 #define INCR_MAX_STORAGE_LEN 24
   67 
   68 #define WRITE_BUFFER_SIZE 1024
   69 #define READ_BUFFER_SIZE 16384
   70 #define READ_BUFFER_CACHED 0
   71 #define UDP_READ_BUFFER_SIZE 65536
   72 #define UDP_MAX_PAYLOAD_SIZE 1400
   73 #define UDP_HEADER_SIZE 8
   74 #define MAX_SENDBUF_SIZE (256 * 1024 * 1024)
   75 
   76 /* Binary protocol stuff */
   77 #define BIN_MAX_EXTLEN 20 // length of the _incr command is currently the longest.
   78 
   79 /* Initial power multiplier for the hash table */
   80 #define HASHPOWER_DEFAULT 16
   81 #define HASHPOWER_MAX 32
   82 
   83 /*
   84  * We only reposition items in the LRU queue if they haven't been repositioned
   85  * in this many seconds. That saves us from churning on frequently-accessed
   86  * items.
   87  */
   88 #define ITEM_UPDATE_INTERVAL 60
   89 
   90 /*
   91  * Valid range of the maximum size of an item, in bytes.
   92  */
   93 #define ITEM_SIZE_MAX_LOWER_LIMIT 1024
   94 #define ITEM_SIZE_MAX_UPPER_LIMIT 1024 * 1024 * 1024
   95 
   96 
   97 /* unistd.h is here */
   98 #if HAVE_UNISTD_H
   99 # include <unistd.h>
  100 #endif
  101 
  102 /* Slab sizing definitions. */
  103 #define POWER_SMALLEST 1
  104 #define POWER_LARGEST  256 /* actual cap is 255 */
  105 #define SLAB_GLOBAL_PAGE_POOL 0 /* magic slab class for storing pages for reassignment */
  106 #define CHUNK_ALIGN_BYTES 8
  107 /* slab class max is a 6-bit number, -1. */
  108 #define MAX_NUMBER_OF_SLAB_CLASSES (63 + 1)
  109 
  110 /** How long an object can reasonably be assumed to be locked before
  111     harvesting it on a low memory condition. Default: disabled. */
  112 #define TAIL_REPAIR_TIME_DEFAULT 0
  113 
  114 /* warning: don't use these macros with a function, as it evals its arg twice */
  115 #define ITEM_get_cas(i) (((i)->it_flags & ITEM_CAS) ? \
  116         (i)->data->cas : (uint64_t)0)
  117 
  118 #define ITEM_set_cas(i,v) { \
  119     if ((i)->it_flags & ITEM_CAS) { \
  120         (i)->data->cas = v; \
  121     } \
  122 }
  123 
  124 #define ITEM_key(item) (((char*)&((item)->data)) \
  125          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  126 
  127 #define ITEM_suffix(item) ((char*) &((item)->data) + (item)->nkey + 1 \
  128          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  129 
  130 #define ITEM_data(item) ((char*) &((item)->data) + (item)->nkey + 1 \
  131          + (((item)->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0) \
  132          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  133 
  134 #define ITEM_ntotal(item) (sizeof(struct _stritem) + (item)->nkey + 1 \
  135          + (item)->nbytes \
  136          + (((item)->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0) \
  137          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  138 
  139 #define ITEM_clsid(item) ((item)->slabs_clsid & ~(3<<6))
  140 #define ITEM_lruid(item) ((item)->slabs_clsid & (3<<6))
  141 
  142 #define STAT_KEY_LEN 128
  143 #define STAT_VAL_LEN 128
  144 
  145 /** Append a simple stat with a stat name, value format and value */
  146 #define APPEND_STAT(name, fmt, val) \
  147     append_stat(name, add_stats, c, fmt, val);
  148 
  149 /** Append an indexed stat with a stat name (with format), value format
  150     and value */
  151 #define APPEND_NUM_FMT_STAT(name_fmt, num, name, fmt, val)          \
  152     klen = snprintf(key_str, STAT_KEY_LEN, name_fmt, num, name);    \
  153     vlen = snprintf(val_str, STAT_VAL_LEN, fmt, val);               \
  154     add_stats(key_str, klen, val_str, vlen, c);
  155 
  156 /** Common APPEND_NUM_FMT_STAT format. */
  157 #define APPEND_NUM_STAT(num, name, fmt, val) \
  158     APPEND_NUM_FMT_STAT("%d:%s", num, name, fmt, val)
  159 
  160 /** Item client flag conversion */
  161 #define FLAGS_CONV(it, flag) { \
  162     if ((it)->it_flags & ITEM_CFLAGS) { \
  163         flag = *((uint32_t *)ITEM_suffix((it))); \
  164     } else { \
  165         flag = 0; \
  166     } \
  167 }
  168 
  169 #define FLAGS_SIZE(item) (((item)->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0)
  170 
  171 /**
  172  * Callback for any function producing stats.
  173  *
  174  * @param key the stat's key
  175  * @param klen length of the key
  176  * @param val the stat's value in an ascii form (e.g. text form of a number)
  177  * @param vlen length of the value
  178  * @parm cookie magic callback cookie
  179  */
  180 typedef void (*ADD_STAT)(const char *key, const uint16_t klen,
  181                          const char *val, const uint32_t vlen,
  182                          const void *cookie);
  183 
  184 /*
  185  * NOTE: If you modify this table you _MUST_ update the function state_text
  186  */
  187 /**
  188  * Possible states of a connection.
  189  */
  190 enum conn_states {
  191     conn_listening,  /**< the socket which listens for connections */
  192     conn_new_cmd,    /**< Prepare connection for next command */
  193     conn_waiting,    /**< waiting for a readable socket */
  194     conn_read,       /**< reading in a command line */
  195     conn_parse_cmd,  /**< try to parse a command from the input buffer */
  196     conn_write,      /**< writing out a simple response */
  197     conn_nread,      /**< reading in a fixed number of bytes */
  198     conn_swallow,    /**< swallowing unnecessary bytes w/o storing */
  199     conn_closing,    /**< closing this connection */
  200     conn_mwrite,     /**< writing out many items sequentially */
  201     conn_closed,     /**< connection is closed */
  202     conn_watch,      /**< held by the logger thread as a watcher */
  203     conn_io_queue,   /**< wait on async. process to get response object */
  204     conn_max_state   /**< Max state value (used for assertion) */
  205 };
  206 
  207 enum bin_substates {
  208     bin_no_state,
  209     bin_reading_set_header,
  210     bin_reading_cas_header,
  211     bin_read_set_value,
  212     bin_reading_get_key,
  213     bin_reading_stat,
  214     bin_reading_del_header,
  215     bin_reading_incr_header,
  216     bin_read_flush_exptime,
  217     bin_reading_sasl_auth,
  218     bin_reading_sasl_auth_data,
  219     bin_reading_touch_key,
  220 };
  221 
  222 enum protocol {
  223     ascii_prot = 3, /* arbitrary value. */
  224     binary_prot,
  225     negotiating_prot /* Discovering the protocol */
  226 };
  227 
  228 enum network_transport {
  229     local_transport, /* Unix sockets*/
  230     tcp_transport,
  231     udp_transport
  232 };
  233 
  234 enum pause_thread_types {
  235     PAUSE_WORKER_THREADS = 0,
  236     PAUSE_ALL_THREADS,
  237     RESUME_ALL_THREADS,
  238     RESUME_WORKER_THREADS
  239 };
  240 
  241 enum stop_reasons {
  242     NOT_STOP,
  243     GRACE_STOP,
  244     EXIT_NORMALLY
  245 };
  246 
  247 #define IS_TCP(x) (x == tcp_transport)
  248 #define IS_UDP(x) (x == udp_transport)
  249 
  250 #define NREAD_ADD 1
  251 #define NREAD_SET 2
  252 #define NREAD_REPLACE 3
  253 #define NREAD_APPEND 4
  254 #define NREAD_PREPEND 5
  255 #define NREAD_CAS 6
  256 
  257 enum store_item_type {
  258     NOT_STORED=0, STORED, EXISTS, NOT_FOUND, TOO_LARGE, NO_MEMORY
  259 };
  260 
  261 enum delta_result_type {
  262     OK, NON_NUMERIC, EOM, DELTA_ITEM_NOT_FOUND, DELTA_ITEM_CAS_MISMATCH
  263 };
  264 
  265 /** Time relative to server start. Smaller than time_t on 64-bit systems. */
  266 // TODO: Move to sub-header. needed in logger.h
  267 //typedef unsigned int rel_time_t;
  268 
  269 /** Use X macros to avoid iterating over the stats fields during reset and
  270  * aggregation. No longer have to add new stats in 3+ places.
  271  */
  272 
  273 #define SLAB_STATS_FIELDS \
  274     X(set_cmds) \
  275     X(get_hits) \
  276     X(touch_hits) \
  277     X(delete_hits) \
  278     X(cas_hits) \
  279     X(cas_badval) \
  280     X(incr_hits) \
  281     X(decr_hits)
  282 
  283 /** Stats stored per slab (and per thread). */
  284 struct slab_stats {
  285 #define X(name) uint64_t    name;
  286     SLAB_STATS_FIELDS
  287 #undef X
  288 };
  289 
  290 #define THREAD_STATS_FIELDS \
  291     X(get_cmds) \
  292     X(get_misses) \
  293     X(get_expired) \
  294     X(get_flushed) \
  295     X(touch_cmds) \
  296     X(touch_misses) \
  297     X(delete_misses) \
  298     X(incr_misses) \
  299     X(decr_misses) \
  300     X(cas_misses) \
  301     X(meta_cmds) \
  302     X(bytes_read) \
  303     X(bytes_written) \
  304     X(flush_cmds) \
  305     X(conn_yields) /* # of yields for connections (-R option)*/ \
  306     X(auth_cmds) \
  307     X(auth_errors) \
  308     X(idle_kicks) /* idle connections killed */ \
  309     X(response_obj_oom) \
  310     X(response_obj_count) \
  311     X(response_obj_bytes) \
  312     X(read_buf_oom)
  313 
  314 #ifdef EXTSTORE
  315 #define EXTSTORE_THREAD_STATS_FIELDS \
  316     X(get_extstore) \
  317     X(get_aborted_extstore) \
  318     X(get_oom_extstore) \
  319     X(recache_from_extstore) \
  320     X(miss_from_extstore) \
  321     X(badcrc_from_extstore)
  322 #endif
  323 
  324 /**
  325  * Stats stored per-thread.
  326  */
  327 struct thread_stats {
  328     pthread_mutex_t   mutex;
  329 #define X(name) uint64_t    name;
  330     THREAD_STATS_FIELDS
  331 #ifdef EXTSTORE
  332     EXTSTORE_THREAD_STATS_FIELDS
  333 #endif
  334 #undef X
  335     struct slab_stats slab_stats[MAX_NUMBER_OF_SLAB_CLASSES];
  336     uint64_t lru_hits[POWER_LARGEST];
  337     uint64_t read_buf_count;
  338     uint64_t read_buf_bytes;
  339     uint64_t read_buf_bytes_free;
  340 };
  341 
  342 /**
  343  * Global stats. Only resettable stats should go into this structure.
  344  */
  345 struct stats {
  346     uint64_t      total_items;
  347     uint64_t      total_conns;
  348     uint64_t      rejected_conns;
  349     uint64_t      malloc_fails;
  350     uint64_t      listen_disabled_num;
  351     uint64_t      slabs_moved;       /* times slabs were moved around */
  352     uint64_t      slab_reassign_rescues; /* items rescued during slab move */
  353     uint64_t      slab_reassign_evictions_nomem; /* valid items lost during slab move */
  354     uint64_t      slab_reassign_inline_reclaim; /* valid items lost during slab move */
  355     uint64_t      slab_reassign_chunk_rescues; /* chunked-item chunks recovered */
  356     uint64_t      slab_reassign_busy_items; /* valid temporarily unmovable */
  357     uint64_t      slab_reassign_busy_deletes; /* refcounted items killed */
  358     uint64_t      lru_crawler_starts; /* Number of item crawlers kicked off */
  359     uint64_t      lru_maintainer_juggles; /* number of LRU bg pokes */
  360     uint64_t      time_in_listen_disabled_us;  /* elapsed time in microseconds while server unable to process new connections */
  361     uint64_t      log_worker_dropped; /* logs dropped by worker threads */
  362     uint64_t      log_worker_written; /* logs written by worker threads */
  363     uint64_t      log_watcher_skipped; /* logs watchers missed */
  364     uint64_t      log_watcher_sent; /* logs sent to watcher buffers */
  365 #ifdef EXTSTORE
  366     uint64_t      extstore_compact_lost; /* items lost because they were locked */
  367     uint64_t      extstore_compact_rescues; /* items re-written during compaction */
  368     uint64_t      extstore_compact_skipped; /* unhit items skipped during compaction */
  369 #endif
  370 #ifdef TLS
  371     uint64_t      ssl_handshake_errors; /* TLS failures at accept/handshake time */
  372     uint64_t      ssl_new_sessions; /* successfully negotiated new (non-reused) TLS sessions */
  373 #endif
  374     struct timeval maxconns_entered;  /* last time maxconns entered */
  375     uint64_t      unexpected_napi_ids;  /* see doc/napi_ids.txt */
  376     uint64_t      round_robin_fallback; /* see doc/napi_ids.txt */
  377 };
  378 
  379 /**
  380  * Global "state" stats. Reflects state that shouldn't be wiped ever.
  381  * Ordered for some cache line locality for commonly updated counters.
  382  */
  383 struct stats_state {
  384     uint64_t      curr_items;
  385     uint64_t      curr_bytes;
  386     uint64_t      curr_conns;
  387     uint64_t      hash_bytes;       /* size used for hash tables */
  388     unsigned int  conn_structs;
  389     unsigned int  reserved_fds;
  390     unsigned int  hash_power_level; /* Better hope it's not over 9000 */
  391     bool          hash_is_expanding; /* If the hash table is being expanded */
  392     bool          accepting_conns;  /* whether we are currently accepting */
  393     bool          slab_reassign_running; /* slab reassign in progress */
  394     bool          lru_crawler_running; /* crawl in progress */
  395 };
  396 
  397 #define MAX_VERBOSITY_LEVEL 2
  398 
  399 /* When adding a setting, be sure to update process_stat_settings */
  400 /**
  401  * Globally accessible settings as derived from the commandline.
  402  */
  403 struct settings {
  404     size_t maxbytes;
  405     int maxconns;
  406     int port;
  407     int udpport;
  408     char *inter;
  409     int verbose;
  410     rel_time_t oldest_live; /* ignore existing items older than this */
  411     uint64_t oldest_cas; /* ignore existing items with CAS values lower than this */
  412     int evict_to_free;
  413     char *socketpath;   /* path to unix socket if using local socket */
  414     char *auth_file;    /* path to user authentication file */
  415     int access;  /* access mask (a la chmod) for unix domain socket */
  416     double factor;          /* chunk size growth factor */
  417     int chunk_size;
  418     int num_threads;        /* number of worker (without dispatcher) libevent threads to run */
  419     int num_threads_per_udp; /* number of worker threads serving each udp socket */
  420     char prefix_delimiter;  /* character that marks a key prefix (for stats) */
  421     int detail_enabled;     /* nonzero if we're collecting detailed stats */
  422     int reqs_per_event;     /* Maximum number of io to process on each
  423                                io-event. */
  424     bool use_cas;
  425     enum protocol binding_protocol;
  426     int backlog;
  427     int item_size_max;        /* Maximum item size */
  428     int slab_chunk_size_max;  /* Upper end for chunks within slab pages. */
  429     int slab_page_size;     /* Slab's page units. */
  430     volatile sig_atomic_t sig_hup;  /* a HUP signal was received but not yet handled */
  431     bool sasl;              /* SASL on/off */
  432     bool maxconns_fast;     /* Whether or not to early close connections */
  433     bool lru_crawler;        /* Whether or not to enable the autocrawler thread */
  434     bool lru_maintainer_thread; /* LRU maintainer background thread */
  435     bool lru_segmented;     /* Use split or flat LRU's */
  436     bool slab_reassign;     /* Whether or not slab reassignment is allowed */
  437     int slab_automove;     /* Whether or not to automatically move slabs */
  438     double slab_automove_ratio; /* youngest must be within pct of oldest */
  439     unsigned int slab_automove_window; /* window mover for algorithm */
  440     int hashpower_init;     /* Starting hash power level */
  441     bool shutdown_command; /* allow shutdown command */
  442     int tail_repair_time;   /* LRU tail refcount leak repair time */
  443     bool flush_enabled;     /* flush_all enabled */
  444     bool dump_enabled;      /* whether cachedump/metadump commands work */
  445     char *hash_algorithm;     /* Hash algorithm in use */
  446     int lru_crawler_sleep;  /* Microsecond sleep between items */
  447     uint32_t lru_crawler_tocrawl; /* Number of items to crawl per run */
  448     int hot_lru_pct; /* percentage of slab space for HOT_LRU */
  449     int warm_lru_pct; /* percentage of slab space for WARM_LRU */
  450     double hot_max_factor; /* HOT tail age relative to COLD tail */
  451     double warm_max_factor; /* WARM tail age relative to COLD tail */
  452     int crawls_persleep; /* Number of LRU crawls to run before sleeping */
  453     bool temp_lru; /* TTL < temporary_ttl uses TEMP_LRU */
  454     uint32_t temporary_ttl; /* temporary LRU threshold */
  455     int idle_timeout;       /* Number of seconds to let connections idle */
  456     unsigned int logger_watcher_buf_size; /* size of logger's per-watcher buffer */
  457     unsigned int logger_buf_size; /* size of per-thread logger buffer */
  458     unsigned int read_buf_mem_limit; /* total megabytes allowable for net buffers */
  459     bool drop_privileges;   /* Whether or not to drop unnecessary process privileges */
  460     bool watch_enabled; /* allows watch commands to be dropped */
  461     bool relaxed_privileges;   /* Relax process restrictions when running testapp */
  462 #ifdef EXTSTORE
  463     unsigned int ext_io_threadcount; /* number of IO threads to run. */
  464     unsigned int ext_page_size; /* size in megabytes of storage pages. */
  465     unsigned int ext_item_size; /* minimum size of items to store externally */
  466     unsigned int ext_item_age; /* max age of tail item before storing ext. */
  467     unsigned int ext_low_ttl; /* remaining TTL below this uses own pages */
  468     unsigned int ext_recache_rate; /* counter++ % recache_rate == 0 > recache */
  469     unsigned int ext_wbuf_size; /* read only note for the engine */
  470     unsigned int ext_compact_under; /* when fewer than this many pages, compact */
  471     unsigned int ext_drop_under; /* when fewer than this many pages, drop COLD items */
  472     double ext_max_frag; /* ideal maximum page fragmentation */
  473     double slab_automove_freeratio; /* % of memory to hold free as buffer */
  474     bool ext_drop_unread; /* skip unread items during compaction */
  475     /* per-slab-class free chunk limit */
  476     unsigned int ext_free_memchunks[MAX_NUMBER_OF_SLAB_CLASSES];
  477 #endif
  478 #ifdef TLS
  479     bool ssl_enabled; /* indicates whether SSL is enabled */
  480     SSL_CTX *ssl_ctx; /* holds the SSL server context which has the server certificate */
  481     char *ssl_chain_cert; /* path to the server SSL chain certificate */
  482     char *ssl_key; /* path to the server key */
  483     int ssl_verify_mode; /* client certificate verify mode */
  484     int ssl_keyformat; /* key format , defult is PEM */
  485     char *ssl_ciphers; /* list of SSL ciphers */
  486     char *ssl_ca_cert; /* certificate with CAs. */
  487     rel_time_t ssl_last_cert_refresh_time; /* time of the last server certificate refresh */
  488     unsigned int ssl_wbuf_size; /* size of the write buffer used by ssl_sendmsg method */
  489     bool ssl_session_cache; /* enable SSL server session caching */
  490 #endif
  491     int num_napi_ids;   /* maximum number of NAPI IDs */
  492     char *memory_file;  /* warm restart memory file path */
  493 };
  494 
  495 extern struct stats stats;
  496 extern struct stats_state stats_state;
  497 extern time_t process_started;
  498 extern struct settings settings;
  499 
  500 #define ITEM_LINKED 1
  501 #define ITEM_CAS 2
  502 
  503 /* temp */
  504 #define ITEM_SLABBED 4
  505 
  506 /* Item was fetched at least once in its lifetime */
  507 #define ITEM_FETCHED 8
  508 /* Appended on fetch, removed on LRU shuffling */
  509 #define ITEM_ACTIVE 16
  510 /* If an item's storage are chained chunks. */
  511 #define ITEM_CHUNKED 32
  512 #define ITEM_CHUNK 64
  513 /* ITEM_data bulk is external to item */
  514 #define ITEM_HDR 128
  515 /* additional 4 bytes for item client flags */
  516 #define ITEM_CFLAGS 256
  517 /* item has sent out a token already */
  518 #define ITEM_TOKEN_SENT 512
  519 /* reserved, in case tokens should be a 2-bit count in future */
  520 #define ITEM_TOKEN_RESERVED 1024
  521 /* if item has been marked as a stale value */
  522 #define ITEM_STALE 2048
  523 
  524 /**
  525  * Structure for storing items within memcached.
  526  */
  527 typedef struct _stritem {
  528     /* Protected by LRU locks */
  529     struct _stritem *next;
  530     struct _stritem *prev;
  531     /* Rest are protected by an item lock */
  532     struct _stritem *h_next;    /* hash chain next */
  533     rel_time_t      time;       /* least recent access */
  534     rel_time_t      exptime;    /* expire time */
  535     int             nbytes;     /* size of data */
  536     unsigned short  refcount;
  537     uint16_t        it_flags;   /* ITEM_* above */
  538     uint8_t         slabs_clsid;/* which slab class we're in */
  539     uint8_t         nkey;       /* key length, w/terminating null and padding */
  540     /* this odd type prevents type-punning issues when we do
  541      * the little shuffle to save space when not using CAS. */
  542     union {
  543         uint64_t cas;
  544         char end;
  545     } data[];
  546     /* if it_flags & ITEM_CAS we have 8 bytes CAS */
  547     /* then null-terminated key */
  548     /* then " flags length\r\n" (no terminating null) */
  549     /* then data with terminating \r\n (no terminating null; it's binary!) */
  550 } item;
  551 
  552 // TODO: If we eventually want user loaded modules, we can't use an enum :(
  553 enum crawler_run_type {
  554     CRAWLER_AUTOEXPIRE=0, CRAWLER_EXPIRED, CRAWLER_METADUMP
  555 };
  556 
  557 typedef struct {
  558     struct _stritem *next;
  559     struct _stritem *prev;
  560     struct _stritem *h_next;    /* hash chain next */
  561     rel_time_t      time;       /* least recent access */
  562     rel_time_t      exptime;    /* expire time */
  563     int             nbytes;     /* size of data */
  564     unsigned short  refcount;
  565     uint16_t        it_flags;   /* ITEM_* above */
  566     uint8_t         slabs_clsid;/* which slab class we're in */
  567     uint8_t         nkey;       /* key length, w/terminating null and padding */
  568     uint32_t        remaining;  /* Max keys to crawl per slab per invocation */
  569     uint64_t        reclaimed;  /* items reclaimed during this crawl. */
  570     uint64_t        unfetched;  /* items reclaimed unfetched during this crawl. */
  571     uint64_t        checked;    /* items examined during this crawl. */
  572 } crawler;
  573 
  574 /* Header when an item is actually a chunk of another item. */
  575 typedef struct _strchunk {
  576     struct _strchunk *next;     /* points within its own chain. */
  577     struct _strchunk *prev;     /* can potentially point to the head. */
  578     struct _stritem  *head;     /* always points to the owner chunk */
  579     int              size;      /* available chunk space in bytes */
  580     int              used;      /* chunk space used */
  581     int              nbytes;    /* used. */
  582     unsigned short   refcount;  /* used? */
  583     uint16_t         it_flags;  /* ITEM_* above. */
  584     uint8_t          slabs_clsid; /* Same as above. */
  585     uint8_t          orig_clsid; /* For obj hdr chunks slabs_clsid is fake. */
  586     char data[];
  587 } item_chunk;
  588 
  589 #ifdef NEED_ALIGN
  590 static inline char *ITEM_schunk(item *it) {
  591     int offset = it->nkey + 1
  592         + ((it->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0)
  593         + ((it->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0);
  594     int remain = offset % 8;
  595     if (remain != 0) {
  596         offset += 8 - remain;
  597     }
  598     return ((char *) &(it->data)) + offset;
  599 }
  600 #else
  601 #define ITEM_schunk(item) ((char*) &((item)->data) + (item)->nkey + 1 \
  602          + (((item)->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0) \
  603          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  604 #endif
  605 
  606 #ifdef EXTSTORE
  607 typedef struct {
  608     unsigned int page_version; /* from IO header */
  609     unsigned int offset; /* from IO header */
  610     unsigned short page_id; /* from IO header */
  611 } item_hdr;
  612 #endif
  613 typedef struct _mc_resp_bundle mc_resp_bundle;
  614 typedef struct {
  615     pthread_t thread_id;        /* unique ID of this thread */
  616     struct event_base *base;    /* libevent handle this thread uses */
  617     struct event notify_event;  /* listen event for notify pipe */
  618     int notify_receive_fd;      /* receiving end of notify pipe */
  619     int notify_send_fd;         /* sending end of notify pipe */
  620     struct thread_stats stats;  /* Stats generated by this thread */
  621     struct conn_queue *new_conn_queue; /* queue of new connections to handle */
  622     cache_t *rbuf_cache;        /* static-sized read buffers */
  623     mc_resp_bundle *open_bundle;
  624     cache_t *io_cache;          /* IO objects */
  625 #ifdef EXTSTORE
  626     void *storage;              /* data object for storage system */
  627 #endif
  628     logger *l;                  /* logger buffer */
  629     void *lru_bump_buf;         /* async LRU bump buffer */
  630 #ifdef TLS
  631     char   *ssl_wbuf;
  632 #endif
  633     int napi_id;                /* napi id associated with this thread */
  634 
  635 } LIBEVENT_THREAD;
  636 
  637 /**
  638  * Response objects
  639  */
  640 typedef struct _io_pending_t io_pending_t;
  641 #define MC_RESP_IOVCOUNT 4
  642 typedef struct _mc_resp {
  643     mc_resp_bundle *bundle; // ptr back to bundle
  644     struct _mc_resp *next; // choo choo.
  645     int wbytes; // bytes to write out of wbuf: might be able to nuke this.
  646     int tosend; // total bytes to send for this response
  647     void *write_and_free; /** free this memory after finishing writing */
  648     io_pending_t *io_pending; /* pending IO descriptor for this response */
  649 
  650     item *item; /* item associated with this response object, with reference held */
  651     struct iovec iov[MC_RESP_IOVCOUNT]; /* built-in iovecs to simplify network code */
  652     int chunked_total; /* total amount of chunked item data to send. */
  653     uint8_t iovcnt;
  654     uint8_t chunked_data_iov; /* this iov is a pointer to chunked data header */
  655 
  656     /* instruct transmit to skip this response object. used by storage engines
  657      * to asynchronously kill an object that was queued to write
  658      */
  659     bool skip;
  660     bool free; // double free detection.
  661     // UDP bits. Copied in from the client.
  662     uint16_t    request_id; /* Incoming UDP request ID, if this is a UDP "connection" */
  663     uint16_t    udp_sequence; /* packet counter when transmitting result */
  664     uint16_t    udp_total; /* total number of packets in sequence */
  665     struct sockaddr_in6 request_addr; /* udp: Who sent this request */
  666     socklen_t request_addr_size;
  667 
  668     char wbuf[WRITE_BUFFER_SIZE];
  669 } mc_resp;
  670 
  671 #define MAX_RESP_PER_BUNDLE ((READ_BUFFER_SIZE - sizeof(mc_resp_bundle)) / sizeof(mc_resp))
  672 struct _mc_resp_bundle {
  673     uint8_t refcount;
  674     uint8_t next_check; // next object to check on assignment.
  675     struct _mc_resp_bundle *next;
  676     struct _mc_resp_bundle *prev;
  677     mc_resp r[];
  678 };
  679 
  680 typedef struct conn conn;
  681 
  682 #define IO_QUEUE_NONE 0
  683 #define IO_QUEUE_EXTSTORE 1
  684 
  685 typedef void (*io_queue_stack_cb)(void *ctx, void *stack);
  686 typedef void (*io_queue_cb)(io_pending_t *pending);
  687 // this structure's ownership gets passed between threads:
  688 // - owned normally by the worker thread.
  689 // - multiple queues can be submitted at the same time.
  690 // - each queue can be sent to different background threads.
  691 // - each submitted queue needs to know when to return to the worker.
  692 // - the worker needs to know when all queues have returned so it can process.
  693 //
  694 // io_queue_t's count field is owned by worker until submitted. Then owned by
  695 // side thread until returned.
  696 // conn->io_queues_submitted is always owned by the worker thread. it is
  697 // incremented as the worker submits queues, and decremented as it gets pinged
  698 // for returned threads.
  699 //
  700 // All of this is to avoid having to hit a mutex owned by the connection
  701 // thread that gets pinged for each thread (or an equivalent atomic).
  702 typedef struct {
  703     void *ctx; // untouched ptr for specific context
  704     void *stack_ctx; // module-specific context to be batch-submitted
  705     io_queue_stack_cb submit_cb; // callback given a full stack of pending IO's at once.
  706     io_queue_stack_cb complete_cb;
  707     io_queue_cb finalize_cb; // called back on the worker thread.
  708     int type;
  709     int count; // ios to process before returning. only accessed by queue processor once submitted
  710 } io_queue_t;
  711 
  712 struct _io_pending_t {
  713     io_queue_t *q;
  714     conn *c;
  715     mc_resp *resp; // associated response object
  716     char data[120];
  717 };
  718 
  719 /**
  720  * The structure representing a connection into memcached.
  721  */
  722 struct conn {
  723     sasl_conn_t *sasl_conn;
  724     int    sfd;
  725     bool sasl_started;
  726     bool authenticated;
  727     bool set_stale;
  728     bool mset_res; /** uses mset format for return code */
  729     bool close_after_write; /** flush write then move to close connection */
  730     bool rbuf_malloced; /** read buffer was malloc'ed for ascii mget, needs free() */
  731 #ifdef TLS
  732     SSL    *ssl;
  733     char   *ssl_wbuf;
  734     bool ssl_enabled;
  735 #endif
  736     enum conn_states  state;
  737     enum bin_substates substate;
  738     rel_time_t last_cmd_time;
  739     struct event event;
  740     short  ev_flags;
  741     short  which;   /** which events were just triggered */
  742 
  743     char   *rbuf;   /** buffer to read commands into */
  744     char   *rcurr;  /** but if we parsed some already, this is where we stopped */
  745     int    rsize;   /** total allocated size of rbuf */
  746     int    rbytes;  /** how much data, starting from rcur, do we have unparsed */
  747 
  748     mc_resp *resp; // tail response.
  749     mc_resp *resp_head; // first response in current stack.
  750     char   *ritem;  /** when we read in an item's value, it goes here */
  751     int    rlbytes;
  752 
  753     /**
  754      * item is used to hold an item structure created after reading the command
  755      * line of set/add/replace commands, but before we finished reading the actual
  756      * data. The data is read into ITEM_data(item) to avoid extra copying.
  757      */
  758 
  759     void   *item;     /* for commands set/add/replace  */
  760 
  761     /* data for the swallow state */
  762     int    sbytes;    /* how many bytes to swallow */
  763 
  764     int io_queues_submitted; /* see notes on io_queue_t */
  765     io_queue_t io_queues[3]; /* set of deferred IO queues. */
  766 #ifdef EXTSTORE
  767     unsigned int recache_counter;
  768 #endif
  769     enum protocol protocol;   /* which protocol this connection speaks */
  770     enum network_transport transport; /* what transport is used by this connection */
  771 
  772     /* data for UDP clients */
  773     int    request_id; /* Incoming UDP request ID, if this is a UDP "connection" */
  774     struct sockaddr_in6 request_addr; /* udp: Who sent the most recent request */
  775     socklen_t request_addr_size;
  776 
  777     bool   noreply;   /* True if the reply should not be sent. */
  778     /* current stats command */
  779     struct {
  780         char *buffer;
  781         size_t size;
  782         size_t offset;
  783     } stats;
  784 
  785     /* Binary protocol stuff */
  786     /* This is where the binary header goes */
  787     protocol_binary_request_header binary_header;
  788     uint64_t cas; /* the cas to return */
  789     short cmd; /* current command being processed */
  790     int opaque;
  791     int keylen;
  792     conn   *next;     /* Used for generating a list of conn structures */
  793     LIBEVENT_THREAD *thread; /* Pointer to the thread object serving this connection */
  794     int (*try_read_command)(conn *c); /* pointer for top level input parser */
  795     ssize_t (*read)(conn  *c, void *buf, size_t count);
  796     ssize_t (*sendmsg)(conn *c, struct msghdr *msg, int flags);
  797     ssize_t (*write)(conn *c, void *buf, size_t count);
  798 };
  799 
  800 /* array of conn structures, indexed by file descriptor */
  801 extern conn **conns;
  802 
  803 /* current time of day (updated periodically) */
  804 extern volatile rel_time_t current_time;
  805 
  806 /* TODO: Move to slabs.h? */
  807 extern volatile int slab_rebalance_signal;
  808 
  809 struct slab_rebalance {
  810     void *slab_start;
  811     void *slab_end;
  812     void *slab_pos;
  813     int s_clsid;
  814     int d_clsid;
  815     uint32_t busy_items;
  816     uint32_t rescues;
  817     uint32_t evictions_nomem;
  818     uint32_t inline_reclaim;
  819     uint32_t chunk_rescues;
  820     uint32_t busy_deletes;
  821     uint32_t busy_loops;
  822     uint8_t done;
  823     uint8_t *completed;
  824 };
  825 
  826 extern struct slab_rebalance slab_rebal;
  827 #ifdef EXTSTORE
  828 extern void *ext_storage;
  829 #endif
  830 /*
  831  * Functions
  832  */
  833 void do_accept_new_conns(const bool do_accept);
  834 enum delta_result_type do_add_delta(conn *c, const char *key,
  835                                     const size_t nkey, const bool incr,
  836                                     const int64_t delta, char *buf,
  837                                     uint64_t *cas, const uint32_t hv,
  838                                     item **it_ret);
  839 enum store_item_type do_store_item(item *item, int comm, conn* c, const uint32_t hv);
  840 void conn_io_queue_add(conn *c, int type, void *ctx, io_queue_stack_cb cb, io_queue_stack_cb com_cb, io_queue_cb fin_cb);
  841 io_queue_t *conn_io_queue_get(conn *c, int type);
  842 conn *conn_new(const int sfd, const enum conn_states init_state, const int event_flags, const int read_buffer_size,
  843     enum network_transport transport, struct event_base *base, void *ssl);
  844 
  845 void conn_worker_readd(conn *c);
  846 extern int daemonize(int nochdir, int noclose);
  847 
  848 #define mutex_lock(x) pthread_mutex_lock(x)
  849 #define mutex_unlock(x) pthread_mutex_unlock(x)
  850 
  851 #include "stats_prefix.h"
  852 #include "slabs.h"
  853 #include "assoc.h"
  854 #include "items.h"
  855 #include "crawler.h"
  856 #include "trace.h"
  857 #include "hash.h"
  858 #include "util.h"
  859 
  860 /*
  861  * Functions such as the libevent-related calls that need to do cross-thread
  862  * communication in multithreaded mode (rather than actually doing the work
  863  * in the current thread) are called via "dispatch_" frontends, which are
  864  * also #define-d to directly call the underlying code in singlethreaded mode.
  865  */
  866 void memcached_thread_init(int nthreads, void *arg);
  867 void redispatch_conn(conn *c);
  868 void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags, int read_buffer_size,
  869     enum network_transport transport, void *ssl);
  870 void sidethread_conn_close(conn *c);
  871 
  872 /* Lock wrappers for cache functions that are called from main loop. */
  873 enum delta_result_type add_delta(conn *c, const char *key,
  874                                  const size_t nkey, bool incr,
  875                                  const int64_t delta, char *buf,
  876                                  uint64_t *cas);
  877 void accept_new_conns(const bool do_accept);
  878 void  conn_close_idle(conn *c);
  879 void  conn_close_all(void);
  880 item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes);
  881 #define DO_UPDATE true
  882 #define DONT_UPDATE false
  883 item *item_get(const char *key, const size_t nkey, conn *c, const bool do_update);
  884 item *item_get_locked(const char *key, const size_t nkey, conn *c, const bool do_update, uint32_t *hv);
  885 item *item_touch(const char *key, const size_t nkey, uint32_t exptime, conn *c);
  886 int   item_link(item *it);
  887 void  item_remove(item *it);
  888 int   item_replace(item *it, item *new_it, const uint32_t hv);
  889 void  item_unlink(item *it);
  890 
  891 void item_lock(uint32_t hv);
  892 void *item_trylock(uint32_t hv);
  893 void item_trylock_unlock(void *arg);
  894 void item_unlock(uint32_t hv);
  895 void pause_threads(enum pause_thread_types type);
  896 void stop_threads(void);
  897 int stop_conn_timeout_thread(void);
  898 #define refcount_incr(it) ++(it->refcount)
  899 #define refcount_decr(it) --(it->refcount)
  900 void STATS_LOCK(void);
  901 void STATS_UNLOCK(void);
  902 #define THR_STATS_LOCK(c) pthread_mutex_lock(&c->thread->stats.mutex)
  903 #define THR_STATS_UNLOCK(c) pthread_mutex_unlock(&c->thread->stats.mutex)
  904 void threadlocal_stats_reset(void);
  905 void threadlocal_stats_aggregate(struct thread_stats *stats);
  906 void slab_stats_aggregate(struct thread_stats *stats, struct slab_stats *out);
  907 
  908 /* Stat processing functions */
  909 void append_stat(const char *name, ADD_STAT add_stats, conn *c,
  910                  const char *fmt, ...);
  911 
  912 enum store_item_type store_item(item *item, int comm, conn *c);
  913 
  914 /* Protocol related code */
  915 void out_string(conn *c, const char *str);
  916 #define REALTIME_MAXDELTA 60*60*24*30
  917 /* Negative exptimes can underflow and end up immortal. realtime() will
  918    immediately expire values that are greater than REALTIME_MAXDELTA, but less
  919    than process_started, so lets aim for that. */
  920 #define EXPTIME_TO_POSITIVE_TIME(exptime) (exptime < 0) ? \
  921         REALTIME_MAXDELTA + 1 : exptime
  922 rel_time_t realtime(const time_t exptime);
  923 item* limited_get(char *key, size_t nkey, conn *c, uint32_t exptime, bool should_touch, bool do_update, bool *overflow);
  924 item* limited_get_locked(char *key, size_t nkey, conn *c, bool do_update, uint32_t *hv, bool *overflow);
  925 // Read/Response object handlers.
  926 void resp_reset(mc_resp *resp);
  927 void resp_add_iov(mc_resp *resp, const void *buf, int len);
  928 void resp_add_chunked_iov(mc_resp *resp, const void *buf, int len);
  929 bool resp_start(conn *c);
  930 mc_resp* resp_finish(conn *c, mc_resp *resp);
  931 bool resp_has_stack(conn *c);
  932 bool rbuf_switch_to_malloc(conn *c);
  933 void conn_release_items(conn *c);
  934 void conn_set_state(conn *c, enum conn_states state);
  935 void out_of_memory(conn *c, char *ascii_error);
  936 void out_errstring(conn *c, const char *str);
  937 void write_and_free(conn *c, char *buf, int bytes);
  938 void server_stats(ADD_STAT add_stats, conn *c);
  939 void append_stats(const char *key, const uint16_t klen,
  940                   const char *val, const uint32_t vlen,
  941                   const void *cookie);
  942 /** Return a datum for stats in binary protocol */
  943 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c);
  944 void stats_reset(void);
  945 void process_stat_settings(ADD_STAT add_stats, void *c);
  946 void process_stats_conns(ADD_STAT add_stats, void *c);
  947 
  948 #if HAVE_DROP_PRIVILEGES
  949 extern void setup_privilege_violations_handler(void);
  950 extern void drop_privileges(void);
  951 #else
  952 #define setup_privilege_violations_handler()
  953 #define drop_privileges()
  954 #endif
  955 
  956 #if HAVE_DROP_WORKER_PRIVILEGES
  957 extern void drop_worker_privileges(void);
  958 #else
  959 #define drop_worker_privileges()
  960 #endif
  961 
  962 /* If supported, give compiler hints for branch prediction. */
  963 #if !defined(__GNUC__) || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
  964 #define __builtin_expect(x, expected_value) (x)
  965 #endif
  966 
  967 #define likely(x)       __builtin_expect((x),1)
  968 #define unlikely(x)     __builtin_expect((x),0)