"Fossies" - the Fresh Open Source Software Archive

Member "memcached-1.6.15/memcached.h" (30 Mar 2022, 38733 Bytes) of package /linux/www/memcached-1.6.15.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "memcached.h" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.6.14_vs_1.6.15.

    1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
    2 
    3 /** \file
    4  * The main memcached header holding commonly used data
    5  * structures and function prototypes.
    6  */
    7 
    8 #ifdef HAVE_CONFIG_H
    9 #include "config.h"
   10 #endif
   11 
   12 #include <sys/types.h>
   13 #include <sys/socket.h>
   14 #include <sys/time.h>
   15 #include <netinet/in.h>
   16 #include <event.h>
   17 #include <netdb.h>
   18 #include <pthread.h>
   19 #include <unistd.h>
   20 #include <assert.h>
   21 #include <grp.h>
   22 #include <signal.h>
   23 /* need this to get IOV_MAX on some platforms. */
   24 #ifndef __need_IOV_MAX
   25 #define __need_IOV_MAX
   26 #endif
   27 #include <limits.h>
   28 /* FreeBSD 4.x doesn't have IOV_MAX exposed. */
   29 #ifndef IOV_MAX
   30 #if defined(__FreeBSD__) || defined(__APPLE__) || defined(__GNU__)
   31 # define IOV_MAX 1024
   32 /* GNU/Hurd don't set MAXPATHLEN
   33  * http://www.gnu.org/software/hurd/hurd/porting/guidelines.html#PATH_MAX_tt_MAX_PATH_tt_MAXPATHL */
   34 #ifndef MAXPATHLEN
   35 #define MAXPATHLEN 4096
   36 #endif
   37 #endif
   38 #endif
   39 
   40 #include "itoa_ljust.h"
   41 #include "protocol_binary.h"
   42 #include "cache.h"
   43 #include "logger.h"
   44 
   45 #ifdef EXTSTORE
   46 #include "crc32c.h"
   47 #endif
   48 
   49 #include "sasl_defs.h"
   50 #ifdef TLS
   51 #include <openssl/ssl.h>
   52 #endif
   53 
   54 /* for NAPI pinning feature */
   55 #ifndef SO_INCOMING_NAPI_ID
   56 #define SO_INCOMING_NAPI_ID 56
   57 #endif
   58 
   59 /** Maximum length of a key. */
   60 #define KEY_MAX_LENGTH 250
   61 
   62 /** Maximum length of a uri encoded key. */
   63 #define KEY_MAX_URI_ENCODED_LENGTH (KEY_MAX_LENGTH  * 3 + 1)
   64 
   65 /** Size of an incr buf. */
   66 #define INCR_MAX_STORAGE_LEN 24
   67 
   68 #define WRITE_BUFFER_SIZE 1024
   69 #define READ_BUFFER_SIZE 16384
   70 #define READ_BUFFER_CACHED 0
   71 #define UDP_READ_BUFFER_SIZE 65536
   72 #define UDP_MAX_PAYLOAD_SIZE 1400
   73 #define UDP_HEADER_SIZE 8
   74 #define UDP_DATA_SIZE 1392 // UDP_MAX_PAYLOAD_SIZE - UDP_HEADER_SIZE
   75 #define MAX_SENDBUF_SIZE (256 * 1024 * 1024)
   76 
   77 /* Binary protocol stuff */
   78 #define BIN_MAX_EXTLEN 20 // length of the _incr command is currently the longest.
   79 
   80 /* Initial power multiplier for the hash table */
   81 #define HASHPOWER_DEFAULT 16
   82 #define HASHPOWER_MAX 32
   83 
   84 /*
   85  * We only reposition items in the LRU queue if they haven't been repositioned
   86  * in this many seconds. That saves us from churning on frequently-accessed
   87  * items.
   88  */
   89 #define ITEM_UPDATE_INTERVAL 60
   90 
   91 /*
   92  * Valid range of the maximum size of an item, in bytes.
   93  */
   94 #define ITEM_SIZE_MAX_LOWER_LIMIT 1024
   95 #define ITEM_SIZE_MAX_UPPER_LIMIT 1024 * 1024 * 1024
   96 
   97 
   98 /* unistd.h is here */
   99 #if HAVE_UNISTD_H
  100 # include <unistd.h>
  101 #endif
  102 
  103 /* Slab sizing definitions. */
  104 #define POWER_SMALLEST 1
  105 #define POWER_LARGEST  256 /* actual cap is 255 */
  106 #define SLAB_GLOBAL_PAGE_POOL 0 /* magic slab class for storing pages for reassignment */
  107 #define CHUNK_ALIGN_BYTES 8
  108 /* slab class max is a 6-bit number, -1. */
  109 #define MAX_NUMBER_OF_SLAB_CLASSES (63 + 1)
  110 
  111 /** How long an object can reasonably be assumed to be locked before
  112     harvesting it on a low memory condition. Default: disabled. */
  113 #define TAIL_REPAIR_TIME_DEFAULT 0
  114 
  115 /* warning: don't use these macros with a function, as it evals its arg twice */
  116 #define ITEM_get_cas(i) (((i)->it_flags & ITEM_CAS) ? \
  117         (i)->data->cas : (uint64_t)0)
  118 
  119 #define ITEM_set_cas(i,v) { \
  120     if ((i)->it_flags & ITEM_CAS) { \
  121         (i)->data->cas = v; \
  122     } \
  123 }
  124 
  125 #define ITEM_key(item) (((char*)&((item)->data)) \
  126          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  127 
  128 #define ITEM_suffix(item) ((char*) &((item)->data) + (item)->nkey + 1 \
  129          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  130 
  131 #define ITEM_data(item) ((char*) &((item)->data) + (item)->nkey + 1 \
  132          + (((item)->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0) \
  133          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  134 
  135 #define ITEM_ntotal(item) (sizeof(struct _stritem) + (item)->nkey + 1 \
  136          + (item)->nbytes \
  137          + (((item)->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0) \
  138          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  139 
  140 #define ITEM_clsid(item) ((item)->slabs_clsid & ~(3<<6))
  141 #define ITEM_lruid(item) ((item)->slabs_clsid & (3<<6))
  142 
  143 #define STAT_KEY_LEN 128
  144 #define STAT_VAL_LEN 128
  145 
  146 /** Append a simple stat with a stat name, value format and value */
  147 #define APPEND_STAT(name, fmt, val) \
  148     append_stat(name, add_stats, c, fmt, val);
  149 
  150 /** Append an indexed stat with a stat name (with format), value format
  151     and value */
  152 #define APPEND_NUM_FMT_STAT(name_fmt, num, name, fmt, val)          \
  153     klen = snprintf(key_str, STAT_KEY_LEN, name_fmt, num, name);    \
  154     vlen = snprintf(val_str, STAT_VAL_LEN, fmt, val);               \
  155     add_stats(key_str, klen, val_str, vlen, c);
  156 
  157 /** Common APPEND_NUM_FMT_STAT format. */
  158 #define APPEND_NUM_STAT(num, name, fmt, val) \
  159     APPEND_NUM_FMT_STAT("%d:%s", num, name, fmt, val)
  160 
  161 /** Item client flag conversion */
  162 #define FLAGS_CONV(it, flag) { \
  163     if ((it)->it_flags & ITEM_CFLAGS) { \
  164         flag = *((uint32_t *)ITEM_suffix((it))); \
  165     } else { \
  166         flag = 0; \
  167     } \
  168 }
  169 
  170 #define FLAGS_SIZE(item) (((item)->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0)
  171 
  172 /**
  173  * Callback for any function producing stats.
  174  *
  175  * @param key the stat's key
  176  * @param klen length of the key
  177  * @param val the stat's value in an ascii form (e.g. text form of a number)
  178  * @param vlen length of the value
  179  * @parm cookie magic callback cookie
  180  */
  181 typedef void (*ADD_STAT)(const char *key, const uint16_t klen,
  182                          const char *val, const uint32_t vlen,
  183                          const void *cookie);
  184 
  185 /*
  186  * NOTE: If you modify this table you _MUST_ update the function state_text
  187  */
  188 /**
  189  * Possible states of a connection.
  190  */
  191 enum conn_states {
  192     conn_listening,  /**< the socket which listens for connections */
  193     conn_new_cmd,    /**< Prepare connection for next command */
  194     conn_waiting,    /**< waiting for a readable socket */
  195     conn_read,       /**< reading in a command line */
  196     conn_parse_cmd,  /**< try to parse a command from the input buffer */
  197     conn_write,      /**< writing out a simple response */
  198     conn_nread,      /**< reading in a fixed number of bytes */
  199     conn_swallow,    /**< swallowing unnecessary bytes w/o storing */
  200     conn_closing,    /**< closing this connection */
  201     conn_mwrite,     /**< writing out many items sequentially */
  202     conn_closed,     /**< connection is closed */
  203     conn_watch,      /**< held by the logger thread as a watcher */
  204     conn_io_queue,   /**< wait on async. process to get response object */
  205     conn_max_state   /**< Max state value (used for assertion) */
  206 };
  207 
  208 enum bin_substates {
  209     bin_no_state,
  210     bin_reading_set_header,
  211     bin_reading_cas_header,
  212     bin_read_set_value,
  213     bin_reading_get_key,
  214     bin_reading_stat,
  215     bin_reading_del_header,
  216     bin_reading_incr_header,
  217     bin_read_flush_exptime,
  218     bin_reading_sasl_auth,
  219     bin_reading_sasl_auth_data,
  220     bin_reading_touch_key,
  221 };
  222 
  223 enum protocol {
  224     ascii_prot = 3, /* arbitrary value. */
  225     binary_prot,
  226     negotiating_prot, /* Discovering the protocol */
  227 #ifdef PROXY
  228     proxy_prot,
  229 #endif
  230 };
  231 
  232 enum network_transport {
  233     local_transport, /* Unix sockets*/
  234     tcp_transport,
  235     udp_transport
  236 };
  237 
  238 enum pause_thread_types {
  239     PAUSE_WORKER_THREADS = 0,
  240     PAUSE_ALL_THREADS,
  241     RESUME_ALL_THREADS,
  242     RESUME_WORKER_THREADS
  243 };
  244 
  245 enum stop_reasons {
  246     NOT_STOP,
  247     GRACE_STOP,
  248     EXIT_NORMALLY
  249 };
  250 
  251 enum close_reasons {
  252     ERROR_CLOSE,
  253     NORMAL_CLOSE,
  254     IDLE_TIMEOUT_CLOSE,
  255     SHUTDOWN_CLOSE,
  256 };
  257 
  258 #define IS_TCP(x) (x == tcp_transport)
  259 #define IS_UDP(x) (x == udp_transport)
  260 
  261 #define NREAD_ADD 1
  262 #define NREAD_SET 2
  263 #define NREAD_REPLACE 3
  264 #define NREAD_APPEND 4
  265 #define NREAD_PREPEND 5
  266 #define NREAD_CAS 6
  267 
  268 enum store_item_type {
  269     NOT_STORED=0, STORED, EXISTS, NOT_FOUND, TOO_LARGE, NO_MEMORY
  270 };
  271 
  272 enum delta_result_type {
  273     OK, NON_NUMERIC, EOM, DELTA_ITEM_NOT_FOUND, DELTA_ITEM_CAS_MISMATCH
  274 };
  275 
  276 /** Time relative to server start. Smaller than time_t on 64-bit systems. */
  277 // TODO: Move to sub-header. needed in logger.h
  278 //typedef unsigned int rel_time_t;
  279 
  280 /** Use X macros to avoid iterating over the stats fields during reset and
  281  * aggregation. No longer have to add new stats in 3+ places.
  282  */
  283 
  284 #define SLAB_STATS_FIELDS \
  285     X(set_cmds) \
  286     X(get_hits) \
  287     X(touch_hits) \
  288     X(delete_hits) \
  289     X(cas_hits) \
  290     X(cas_badval) \
  291     X(incr_hits) \
  292     X(decr_hits)
  293 
  294 /** Stats stored per slab (and per thread). */
  295 struct slab_stats {
  296 #define X(name) uint64_t    name;
  297     SLAB_STATS_FIELDS
  298 #undef X
  299 };
  300 
  301 #define THREAD_STATS_FIELDS \
  302     X(get_cmds) \
  303     X(get_misses) \
  304     X(get_expired) \
  305     X(get_flushed) \
  306     X(touch_cmds) \
  307     X(touch_misses) \
  308     X(delete_misses) \
  309     X(incr_misses) \
  310     X(decr_misses) \
  311     X(cas_misses) \
  312     X(meta_cmds) \
  313     X(bytes_read) \
  314     X(bytes_written) \
  315     X(flush_cmds) \
  316     X(conn_yields) /* # of yields for connections (-R option)*/ \
  317     X(auth_cmds) \
  318     X(auth_errors) \
  319     X(idle_kicks) /* idle connections killed */ \
  320     X(response_obj_oom) \
  321     X(response_obj_count) \
  322     X(response_obj_bytes) \
  323     X(read_buf_oom) \
  324     X(store_too_large) \
  325     X(store_no_memory)
  326 
  327 #ifdef EXTSTORE
  328 #define EXTSTORE_THREAD_STATS_FIELDS \
  329     X(get_extstore) \
  330     X(get_aborted_extstore) \
  331     X(get_oom_extstore) \
  332     X(recache_from_extstore) \
  333     X(miss_from_extstore) \
  334     X(badcrc_from_extstore)
  335 #endif
  336 
  337 #ifdef PROXY
  338 #define PROXY_THREAD_STATS_FIELDS \
  339     X(proxy_conn_requests) \
  340     X(proxy_conn_errors) \
  341     X(proxy_conn_oom) \
  342     X(proxy_req_active)
  343 #endif
  344 
  345 /**
  346  * Stats stored per-thread.
  347  */
  348 struct thread_stats {
  349     pthread_mutex_t   mutex;
  350 #define X(name) uint64_t    name;
  351     THREAD_STATS_FIELDS
  352 #ifdef EXTSTORE
  353     EXTSTORE_THREAD_STATS_FIELDS
  354 #endif
  355 #ifdef PROXY
  356     PROXY_THREAD_STATS_FIELDS
  357 #endif
  358 #undef X
  359     struct slab_stats slab_stats[MAX_NUMBER_OF_SLAB_CLASSES];
  360     uint64_t lru_hits[POWER_LARGEST];
  361     uint64_t read_buf_count;
  362     uint64_t read_buf_bytes;
  363     uint64_t read_buf_bytes_free;
  364 };
  365 
  366 /**
  367  * Global stats. Only resettable stats should go into this structure.
  368  */
  369 struct stats {
  370     uint64_t      total_items;
  371     uint64_t      total_conns;
  372     uint64_t      rejected_conns;
  373     uint64_t      malloc_fails;
  374     uint64_t      listen_disabled_num;
  375     uint64_t      slabs_moved;       /* times slabs were moved around */
  376     uint64_t      slab_reassign_rescues; /* items rescued during slab move */
  377     uint64_t      slab_reassign_evictions_nomem; /* valid items lost during slab move */
  378     uint64_t      slab_reassign_inline_reclaim; /* valid items lost during slab move */
  379     uint64_t      slab_reassign_chunk_rescues; /* chunked-item chunks recovered */
  380     uint64_t      slab_reassign_busy_items; /* valid temporarily unmovable */
  381     uint64_t      slab_reassign_busy_deletes; /* refcounted items killed */
  382     uint64_t      lru_crawler_starts; /* Number of item crawlers kicked off */
  383     uint64_t      lru_maintainer_juggles; /* number of LRU bg pokes */
  384     uint64_t      time_in_listen_disabled_us;  /* elapsed time in microseconds while server unable to process new connections */
  385     uint64_t      log_worker_dropped; /* logs dropped by worker threads */
  386     uint64_t      log_worker_written; /* logs written by worker threads */
  387     uint64_t      log_watcher_skipped; /* logs watchers missed */
  388     uint64_t      log_watcher_sent; /* logs sent to watcher buffers */
  389 #ifdef EXTSTORE
  390     uint64_t      extstore_compact_lost; /* items lost because they were locked */
  391     uint64_t      extstore_compact_rescues; /* items re-written during compaction */
  392     uint64_t      extstore_compact_skipped; /* unhit items skipped during compaction */
  393 #endif
  394 #ifdef TLS
  395     uint64_t      ssl_handshake_errors; /* TLS failures at accept/handshake time */
  396     uint64_t      ssl_new_sessions; /* successfully negotiated new (non-reused) TLS sessions */
  397 #endif
  398     struct timeval maxconns_entered;  /* last time maxconns entered */
  399     uint64_t      unexpected_napi_ids;  /* see doc/napi_ids.txt */
  400     uint64_t      round_robin_fallback; /* see doc/napi_ids.txt */
  401 };
  402 
  403 /**
  404  * Global "state" stats. Reflects state that shouldn't be wiped ever.
  405  * Ordered for some cache line locality for commonly updated counters.
  406  */
  407 struct stats_state {
  408     uint64_t      curr_items;
  409     uint64_t      curr_bytes;
  410     uint64_t      curr_conns;
  411     uint64_t      hash_bytes;       /* size used for hash tables */
  412     unsigned int  conn_structs;
  413     unsigned int  reserved_fds;
  414     unsigned int  hash_power_level; /* Better hope it's not over 9000 */
  415     unsigned int  log_watchers; /* number of currently active watchers */
  416     bool          hash_is_expanding; /* If the hash table is being expanded */
  417     bool          accepting_conns;  /* whether we are currently accepting */
  418     bool          slab_reassign_running; /* slab reassign in progress */
  419     bool          lru_crawler_running; /* crawl in progress */
  420 };
  421 
  422 #define MAX_VERBOSITY_LEVEL 2
  423 
  424 /* When adding a setting, be sure to update process_stat_settings */
  425 /**
  426  * Globally accessible settings as derived from the commandline.
  427  */
  428 struct settings {
  429     size_t maxbytes;
  430     int maxconns;
  431     int port;
  432     int udpport;
  433     char *inter;
  434     int verbose;
  435     rel_time_t oldest_live; /* ignore existing items older than this */
  436     uint64_t oldest_cas; /* ignore existing items with CAS values lower than this */
  437     int evict_to_free;
  438     char *socketpath;   /* path to unix socket if using local socket */
  439     char *auth_file;    /* path to user authentication file */
  440     int access;  /* access mask (a la chmod) for unix domain socket */
  441     double factor;          /* chunk size growth factor */
  442     int chunk_size;
  443     int num_threads;        /* number of worker (without dispatcher) libevent threads to run */
  444     int num_threads_per_udp; /* number of worker threads serving each udp socket */
  445     char prefix_delimiter;  /* character that marks a key prefix (for stats) */
  446     int detail_enabled;     /* nonzero if we're collecting detailed stats */
  447     int reqs_per_event;     /* Maximum number of io to process on each
  448                                io-event. */
  449     bool use_cas;
  450     enum protocol binding_protocol;
  451     int backlog;
  452     int item_size_max;        /* Maximum item size */
  453     int slab_chunk_size_max;  /* Upper end for chunks within slab pages. */
  454     int slab_page_size;     /* Slab's page units. */
  455     volatile sig_atomic_t sig_hup;  /* a HUP signal was received but not yet handled */
  456     bool sasl;              /* SASL on/off */
  457     bool maxconns_fast;     /* Whether or not to early close connections */
  458     bool lru_crawler;        /* Whether or not to enable the autocrawler thread */
  459     bool lru_maintainer_thread; /* LRU maintainer background thread */
  460     bool lru_segmented;     /* Use split or flat LRU's */
  461     bool slab_reassign;     /* Whether or not slab reassignment is allowed */
  462     int slab_automove;     /* Whether or not to automatically move slabs */
  463     double slab_automove_ratio; /* youngest must be within pct of oldest */
  464     unsigned int slab_automove_window; /* window mover for algorithm */
  465     int hashpower_init;     /* Starting hash power level */
  466     bool shutdown_command; /* allow shutdown command */
  467     int tail_repair_time;   /* LRU tail refcount leak repair time */
  468     bool flush_enabled;     /* flush_all enabled */
  469     bool dump_enabled;      /* whether cachedump/metadump commands work */
  470     char *hash_algorithm;     /* Hash algorithm in use */
  471     int lru_crawler_sleep;  /* Microsecond sleep between items */
  472     uint32_t lru_crawler_tocrawl; /* Number of items to crawl per run */
  473     int hot_lru_pct; /* percentage of slab space for HOT_LRU */
  474     int warm_lru_pct; /* percentage of slab space for WARM_LRU */
  475     double hot_max_factor; /* HOT tail age relative to COLD tail */
  476     double warm_max_factor; /* WARM tail age relative to COLD tail */
  477     int crawls_persleep; /* Number of LRU crawls to run before sleeping */
  478     bool temp_lru; /* TTL < temporary_ttl uses TEMP_LRU */
  479     uint32_t temporary_ttl; /* temporary LRU threshold */
  480     int idle_timeout;       /* Number of seconds to let connections idle */
  481     unsigned int logger_watcher_buf_size; /* size of logger's per-watcher buffer */
  482     unsigned int logger_buf_size; /* size of per-thread logger buffer */
  483     unsigned int read_buf_mem_limit; /* total megabytes allowable for net buffers */
  484     bool drop_privileges;   /* Whether or not to drop unnecessary process privileges */
  485     bool watch_enabled; /* allows watch commands to be dropped */
  486     bool relaxed_privileges;   /* Relax process restrictions when running testapp */
  487     bool meta_response_old; /* use "OK" instead of "HD". for response code TEMPORARY! */
  488 #ifdef EXTSTORE
  489     unsigned int ext_io_threadcount; /* number of IO threads to run. */
  490     unsigned int ext_page_size; /* size in megabytes of storage pages. */
  491     unsigned int ext_item_size; /* minimum size of items to store externally */
  492     unsigned int ext_item_age; /* max age of tail item before storing ext. */
  493     unsigned int ext_low_ttl; /* remaining TTL below this uses own pages */
  494     unsigned int ext_recache_rate; /* counter++ % recache_rate == 0 > recache */
  495     unsigned int ext_wbuf_size; /* read only note for the engine */
  496     unsigned int ext_compact_under; /* when fewer than this many pages, compact */
  497     unsigned int ext_drop_under; /* when fewer than this many pages, drop COLD items */
  498     unsigned int ext_max_sleep; /* maximum sleep time for extstore bg threads, in us */
  499     double ext_max_frag; /* ideal maximum page fragmentation */
  500     double slab_automove_freeratio; /* % of memory to hold free as buffer */
  501     bool ext_drop_unread; /* skip unread items during compaction */
  502     /* per-slab-class free chunk limit */
  503     unsigned int ext_free_memchunks[MAX_NUMBER_OF_SLAB_CLASSES];
  504 #endif
  505 #ifdef TLS
  506     bool ssl_enabled; /* indicates whether SSL is enabled */
  507     SSL_CTX *ssl_ctx; /* holds the SSL server context which has the server certificate */
  508     char *ssl_chain_cert; /* path to the server SSL chain certificate */
  509     char *ssl_key; /* path to the server key */
  510     int ssl_verify_mode; /* client certificate verify mode */
  511     int ssl_keyformat; /* key format , default is PEM */
  512     char *ssl_ciphers; /* list of SSL ciphers */
  513     char *ssl_ca_cert; /* certificate with CAs. */
  514     rel_time_t ssl_last_cert_refresh_time; /* time of the last server certificate refresh */
  515     unsigned int ssl_wbuf_size; /* size of the write buffer used by ssl_sendmsg method */
  516     bool ssl_session_cache; /* enable SSL server session caching */
  517     int ssl_min_version; /* minimum SSL protocol version to accept */
  518 #endif
  519     int num_napi_ids;   /* maximum number of NAPI IDs */
  520     char *memory_file;  /* warm restart memory file path */
  521 #ifdef PROXY
  522     bool proxy_enabled;
  523     bool proxy_uring; /* if the proxy should use io_uring */
  524     char *proxy_startfile; /* lua file to run when workers start */
  525     void *proxy_ctx; /* proxy's state context */
  526 #endif
  527 };
  528 
  529 extern struct stats stats;
  530 extern struct stats_state stats_state;
  531 extern time_t process_started;
  532 extern struct settings settings;
  533 
  534 #define ITEM_LINKED 1
  535 #define ITEM_CAS 2
  536 
  537 /* temp */
  538 #define ITEM_SLABBED 4
  539 
  540 /* Item was fetched at least once in its lifetime */
  541 #define ITEM_FETCHED 8
  542 /* Appended on fetch, removed on LRU shuffling */
  543 #define ITEM_ACTIVE 16
  544 /* If an item's storage are chained chunks. */
  545 #define ITEM_CHUNKED 32
  546 #define ITEM_CHUNK 64
  547 /* ITEM_data bulk is external to item */
  548 #define ITEM_HDR 128
  549 /* additional 4 bytes for item client flags */
  550 #define ITEM_CFLAGS 256
  551 /* item has sent out a token already */
  552 #define ITEM_TOKEN_SENT 512
  553 /* reserved, in case tokens should be a 2-bit count in future */
  554 #define ITEM_TOKEN_RESERVED 1024
  555 /* if item has been marked as a stale value */
  556 #define ITEM_STALE 2048
  557 /* if item key was sent in binary */
  558 #define ITEM_KEY_BINARY 4096
  559 
  560 /**
  561  * Structure for storing items within memcached.
  562  */
  563 typedef struct _stritem {
  564     /* Protected by LRU locks */
  565     struct _stritem *next;
  566     struct _stritem *prev;
  567     /* Rest are protected by an item lock */
  568     struct _stritem *h_next;    /* hash chain next */
  569     rel_time_t      time;       /* least recent access */
  570     rel_time_t      exptime;    /* expire time */
  571     int             nbytes;     /* size of data */
  572     unsigned short  refcount;
  573     uint16_t        it_flags;   /* ITEM_* above */
  574     uint8_t         slabs_clsid;/* which slab class we're in */
  575     uint8_t         nkey;       /* key length, w/terminating null and padding */
  576     /* this odd type prevents type-punning issues when we do
  577      * the little shuffle to save space when not using CAS. */
  578     union {
  579         uint64_t cas;
  580         char end;
  581     } data[];
  582     /* if it_flags & ITEM_CAS we have 8 bytes CAS */
  583     /* then null-terminated key */
  584     /* then " flags length\r\n" (no terminating null) */
  585     /* then data with terminating \r\n (no terminating null; it's binary!) */
  586 } item;
  587 
  588 // TODO: If we eventually want user loaded modules, we can't use an enum :(
  589 enum crawler_run_type {
  590     CRAWLER_AUTOEXPIRE=0, CRAWLER_EXPIRED, CRAWLER_METADUMP
  591 };
  592 
  593 typedef struct {
  594     struct _stritem *next;
  595     struct _stritem *prev;
  596     struct _stritem *h_next;    /* hash chain next */
  597     rel_time_t      time;       /* least recent access */
  598     rel_time_t      exptime;    /* expire time */
  599     int             nbytes;     /* size of data */
  600     unsigned short  refcount;
  601     uint16_t        it_flags;   /* ITEM_* above */
  602     uint8_t         slabs_clsid;/* which slab class we're in */
  603     uint8_t         nkey;       /* key length, w/terminating null and padding */
  604     uint32_t        remaining;  /* Max keys to crawl per slab per invocation */
  605     uint64_t        reclaimed;  /* items reclaimed during this crawl. */
  606     uint64_t        unfetched;  /* items reclaimed unfetched during this crawl. */
  607     uint64_t        checked;    /* items examined during this crawl. */
  608 } crawler;
  609 
  610 /* Header when an item is actually a chunk of another item. */
  611 typedef struct _strchunk {
  612     struct _strchunk *next;     /* points within its own chain. */
  613     struct _strchunk *prev;     /* can potentially point to the head. */
  614     struct _stritem  *head;     /* always points to the owner chunk */
  615     int              size;      /* available chunk space in bytes */
  616     int              used;      /* chunk space used */
  617     int              nbytes;    /* used. */
  618     unsigned short   refcount;  /* used? */
  619     uint16_t         it_flags;  /* ITEM_* above. */
  620     uint8_t          slabs_clsid; /* Same as above. */
  621     uint8_t          orig_clsid; /* For obj hdr chunks slabs_clsid is fake. */
  622     char data[];
  623 } item_chunk;
  624 
  625 #ifdef NEED_ALIGN
  626 static inline char *ITEM_schunk(item *it) {
  627     int offset = it->nkey + 1
  628         + ((it->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0)
  629         + ((it->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0);
  630     int remain = offset % 8;
  631     if (remain != 0) {
  632         offset += 8 - remain;
  633     }
  634     return ((char *) &(it->data)) + offset;
  635 }
  636 #else
  637 #define ITEM_schunk(item) ((char*) &((item)->data) + (item)->nkey + 1 \
  638          + (((item)->it_flags & ITEM_CFLAGS) ? sizeof(uint32_t) : 0) \
  639          + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
  640 #endif
  641 
  642 #ifdef EXTSTORE
  643 typedef struct {
  644     unsigned int page_version; /* from IO header */
  645     unsigned int offset; /* from IO header */
  646     unsigned short page_id; /* from IO header */
  647 } item_hdr;
  648 #endif
  649 
  650 #define IO_QUEUE_COUNT 3
  651 
  652 #define IO_QUEUE_NONE 0
  653 #define IO_QUEUE_EXTSTORE 1
  654 #define IO_QUEUE_PROXY 2
  655 
  656 typedef struct _io_pending_t io_pending_t;
  657 typedef struct io_queue_s io_queue_t;
  658 typedef void (*io_queue_stack_cb)(io_queue_t *q);
  659 typedef void (*io_queue_cb)(io_pending_t *pending);
  660 // this structure's ownership gets passed between threads:
  661 // - owned normally by the worker thread.
  662 // - multiple queues can be submitted at the same time.
  663 // - each queue can be sent to different background threads.
  664 // - each submitted queue needs to know when to return to the worker.
  665 // - the worker needs to know when all queues have returned so it can process.
  666 //
  667 // io_queue_t's count field is owned by worker until submitted. Then owned by
  668 // side thread until returned.
  669 // conn->io_queues_submitted is always owned by the worker thread. it is
  670 // incremented as the worker submits queues, and decremented as it gets pinged
  671 // for returned threads.
  672 //
  673 // All of this is to avoid having to hit a mutex owned by the connection
  674 // thread that gets pinged for each thread (or an equivalent atomic).
  675 struct io_queue_s {
  676     void *ctx; // duplicated from io_queue_cb_t
  677     void *stack_ctx; // module-specific context to be batch-submitted
  678     int count; // ios to process before returning. only accessed by queue processor once submitted
  679     int type; // duplicated from io_queue_cb_t
  680 };
  681 
  682 typedef struct io_queue_cb_s {
  683     void *ctx; // untouched ptr for specific context
  684     io_queue_stack_cb submit_cb; // callback given a full stack of pending IO's at once.
  685     io_queue_stack_cb complete_cb;
  686     io_queue_cb return_cb; // called on worker thread.
  687     io_queue_cb finalize_cb; // called back on the worker thread.
  688     int type;
  689 } io_queue_cb_t;
  690 
  691 typedef struct _mc_resp_bundle mc_resp_bundle;
  692 typedef struct {
  693     pthread_t thread_id;        /* unique ID of this thread */
  694     struct event_base *base;    /* libevent handle this thread uses */
  695     struct event notify_event;  /* listen event for notify pipe */
  696 #ifdef HAVE_EVENTFD
  697     int notify_event_fd;        /* notify counter */
  698 #else
  699     int notify_receive_fd;      /* receiving end of notify pipe */
  700     int notify_send_fd;         /* sending end of notify pipe */
  701 #endif
  702     struct thread_stats stats;  /* Stats generated by this thread */
  703     io_queue_cb_t io_queues[IO_QUEUE_COUNT];
  704     struct conn_queue *ev_queue; /* Worker/conn event queue */
  705     cache_t *rbuf_cache;        /* static-sized read buffers */
  706     mc_resp_bundle *open_bundle;
  707     cache_t *io_cache;          /* IO objects */
  708 #ifdef EXTSTORE
  709     void *storage;              /* data object for storage system */
  710 #endif
  711     logger *l;                  /* logger buffer */
  712     void *lru_bump_buf;         /* async LRU bump buffer */
  713 #ifdef TLS
  714     char   *ssl_wbuf;
  715 #endif
  716     int napi_id;                /* napi id associated with this thread */
  717 #ifdef PROXY
  718     void *L;
  719     void *proxy_hooks;
  720     void *proxy_user_stats;
  721     void *proxy_int_stats;
  722     // TODO: add ctx object so we can attach to queue.
  723 #endif
  724 } LIBEVENT_THREAD;
  725 
  726 /**
  727  * Response objects
  728  */
  729 #define MC_RESP_IOVCOUNT 4
  730 typedef struct _mc_resp {
  731     mc_resp_bundle *bundle; // ptr back to bundle
  732     struct _mc_resp *next; // choo choo.
  733     int wbytes; // bytes to write out of wbuf: might be able to nuke this.
  734     int tosend; // total bytes to send for this response
  735     void *write_and_free; /** free this memory after finishing writing */
  736     io_pending_t *io_pending; /* pending IO descriptor for this response */
  737 
  738     item *item; /* item associated with this response object, with reference held */
  739     struct iovec iov[MC_RESP_IOVCOUNT]; /* built-in iovecs to simplify network code */
  740     int chunked_total; /* total amount of chunked item data to send. */
  741     uint8_t iovcnt;
  742     uint8_t chunked_data_iov; /* this iov is a pointer to chunked data header */
  743 
  744     /* instruct transmit to skip this response object. used by storage engines
  745      * to asynchronously kill an object that was queued to write
  746      */
  747     bool skip;
  748     bool free; // double free detection.
  749     // UDP bits. Copied in from the client.
  750     uint16_t    request_id; /* Incoming UDP request ID, if this is a UDP "connection" */
  751     uint16_t    udp_sequence; /* packet counter when transmitting result */
  752     uint16_t    udp_total; /* total number of packets in sequence */
  753     struct sockaddr_in6 request_addr; /* udp: Who sent this request */
  754     socklen_t request_addr_size;
  755 
  756     char wbuf[WRITE_BUFFER_SIZE];
  757 } mc_resp;
  758 
  759 #define MAX_RESP_PER_BUNDLE ((READ_BUFFER_SIZE - sizeof(mc_resp_bundle)) / sizeof(mc_resp))
  760 struct _mc_resp_bundle {
  761     uint8_t refcount;
  762     uint8_t next_check; // next object to check on assignment.
  763     struct _mc_resp_bundle *next;
  764     struct _mc_resp_bundle *prev;
  765     mc_resp r[];
  766 };
  767 
  768 typedef struct conn conn;
  769 
  770 struct _io_pending_t {
  771     int io_queue_type; // matches one of IO_QUEUE_*
  772     LIBEVENT_THREAD *thread;
  773     conn *c;
  774     mc_resp *resp; // associated response object
  775     char data[120];
  776 };
  777 
  778 /**
  779  * The structure representing a connection into memcached.
  780  */
  781 struct conn {
  782     sasl_conn_t *sasl_conn;
  783     int    sfd;
  784     bool sasl_started;
  785     bool authenticated;
  786     bool set_stale;
  787     bool mset_res; /** uses mset format for return code */
  788     bool close_after_write; /** flush write then move to close connection */
  789     bool rbuf_malloced; /** read buffer was malloc'ed for ascii mget, needs free() */
  790     bool item_malloced; /** item for conn_nread state is a temporary malloc */
  791 #ifdef TLS
  792     SSL    *ssl;
  793     char   *ssl_wbuf;
  794     bool ssl_enabled;
  795 #endif
  796     enum conn_states  state;
  797     enum bin_substates substate;
  798     rel_time_t last_cmd_time;
  799     struct event event;
  800     short  ev_flags;
  801     short  which;   /** which events were just triggered */
  802 
  803     char   *rbuf;   /** buffer to read commands into */
  804     char   *rcurr;  /** but if we parsed some already, this is where we stopped */
  805     int    rsize;   /** total allocated size of rbuf */
  806     int    rbytes;  /** how much data, starting from rcur, do we have unparsed */
  807 
  808     mc_resp *resp; // tail response.
  809     mc_resp *resp_head; // first response in current stack.
  810     char   *ritem;  /** when we read in an item's value, it goes here */
  811     int    rlbytes;
  812 
  813     /**
  814      * item is used to hold an item structure created after reading the command
  815      * line of set/add/replace commands, but before we finished reading the actual
  816      * data. The data is read into ITEM_data(item) to avoid extra copying.
  817      */
  818 
  819     void   *item;     /* for commands set/add/replace  */
  820 
  821     /* data for the swallow state */
  822     int    sbytes;    /* how many bytes to swallow */
  823 
  824     int io_queues_submitted; /* see notes on io_queue_t */
  825     io_queue_t io_queues[IO_QUEUE_COUNT]; /* set of deferred IO queues. */
  826 #ifdef PROXY
  827     unsigned int proxy_coro_ref; /* lua reference for active coroutine */
  828 #endif
  829 #ifdef EXTSTORE
  830     unsigned int recache_counter;
  831 #endif
  832     enum protocol protocol;   /* which protocol this connection speaks */
  833     enum network_transport transport; /* what transport is used by this connection */
  834     enum close_reasons close_reason; /* reason for transition into conn_closing */
  835 
  836     /* data for UDP clients */
  837     int    request_id; /* Incoming UDP request ID, if this is a UDP "connection" */
  838     struct sockaddr_in6 request_addr; /* udp: Who sent the most recent request */
  839     socklen_t request_addr_size;
  840 
  841     bool   noreply;   /* True if the reply should not be sent. */
  842     /* current stats command */
  843     struct {
  844         char *buffer;
  845         size_t size;
  846         size_t offset;
  847     } stats;
  848 
  849     /* Binary protocol stuff */
  850     /* This is where the binary header goes */
  851     protocol_binary_request_header binary_header;
  852     uint64_t cas; /* the cas to return */
  853     short cmd; /* current command being processed */
  854     int opaque;
  855     int keylen;
  856     conn   *next;     /* Used for generating a list of conn structures */
  857     LIBEVENT_THREAD *thread; /* Pointer to the thread object serving this connection */
  858     int (*try_read_command)(conn *c); /* pointer for top level input parser */
  859     ssize_t (*read)(conn  *c, void *buf, size_t count);
  860     ssize_t (*sendmsg)(conn *c, struct msghdr *msg, int flags);
  861     ssize_t (*write)(conn *c, void *buf, size_t count);
  862 };
  863 
  864 /* array of conn structures, indexed by file descriptor */
  865 extern conn **conns;
  866 
  867 /* current time of day (updated periodically) */
  868 extern volatile rel_time_t current_time;
  869 
  870 #ifdef MEMCACHED_DEBUG
  871 extern volatile bool is_paused;
  872 extern volatile int64_t delta;
  873 #endif
  874 
  875 /* TODO: Move to slabs.h? */
  876 extern volatile int slab_rebalance_signal;
  877 
  878 struct slab_rebalance {
  879     void *slab_start;
  880     void *slab_end;
  881     void *slab_pos;
  882     int s_clsid;
  883     int d_clsid;
  884     uint32_t busy_items;
  885     uint32_t rescues;
  886     uint32_t evictions_nomem;
  887     uint32_t inline_reclaim;
  888     uint32_t chunk_rescues;
  889     uint32_t busy_deletes;
  890     uint32_t busy_loops;
  891     uint8_t done;
  892     uint8_t *completed;
  893 };
  894 
  895 extern struct slab_rebalance slab_rebal;
  896 #ifdef EXTSTORE
  897 extern void *ext_storage;
  898 #endif
  899 /*
  900  * Functions
  901  */
  902 void do_accept_new_conns(const bool do_accept);
  903 enum delta_result_type do_add_delta(conn *c, const char *key,
  904                                     const size_t nkey, const bool incr,
  905                                     const int64_t delta, char *buf,
  906                                     uint64_t *cas, const uint32_t hv,
  907                                     item **it_ret);
  908 enum store_item_type do_store_item(item *item, int comm, conn* c, const uint32_t hv);
  909 void thread_io_queue_add(LIBEVENT_THREAD *t, int type, void *ctx, io_queue_stack_cb cb, io_queue_stack_cb com_cb, io_queue_cb ret_cb, io_queue_cb fin_cb);
  910 void conn_io_queue_setup(conn *c);
  911 io_queue_t *conn_io_queue_get(conn *c, int type);
  912 io_queue_cb_t *thread_io_queue_get(LIBEVENT_THREAD *t, int type);
  913 void conn_io_queue_return(io_pending_t *io);
  914 conn *conn_new(const int sfd, const enum conn_states init_state, const int event_flags, const int read_buffer_size,
  915     enum network_transport transport, struct event_base *base, void *ssl);
  916 
  917 void conn_worker_readd(conn *c);
  918 extern int daemonize(int nochdir, int noclose);
  919 
  920 #define mutex_lock(x) pthread_mutex_lock(x)
  921 #define mutex_unlock(x) pthread_mutex_unlock(x)
  922 
  923 #include "stats_prefix.h"
  924 #include "slabs.h"
  925 #include "assoc.h"
  926 #include "items.h"
  927 #include "crawler.h"
  928 #include "trace.h"
  929 #include "hash.h"
  930 #include "util.h"
  931 
  932 /*
  933  * Functions such as the libevent-related calls that need to do cross-thread
  934  * communication in multithreaded mode (rather than actually doing the work
  935  * in the current thread) are called via "dispatch_" frontends, which are
  936  * also #define-d to directly call the underlying code in singlethreaded mode.
  937  */
  938 void memcached_thread_init(int nthreads, void *arg);
  939 void redispatch_conn(conn *c);
  940 void timeout_conn(conn *c);
  941 #ifdef PROXY
  942 void proxy_reload_notify(LIBEVENT_THREAD *t);
  943 #endif
  944 void return_io_pending(io_pending_t *io);
  945 void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags, int read_buffer_size,
  946     enum network_transport transport, void *ssl);
  947 void sidethread_conn_close(conn *c);
  948 
  949 /* Lock wrappers for cache functions that are called from main loop. */
  950 enum delta_result_type add_delta(conn *c, const char *key,
  951                                  const size_t nkey, bool incr,
  952                                  const int64_t delta, char *buf,
  953                                  uint64_t *cas);
  954 void accept_new_conns(const bool do_accept);
  955 void  conn_close_idle(conn *c);
  956 void  conn_close_all(void);
  957 item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes);
  958 #define DO_UPDATE true
  959 #define DONT_UPDATE false
  960 item *item_get(const char *key, const size_t nkey, conn *c, const bool do_update);
  961 item *item_get_locked(const char *key, const size_t nkey, conn *c, const bool do_update, uint32_t *hv);
  962 item *item_touch(const char *key, const size_t nkey, uint32_t exptime, conn *c);
  963 int   item_link(item *it);
  964 void  item_remove(item *it);
  965 int   item_replace(item *it, item *new_it, const uint32_t hv);
  966 void  item_unlink(item *it);
  967 
  968 void item_lock(uint32_t hv);
  969 void *item_trylock(uint32_t hv);
  970 void item_trylock_unlock(void *arg);
  971 void item_unlock(uint32_t hv);
  972 void pause_threads(enum pause_thread_types type);
  973 void stop_threads(void);
  974 int stop_conn_timeout_thread(void);
  975 #define refcount_incr(it) ++(it->refcount)
  976 #define refcount_decr(it) --(it->refcount)
  977 void STATS_LOCK(void);
  978 void STATS_UNLOCK(void);
  979 #define THR_STATS_LOCK(c) pthread_mutex_lock(&c->thread->stats.mutex)
  980 #define THR_STATS_UNLOCK(c) pthread_mutex_unlock(&c->thread->stats.mutex)
  981 void threadlocal_stats_reset(void);
  982 void threadlocal_stats_aggregate(struct thread_stats *stats);
  983 void slab_stats_aggregate(struct thread_stats *stats, struct slab_stats *out);
  984 LIBEVENT_THREAD *get_worker_thread(int id);
  985 
  986 /* Stat processing functions */
  987 void append_stat(const char *name, ADD_STAT add_stats, conn *c,
  988                  const char *fmt, ...);
  989 
  990 enum store_item_type store_item(item *item, int comm, conn *c);
  991 
  992 /* Protocol related code */
  993 void out_string(conn *c, const char *str);
  994 #define REALTIME_MAXDELTA 60*60*24*30
  995 /* Negative exptimes can underflow and end up immortal. realtime() will
  996    immediately expire values that are greater than REALTIME_MAXDELTA, but less
  997    than process_started, so lets aim for that. */
  998 #define EXPTIME_TO_POSITIVE_TIME(exptime) (exptime < 0) ? \
  999         REALTIME_MAXDELTA + 1 : exptime
 1000 rel_time_t realtime(const time_t exptime);
 1001 item* limited_get(char *key, size_t nkey, conn *c, uint32_t exptime, bool should_touch, bool do_update, bool *overflow);
 1002 item* limited_get_locked(char *key, size_t nkey, conn *c, bool do_update, uint32_t *hv, bool *overflow);
 1003 // Read/Response object handlers.
 1004 void resp_reset(mc_resp *resp);
 1005 void resp_add_iov(mc_resp *resp, const void *buf, int len);
 1006 void resp_add_chunked_iov(mc_resp *resp, const void *buf, int len);
 1007 bool resp_start(conn *c);
 1008 mc_resp* resp_finish(conn *c, mc_resp *resp);
 1009 bool resp_has_stack(conn *c);
 1010 bool rbuf_switch_to_malloc(conn *c);
 1011 void conn_release_items(conn *c);
 1012 void conn_set_state(conn *c, enum conn_states state);
 1013 void out_of_memory(conn *c, char *ascii_error);
 1014 void out_errstring(conn *c, const char *str);
 1015 void write_and_free(conn *c, char *buf, int bytes);
 1016 void server_stats(ADD_STAT add_stats, conn *c);
 1017 void append_stats(const char *key, const uint16_t klen,
 1018                   const char *val, const uint32_t vlen,
 1019                   const void *cookie);
 1020 /** Return a datum for stats in binary protocol */
 1021 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c);
 1022 void stats_reset(void);
 1023 void process_stat_settings(ADD_STAT add_stats, void *c);
 1024 void process_stats_conns(ADD_STAT add_stats, void *c);
 1025 
 1026 #if HAVE_DROP_PRIVILEGES
 1027 extern void setup_privilege_violations_handler(void);
 1028 extern void drop_privileges(void);
 1029 #else
 1030 #define setup_privilege_violations_handler()
 1031 #define drop_privileges()
 1032 #endif
 1033 
 1034 #if HAVE_DROP_WORKER_PRIVILEGES
 1035 extern void drop_worker_privileges(void);
 1036 #else
 1037 #define drop_worker_privileges()
 1038 #endif
 1039 
 1040 /* If supported, give compiler hints for branch prediction. */
 1041 #if !defined(__GNUC__) || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
 1042 #define __builtin_expect(x, expected_value) (x)
 1043 #endif
 1044 
 1045 #define likely(x)       __builtin_expect((x),1)
 1046 #define unlikely(x)     __builtin_expect((x),0)