"Fossies" - the Fresh Open Source Software Archive

Member "memcached-1.6.15/items.c" (21 Feb 2022, 63931 Bytes) of package /linux/www/memcached-1.6.15.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "items.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.6.10_vs_1.6.11.

    1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
    2 #include "memcached.h"
    3 #include "bipbuffer.h"
    4 #include "slab_automove.h"
    5 #include "storage.h"
    6 #ifdef EXTSTORE
    7 #include "slab_automove_extstore.h"
    8 #endif
    9 #include <sys/stat.h>
   10 #include <sys/socket.h>
   11 #include <sys/resource.h>
   12 #include <fcntl.h>
   13 #include <netinet/in.h>
   14 #include <errno.h>
   15 #include <stdlib.h>
   16 #include <stdio.h>
   17 #include <signal.h>
   18 #include <string.h>
   19 #include <time.h>
   20 #include <assert.h>
   21 #include <unistd.h>
   22 #include <poll.h>
   23 
   24 /* Forward Declarations */
   25 static void item_link_q(item *it);
   26 static void item_unlink_q(item *it);
   27 
   28 static unsigned int lru_type_map[4] = {HOT_LRU, WARM_LRU, COLD_LRU, TEMP_LRU};
   29 
   30 #define LARGEST_ID POWER_LARGEST
   31 typedef struct {
   32     uint64_t evicted;
   33     uint64_t evicted_nonzero;
   34     uint64_t reclaimed;
   35     uint64_t outofmemory;
   36     uint64_t tailrepairs;
   37     uint64_t expired_unfetched; /* items reclaimed but never touched */
   38     uint64_t evicted_unfetched; /* items evicted but never touched */
   39     uint64_t evicted_active; /* items evicted that should have been shuffled */
   40     uint64_t crawler_reclaimed;
   41     uint64_t crawler_items_checked;
   42     uint64_t lrutail_reflocked;
   43     uint64_t moves_to_cold;
   44     uint64_t moves_to_warm;
   45     uint64_t moves_within_lru;
   46     uint64_t direct_reclaims;
   47     uint64_t hits_to_hot;
   48     uint64_t hits_to_warm;
   49     uint64_t hits_to_cold;
   50     uint64_t hits_to_temp;
   51     uint64_t mem_requested;
   52     rel_time_t evicted_time;
   53 } itemstats_t;
   54 
   55 static item *heads[LARGEST_ID];
   56 static item *tails[LARGEST_ID];
   57 static itemstats_t itemstats[LARGEST_ID];
   58 static unsigned int sizes[LARGEST_ID];
   59 static uint64_t sizes_bytes[LARGEST_ID];
   60 static unsigned int *stats_sizes_hist = NULL;
   61 static uint64_t stats_sizes_cas_min = 0;
   62 static int stats_sizes_buckets = 0;
   63 static uint64_t cas_id = 0;
   64 
   65 static volatile int do_run_lru_maintainer_thread = 0;
   66 static int lru_maintainer_initialized = 0;
   67 static pthread_mutex_t lru_maintainer_lock = PTHREAD_MUTEX_INITIALIZER;
   68 static pthread_mutex_t cas_id_lock = PTHREAD_MUTEX_INITIALIZER;
   69 static pthread_mutex_t stats_sizes_lock = PTHREAD_MUTEX_INITIALIZER;
   70 
   71 void item_stats_reset(void) {
   72     int i;
   73     for (i = 0; i < LARGEST_ID; i++) {
   74         pthread_mutex_lock(&lru_locks[i]);
   75         memset(&itemstats[i], 0, sizeof(itemstats_t));
   76         pthread_mutex_unlock(&lru_locks[i]);
   77     }
   78 }
   79 
   80 /* called with class lru lock held */
   81 void do_item_stats_add_crawl(const int i, const uint64_t reclaimed,
   82         const uint64_t unfetched, const uint64_t checked) {
   83     itemstats[i].crawler_reclaimed += reclaimed;
   84     itemstats[i].expired_unfetched += unfetched;
   85     itemstats[i].crawler_items_checked += checked;
   86 }
   87 
   88 typedef struct _lru_bump_buf {
   89     struct _lru_bump_buf *prev;
   90     struct _lru_bump_buf *next;
   91     pthread_mutex_t mutex;
   92     bipbuf_t *buf;
   93     uint64_t dropped;
   94 } lru_bump_buf;
   95 
   96 typedef struct {
   97     item *it;
   98     uint32_t hv;
   99 } lru_bump_entry;
  100 
  101 static lru_bump_buf *bump_buf_head = NULL;
  102 static lru_bump_buf *bump_buf_tail = NULL;
  103 static pthread_mutex_t bump_buf_lock = PTHREAD_MUTEX_INITIALIZER;
  104 /* TODO: tunable? Need bench results */
  105 #define LRU_BUMP_BUF_SIZE 8192
  106 
  107 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv);
  108 static uint64_t lru_total_bumps_dropped(void);
  109 
  110 /* Get the next CAS id for a new item. */
  111 /* TODO: refactor some atomics for this. */
  112 uint64_t get_cas_id(void) {
  113     pthread_mutex_lock(&cas_id_lock);
  114     uint64_t next_id = ++cas_id;
  115     pthread_mutex_unlock(&cas_id_lock);
  116     return next_id;
  117 }
  118 
  119 void set_cas_id(uint64_t new_cas) {
  120     pthread_mutex_lock(&cas_id_lock);
  121     cas_id = new_cas;
  122     pthread_mutex_unlock(&cas_id_lock);
  123 }
  124 
  125 int item_is_flushed(item *it) {
  126     rel_time_t oldest_live = settings.oldest_live;
  127     uint64_t cas = ITEM_get_cas(it);
  128     uint64_t oldest_cas = settings.oldest_cas;
  129     if (oldest_live == 0 || oldest_live > current_time)
  130         return 0;
  131     if ((it->time <= oldest_live)
  132             || (oldest_cas != 0 && cas != 0 && cas < oldest_cas)) {
  133         return 1;
  134     }
  135     return 0;
  136 }
  137 
  138 /* must be locked before call */
  139 unsigned int do_get_lru_size(uint32_t id) {
  140     return sizes[id];
  141 }
  142 
  143 /* Enable this for reference-count debugging. */
  144 #if 0
  145 # define DEBUG_REFCNT(it,op) \
  146                 fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \
  147                         it, op, it->refcount, \
  148                         (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
  149                         (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
  150 #else
  151 # define DEBUG_REFCNT(it,op) while(0)
  152 #endif
  153 
  154 /**
  155  * Generates the variable-sized part of the header for an object.
  156  *
  157  * nkey    - The length of the key
  158  * flags   - key flags
  159  * nbytes  - Number of bytes to hold value and addition CRLF terminator
  160  * suffix  - Buffer for the "VALUE" line suffix (flags, size).
  161  * nsuffix - The length of the suffix is stored here.
  162  *
  163  * Returns the total size of the header.
  164  */
  165 static size_t item_make_header(const uint8_t nkey, const unsigned int flags, const int nbytes,
  166                      char *suffix, uint8_t *nsuffix) {
  167     if (flags == 0) {
  168         *nsuffix = 0;
  169     } else {
  170         *nsuffix = sizeof(flags);
  171     }
  172     return sizeof(item) + nkey + *nsuffix + nbytes;
  173 }
  174 
  175 item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) {
  176     item *it = NULL;
  177     int i;
  178     /* If no memory is available, attempt a direct LRU juggle/eviction */
  179     /* This is a race in order to simplify lru_pull_tail; in cases where
  180      * locked items are on the tail, you want them to fall out and cause
  181      * occasional OOM's, rather than internally work around them.
  182      * This also gives one fewer code path for slab alloc/free
  183      */
  184     for (i = 0; i < 10; i++) {
  185         /* Try to reclaim memory first */
  186         if (!settings.lru_segmented) {
  187             lru_pull_tail(id, COLD_LRU, 0, 0, 0, NULL);
  188         }
  189         it = slabs_alloc(ntotal, id, 0);
  190 
  191         if (it == NULL) {
  192             // We send '0' in for "total_bytes" as this routine is always
  193             // pulling to evict, or forcing HOT -> COLD migration.
  194             // As of this writing, total_bytes isn't at all used with COLD_LRU.
  195             if (lru_pull_tail(id, COLD_LRU, 0, LRU_PULL_EVICT, 0, NULL) <= 0) {
  196                 if (settings.lru_segmented) {
  197                     lru_pull_tail(id, HOT_LRU, 0, 0, 0, NULL);
  198                 } else {
  199                     break;
  200                 }
  201             }
  202         } else {
  203             break;
  204         }
  205     }
  206 
  207     if (i > 0) {
  208         pthread_mutex_lock(&lru_locks[id]);
  209         itemstats[id].direct_reclaims += i;
  210         pthread_mutex_unlock(&lru_locks[id]);
  211     }
  212 
  213     return it;
  214 }
  215 
  216 /* Chain another chunk onto this chunk. */
  217 /* slab mover: if it finds a chunk without ITEM_CHUNK flag, and no ITEM_LINKED
  218  * flag, it counts as busy and skips.
  219  * I think it might still not be safe to do linking outside of the slab lock
  220  */
  221 item_chunk *do_item_alloc_chunk(item_chunk *ch, const size_t bytes_remain) {
  222     // TODO: Should be a cleaner way of finding real size with slabber calls
  223     size_t size = bytes_remain + sizeof(item_chunk);
  224     if (size > settings.slab_chunk_size_max)
  225         size = settings.slab_chunk_size_max;
  226     unsigned int id = slabs_clsid(size);
  227 
  228     item_chunk *nch = (item_chunk *) do_item_alloc_pull(size, id);
  229     if (nch == NULL)
  230         return NULL;
  231 
  232     // link in.
  233     // ITEM_CHUNK[ED] bits need to be protected by the slabs lock.
  234     slabs_mlock();
  235     nch->head = ch->head;
  236     ch->next = nch;
  237     nch->prev = ch;
  238     nch->next = 0;
  239     nch->used = 0;
  240     nch->slabs_clsid = id;
  241     nch->size = size - sizeof(item_chunk);
  242     nch->it_flags |= ITEM_CHUNK;
  243     slabs_munlock();
  244     return nch;
  245 }
  246 
  247 item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
  248                     const rel_time_t exptime, const int nbytes) {
  249     uint8_t nsuffix;
  250     item *it = NULL;
  251     char suffix[40];
  252     // Avoid potential underflows.
  253     if (nbytes < 2)
  254         return 0;
  255 
  256     size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
  257     if (settings.use_cas) {
  258         ntotal += sizeof(uint64_t);
  259     }
  260 
  261     unsigned int id = slabs_clsid(ntotal);
  262     unsigned int hdr_id = 0;
  263     if (id == 0)
  264         return 0;
  265 
  266     /* This is a large item. Allocate a header object now, lazily allocate
  267      *  chunks while reading the upload.
  268      */
  269     if (ntotal > settings.slab_chunk_size_max) {
  270         /* We still link this item into the LRU for the larger slab class, but
  271          * we're pulling a header from an entirely different slab class. The
  272          * free routines handle large items specifically.
  273          */
  274         int htotal = nkey + 1 + nsuffix + sizeof(item) + sizeof(item_chunk);
  275         if (settings.use_cas) {
  276             htotal += sizeof(uint64_t);
  277         }
  278 #ifdef NEED_ALIGN
  279         // header chunk needs to be padded on some systems
  280         int remain = htotal % 8;
  281         if (remain != 0) {
  282             htotal += 8 - remain;
  283         }
  284 #endif
  285         hdr_id = slabs_clsid(htotal);
  286         it = do_item_alloc_pull(htotal, hdr_id);
  287         /* setting ITEM_CHUNKED is fine here because we aren't LINKED yet. */
  288         if (it != NULL)
  289             it->it_flags |= ITEM_CHUNKED;
  290     } else {
  291         it = do_item_alloc_pull(ntotal, id);
  292     }
  293 
  294     if (it == NULL) {
  295         pthread_mutex_lock(&lru_locks[id]);
  296         itemstats[id].outofmemory++;
  297         pthread_mutex_unlock(&lru_locks[id]);
  298         return NULL;
  299     }
  300 
  301     assert(it->it_flags == 0 || it->it_flags == ITEM_CHUNKED);
  302     //assert(it != heads[id]);
  303 
  304     /* Refcount is seeded to 1 by slabs_alloc() */
  305     it->next = it->prev = 0;
  306 
  307     /* Items are initially loaded into the HOT_LRU. This is '0' but I want at
  308      * least a note here. Compiler (hopefully?) optimizes this out.
  309      */
  310     if (settings.temp_lru &&
  311             exptime - current_time <= settings.temporary_ttl) {
  312         id |= TEMP_LRU;
  313     } else if (settings.lru_segmented) {
  314         id |= HOT_LRU;
  315     } else {
  316         /* There is only COLD in compat-mode */
  317         id |= COLD_LRU;
  318     }
  319     it->slabs_clsid = id;
  320 
  321     DEBUG_REFCNT(it, '*');
  322     it->it_flags |= settings.use_cas ? ITEM_CAS : 0;
  323     it->it_flags |= nsuffix != 0 ? ITEM_CFLAGS : 0;
  324     it->nkey = nkey;
  325     it->nbytes = nbytes;
  326     memcpy(ITEM_key(it), key, nkey);
  327     it->exptime = exptime;
  328     if (nsuffix > 0) {
  329         memcpy(ITEM_suffix(it), &flags, sizeof(flags));
  330     }
  331 
  332     /* Initialize internal chunk. */
  333     if (it->it_flags & ITEM_CHUNKED) {
  334         item_chunk *chunk = (item_chunk *) ITEM_schunk(it);
  335 
  336         chunk->next = 0;
  337         chunk->prev = 0;
  338         chunk->used = 0;
  339         chunk->size = 0;
  340         chunk->head = it;
  341         chunk->orig_clsid = hdr_id;
  342     }
  343     it->h_next = 0;
  344 
  345     return it;
  346 }
  347 
  348 void item_free(item *it) {
  349     size_t ntotal = ITEM_ntotal(it);
  350     unsigned int clsid;
  351     assert((it->it_flags & ITEM_LINKED) == 0);
  352     assert(it != heads[it->slabs_clsid]);
  353     assert(it != tails[it->slabs_clsid]);
  354     assert(it->refcount == 0);
  355 
  356     /* so slab size changer can tell later if item is already free or not */
  357     clsid = ITEM_clsid(it);
  358     DEBUG_REFCNT(it, 'F');
  359     slabs_free(it, ntotal, clsid);
  360 }
  361 
  362 /**
  363  * Returns true if an item will fit in the cache (its size does not exceed
  364  * the maximum for a cache entry.)
  365  */
  366 bool item_size_ok(const size_t nkey, const int flags, const int nbytes) {
  367     char prefix[40];
  368     uint8_t nsuffix;
  369     if (nbytes < 2)
  370         return false;
  371 
  372     size_t ntotal = item_make_header(nkey + 1, flags, nbytes,
  373                                      prefix, &nsuffix);
  374     if (settings.use_cas) {
  375         ntotal += sizeof(uint64_t);
  376     }
  377 
  378     return slabs_clsid(ntotal) != 0;
  379 }
  380 
  381 /* fixing stats/references during warm start */
  382 void do_item_link_fixup(item *it) {
  383     item **head, **tail;
  384     int ntotal = ITEM_ntotal(it);
  385     uint32_t hv = hash(ITEM_key(it), it->nkey);
  386     assoc_insert(it, hv);
  387 
  388     head = &heads[it->slabs_clsid];
  389     tail = &tails[it->slabs_clsid];
  390     if (it->prev == 0 && *head == 0) *head = it;
  391     if (it->next == 0 && *tail == 0) *tail = it;
  392     sizes[it->slabs_clsid]++;
  393     sizes_bytes[it->slabs_clsid] += ntotal;
  394 
  395     STATS_LOCK();
  396     stats_state.curr_bytes += ntotal;
  397     stats_state.curr_items += 1;
  398     stats.total_items += 1;
  399     STATS_UNLOCK();
  400 
  401     item_stats_sizes_add(it);
  402 
  403     return;
  404 }
  405 
  406 static void do_item_link_q(item *it) { /* item is the new head */
  407     item **head, **tail;
  408     assert((it->it_flags & ITEM_SLABBED) == 0);
  409 
  410     head = &heads[it->slabs_clsid];
  411     tail = &tails[it->slabs_clsid];
  412     assert(it != *head);
  413     assert((*head && *tail) || (*head == 0 && *tail == 0));
  414     it->prev = 0;
  415     it->next = *head;
  416     if (it->next) it->next->prev = it;
  417     *head = it;
  418     if (*tail == 0) *tail = it;
  419     sizes[it->slabs_clsid]++;
  420 #ifdef EXTSTORE
  421     if (it->it_flags & ITEM_HDR) {
  422         sizes_bytes[it->slabs_clsid] += (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
  423     } else {
  424         sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
  425     }
  426 #else
  427     sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
  428 #endif
  429 
  430     return;
  431 }
  432 
  433 static void item_link_q(item *it) {
  434     pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
  435     do_item_link_q(it);
  436     pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
  437 }
  438 
  439 static void item_link_q_warm(item *it) {
  440     pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
  441     do_item_link_q(it);
  442     itemstats[it->slabs_clsid].moves_to_warm++;
  443     pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
  444 }
  445 
  446 static void do_item_unlink_q(item *it) {
  447     item **head, **tail;
  448     head = &heads[it->slabs_clsid];
  449     tail = &tails[it->slabs_clsid];
  450 
  451     if (*head == it) {
  452         assert(it->prev == 0);
  453         *head = it->next;
  454     }
  455     if (*tail == it) {
  456         assert(it->next == 0);
  457         *tail = it->prev;
  458     }
  459     assert(it->next != it);
  460     assert(it->prev != it);
  461 
  462     if (it->next) it->next->prev = it->prev;
  463     if (it->prev) it->prev->next = it->next;
  464     sizes[it->slabs_clsid]--;
  465 #ifdef EXTSTORE
  466     if (it->it_flags & ITEM_HDR) {
  467         sizes_bytes[it->slabs_clsid] -= (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
  468     } else {
  469         sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
  470     }
  471 #else
  472     sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
  473 #endif
  474 
  475     return;
  476 }
  477 
  478 static void item_unlink_q(item *it) {
  479     pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
  480     do_item_unlink_q(it);
  481     pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
  482 }
  483 
  484 int do_item_link(item *it, const uint32_t hv) {
  485     MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
  486     assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
  487     it->it_flags |= ITEM_LINKED;
  488     it->time = current_time;
  489 
  490     STATS_LOCK();
  491     stats_state.curr_bytes += ITEM_ntotal(it);
  492     stats_state.curr_items += 1;
  493     stats.total_items += 1;
  494     STATS_UNLOCK();
  495 
  496     /* Allocate a new CAS ID on link. */
  497     ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
  498     assoc_insert(it, hv);
  499     item_link_q(it);
  500     refcount_incr(it);
  501     item_stats_sizes_add(it);
  502 
  503     return 1;
  504 }
  505 
  506 void do_item_unlink(item *it, const uint32_t hv) {
  507     MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
  508     if ((it->it_flags & ITEM_LINKED) != 0) {
  509         it->it_flags &= ~ITEM_LINKED;
  510         STATS_LOCK();
  511         stats_state.curr_bytes -= ITEM_ntotal(it);
  512         stats_state.curr_items -= 1;
  513         STATS_UNLOCK();
  514         item_stats_sizes_remove(it);
  515         assoc_delete(ITEM_key(it), it->nkey, hv);
  516         item_unlink_q(it);
  517         do_item_remove(it);
  518     }
  519 }
  520 
  521 /* FIXME: Is it necessary to keep this copy/pasted code? */
  522 void do_item_unlink_nolock(item *it, const uint32_t hv) {
  523     MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
  524     if ((it->it_flags & ITEM_LINKED) != 0) {
  525         it->it_flags &= ~ITEM_LINKED;
  526         STATS_LOCK();
  527         stats_state.curr_bytes -= ITEM_ntotal(it);
  528         stats_state.curr_items -= 1;
  529         STATS_UNLOCK();
  530         item_stats_sizes_remove(it);
  531         assoc_delete(ITEM_key(it), it->nkey, hv);
  532         do_item_unlink_q(it);
  533         do_item_remove(it);
  534     }
  535 }
  536 
  537 void do_item_remove(item *it) {
  538     MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes);
  539     assert((it->it_flags & ITEM_SLABBED) == 0);
  540     assert(it->refcount > 0);
  541 
  542     if (refcount_decr(it) == 0) {
  543         item_free(it);
  544     }
  545 }
  546 
  547 /* Bump the last accessed time, or relink if we're in compat mode */
  548 void do_item_update(item *it) {
  549     MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes);
  550 
  551     /* Hits to COLD_LRU immediately move to WARM. */
  552     if (settings.lru_segmented) {
  553         assert((it->it_flags & ITEM_SLABBED) == 0);
  554         if ((it->it_flags & ITEM_LINKED) != 0) {
  555             if (ITEM_lruid(it) == COLD_LRU && (it->it_flags & ITEM_ACTIVE)) {
  556                 it->time = current_time;
  557                 item_unlink_q(it);
  558                 it->slabs_clsid = ITEM_clsid(it);
  559                 it->slabs_clsid |= WARM_LRU;
  560                 it->it_flags &= ~ITEM_ACTIVE;
  561                 item_link_q_warm(it);
  562             } else {
  563                 it->time = current_time;
  564             }
  565         }
  566     } else if (it->time < current_time - ITEM_UPDATE_INTERVAL) {
  567         assert((it->it_flags & ITEM_SLABBED) == 0);
  568 
  569         if ((it->it_flags & ITEM_LINKED) != 0) {
  570             it->time = current_time;
  571             item_unlink_q(it);
  572             item_link_q(it);
  573         }
  574     }
  575 }
  576 
  577 int do_item_replace(item *it, item *new_it, const uint32_t hv) {
  578     MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,
  579                            ITEM_key(new_it), new_it->nkey, new_it->nbytes);
  580     assert((it->it_flags & ITEM_SLABBED) == 0);
  581 
  582     do_item_unlink(it, hv);
  583     return do_item_link(new_it, hv);
  584 }
  585 
  586 /*@null@*/
  587 /* This is walking the line of violating lock order, but I think it's safe.
  588  * If the LRU lock is held, an item in the LRU cannot be wiped and freed.
  589  * The data could possibly be overwritten, but this is only accessing the
  590  * headers.
  591  * It may not be the best idea to leave it like this, but for now it's safe.
  592  */
  593 char *item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes) {
  594     unsigned int memlimit = 2 * 1024 * 1024;   /* 2MB max response size */
  595     char *buffer;
  596     unsigned int bufcurr;
  597     item *it;
  598     unsigned int len;
  599     unsigned int shown = 0;
  600     char key_temp[KEY_MAX_LENGTH + 1];
  601     char temp[512];
  602     unsigned int id = slabs_clsid;
  603     id |= COLD_LRU;
  604 
  605     pthread_mutex_lock(&lru_locks[id]);
  606     it = heads[id];
  607 
  608     buffer = malloc((size_t)memlimit);
  609     if (buffer == 0) {
  610         pthread_mutex_unlock(&lru_locks[id]);
  611         return NULL;
  612     }
  613     bufcurr = 0;
  614 
  615     while (it != NULL && (limit == 0 || shown < limit)) {
  616         assert(it->nkey <= KEY_MAX_LENGTH);
  617         // protect from printing binary keys.
  618         if ((it->nbytes == 0 && it->nkey == 0) || (it->it_flags & ITEM_KEY_BINARY)) {
  619             it = it->next;
  620             continue;
  621         }
  622         /* Copy the key since it may not be null-terminated in the struct */
  623         strncpy(key_temp, ITEM_key(it), it->nkey);
  624         key_temp[it->nkey] = 0x00; /* terminate */
  625         len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %llu s]\r\n",
  626                        key_temp, it->nbytes - 2,
  627                        it->exptime == 0 ? 0 :
  628                        (unsigned long long)it->exptime + process_started);
  629         if (bufcurr + len + 6 > memlimit)  /* 6 is END\r\n\0 */
  630             break;
  631         memcpy(buffer + bufcurr, temp, len);
  632         bufcurr += len;
  633         shown++;
  634         it = it->next;
  635     }
  636 
  637     memcpy(buffer + bufcurr, "END\r\n", 6);
  638     bufcurr += 5;
  639 
  640     *bytes = bufcurr;
  641     pthread_mutex_unlock(&lru_locks[id]);
  642     return buffer;
  643 }
  644 
  645 /* With refactoring of the various stats code the automover won't need a
  646  * custom function here.
  647  */
  648 void fill_item_stats_automove(item_stats_automove *am) {
  649     int n;
  650     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
  651         item_stats_automove *cur = &am[n];
  652 
  653         // outofmemory records into HOT
  654         int i = n | HOT_LRU;
  655         pthread_mutex_lock(&lru_locks[i]);
  656         cur->outofmemory = itemstats[i].outofmemory;
  657         pthread_mutex_unlock(&lru_locks[i]);
  658 
  659         // evictions and tail age are from COLD
  660         i = n | COLD_LRU;
  661         pthread_mutex_lock(&lru_locks[i]);
  662         cur->evicted = itemstats[i].evicted;
  663         if (tails[i]) {
  664             cur->age = current_time - tails[i]->time;
  665         } else {
  666             cur->age = 0;
  667         }
  668         pthread_mutex_unlock(&lru_locks[i]);
  669      }
  670 }
  671 
  672 void item_stats_totals(ADD_STAT add_stats, void *c) {
  673     itemstats_t totals;
  674     memset(&totals, 0, sizeof(itemstats_t));
  675     int n;
  676     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
  677         int x;
  678         int i;
  679         for (x = 0; x < 4; x++) {
  680             i = n | lru_type_map[x];
  681             pthread_mutex_lock(&lru_locks[i]);
  682             totals.expired_unfetched += itemstats[i].expired_unfetched;
  683             totals.evicted_unfetched += itemstats[i].evicted_unfetched;
  684             totals.evicted_active += itemstats[i].evicted_active;
  685             totals.evicted += itemstats[i].evicted;
  686             totals.reclaimed += itemstats[i].reclaimed;
  687             totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
  688             totals.crawler_items_checked += itemstats[i].crawler_items_checked;
  689             totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
  690             totals.moves_to_cold += itemstats[i].moves_to_cold;
  691             totals.moves_to_warm += itemstats[i].moves_to_warm;
  692             totals.moves_within_lru += itemstats[i].moves_within_lru;
  693             totals.direct_reclaims += itemstats[i].direct_reclaims;
  694             pthread_mutex_unlock(&lru_locks[i]);
  695         }
  696     }
  697     APPEND_STAT("expired_unfetched", "%llu",
  698                 (unsigned long long)totals.expired_unfetched);
  699     APPEND_STAT("evicted_unfetched", "%llu",
  700                 (unsigned long long)totals.evicted_unfetched);
  701     if (settings.lru_maintainer_thread) {
  702         APPEND_STAT("evicted_active", "%llu",
  703                     (unsigned long long)totals.evicted_active);
  704     }
  705     APPEND_STAT("evictions", "%llu",
  706                 (unsigned long long)totals.evicted);
  707     APPEND_STAT("reclaimed", "%llu",
  708                 (unsigned long long)totals.reclaimed);
  709     APPEND_STAT("crawler_reclaimed", "%llu",
  710                 (unsigned long long)totals.crawler_reclaimed);
  711     APPEND_STAT("crawler_items_checked", "%llu",
  712                 (unsigned long long)totals.crawler_items_checked);
  713     APPEND_STAT("lrutail_reflocked", "%llu",
  714                 (unsigned long long)totals.lrutail_reflocked);
  715     if (settings.lru_maintainer_thread) {
  716         APPEND_STAT("moves_to_cold", "%llu",
  717                     (unsigned long long)totals.moves_to_cold);
  718         APPEND_STAT("moves_to_warm", "%llu",
  719                     (unsigned long long)totals.moves_to_warm);
  720         APPEND_STAT("moves_within_lru", "%llu",
  721                     (unsigned long long)totals.moves_within_lru);
  722         APPEND_STAT("direct_reclaims", "%llu",
  723                     (unsigned long long)totals.direct_reclaims);
  724         APPEND_STAT("lru_bumps_dropped", "%llu",
  725                     (unsigned long long)lru_total_bumps_dropped());
  726     }
  727 }
  728 
  729 void item_stats(ADD_STAT add_stats, void *c) {
  730     struct thread_stats thread_stats;
  731     threadlocal_stats_aggregate(&thread_stats);
  732     itemstats_t totals;
  733     int n;
  734     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
  735         memset(&totals, 0, sizeof(itemstats_t));
  736         int x;
  737         int i;
  738         unsigned int size = 0;
  739         unsigned int age  = 0;
  740         unsigned int age_hot = 0;
  741         unsigned int age_warm = 0;
  742         unsigned int lru_size_map[4];
  743         const char *fmt = "items:%d:%s";
  744         char key_str[STAT_KEY_LEN];
  745         char val_str[STAT_VAL_LEN];
  746         int klen = 0, vlen = 0;
  747         for (x = 0; x < 4; x++) {
  748             i = n | lru_type_map[x];
  749             pthread_mutex_lock(&lru_locks[i]);
  750             totals.evicted += itemstats[i].evicted;
  751             totals.evicted_nonzero += itemstats[i].evicted_nonzero;
  752             totals.outofmemory += itemstats[i].outofmemory;
  753             totals.tailrepairs += itemstats[i].tailrepairs;
  754             totals.reclaimed += itemstats[i].reclaimed;
  755             totals.expired_unfetched += itemstats[i].expired_unfetched;
  756             totals.evicted_unfetched += itemstats[i].evicted_unfetched;
  757             totals.evicted_active += itemstats[i].evicted_active;
  758             totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
  759             totals.crawler_items_checked += itemstats[i].crawler_items_checked;
  760             totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
  761             totals.moves_to_cold += itemstats[i].moves_to_cold;
  762             totals.moves_to_warm += itemstats[i].moves_to_warm;
  763             totals.moves_within_lru += itemstats[i].moves_within_lru;
  764             totals.direct_reclaims += itemstats[i].direct_reclaims;
  765             totals.mem_requested += sizes_bytes[i];
  766             size += sizes[i];
  767             lru_size_map[x] = sizes[i];
  768             if (lru_type_map[x] == COLD_LRU && tails[i] != NULL) {
  769                 age = current_time - tails[i]->time;
  770             } else if (lru_type_map[x] == HOT_LRU && tails[i] != NULL) {
  771                 age_hot = current_time - tails[i]->time;
  772             } else if (lru_type_map[x] == WARM_LRU && tails[i] != NULL) {
  773                 age_warm = current_time - tails[i]->time;
  774             }
  775             if (lru_type_map[x] == COLD_LRU)
  776                 totals.evicted_time = itemstats[i].evicted_time;
  777             switch (lru_type_map[x]) {
  778                 case HOT_LRU:
  779                     totals.hits_to_hot = thread_stats.lru_hits[i];
  780                     break;
  781                 case WARM_LRU:
  782                     totals.hits_to_warm = thread_stats.lru_hits[i];
  783                     break;
  784                 case COLD_LRU:
  785                     totals.hits_to_cold = thread_stats.lru_hits[i];
  786                     break;
  787                 case TEMP_LRU:
  788                     totals.hits_to_temp = thread_stats.lru_hits[i];
  789                     break;
  790             }
  791             pthread_mutex_unlock(&lru_locks[i]);
  792         }
  793         if (size == 0)
  794             continue;
  795         APPEND_NUM_FMT_STAT(fmt, n, "number", "%u", size);
  796         if (settings.lru_maintainer_thread) {
  797             APPEND_NUM_FMT_STAT(fmt, n, "number_hot", "%u", lru_size_map[0]);
  798             APPEND_NUM_FMT_STAT(fmt, n, "number_warm", "%u", lru_size_map[1]);
  799             APPEND_NUM_FMT_STAT(fmt, n, "number_cold", "%u", lru_size_map[2]);
  800             if (settings.temp_lru) {
  801                 APPEND_NUM_FMT_STAT(fmt, n, "number_temp", "%u", lru_size_map[3]);
  802             }
  803             APPEND_NUM_FMT_STAT(fmt, n, "age_hot", "%u", age_hot);
  804             APPEND_NUM_FMT_STAT(fmt, n, "age_warm", "%u", age_warm);
  805         }
  806         APPEND_NUM_FMT_STAT(fmt, n, "age", "%u", age);
  807         APPEND_NUM_FMT_STAT(fmt, n, "mem_requested", "%llu", (unsigned long long)totals.mem_requested);
  808         APPEND_NUM_FMT_STAT(fmt, n, "evicted",
  809                             "%llu", (unsigned long long)totals.evicted);
  810         APPEND_NUM_FMT_STAT(fmt, n, "evicted_nonzero",
  811                             "%llu", (unsigned long long)totals.evicted_nonzero);
  812         APPEND_NUM_FMT_STAT(fmt, n, "evicted_time",
  813                             "%u", totals.evicted_time);
  814         APPEND_NUM_FMT_STAT(fmt, n, "outofmemory",
  815                             "%llu", (unsigned long long)totals.outofmemory);
  816         APPEND_NUM_FMT_STAT(fmt, n, "tailrepairs",
  817                             "%llu", (unsigned long long)totals.tailrepairs);
  818         APPEND_NUM_FMT_STAT(fmt, n, "reclaimed",
  819                             "%llu", (unsigned long long)totals.reclaimed);
  820         APPEND_NUM_FMT_STAT(fmt, n, "expired_unfetched",
  821                             "%llu", (unsigned long long)totals.expired_unfetched);
  822         APPEND_NUM_FMT_STAT(fmt, n, "evicted_unfetched",
  823                             "%llu", (unsigned long long)totals.evicted_unfetched);
  824         if (settings.lru_maintainer_thread) {
  825             APPEND_NUM_FMT_STAT(fmt, n, "evicted_active",
  826                                 "%llu", (unsigned long long)totals.evicted_active);
  827         }
  828         APPEND_NUM_FMT_STAT(fmt, n, "crawler_reclaimed",
  829                             "%llu", (unsigned long long)totals.crawler_reclaimed);
  830         APPEND_NUM_FMT_STAT(fmt, n, "crawler_items_checked",
  831                             "%llu", (unsigned long long)totals.crawler_items_checked);
  832         APPEND_NUM_FMT_STAT(fmt, n, "lrutail_reflocked",
  833                             "%llu", (unsigned long long)totals.lrutail_reflocked);
  834         if (settings.lru_maintainer_thread) {
  835             APPEND_NUM_FMT_STAT(fmt, n, "moves_to_cold",
  836                                 "%llu", (unsigned long long)totals.moves_to_cold);
  837             APPEND_NUM_FMT_STAT(fmt, n, "moves_to_warm",
  838                                 "%llu", (unsigned long long)totals.moves_to_warm);
  839             APPEND_NUM_FMT_STAT(fmt, n, "moves_within_lru",
  840                                 "%llu", (unsigned long long)totals.moves_within_lru);
  841             APPEND_NUM_FMT_STAT(fmt, n, "direct_reclaims",
  842                                 "%llu", (unsigned long long)totals.direct_reclaims);
  843             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_hot",
  844                                 "%llu", (unsigned long long)totals.hits_to_hot);
  845 
  846             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_warm",
  847                                 "%llu", (unsigned long long)totals.hits_to_warm);
  848 
  849             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_cold",
  850                                 "%llu", (unsigned long long)totals.hits_to_cold);
  851 
  852             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_temp",
  853                                 "%llu", (unsigned long long)totals.hits_to_temp);
  854 
  855         }
  856     }
  857 
  858     /* getting here means both ascii and binary terminators fit */
  859     add_stats(NULL, 0, NULL, 0, c);
  860 }
  861 
  862 bool item_stats_sizes_status(void) {
  863     bool ret = false;
  864     mutex_lock(&stats_sizes_lock);
  865     if (stats_sizes_hist != NULL)
  866         ret = true;
  867     mutex_unlock(&stats_sizes_lock);
  868     return ret;
  869 }
  870 
  871 void item_stats_sizes_init(void) {
  872     if (stats_sizes_hist != NULL)
  873         return;
  874     stats_sizes_buckets = settings.item_size_max / 32 + 1;
  875     stats_sizes_hist = calloc(stats_sizes_buckets, sizeof(int));
  876     stats_sizes_cas_min = (settings.use_cas) ? get_cas_id() : 0;
  877 }
  878 
  879 void item_stats_sizes_enable(ADD_STAT add_stats, void *c) {
  880     mutex_lock(&stats_sizes_lock);
  881     if (!settings.use_cas) {
  882         APPEND_STAT("sizes_status", "error", "");
  883         APPEND_STAT("sizes_error", "cas_support_disabled", "");
  884     } else if (stats_sizes_hist == NULL) {
  885         item_stats_sizes_init();
  886         if (stats_sizes_hist != NULL) {
  887             APPEND_STAT("sizes_status", "enabled", "");
  888         } else {
  889             APPEND_STAT("sizes_status", "error", "");
  890             APPEND_STAT("sizes_error", "no_memory", "");
  891         }
  892     } else {
  893         APPEND_STAT("sizes_status", "enabled", "");
  894     }
  895     mutex_unlock(&stats_sizes_lock);
  896 }
  897 
  898 void item_stats_sizes_disable(ADD_STAT add_stats, void *c) {
  899     mutex_lock(&stats_sizes_lock);
  900     if (stats_sizes_hist != NULL) {
  901         free(stats_sizes_hist);
  902         stats_sizes_hist = NULL;
  903     }
  904     APPEND_STAT("sizes_status", "disabled", "");
  905     mutex_unlock(&stats_sizes_lock);
  906 }
  907 
  908 void item_stats_sizes_add(item *it) {
  909     if (stats_sizes_hist == NULL || stats_sizes_cas_min > ITEM_get_cas(it))
  910         return;
  911     int ntotal = ITEM_ntotal(it);
  912     int bucket = ntotal / 32;
  913     if ((ntotal % 32) != 0) bucket++;
  914     if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]++;
  915 }
  916 
  917 /* I think there's no way for this to be accurate without using the CAS value.
  918  * Since items getting their time value bumped will pass this validation.
  919  */
  920 void item_stats_sizes_remove(item *it) {
  921     if (stats_sizes_hist == NULL || stats_sizes_cas_min > ITEM_get_cas(it))
  922         return;
  923     int ntotal = ITEM_ntotal(it);
  924     int bucket = ntotal / 32;
  925     if ((ntotal % 32) != 0) bucket++;
  926     if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]--;
  927 }
  928 
  929 /** dumps out a list of objects of each size, with granularity of 32 bytes */
  930 /*@null@*/
  931 /* Locks are correct based on a technicality. Holds LRU lock while doing the
  932  * work, so items can't go invalid, and it's only looking at header sizes
  933  * which don't change.
  934  */
  935 void item_stats_sizes(ADD_STAT add_stats, void *c) {
  936     mutex_lock(&stats_sizes_lock);
  937 
  938     if (stats_sizes_hist != NULL) {
  939         int i;
  940         for (i = 0; i < stats_sizes_buckets; i++) {
  941             if (stats_sizes_hist[i] != 0) {
  942                 char key[12];
  943                 snprintf(key, sizeof(key), "%d", i * 32);
  944                 APPEND_STAT(key, "%u", stats_sizes_hist[i]);
  945             }
  946         }
  947     } else {
  948         APPEND_STAT("sizes_status", "disabled", "");
  949     }
  950 
  951     add_stats(NULL, 0, NULL, 0, c);
  952     mutex_unlock(&stats_sizes_lock);
  953 }
  954 
  955 /** wrapper around assoc_find which does the lazy expiration logic */
  956 item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, conn *c, const bool do_update) {
  957     item *it = assoc_find(key, nkey, hv);
  958     if (it != NULL) {
  959         refcount_incr(it);
  960         /* Optimization for slab reassignment. prevents popular items from
  961          * jamming in busy wait. Can only do this here to satisfy lock order
  962          * of item_lock, slabs_lock. */
  963         /* This was made unsafe by removal of the cache_lock:
  964          * slab_rebalance_signal and slab_rebal.* are modified in a separate
  965          * thread under slabs_lock. If slab_rebalance_signal = 1, slab_start =
  966          * NULL (0), but slab_end is still equal to some value, this would end
  967          * up unlinking every item fetched.
  968          * This is either an acceptable loss, or if slab_rebalance_signal is
  969          * true, slab_start/slab_end should be put behind the slabs_lock.
  970          * Which would cause a huge potential slowdown.
  971          * Could also use a specific lock for slab_rebal.* and
  972          * slab_rebalance_signal (shorter lock?)
  973          */
  974         /*if (slab_rebalance_signal &&
  975             ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) {
  976             do_item_unlink(it, hv);
  977             do_item_remove(it);
  978             it = NULL;
  979         }*/
  980     }
  981     int was_found = 0;
  982 
  983     if (settings.verbose > 2) {
  984         int ii;
  985         if (it == NULL) {
  986             fprintf(stderr, "> NOT FOUND ");
  987         } else {
  988             fprintf(stderr, "> FOUND KEY ");
  989         }
  990         for (ii = 0; ii < nkey; ++ii) {
  991             fprintf(stderr, "%c", key[ii]);
  992         }
  993     }
  994 
  995     if (it != NULL) {
  996         was_found = 1;
  997         if (item_is_flushed(it)) {
  998             do_item_unlink(it, hv);
  999             STORAGE_delete(c->thread->storage, it);
 1000             do_item_remove(it);
 1001             it = NULL;
 1002             pthread_mutex_lock(&c->thread->stats.mutex);
 1003             c->thread->stats.get_flushed++;
 1004             pthread_mutex_unlock(&c->thread->stats.mutex);
 1005             if (settings.verbose > 2) {
 1006                 fprintf(stderr, " -nuked by flush");
 1007             }
 1008             was_found = 2;
 1009         } else if (it->exptime != 0 && it->exptime <= current_time) {
 1010             do_item_unlink(it, hv);
 1011             STORAGE_delete(c->thread->storage, it);
 1012             do_item_remove(it);
 1013             it = NULL;
 1014             pthread_mutex_lock(&c->thread->stats.mutex);
 1015             c->thread->stats.get_expired++;
 1016             pthread_mutex_unlock(&c->thread->stats.mutex);
 1017             if (settings.verbose > 2) {
 1018                 fprintf(stderr, " -nuked by expire");
 1019             }
 1020             was_found = 3;
 1021         } else {
 1022             if (do_update) {
 1023                 do_item_bump(c, it, hv);
 1024             }
 1025             DEBUG_REFCNT(it, '+');
 1026         }
 1027     }
 1028 
 1029     if (settings.verbose > 2)
 1030         fprintf(stderr, "\n");
 1031     /* For now this is in addition to the above verbose logging. */
 1032     LOGGER_LOG(c->thread->l, LOG_FETCHERS, LOGGER_ITEM_GET, NULL, was_found, key,
 1033                nkey, (it) ? it->nbytes : 0, (it) ? ITEM_clsid(it) : 0, c->sfd);
 1034 
 1035     return it;
 1036 }
 1037 
 1038 // Requires lock held for item.
 1039 // Split out of do_item_get() to allow mget functions to look through header
 1040 // data before losing state modified via the bump function.
 1041 void do_item_bump(conn *c, item *it, const uint32_t hv) {
 1042     /* We update the hit markers only during fetches.
 1043      * An item needs to be hit twice overall to be considered
 1044      * ACTIVE, but only needs a single hit to maintain activity
 1045      * afterward.
 1046      * FETCHED tells if an item has ever been active.
 1047      */
 1048     if (settings.lru_segmented) {
 1049         if ((it->it_flags & ITEM_ACTIVE) == 0) {
 1050             if ((it->it_flags & ITEM_FETCHED) == 0) {
 1051                 it->it_flags |= ITEM_FETCHED;
 1052             } else {
 1053                 it->it_flags |= ITEM_ACTIVE;
 1054                 if (ITEM_lruid(it) != COLD_LRU) {
 1055                     it->time = current_time; // only need to bump time.
 1056                 } else if (!lru_bump_async(c->thread->lru_bump_buf, it, hv)) {
 1057                     // add flag before async bump to avoid race.
 1058                     it->it_flags &= ~ITEM_ACTIVE;
 1059                 }
 1060             }
 1061         }
 1062     } else {
 1063         it->it_flags |= ITEM_FETCHED;
 1064         do_item_update(it);
 1065     }
 1066 }
 1067 
 1068 item *do_item_touch(const char *key, size_t nkey, uint32_t exptime,
 1069                     const uint32_t hv, conn *c) {
 1070     item *it = do_item_get(key, nkey, hv, c, DO_UPDATE);
 1071     if (it != NULL) {
 1072         it->exptime = exptime;
 1073     }
 1074     return it;
 1075 }
 1076 
 1077 /*** LRU MAINTENANCE THREAD ***/
 1078 
 1079 /* Returns number of items remove, expired, or evicted.
 1080  * Callable from worker threads or the LRU maintainer thread */
 1081 int lru_pull_tail(const int orig_id, const int cur_lru,
 1082         const uint64_t total_bytes, const uint8_t flags, const rel_time_t max_age,
 1083         struct lru_pull_tail_return *ret_it) {
 1084     item *it = NULL;
 1085     int id = orig_id;
 1086     int removed = 0;
 1087     if (id == 0)
 1088         return 0;
 1089 
 1090     int tries = 5;
 1091     item *search;
 1092     item *next_it;
 1093     void *hold_lock = NULL;
 1094     unsigned int move_to_lru = 0;
 1095     uint64_t limit = 0;
 1096 
 1097     id |= cur_lru;
 1098     pthread_mutex_lock(&lru_locks[id]);
 1099     search = tails[id];
 1100     /* We walk up *only* for locked items, and if bottom is expired. */
 1101     for (; tries > 0 && search != NULL; tries--, search=next_it) {
 1102         /* we might relink search mid-loop, so search->prev isn't reliable */
 1103         next_it = search->prev;
 1104         if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) {
 1105             /* We are a crawler, ignore it. */
 1106             if (flags & LRU_PULL_CRAWL_BLOCKS) {
 1107                 pthread_mutex_unlock(&lru_locks[id]);
 1108                 return 0;
 1109             }
 1110             tries++;
 1111             continue;
 1112         }
 1113         uint32_t hv = hash(ITEM_key(search), search->nkey);
 1114         /* Attempt to hash item lock the "search" item. If locked, no
 1115          * other callers can incr the refcount. Also skip ourselves. */
 1116         if ((hold_lock = item_trylock(hv)) == NULL)
 1117             continue;
 1118         /* Now see if the item is refcount locked */
 1119         if (refcount_incr(search) != 2) {
 1120             /* Note pathological case with ref'ed items in tail.
 1121              * Can still unlink the item, but it won't be reusable yet */
 1122             itemstats[id].lrutail_reflocked++;
 1123             /* In case of refcount leaks, enable for quick workaround. */
 1124             /* WARNING: This can cause terrible corruption */
 1125             if (settings.tail_repair_time &&
 1126                     search->time + settings.tail_repair_time < current_time) {
 1127                 itemstats[id].tailrepairs++;
 1128                 search->refcount = 1;
 1129                 /* This will call item_remove -> item_free since refcnt is 1 */
 1130                 STORAGE_delete(ext_storage, search);
 1131                 do_item_unlink_nolock(search, hv);
 1132                 item_trylock_unlock(hold_lock);
 1133                 continue;
 1134             }
 1135         }
 1136 
 1137         /* Expired or flushed */
 1138         if ((search->exptime != 0 && search->exptime < current_time)
 1139             || item_is_flushed(search)) {
 1140             itemstats[id].reclaimed++;
 1141             if ((search->it_flags & ITEM_FETCHED) == 0) {
 1142                 itemstats[id].expired_unfetched++;
 1143             }
 1144             /* refcnt 2 -> 1 */
 1145             do_item_unlink_nolock(search, hv);
 1146             STORAGE_delete(ext_storage, search);
 1147             /* refcnt 1 -> 0 -> item_free */
 1148             do_item_remove(search);
 1149             item_trylock_unlock(hold_lock);
 1150             removed++;
 1151 
 1152             /* If all we're finding are expired, can keep going */
 1153             continue;
 1154         }
 1155 
 1156         /* If we're HOT_LRU or WARM_LRU and over size limit, send to COLD_LRU.
 1157          * If we're COLD_LRU, send to WARM_LRU unless we need to evict
 1158          */
 1159         switch (cur_lru) {
 1160             case HOT_LRU:
 1161                 limit = total_bytes * settings.hot_lru_pct / 100;
 1162             case WARM_LRU:
 1163                 if (limit == 0)
 1164                     limit = total_bytes * settings.warm_lru_pct / 100;
 1165                 /* Rescue ACTIVE items aggressively */
 1166                 if ((search->it_flags & ITEM_ACTIVE) != 0) {
 1167                     search->it_flags &= ~ITEM_ACTIVE;
 1168                     removed++;
 1169                     if (cur_lru == WARM_LRU) {
 1170                         itemstats[id].moves_within_lru++;
 1171                         do_item_unlink_q(search);
 1172                         do_item_link_q(search);
 1173                         do_item_remove(search);
 1174                         item_trylock_unlock(hold_lock);
 1175                     } else {
 1176                         /* Active HOT_LRU items flow to WARM */
 1177                         itemstats[id].moves_to_warm++;
 1178                         move_to_lru = WARM_LRU;
 1179                         do_item_unlink_q(search);
 1180                         it = search;
 1181                     }
 1182                 } else if (sizes_bytes[id] > limit ||
 1183                            current_time - search->time > max_age) {
 1184                     itemstats[id].moves_to_cold++;
 1185                     move_to_lru = COLD_LRU;
 1186                     do_item_unlink_q(search);
 1187                     it = search;
 1188                     removed++;
 1189                     break;
 1190                 } else {
 1191                     /* Don't want to move to COLD, not active, bail out */
 1192                     it = search;
 1193                 }
 1194                 break;
 1195             case COLD_LRU:
 1196                 it = search; /* No matter what, we're stopping */
 1197                 if (flags & LRU_PULL_EVICT) {
 1198                     if (settings.evict_to_free == 0) {
 1199                         /* Don't think we need a counter for this. It'll OOM.  */
 1200                         break;
 1201                     }
 1202                     itemstats[id].evicted++;
 1203                     itemstats[id].evicted_time = current_time - search->time;
 1204                     if (search->exptime != 0)
 1205                         itemstats[id].evicted_nonzero++;
 1206                     if ((search->it_flags & ITEM_FETCHED) == 0) {
 1207                         itemstats[id].evicted_unfetched++;
 1208                     }
 1209                     if ((search->it_flags & ITEM_ACTIVE)) {
 1210                         itemstats[id].evicted_active++;
 1211                     }
 1212                     LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EVICTION, search);
 1213                     STORAGE_delete(ext_storage, search);
 1214                     do_item_unlink_nolock(search, hv);
 1215                     removed++;
 1216                     if (settings.slab_automove == 2) {
 1217                         slabs_reassign(-1, orig_id);
 1218                     }
 1219                 } else if (flags & LRU_PULL_RETURN_ITEM) {
 1220                     /* Keep a reference to this item and return it. */
 1221                     ret_it->it = it;
 1222                     ret_it->hv = hv;
 1223                 } else if ((search->it_flags & ITEM_ACTIVE) != 0
 1224                         && settings.lru_segmented) {
 1225                     itemstats[id].moves_to_warm++;
 1226                     search->it_flags &= ~ITEM_ACTIVE;
 1227                     move_to_lru = WARM_LRU;
 1228                     do_item_unlink_q(search);
 1229                     removed++;
 1230                 }
 1231                 break;
 1232             case TEMP_LRU:
 1233                 it = search; /* Kill the loop. Parent only interested in reclaims */
 1234                 break;
 1235         }
 1236         if (it != NULL)
 1237             break;
 1238     }
 1239 
 1240     pthread_mutex_unlock(&lru_locks[id]);
 1241 
 1242     if (it != NULL) {
 1243         if (move_to_lru) {
 1244             it->slabs_clsid = ITEM_clsid(it);
 1245             it->slabs_clsid |= move_to_lru;
 1246             item_link_q(it);
 1247         }
 1248         if ((flags & LRU_PULL_RETURN_ITEM) == 0) {
 1249             do_item_remove(it);
 1250             item_trylock_unlock(hold_lock);
 1251         }
 1252     }
 1253 
 1254     return removed;
 1255 }
 1256 
 1257 
 1258 /* TODO: Third place this code needs to be deduped */
 1259 static void lru_bump_buf_link_q(lru_bump_buf *b) {
 1260     pthread_mutex_lock(&bump_buf_lock);
 1261     assert(b != bump_buf_head);
 1262 
 1263     b->prev = 0;
 1264     b->next = bump_buf_head;
 1265     if (b->next) b->next->prev = b;
 1266     bump_buf_head = b;
 1267     if (bump_buf_tail == 0) bump_buf_tail = b;
 1268     pthread_mutex_unlock(&bump_buf_lock);
 1269     return;
 1270 }
 1271 
 1272 void *item_lru_bump_buf_create(void) {
 1273     lru_bump_buf *b = calloc(1, sizeof(lru_bump_buf));
 1274     if (b == NULL) {
 1275         return NULL;
 1276     }
 1277 
 1278     b->buf = bipbuf_new(sizeof(lru_bump_entry) * LRU_BUMP_BUF_SIZE);
 1279     if (b->buf == NULL) {
 1280         free(b);
 1281         return NULL;
 1282     }
 1283 
 1284     pthread_mutex_init(&b->mutex, NULL);
 1285 
 1286     lru_bump_buf_link_q(b);
 1287     return b;
 1288 }
 1289 
 1290 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv) {
 1291     bool ret = true;
 1292     refcount_incr(it);
 1293     pthread_mutex_lock(&b->mutex);
 1294     lru_bump_entry *be = (lru_bump_entry *) bipbuf_request(b->buf, sizeof(lru_bump_entry));
 1295     if (be != NULL) {
 1296         be->it = it;
 1297         be->hv = hv;
 1298         if (bipbuf_push(b->buf, sizeof(lru_bump_entry)) == 0) {
 1299             ret = false;
 1300             b->dropped++;
 1301         }
 1302     } else {
 1303         ret = false;
 1304         b->dropped++;
 1305     }
 1306     if (!ret) {
 1307         refcount_decr(it);
 1308     }
 1309     pthread_mutex_unlock(&b->mutex);
 1310     return ret;
 1311 }
 1312 
 1313 /* TODO: Might be worth a micro-optimization of having bump buffers link
 1314  * themselves back into the central queue when queue goes from zero to
 1315  * non-zero, then remove from list if zero more than N times.
 1316  * If very few hits on cold this would avoid extra memory barriers from LRU
 1317  * maintainer thread. If many hits, they'll just stay in the list.
 1318  */
 1319 static bool lru_maintainer_bumps(void) {
 1320     lru_bump_buf *b;
 1321     lru_bump_entry *be;
 1322     unsigned int size;
 1323     unsigned int todo;
 1324     bool bumped = false;
 1325     pthread_mutex_lock(&bump_buf_lock);
 1326     for (b = bump_buf_head; b != NULL; b=b->next) {
 1327         pthread_mutex_lock(&b->mutex);
 1328         be = (lru_bump_entry *) bipbuf_peek_all(b->buf, &size);
 1329         pthread_mutex_unlock(&b->mutex);
 1330 
 1331         if (be == NULL) {
 1332             continue;
 1333         }
 1334         todo = size;
 1335         bumped = true;
 1336 
 1337         while (todo) {
 1338             item_lock(be->hv);
 1339             do_item_update(be->it);
 1340             do_item_remove(be->it);
 1341             item_unlock(be->hv);
 1342             be++;
 1343             todo -= sizeof(lru_bump_entry);
 1344         }
 1345 
 1346         pthread_mutex_lock(&b->mutex);
 1347         be = (lru_bump_entry *) bipbuf_poll(b->buf, size);
 1348         pthread_mutex_unlock(&b->mutex);
 1349     }
 1350     pthread_mutex_unlock(&bump_buf_lock);
 1351     return bumped;
 1352 }
 1353 
 1354 static uint64_t lru_total_bumps_dropped(void) {
 1355     uint64_t total = 0;
 1356     lru_bump_buf *b;
 1357     pthread_mutex_lock(&bump_buf_lock);
 1358     for (b = bump_buf_head; b != NULL; b=b->next) {
 1359         pthread_mutex_lock(&b->mutex);
 1360         total += b->dropped;
 1361         pthread_mutex_unlock(&b->mutex);
 1362     }
 1363     pthread_mutex_unlock(&bump_buf_lock);
 1364     return total;
 1365 }
 1366 
 1367 /* Loop up to N times:
 1368  * If too many items are in HOT_LRU, push to COLD_LRU
 1369  * If too many items are in WARM_LRU, push to COLD_LRU
 1370  * If too many items are in COLD_LRU, poke COLD_LRU tail
 1371  * 1000 loops with 1ms min sleep gives us under 1m items shifted/sec. The
 1372  * locks can't handle much more than that. Leaving a TODO for how to
 1373  * autoadjust in the future.
 1374  */
 1375 static int lru_maintainer_juggle(const int slabs_clsid) {
 1376     int i;
 1377     int did_moves = 0;
 1378     uint64_t total_bytes = 0;
 1379     unsigned int chunks_perslab = 0;
 1380     //unsigned int chunks_free = 0;
 1381     /* TODO: if free_chunks below high watermark, increase aggressiveness */
 1382     slabs_available_chunks(slabs_clsid, NULL,
 1383             &chunks_perslab);
 1384     if (settings.temp_lru) {
 1385         /* Only looking for reclaims. Run before we size the LRU. */
 1386         for (i = 0; i < 500; i++) {
 1387             if (lru_pull_tail(slabs_clsid, TEMP_LRU, 0, 0, 0, NULL) <= 0) {
 1388                 break;
 1389             } else {
 1390                 did_moves++;
 1391             }
 1392         }
 1393     }
 1394 
 1395     rel_time_t cold_age = 0;
 1396     rel_time_t hot_age = 0;
 1397     rel_time_t warm_age = 0;
 1398     /* If LRU is in flat mode, force items to drain into COLD via max age of 0 */
 1399     if (settings.lru_segmented) {
 1400         pthread_mutex_lock(&lru_locks[slabs_clsid|COLD_LRU]);
 1401         if (tails[slabs_clsid|COLD_LRU]) {
 1402             cold_age = current_time - tails[slabs_clsid|COLD_LRU]->time;
 1403         }
 1404         // Also build up total_bytes for the classes.
 1405         total_bytes += sizes_bytes[slabs_clsid|COLD_LRU];
 1406         pthread_mutex_unlock(&lru_locks[slabs_clsid|COLD_LRU]);
 1407 
 1408         hot_age = cold_age * settings.hot_max_factor;
 1409         warm_age = cold_age * settings.warm_max_factor;
 1410 
 1411         // total_bytes doesn't have to be exact. cache it for the juggles.
 1412         pthread_mutex_lock(&lru_locks[slabs_clsid|HOT_LRU]);
 1413         total_bytes += sizes_bytes[slabs_clsid|HOT_LRU];
 1414         pthread_mutex_unlock(&lru_locks[slabs_clsid|HOT_LRU]);
 1415 
 1416         pthread_mutex_lock(&lru_locks[slabs_clsid|WARM_LRU]);
 1417         total_bytes += sizes_bytes[slabs_clsid|WARM_LRU];
 1418         pthread_mutex_unlock(&lru_locks[slabs_clsid|WARM_LRU]);
 1419     }
 1420 
 1421     /* Juggle HOT/WARM up to N times */
 1422     for (i = 0; i < 500; i++) {
 1423         int do_more = 0;
 1424         if (lru_pull_tail(slabs_clsid, HOT_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, hot_age, NULL) ||
 1425             lru_pull_tail(slabs_clsid, WARM_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, warm_age, NULL)) {
 1426             do_more++;
 1427         }
 1428         if (settings.lru_segmented) {
 1429             do_more += lru_pull_tail(slabs_clsid, COLD_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, 0, NULL);
 1430         }
 1431         if (do_more == 0)
 1432             break;
 1433         did_moves++;
 1434     }
 1435     return did_moves;
 1436 }
 1437 
 1438 /* Will crawl all slab classes a minimum of once per hour */
 1439 #define MAX_MAINTCRAWL_WAIT 60 * 60
 1440 
 1441 /* Hoping user input will improve this function. This is all a wild guess.
 1442  * Operation: Kicks crawler for each slab id. Crawlers take some statistics as
 1443  * to items with nonzero expirations. It then buckets how many items will
 1444  * expire per minute for the next hour.
 1445  * This function checks the results of a run, and if it things more than 1% of
 1446  * expirable objects are ready to go, kick the crawler again to reap.
 1447  * It will also kick the crawler once per minute regardless, waiting a minute
 1448  * longer for each time it has no work to do, up to an hour wait time.
 1449  * The latter is to avoid newly started daemons from waiting too long before
 1450  * retrying a crawl.
 1451  */
 1452 static void lru_maintainer_crawler_check(struct crawler_expired_data *cdata, logger *l) {
 1453     int i;
 1454     static rel_time_t next_crawls[POWER_LARGEST];
 1455     static rel_time_t next_crawl_wait[POWER_LARGEST];
 1456     uint8_t todo[POWER_LARGEST];
 1457     memset(todo, 0, sizeof(uint8_t) * POWER_LARGEST);
 1458     bool do_run = false;
 1459     unsigned int tocrawl_limit = 0;
 1460 
 1461     // TODO: If not segmented LRU, skip non-cold
 1462     for (i = POWER_SMALLEST; i < POWER_LARGEST; i++) {
 1463         crawlerstats_t *s = &cdata->crawlerstats[i];
 1464         /* We've not successfully kicked off a crawl yet. */
 1465         if (s->run_complete) {
 1466             char *lru_name = "na";
 1467             pthread_mutex_lock(&cdata->lock);
 1468             int x;
 1469             /* Should we crawl again? */
 1470             uint64_t possible_reclaims = s->seen - s->noexp;
 1471             uint64_t available_reclaims = 0;
 1472             /* Need to think we can free at least 1% of the items before
 1473              * crawling. */
 1474             /* FIXME: Configurable? */
 1475             uint64_t low_watermark = (possible_reclaims / 100) + 1;
 1476             rel_time_t since_run = current_time - s->end_time;
 1477             /* Don't bother if the payoff is too low. */
 1478             for (x = 0; x < 60; x++) {
 1479                 available_reclaims += s->histo[x];
 1480                 if (available_reclaims > low_watermark) {
 1481                     if (next_crawl_wait[i] < (x * 60)) {
 1482                         next_crawl_wait[i] += 60;
 1483                     } else if (next_crawl_wait[i] >= 60) {
 1484                         next_crawl_wait[i] -= 60;
 1485                     }
 1486                     break;
 1487                 }
 1488             }
 1489 
 1490             if (available_reclaims == 0) {
 1491                 next_crawl_wait[i] += 60;
 1492             }
 1493 
 1494             if (next_crawl_wait[i] > MAX_MAINTCRAWL_WAIT) {
 1495                 next_crawl_wait[i] = MAX_MAINTCRAWL_WAIT;
 1496             }
 1497 
 1498             next_crawls[i] = current_time + next_crawl_wait[i] + 5;
 1499             switch (GET_LRU(i)) {
 1500                 case HOT_LRU:
 1501                     lru_name = "hot";
 1502                     break;
 1503                 case WARM_LRU:
 1504                     lru_name = "warm";
 1505                     break;
 1506                 case COLD_LRU:
 1507                     lru_name = "cold";
 1508                     break;
 1509                 case TEMP_LRU:
 1510                     lru_name = "temp";
 1511                     break;
 1512             }
 1513             LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_CRAWLER_STATUS, NULL,
 1514                     CLEAR_LRU(i),
 1515                     lru_name,
 1516                     (unsigned long long)low_watermark,
 1517                     (unsigned long long)available_reclaims,
 1518                     (unsigned int)since_run,
 1519                     next_crawls[i] - current_time,
 1520                     s->end_time - s->start_time,
 1521                     s->seen,
 1522                     s->reclaimed);
 1523             // Got our calculation, avoid running until next actual run.
 1524             s->run_complete = false;
 1525             pthread_mutex_unlock(&cdata->lock);
 1526         }
 1527         if (current_time > next_crawls[i]) {
 1528             pthread_mutex_lock(&lru_locks[i]);
 1529             if (sizes[i] > tocrawl_limit) {
 1530                 tocrawl_limit = sizes[i];
 1531             }
 1532             pthread_mutex_unlock(&lru_locks[i]);
 1533             todo[i] = 1;
 1534             do_run = true;
 1535             next_crawls[i] = current_time + 5; // minimum retry wait.
 1536         }
 1537     }
 1538     if (do_run) {
 1539         if (settings.lru_crawler_tocrawl && settings.lru_crawler_tocrawl < tocrawl_limit) {
 1540             tocrawl_limit = settings.lru_crawler_tocrawl;
 1541         }
 1542         lru_crawler_start(todo, tocrawl_limit, CRAWLER_AUTOEXPIRE, cdata, NULL, 0);
 1543     }
 1544 }
 1545 
 1546 slab_automove_reg_t slab_automove_default = {
 1547     .init = slab_automove_init,
 1548     .free = slab_automove_free,
 1549     .run = slab_automove_run
 1550 };
 1551 #ifdef EXTSTORE
 1552 slab_automove_reg_t slab_automove_extstore = {
 1553     .init = slab_automove_extstore_init,
 1554     .free = slab_automove_extstore_free,
 1555     .run = slab_automove_extstore_run
 1556 };
 1557 #endif
 1558 static pthread_t lru_maintainer_tid;
 1559 
 1560 #define MAX_LRU_MAINTAINER_SLEEP 1000000
 1561 #define MIN_LRU_MAINTAINER_SLEEP 1000
 1562 
 1563 static void *lru_maintainer_thread(void *arg) {
 1564     slab_automove_reg_t *sam = &slab_automove_default;
 1565 #ifdef EXTSTORE
 1566     void *storage = arg;
 1567     if (storage != NULL)
 1568         sam = &slab_automove_extstore;
 1569 #endif
 1570     int i;
 1571     useconds_t to_sleep = MIN_LRU_MAINTAINER_SLEEP;
 1572     useconds_t last_sleep = MIN_LRU_MAINTAINER_SLEEP;
 1573     rel_time_t last_crawler_check = 0;
 1574     rel_time_t last_automove_check = 0;
 1575     useconds_t next_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
 1576     useconds_t backoff_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
 1577     struct crawler_expired_data *cdata =
 1578         calloc(1, sizeof(struct crawler_expired_data));
 1579     if (cdata == NULL) {
 1580         fprintf(stderr, "Failed to allocate crawler data for LRU maintainer thread\n");
 1581         abort();
 1582     }
 1583     pthread_mutex_init(&cdata->lock, NULL);
 1584     cdata->crawl_complete = true; // kick off the crawler.
 1585     logger *l = logger_create();
 1586     if (l == NULL) {
 1587         fprintf(stderr, "Failed to allocate logger for LRU maintainer thread\n");
 1588         abort();
 1589     }
 1590 
 1591     double last_ratio = settings.slab_automove_ratio;
 1592     void *am = sam->init(&settings);
 1593 
 1594     pthread_mutex_lock(&lru_maintainer_lock);
 1595     if (settings.verbose > 2)
 1596         fprintf(stderr, "Starting LRU maintainer background thread\n");
 1597     while (do_run_lru_maintainer_thread) {
 1598         pthread_mutex_unlock(&lru_maintainer_lock);
 1599         if (to_sleep)
 1600             usleep(to_sleep);
 1601         pthread_mutex_lock(&lru_maintainer_lock);
 1602         /* A sleep of zero counts as a minimum of a 1ms wait */
 1603         last_sleep = to_sleep > 1000 ? to_sleep : 1000;
 1604         to_sleep = MAX_LRU_MAINTAINER_SLEEP;
 1605 
 1606         STATS_LOCK();
 1607         stats.lru_maintainer_juggles++;
 1608         STATS_UNLOCK();
 1609 
 1610         /* Each slab class gets its own sleep to avoid hammering locks */
 1611         for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
 1612             next_juggles[i] = next_juggles[i] > last_sleep ? next_juggles[i] - last_sleep : 0;
 1613 
 1614             if (next_juggles[i] > 0) {
 1615                 // Sleep the thread just for the minimum amount (or not at all)
 1616                 if (next_juggles[i] < to_sleep)
 1617                     to_sleep = next_juggles[i];
 1618                 continue;
 1619             }
 1620 
 1621             int did_moves = lru_maintainer_juggle(i);
 1622             if (did_moves == 0) {
 1623                 if (backoff_juggles[i] != 0) {
 1624                     backoff_juggles[i] += backoff_juggles[i] / 8;
 1625                 } else {
 1626                     backoff_juggles[i] = MIN_LRU_MAINTAINER_SLEEP;
 1627                 }
 1628                 if (backoff_juggles[i] > MAX_LRU_MAINTAINER_SLEEP)
 1629                     backoff_juggles[i] = MAX_LRU_MAINTAINER_SLEEP;
 1630             } else if (backoff_juggles[i] > 0) {
 1631                 backoff_juggles[i] /= 2;
 1632                 if (backoff_juggles[i] < MIN_LRU_MAINTAINER_SLEEP) {
 1633                     backoff_juggles[i] = 0;
 1634                 }
 1635             }
 1636             next_juggles[i] = backoff_juggles[i];
 1637             if (next_juggles[i] < to_sleep)
 1638                 to_sleep = next_juggles[i];
 1639         }
 1640 
 1641         /* Minimize the sleep if we had async LRU bumps to process */
 1642         if (settings.lru_segmented && lru_maintainer_bumps() && to_sleep > 1000) {
 1643             to_sleep = 1000;
 1644         }
 1645 
 1646         /* Once per second at most */
 1647         if (settings.lru_crawler && last_crawler_check != current_time) {
 1648             lru_maintainer_crawler_check(cdata, l);
 1649             last_crawler_check = current_time;
 1650         }
 1651 
 1652         if (settings.slab_automove == 1 && last_automove_check != current_time) {
 1653             if (last_ratio != settings.slab_automove_ratio) {
 1654                 sam->free(am);
 1655                 am = sam->init(&settings);
 1656                 last_ratio = settings.slab_automove_ratio;
 1657             }
 1658             int src, dst;
 1659             sam->run(am, &src, &dst);
 1660             if (src != -1 && dst != -1) {
 1661                 slabs_reassign(src, dst);
 1662                 LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_SLAB_MOVE, NULL,
 1663                         src, dst);
 1664             }
 1665             // dst == 0 means reclaim to global pool, be more aggressive
 1666             if (dst != 0) {
 1667                 last_automove_check = current_time;
 1668             } else if (dst == 0) {
 1669                 // also ensure we minimize the thread sleep
 1670                 to_sleep = 1000;
 1671             }
 1672         }
 1673     }
 1674     pthread_mutex_unlock(&lru_maintainer_lock);
 1675     sam->free(am);
 1676     // LRU crawler *must* be stopped.
 1677     free(cdata);
 1678     if (settings.verbose > 2)
 1679         fprintf(stderr, "LRU maintainer thread stopping\n");
 1680 
 1681     return NULL;
 1682 }
 1683 
 1684 int stop_lru_maintainer_thread(void) {
 1685     int ret;
 1686     pthread_mutex_lock(&lru_maintainer_lock);
 1687     /* LRU thread is a sleep loop, will die on its own */
 1688     do_run_lru_maintainer_thread = 0;
 1689     pthread_mutex_unlock(&lru_maintainer_lock);
 1690     if ((ret = pthread_join(lru_maintainer_tid, NULL)) != 0) {
 1691         fprintf(stderr, "Failed to stop LRU maintainer thread: %s\n", strerror(ret));
 1692         return -1;
 1693     }
 1694     settings.lru_maintainer_thread = false;
 1695     return 0;
 1696 }
 1697 
 1698 int start_lru_maintainer_thread(void *arg) {
 1699     int ret;
 1700 
 1701     pthread_mutex_lock(&lru_maintainer_lock);
 1702     do_run_lru_maintainer_thread = 1;
 1703     settings.lru_maintainer_thread = true;
 1704     if ((ret = pthread_create(&lru_maintainer_tid, NULL,
 1705         lru_maintainer_thread, arg)) != 0) {
 1706         fprintf(stderr, "Can't create LRU maintainer thread: %s\n",
 1707             strerror(ret));
 1708         pthread_mutex_unlock(&lru_maintainer_lock);
 1709         return -1;
 1710     }
 1711     pthread_mutex_unlock(&lru_maintainer_lock);
 1712 
 1713     return 0;
 1714 }
 1715 
 1716 /* If we hold this lock, crawler can't wake up or move */
 1717 void lru_maintainer_pause(void) {
 1718     pthread_mutex_lock(&lru_maintainer_lock);
 1719 }
 1720 
 1721 void lru_maintainer_resume(void) {
 1722     pthread_mutex_unlock(&lru_maintainer_lock);
 1723 }
 1724 
 1725 int init_lru_maintainer(void) {
 1726     lru_maintainer_initialized = 1;
 1727     return 0;
 1728 }
 1729 
 1730 /* Tail linkers and crawler for the LRU crawler. */
 1731 void do_item_linktail_q(item *it) { /* item is the new tail */
 1732     item **head, **tail;
 1733     assert(it->it_flags == 1);
 1734     assert(it->nbytes == 0);
 1735 
 1736     head = &heads[it->slabs_clsid];
 1737     tail = &tails[it->slabs_clsid];
 1738     //assert(*tail != 0);
 1739     assert(it != *tail);
 1740     assert((*head && *tail) || (*head == 0 && *tail == 0));
 1741     it->prev = *tail;
 1742     it->next = 0;
 1743     if (it->prev) {
 1744         assert(it->prev->next == 0);
 1745         it->prev->next = it;
 1746     }
 1747     *tail = it;
 1748     if (*head == 0) *head = it;
 1749     return;
 1750 }
 1751 
 1752 void do_item_unlinktail_q(item *it) {
 1753     item **head, **tail;
 1754     head = &heads[it->slabs_clsid];
 1755     tail = &tails[it->slabs_clsid];
 1756 
 1757     if (*head == it) {
 1758         assert(it->prev == 0);
 1759         *head = it->next;
 1760     }
 1761     if (*tail == it) {
 1762         assert(it->next == 0);
 1763         *tail = it->prev;
 1764     }
 1765     assert(it->next != it);
 1766     assert(it->prev != it);
 1767 
 1768     if (it->next) it->next->prev = it->prev;
 1769     if (it->prev) it->prev->next = it->next;
 1770     return;
 1771 }
 1772 
 1773 /* This is too convoluted, but it's a difficult shuffle. Try to rewrite it
 1774  * more clearly. */
 1775 item *do_item_crawl_q(item *it) {
 1776     item **head, **tail;
 1777     assert(it->it_flags == 1);
 1778     assert(it->nbytes == 0);
 1779     head = &heads[it->slabs_clsid];
 1780     tail = &tails[it->slabs_clsid];
 1781 
 1782     /* We've hit the head, pop off */
 1783     if (it->prev == 0) {
 1784         assert(*head == it);
 1785         if (it->next) {
 1786             *head = it->next;
 1787             assert(it->next->prev == it);
 1788             it->next->prev = 0;
 1789         }
 1790         return NULL; /* Done */
 1791     }
 1792 
 1793     /* Swing ourselves in front of the next item */
 1794     /* NB: If there is a prev, we can't be the head */
 1795     assert(it->prev != it);
 1796     if (it->prev) {
 1797         if (*head == it->prev) {
 1798             /* Prev was the head, now we're the head */
 1799             *head = it;
 1800         }
 1801         if (*tail == it) {
 1802             /* We are the tail, now they are the tail */
 1803             *tail = it->prev;
 1804         }
 1805         assert(it->next != it);
 1806         if (it->next) {
 1807             assert(it->prev->next == it);
 1808             it->prev->next = it->next;
 1809             it->next->prev = it->prev;
 1810         } else {
 1811             /* Tail. Move this above? */
 1812             it->prev->next = 0;
 1813         }
 1814         /* prev->prev's next is it->prev */
 1815         it->next = it->prev;
 1816         it->prev = it->next->prev;
 1817         it->next->prev = it;
 1818         /* New it->prev now, if we're not at the head. */
 1819         if (it->prev) {
 1820             it->prev->next = it;
 1821         }
 1822     }
 1823     assert(it->next != it);
 1824     assert(it->prev != it);
 1825 
 1826     return it->next; /* success */
 1827 }