"Fossies" - the Fresh Open Source Software Archive 
Member "memcached-1.6.15/items.c" (21 Feb 2022, 63931 Bytes) of package /linux/www/memcached-1.6.15.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "items.c" see the
Fossies "Dox" file reference documentation and the last
Fossies "Diffs" side-by-side code changes report:
1.6.10_vs_1.6.11.
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 #include "memcached.h"
3 #include "bipbuffer.h"
4 #include "slab_automove.h"
5 #include "storage.h"
6 #ifdef EXTSTORE
7 #include "slab_automove_extstore.h"
8 #endif
9 #include <sys/stat.h>
10 #include <sys/socket.h>
11 #include <sys/resource.h>
12 #include <fcntl.h>
13 #include <netinet/in.h>
14 #include <errno.h>
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <signal.h>
18 #include <string.h>
19 #include <time.h>
20 #include <assert.h>
21 #include <unistd.h>
22 #include <poll.h>
23
24 /* Forward Declarations */
25 static void item_link_q(item *it);
26 static void item_unlink_q(item *it);
27
28 static unsigned int lru_type_map[4] = {HOT_LRU, WARM_LRU, COLD_LRU, TEMP_LRU};
29
30 #define LARGEST_ID POWER_LARGEST
31 typedef struct {
32 uint64_t evicted;
33 uint64_t evicted_nonzero;
34 uint64_t reclaimed;
35 uint64_t outofmemory;
36 uint64_t tailrepairs;
37 uint64_t expired_unfetched; /* items reclaimed but never touched */
38 uint64_t evicted_unfetched; /* items evicted but never touched */
39 uint64_t evicted_active; /* items evicted that should have been shuffled */
40 uint64_t crawler_reclaimed;
41 uint64_t crawler_items_checked;
42 uint64_t lrutail_reflocked;
43 uint64_t moves_to_cold;
44 uint64_t moves_to_warm;
45 uint64_t moves_within_lru;
46 uint64_t direct_reclaims;
47 uint64_t hits_to_hot;
48 uint64_t hits_to_warm;
49 uint64_t hits_to_cold;
50 uint64_t hits_to_temp;
51 uint64_t mem_requested;
52 rel_time_t evicted_time;
53 } itemstats_t;
54
55 static item *heads[LARGEST_ID];
56 static item *tails[LARGEST_ID];
57 static itemstats_t itemstats[LARGEST_ID];
58 static unsigned int sizes[LARGEST_ID];
59 static uint64_t sizes_bytes[LARGEST_ID];
60 static unsigned int *stats_sizes_hist = NULL;
61 static uint64_t stats_sizes_cas_min = 0;
62 static int stats_sizes_buckets = 0;
63 static uint64_t cas_id = 0;
64
65 static volatile int do_run_lru_maintainer_thread = 0;
66 static int lru_maintainer_initialized = 0;
67 static pthread_mutex_t lru_maintainer_lock = PTHREAD_MUTEX_INITIALIZER;
68 static pthread_mutex_t cas_id_lock = PTHREAD_MUTEX_INITIALIZER;
69 static pthread_mutex_t stats_sizes_lock = PTHREAD_MUTEX_INITIALIZER;
70
71 void item_stats_reset(void) {
72 int i;
73 for (i = 0; i < LARGEST_ID; i++) {
74 pthread_mutex_lock(&lru_locks[i]);
75 memset(&itemstats[i], 0, sizeof(itemstats_t));
76 pthread_mutex_unlock(&lru_locks[i]);
77 }
78 }
79
80 /* called with class lru lock held */
81 void do_item_stats_add_crawl(const int i, const uint64_t reclaimed,
82 const uint64_t unfetched, const uint64_t checked) {
83 itemstats[i].crawler_reclaimed += reclaimed;
84 itemstats[i].expired_unfetched += unfetched;
85 itemstats[i].crawler_items_checked += checked;
86 }
87
88 typedef struct _lru_bump_buf {
89 struct _lru_bump_buf *prev;
90 struct _lru_bump_buf *next;
91 pthread_mutex_t mutex;
92 bipbuf_t *buf;
93 uint64_t dropped;
94 } lru_bump_buf;
95
96 typedef struct {
97 item *it;
98 uint32_t hv;
99 } lru_bump_entry;
100
101 static lru_bump_buf *bump_buf_head = NULL;
102 static lru_bump_buf *bump_buf_tail = NULL;
103 static pthread_mutex_t bump_buf_lock = PTHREAD_MUTEX_INITIALIZER;
104 /* TODO: tunable? Need bench results */
105 #define LRU_BUMP_BUF_SIZE 8192
106
107 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv);
108 static uint64_t lru_total_bumps_dropped(void);
109
110 /* Get the next CAS id for a new item. */
111 /* TODO: refactor some atomics for this. */
112 uint64_t get_cas_id(void) {
113 pthread_mutex_lock(&cas_id_lock);
114 uint64_t next_id = ++cas_id;
115 pthread_mutex_unlock(&cas_id_lock);
116 return next_id;
117 }
118
119 void set_cas_id(uint64_t new_cas) {
120 pthread_mutex_lock(&cas_id_lock);
121 cas_id = new_cas;
122 pthread_mutex_unlock(&cas_id_lock);
123 }
124
125 int item_is_flushed(item *it) {
126 rel_time_t oldest_live = settings.oldest_live;
127 uint64_t cas = ITEM_get_cas(it);
128 uint64_t oldest_cas = settings.oldest_cas;
129 if (oldest_live == 0 || oldest_live > current_time)
130 return 0;
131 if ((it->time <= oldest_live)
132 || (oldest_cas != 0 && cas != 0 && cas < oldest_cas)) {
133 return 1;
134 }
135 return 0;
136 }
137
138 /* must be locked before call */
139 unsigned int do_get_lru_size(uint32_t id) {
140 return sizes[id];
141 }
142
143 /* Enable this for reference-count debugging. */
144 #if 0
145 # define DEBUG_REFCNT(it,op) \
146 fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \
147 it, op, it->refcount, \
148 (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
149 (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
150 #else
151 # define DEBUG_REFCNT(it,op) while(0)
152 #endif
153
154 /**
155 * Generates the variable-sized part of the header for an object.
156 *
157 * nkey - The length of the key
158 * flags - key flags
159 * nbytes - Number of bytes to hold value and addition CRLF terminator
160 * suffix - Buffer for the "VALUE" line suffix (flags, size).
161 * nsuffix - The length of the suffix is stored here.
162 *
163 * Returns the total size of the header.
164 */
165 static size_t item_make_header(const uint8_t nkey, const unsigned int flags, const int nbytes,
166 char *suffix, uint8_t *nsuffix) {
167 if (flags == 0) {
168 *nsuffix = 0;
169 } else {
170 *nsuffix = sizeof(flags);
171 }
172 return sizeof(item) + nkey + *nsuffix + nbytes;
173 }
174
175 item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) {
176 item *it = NULL;
177 int i;
178 /* If no memory is available, attempt a direct LRU juggle/eviction */
179 /* This is a race in order to simplify lru_pull_tail; in cases where
180 * locked items are on the tail, you want them to fall out and cause
181 * occasional OOM's, rather than internally work around them.
182 * This also gives one fewer code path for slab alloc/free
183 */
184 for (i = 0; i < 10; i++) {
185 /* Try to reclaim memory first */
186 if (!settings.lru_segmented) {
187 lru_pull_tail(id, COLD_LRU, 0, 0, 0, NULL);
188 }
189 it = slabs_alloc(ntotal, id, 0);
190
191 if (it == NULL) {
192 // We send '0' in for "total_bytes" as this routine is always
193 // pulling to evict, or forcing HOT -> COLD migration.
194 // As of this writing, total_bytes isn't at all used with COLD_LRU.
195 if (lru_pull_tail(id, COLD_LRU, 0, LRU_PULL_EVICT, 0, NULL) <= 0) {
196 if (settings.lru_segmented) {
197 lru_pull_tail(id, HOT_LRU, 0, 0, 0, NULL);
198 } else {
199 break;
200 }
201 }
202 } else {
203 break;
204 }
205 }
206
207 if (i > 0) {
208 pthread_mutex_lock(&lru_locks[id]);
209 itemstats[id].direct_reclaims += i;
210 pthread_mutex_unlock(&lru_locks[id]);
211 }
212
213 return it;
214 }
215
216 /* Chain another chunk onto this chunk. */
217 /* slab mover: if it finds a chunk without ITEM_CHUNK flag, and no ITEM_LINKED
218 * flag, it counts as busy and skips.
219 * I think it might still not be safe to do linking outside of the slab lock
220 */
221 item_chunk *do_item_alloc_chunk(item_chunk *ch, const size_t bytes_remain) {
222 // TODO: Should be a cleaner way of finding real size with slabber calls
223 size_t size = bytes_remain + sizeof(item_chunk);
224 if (size > settings.slab_chunk_size_max)
225 size = settings.slab_chunk_size_max;
226 unsigned int id = slabs_clsid(size);
227
228 item_chunk *nch = (item_chunk *) do_item_alloc_pull(size, id);
229 if (nch == NULL)
230 return NULL;
231
232 // link in.
233 // ITEM_CHUNK[ED] bits need to be protected by the slabs lock.
234 slabs_mlock();
235 nch->head = ch->head;
236 ch->next = nch;
237 nch->prev = ch;
238 nch->next = 0;
239 nch->used = 0;
240 nch->slabs_clsid = id;
241 nch->size = size - sizeof(item_chunk);
242 nch->it_flags |= ITEM_CHUNK;
243 slabs_munlock();
244 return nch;
245 }
246
247 item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
248 const rel_time_t exptime, const int nbytes) {
249 uint8_t nsuffix;
250 item *it = NULL;
251 char suffix[40];
252 // Avoid potential underflows.
253 if (nbytes < 2)
254 return 0;
255
256 size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
257 if (settings.use_cas) {
258 ntotal += sizeof(uint64_t);
259 }
260
261 unsigned int id = slabs_clsid(ntotal);
262 unsigned int hdr_id = 0;
263 if (id == 0)
264 return 0;
265
266 /* This is a large item. Allocate a header object now, lazily allocate
267 * chunks while reading the upload.
268 */
269 if (ntotal > settings.slab_chunk_size_max) {
270 /* We still link this item into the LRU for the larger slab class, but
271 * we're pulling a header from an entirely different slab class. The
272 * free routines handle large items specifically.
273 */
274 int htotal = nkey + 1 + nsuffix + sizeof(item) + sizeof(item_chunk);
275 if (settings.use_cas) {
276 htotal += sizeof(uint64_t);
277 }
278 #ifdef NEED_ALIGN
279 // header chunk needs to be padded on some systems
280 int remain = htotal % 8;
281 if (remain != 0) {
282 htotal += 8 - remain;
283 }
284 #endif
285 hdr_id = slabs_clsid(htotal);
286 it = do_item_alloc_pull(htotal, hdr_id);
287 /* setting ITEM_CHUNKED is fine here because we aren't LINKED yet. */
288 if (it != NULL)
289 it->it_flags |= ITEM_CHUNKED;
290 } else {
291 it = do_item_alloc_pull(ntotal, id);
292 }
293
294 if (it == NULL) {
295 pthread_mutex_lock(&lru_locks[id]);
296 itemstats[id].outofmemory++;
297 pthread_mutex_unlock(&lru_locks[id]);
298 return NULL;
299 }
300
301 assert(it->it_flags == 0 || it->it_flags == ITEM_CHUNKED);
302 //assert(it != heads[id]);
303
304 /* Refcount is seeded to 1 by slabs_alloc() */
305 it->next = it->prev = 0;
306
307 /* Items are initially loaded into the HOT_LRU. This is '0' but I want at
308 * least a note here. Compiler (hopefully?) optimizes this out.
309 */
310 if (settings.temp_lru &&
311 exptime - current_time <= settings.temporary_ttl) {
312 id |= TEMP_LRU;
313 } else if (settings.lru_segmented) {
314 id |= HOT_LRU;
315 } else {
316 /* There is only COLD in compat-mode */
317 id |= COLD_LRU;
318 }
319 it->slabs_clsid = id;
320
321 DEBUG_REFCNT(it, '*');
322 it->it_flags |= settings.use_cas ? ITEM_CAS : 0;
323 it->it_flags |= nsuffix != 0 ? ITEM_CFLAGS : 0;
324 it->nkey = nkey;
325 it->nbytes = nbytes;
326 memcpy(ITEM_key(it), key, nkey);
327 it->exptime = exptime;
328 if (nsuffix > 0) {
329 memcpy(ITEM_suffix(it), &flags, sizeof(flags));
330 }
331
332 /* Initialize internal chunk. */
333 if (it->it_flags & ITEM_CHUNKED) {
334 item_chunk *chunk = (item_chunk *) ITEM_schunk(it);
335
336 chunk->next = 0;
337 chunk->prev = 0;
338 chunk->used = 0;
339 chunk->size = 0;
340 chunk->head = it;
341 chunk->orig_clsid = hdr_id;
342 }
343 it->h_next = 0;
344
345 return it;
346 }
347
348 void item_free(item *it) {
349 size_t ntotal = ITEM_ntotal(it);
350 unsigned int clsid;
351 assert((it->it_flags & ITEM_LINKED) == 0);
352 assert(it != heads[it->slabs_clsid]);
353 assert(it != tails[it->slabs_clsid]);
354 assert(it->refcount == 0);
355
356 /* so slab size changer can tell later if item is already free or not */
357 clsid = ITEM_clsid(it);
358 DEBUG_REFCNT(it, 'F');
359 slabs_free(it, ntotal, clsid);
360 }
361
362 /**
363 * Returns true if an item will fit in the cache (its size does not exceed
364 * the maximum for a cache entry.)
365 */
366 bool item_size_ok(const size_t nkey, const int flags, const int nbytes) {
367 char prefix[40];
368 uint8_t nsuffix;
369 if (nbytes < 2)
370 return false;
371
372 size_t ntotal = item_make_header(nkey + 1, flags, nbytes,
373 prefix, &nsuffix);
374 if (settings.use_cas) {
375 ntotal += sizeof(uint64_t);
376 }
377
378 return slabs_clsid(ntotal) != 0;
379 }
380
381 /* fixing stats/references during warm start */
382 void do_item_link_fixup(item *it) {
383 item **head, **tail;
384 int ntotal = ITEM_ntotal(it);
385 uint32_t hv = hash(ITEM_key(it), it->nkey);
386 assoc_insert(it, hv);
387
388 head = &heads[it->slabs_clsid];
389 tail = &tails[it->slabs_clsid];
390 if (it->prev == 0 && *head == 0) *head = it;
391 if (it->next == 0 && *tail == 0) *tail = it;
392 sizes[it->slabs_clsid]++;
393 sizes_bytes[it->slabs_clsid] += ntotal;
394
395 STATS_LOCK();
396 stats_state.curr_bytes += ntotal;
397 stats_state.curr_items += 1;
398 stats.total_items += 1;
399 STATS_UNLOCK();
400
401 item_stats_sizes_add(it);
402
403 return;
404 }
405
406 static void do_item_link_q(item *it) { /* item is the new head */
407 item **head, **tail;
408 assert((it->it_flags & ITEM_SLABBED) == 0);
409
410 head = &heads[it->slabs_clsid];
411 tail = &tails[it->slabs_clsid];
412 assert(it != *head);
413 assert((*head && *tail) || (*head == 0 && *tail == 0));
414 it->prev = 0;
415 it->next = *head;
416 if (it->next) it->next->prev = it;
417 *head = it;
418 if (*tail == 0) *tail = it;
419 sizes[it->slabs_clsid]++;
420 #ifdef EXTSTORE
421 if (it->it_flags & ITEM_HDR) {
422 sizes_bytes[it->slabs_clsid] += (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
423 } else {
424 sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
425 }
426 #else
427 sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
428 #endif
429
430 return;
431 }
432
433 static void item_link_q(item *it) {
434 pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
435 do_item_link_q(it);
436 pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
437 }
438
439 static void item_link_q_warm(item *it) {
440 pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
441 do_item_link_q(it);
442 itemstats[it->slabs_clsid].moves_to_warm++;
443 pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
444 }
445
446 static void do_item_unlink_q(item *it) {
447 item **head, **tail;
448 head = &heads[it->slabs_clsid];
449 tail = &tails[it->slabs_clsid];
450
451 if (*head == it) {
452 assert(it->prev == 0);
453 *head = it->next;
454 }
455 if (*tail == it) {
456 assert(it->next == 0);
457 *tail = it->prev;
458 }
459 assert(it->next != it);
460 assert(it->prev != it);
461
462 if (it->next) it->next->prev = it->prev;
463 if (it->prev) it->prev->next = it->next;
464 sizes[it->slabs_clsid]--;
465 #ifdef EXTSTORE
466 if (it->it_flags & ITEM_HDR) {
467 sizes_bytes[it->slabs_clsid] -= (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
468 } else {
469 sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
470 }
471 #else
472 sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
473 #endif
474
475 return;
476 }
477
478 static void item_unlink_q(item *it) {
479 pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
480 do_item_unlink_q(it);
481 pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
482 }
483
484 int do_item_link(item *it, const uint32_t hv) {
485 MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
486 assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
487 it->it_flags |= ITEM_LINKED;
488 it->time = current_time;
489
490 STATS_LOCK();
491 stats_state.curr_bytes += ITEM_ntotal(it);
492 stats_state.curr_items += 1;
493 stats.total_items += 1;
494 STATS_UNLOCK();
495
496 /* Allocate a new CAS ID on link. */
497 ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
498 assoc_insert(it, hv);
499 item_link_q(it);
500 refcount_incr(it);
501 item_stats_sizes_add(it);
502
503 return 1;
504 }
505
506 void do_item_unlink(item *it, const uint32_t hv) {
507 MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
508 if ((it->it_flags & ITEM_LINKED) != 0) {
509 it->it_flags &= ~ITEM_LINKED;
510 STATS_LOCK();
511 stats_state.curr_bytes -= ITEM_ntotal(it);
512 stats_state.curr_items -= 1;
513 STATS_UNLOCK();
514 item_stats_sizes_remove(it);
515 assoc_delete(ITEM_key(it), it->nkey, hv);
516 item_unlink_q(it);
517 do_item_remove(it);
518 }
519 }
520
521 /* FIXME: Is it necessary to keep this copy/pasted code? */
522 void do_item_unlink_nolock(item *it, const uint32_t hv) {
523 MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
524 if ((it->it_flags & ITEM_LINKED) != 0) {
525 it->it_flags &= ~ITEM_LINKED;
526 STATS_LOCK();
527 stats_state.curr_bytes -= ITEM_ntotal(it);
528 stats_state.curr_items -= 1;
529 STATS_UNLOCK();
530 item_stats_sizes_remove(it);
531 assoc_delete(ITEM_key(it), it->nkey, hv);
532 do_item_unlink_q(it);
533 do_item_remove(it);
534 }
535 }
536
537 void do_item_remove(item *it) {
538 MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes);
539 assert((it->it_flags & ITEM_SLABBED) == 0);
540 assert(it->refcount > 0);
541
542 if (refcount_decr(it) == 0) {
543 item_free(it);
544 }
545 }
546
547 /* Bump the last accessed time, or relink if we're in compat mode */
548 void do_item_update(item *it) {
549 MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes);
550
551 /* Hits to COLD_LRU immediately move to WARM. */
552 if (settings.lru_segmented) {
553 assert((it->it_flags & ITEM_SLABBED) == 0);
554 if ((it->it_flags & ITEM_LINKED) != 0) {
555 if (ITEM_lruid(it) == COLD_LRU && (it->it_flags & ITEM_ACTIVE)) {
556 it->time = current_time;
557 item_unlink_q(it);
558 it->slabs_clsid = ITEM_clsid(it);
559 it->slabs_clsid |= WARM_LRU;
560 it->it_flags &= ~ITEM_ACTIVE;
561 item_link_q_warm(it);
562 } else {
563 it->time = current_time;
564 }
565 }
566 } else if (it->time < current_time - ITEM_UPDATE_INTERVAL) {
567 assert((it->it_flags & ITEM_SLABBED) == 0);
568
569 if ((it->it_flags & ITEM_LINKED) != 0) {
570 it->time = current_time;
571 item_unlink_q(it);
572 item_link_q(it);
573 }
574 }
575 }
576
577 int do_item_replace(item *it, item *new_it, const uint32_t hv) {
578 MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,
579 ITEM_key(new_it), new_it->nkey, new_it->nbytes);
580 assert((it->it_flags & ITEM_SLABBED) == 0);
581
582 do_item_unlink(it, hv);
583 return do_item_link(new_it, hv);
584 }
585
586 /*@null@*/
587 /* This is walking the line of violating lock order, but I think it's safe.
588 * If the LRU lock is held, an item in the LRU cannot be wiped and freed.
589 * The data could possibly be overwritten, but this is only accessing the
590 * headers.
591 * It may not be the best idea to leave it like this, but for now it's safe.
592 */
593 char *item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes) {
594 unsigned int memlimit = 2 * 1024 * 1024; /* 2MB max response size */
595 char *buffer;
596 unsigned int bufcurr;
597 item *it;
598 unsigned int len;
599 unsigned int shown = 0;
600 char key_temp[KEY_MAX_LENGTH + 1];
601 char temp[512];
602 unsigned int id = slabs_clsid;
603 id |= COLD_LRU;
604
605 pthread_mutex_lock(&lru_locks[id]);
606 it = heads[id];
607
608 buffer = malloc((size_t)memlimit);
609 if (buffer == 0) {
610 pthread_mutex_unlock(&lru_locks[id]);
611 return NULL;
612 }
613 bufcurr = 0;
614
615 while (it != NULL && (limit == 0 || shown < limit)) {
616 assert(it->nkey <= KEY_MAX_LENGTH);
617 // protect from printing binary keys.
618 if ((it->nbytes == 0 && it->nkey == 0) || (it->it_flags & ITEM_KEY_BINARY)) {
619 it = it->next;
620 continue;
621 }
622 /* Copy the key since it may not be null-terminated in the struct */
623 strncpy(key_temp, ITEM_key(it), it->nkey);
624 key_temp[it->nkey] = 0x00; /* terminate */
625 len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %llu s]\r\n",
626 key_temp, it->nbytes - 2,
627 it->exptime == 0 ? 0 :
628 (unsigned long long)it->exptime + process_started);
629 if (bufcurr + len + 6 > memlimit) /* 6 is END\r\n\0 */
630 break;
631 memcpy(buffer + bufcurr, temp, len);
632 bufcurr += len;
633 shown++;
634 it = it->next;
635 }
636
637 memcpy(buffer + bufcurr, "END\r\n", 6);
638 bufcurr += 5;
639
640 *bytes = bufcurr;
641 pthread_mutex_unlock(&lru_locks[id]);
642 return buffer;
643 }
644
645 /* With refactoring of the various stats code the automover won't need a
646 * custom function here.
647 */
648 void fill_item_stats_automove(item_stats_automove *am) {
649 int n;
650 for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
651 item_stats_automove *cur = &am[n];
652
653 // outofmemory records into HOT
654 int i = n | HOT_LRU;
655 pthread_mutex_lock(&lru_locks[i]);
656 cur->outofmemory = itemstats[i].outofmemory;
657 pthread_mutex_unlock(&lru_locks[i]);
658
659 // evictions and tail age are from COLD
660 i = n | COLD_LRU;
661 pthread_mutex_lock(&lru_locks[i]);
662 cur->evicted = itemstats[i].evicted;
663 if (tails[i]) {
664 cur->age = current_time - tails[i]->time;
665 } else {
666 cur->age = 0;
667 }
668 pthread_mutex_unlock(&lru_locks[i]);
669 }
670 }
671
672 void item_stats_totals(ADD_STAT add_stats, void *c) {
673 itemstats_t totals;
674 memset(&totals, 0, sizeof(itemstats_t));
675 int n;
676 for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
677 int x;
678 int i;
679 for (x = 0; x < 4; x++) {
680 i = n | lru_type_map[x];
681 pthread_mutex_lock(&lru_locks[i]);
682 totals.expired_unfetched += itemstats[i].expired_unfetched;
683 totals.evicted_unfetched += itemstats[i].evicted_unfetched;
684 totals.evicted_active += itemstats[i].evicted_active;
685 totals.evicted += itemstats[i].evicted;
686 totals.reclaimed += itemstats[i].reclaimed;
687 totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
688 totals.crawler_items_checked += itemstats[i].crawler_items_checked;
689 totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
690 totals.moves_to_cold += itemstats[i].moves_to_cold;
691 totals.moves_to_warm += itemstats[i].moves_to_warm;
692 totals.moves_within_lru += itemstats[i].moves_within_lru;
693 totals.direct_reclaims += itemstats[i].direct_reclaims;
694 pthread_mutex_unlock(&lru_locks[i]);
695 }
696 }
697 APPEND_STAT("expired_unfetched", "%llu",
698 (unsigned long long)totals.expired_unfetched);
699 APPEND_STAT("evicted_unfetched", "%llu",
700 (unsigned long long)totals.evicted_unfetched);
701 if (settings.lru_maintainer_thread) {
702 APPEND_STAT("evicted_active", "%llu",
703 (unsigned long long)totals.evicted_active);
704 }
705 APPEND_STAT("evictions", "%llu",
706 (unsigned long long)totals.evicted);
707 APPEND_STAT("reclaimed", "%llu",
708 (unsigned long long)totals.reclaimed);
709 APPEND_STAT("crawler_reclaimed", "%llu",
710 (unsigned long long)totals.crawler_reclaimed);
711 APPEND_STAT("crawler_items_checked", "%llu",
712 (unsigned long long)totals.crawler_items_checked);
713 APPEND_STAT("lrutail_reflocked", "%llu",
714 (unsigned long long)totals.lrutail_reflocked);
715 if (settings.lru_maintainer_thread) {
716 APPEND_STAT("moves_to_cold", "%llu",
717 (unsigned long long)totals.moves_to_cold);
718 APPEND_STAT("moves_to_warm", "%llu",
719 (unsigned long long)totals.moves_to_warm);
720 APPEND_STAT("moves_within_lru", "%llu",
721 (unsigned long long)totals.moves_within_lru);
722 APPEND_STAT("direct_reclaims", "%llu",
723 (unsigned long long)totals.direct_reclaims);
724 APPEND_STAT("lru_bumps_dropped", "%llu",
725 (unsigned long long)lru_total_bumps_dropped());
726 }
727 }
728
729 void item_stats(ADD_STAT add_stats, void *c) {
730 struct thread_stats thread_stats;
731 threadlocal_stats_aggregate(&thread_stats);
732 itemstats_t totals;
733 int n;
734 for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
735 memset(&totals, 0, sizeof(itemstats_t));
736 int x;
737 int i;
738 unsigned int size = 0;
739 unsigned int age = 0;
740 unsigned int age_hot = 0;
741 unsigned int age_warm = 0;
742 unsigned int lru_size_map[4];
743 const char *fmt = "items:%d:%s";
744 char key_str[STAT_KEY_LEN];
745 char val_str[STAT_VAL_LEN];
746 int klen = 0, vlen = 0;
747 for (x = 0; x < 4; x++) {
748 i = n | lru_type_map[x];
749 pthread_mutex_lock(&lru_locks[i]);
750 totals.evicted += itemstats[i].evicted;
751 totals.evicted_nonzero += itemstats[i].evicted_nonzero;
752 totals.outofmemory += itemstats[i].outofmemory;
753 totals.tailrepairs += itemstats[i].tailrepairs;
754 totals.reclaimed += itemstats[i].reclaimed;
755 totals.expired_unfetched += itemstats[i].expired_unfetched;
756 totals.evicted_unfetched += itemstats[i].evicted_unfetched;
757 totals.evicted_active += itemstats[i].evicted_active;
758 totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
759 totals.crawler_items_checked += itemstats[i].crawler_items_checked;
760 totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
761 totals.moves_to_cold += itemstats[i].moves_to_cold;
762 totals.moves_to_warm += itemstats[i].moves_to_warm;
763 totals.moves_within_lru += itemstats[i].moves_within_lru;
764 totals.direct_reclaims += itemstats[i].direct_reclaims;
765 totals.mem_requested += sizes_bytes[i];
766 size += sizes[i];
767 lru_size_map[x] = sizes[i];
768 if (lru_type_map[x] == COLD_LRU && tails[i] != NULL) {
769 age = current_time - tails[i]->time;
770 } else if (lru_type_map[x] == HOT_LRU && tails[i] != NULL) {
771 age_hot = current_time - tails[i]->time;
772 } else if (lru_type_map[x] == WARM_LRU && tails[i] != NULL) {
773 age_warm = current_time - tails[i]->time;
774 }
775 if (lru_type_map[x] == COLD_LRU)
776 totals.evicted_time = itemstats[i].evicted_time;
777 switch (lru_type_map[x]) {
778 case HOT_LRU:
779 totals.hits_to_hot = thread_stats.lru_hits[i];
780 break;
781 case WARM_LRU:
782 totals.hits_to_warm = thread_stats.lru_hits[i];
783 break;
784 case COLD_LRU:
785 totals.hits_to_cold = thread_stats.lru_hits[i];
786 break;
787 case TEMP_LRU:
788 totals.hits_to_temp = thread_stats.lru_hits[i];
789 break;
790 }
791 pthread_mutex_unlock(&lru_locks[i]);
792 }
793 if (size == 0)
794 continue;
795 APPEND_NUM_FMT_STAT(fmt, n, "number", "%u", size);
796 if (settings.lru_maintainer_thread) {
797 APPEND_NUM_FMT_STAT(fmt, n, "number_hot", "%u", lru_size_map[0]);
798 APPEND_NUM_FMT_STAT(fmt, n, "number_warm", "%u", lru_size_map[1]);
799 APPEND_NUM_FMT_STAT(fmt, n, "number_cold", "%u", lru_size_map[2]);
800 if (settings.temp_lru) {
801 APPEND_NUM_FMT_STAT(fmt, n, "number_temp", "%u", lru_size_map[3]);
802 }
803 APPEND_NUM_FMT_STAT(fmt, n, "age_hot", "%u", age_hot);
804 APPEND_NUM_FMT_STAT(fmt, n, "age_warm", "%u", age_warm);
805 }
806 APPEND_NUM_FMT_STAT(fmt, n, "age", "%u", age);
807 APPEND_NUM_FMT_STAT(fmt, n, "mem_requested", "%llu", (unsigned long long)totals.mem_requested);
808 APPEND_NUM_FMT_STAT(fmt, n, "evicted",
809 "%llu", (unsigned long long)totals.evicted);
810 APPEND_NUM_FMT_STAT(fmt, n, "evicted_nonzero",
811 "%llu", (unsigned long long)totals.evicted_nonzero);
812 APPEND_NUM_FMT_STAT(fmt, n, "evicted_time",
813 "%u", totals.evicted_time);
814 APPEND_NUM_FMT_STAT(fmt, n, "outofmemory",
815 "%llu", (unsigned long long)totals.outofmemory);
816 APPEND_NUM_FMT_STAT(fmt, n, "tailrepairs",
817 "%llu", (unsigned long long)totals.tailrepairs);
818 APPEND_NUM_FMT_STAT(fmt, n, "reclaimed",
819 "%llu", (unsigned long long)totals.reclaimed);
820 APPEND_NUM_FMT_STAT(fmt, n, "expired_unfetched",
821 "%llu", (unsigned long long)totals.expired_unfetched);
822 APPEND_NUM_FMT_STAT(fmt, n, "evicted_unfetched",
823 "%llu", (unsigned long long)totals.evicted_unfetched);
824 if (settings.lru_maintainer_thread) {
825 APPEND_NUM_FMT_STAT(fmt, n, "evicted_active",
826 "%llu", (unsigned long long)totals.evicted_active);
827 }
828 APPEND_NUM_FMT_STAT(fmt, n, "crawler_reclaimed",
829 "%llu", (unsigned long long)totals.crawler_reclaimed);
830 APPEND_NUM_FMT_STAT(fmt, n, "crawler_items_checked",
831 "%llu", (unsigned long long)totals.crawler_items_checked);
832 APPEND_NUM_FMT_STAT(fmt, n, "lrutail_reflocked",
833 "%llu", (unsigned long long)totals.lrutail_reflocked);
834 if (settings.lru_maintainer_thread) {
835 APPEND_NUM_FMT_STAT(fmt, n, "moves_to_cold",
836 "%llu", (unsigned long long)totals.moves_to_cold);
837 APPEND_NUM_FMT_STAT(fmt, n, "moves_to_warm",
838 "%llu", (unsigned long long)totals.moves_to_warm);
839 APPEND_NUM_FMT_STAT(fmt, n, "moves_within_lru",
840 "%llu", (unsigned long long)totals.moves_within_lru);
841 APPEND_NUM_FMT_STAT(fmt, n, "direct_reclaims",
842 "%llu", (unsigned long long)totals.direct_reclaims);
843 APPEND_NUM_FMT_STAT(fmt, n, "hits_to_hot",
844 "%llu", (unsigned long long)totals.hits_to_hot);
845
846 APPEND_NUM_FMT_STAT(fmt, n, "hits_to_warm",
847 "%llu", (unsigned long long)totals.hits_to_warm);
848
849 APPEND_NUM_FMT_STAT(fmt, n, "hits_to_cold",
850 "%llu", (unsigned long long)totals.hits_to_cold);
851
852 APPEND_NUM_FMT_STAT(fmt, n, "hits_to_temp",
853 "%llu", (unsigned long long)totals.hits_to_temp);
854
855 }
856 }
857
858 /* getting here means both ascii and binary terminators fit */
859 add_stats(NULL, 0, NULL, 0, c);
860 }
861
862 bool item_stats_sizes_status(void) {
863 bool ret = false;
864 mutex_lock(&stats_sizes_lock);
865 if (stats_sizes_hist != NULL)
866 ret = true;
867 mutex_unlock(&stats_sizes_lock);
868 return ret;
869 }
870
871 void item_stats_sizes_init(void) {
872 if (stats_sizes_hist != NULL)
873 return;
874 stats_sizes_buckets = settings.item_size_max / 32 + 1;
875 stats_sizes_hist = calloc(stats_sizes_buckets, sizeof(int));
876 stats_sizes_cas_min = (settings.use_cas) ? get_cas_id() : 0;
877 }
878
879 void item_stats_sizes_enable(ADD_STAT add_stats, void *c) {
880 mutex_lock(&stats_sizes_lock);
881 if (!settings.use_cas) {
882 APPEND_STAT("sizes_status", "error", "");
883 APPEND_STAT("sizes_error", "cas_support_disabled", "");
884 } else if (stats_sizes_hist == NULL) {
885 item_stats_sizes_init();
886 if (stats_sizes_hist != NULL) {
887 APPEND_STAT("sizes_status", "enabled", "");
888 } else {
889 APPEND_STAT("sizes_status", "error", "");
890 APPEND_STAT("sizes_error", "no_memory", "");
891 }
892 } else {
893 APPEND_STAT("sizes_status", "enabled", "");
894 }
895 mutex_unlock(&stats_sizes_lock);
896 }
897
898 void item_stats_sizes_disable(ADD_STAT add_stats, void *c) {
899 mutex_lock(&stats_sizes_lock);
900 if (stats_sizes_hist != NULL) {
901 free(stats_sizes_hist);
902 stats_sizes_hist = NULL;
903 }
904 APPEND_STAT("sizes_status", "disabled", "");
905 mutex_unlock(&stats_sizes_lock);
906 }
907
908 void item_stats_sizes_add(item *it) {
909 if (stats_sizes_hist == NULL || stats_sizes_cas_min > ITEM_get_cas(it))
910 return;
911 int ntotal = ITEM_ntotal(it);
912 int bucket = ntotal / 32;
913 if ((ntotal % 32) != 0) bucket++;
914 if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]++;
915 }
916
917 /* I think there's no way for this to be accurate without using the CAS value.
918 * Since items getting their time value bumped will pass this validation.
919 */
920 void item_stats_sizes_remove(item *it) {
921 if (stats_sizes_hist == NULL || stats_sizes_cas_min > ITEM_get_cas(it))
922 return;
923 int ntotal = ITEM_ntotal(it);
924 int bucket = ntotal / 32;
925 if ((ntotal % 32) != 0) bucket++;
926 if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]--;
927 }
928
929 /** dumps out a list of objects of each size, with granularity of 32 bytes */
930 /*@null@*/
931 /* Locks are correct based on a technicality. Holds LRU lock while doing the
932 * work, so items can't go invalid, and it's only looking at header sizes
933 * which don't change.
934 */
935 void item_stats_sizes(ADD_STAT add_stats, void *c) {
936 mutex_lock(&stats_sizes_lock);
937
938 if (stats_sizes_hist != NULL) {
939 int i;
940 for (i = 0; i < stats_sizes_buckets; i++) {
941 if (stats_sizes_hist[i] != 0) {
942 char key[12];
943 snprintf(key, sizeof(key), "%d", i * 32);
944 APPEND_STAT(key, "%u", stats_sizes_hist[i]);
945 }
946 }
947 } else {
948 APPEND_STAT("sizes_status", "disabled", "");
949 }
950
951 add_stats(NULL, 0, NULL, 0, c);
952 mutex_unlock(&stats_sizes_lock);
953 }
954
955 /** wrapper around assoc_find which does the lazy expiration logic */
956 item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, conn *c, const bool do_update) {
957 item *it = assoc_find(key, nkey, hv);
958 if (it != NULL) {
959 refcount_incr(it);
960 /* Optimization for slab reassignment. prevents popular items from
961 * jamming in busy wait. Can only do this here to satisfy lock order
962 * of item_lock, slabs_lock. */
963 /* This was made unsafe by removal of the cache_lock:
964 * slab_rebalance_signal and slab_rebal.* are modified in a separate
965 * thread under slabs_lock. If slab_rebalance_signal = 1, slab_start =
966 * NULL (0), but slab_end is still equal to some value, this would end
967 * up unlinking every item fetched.
968 * This is either an acceptable loss, or if slab_rebalance_signal is
969 * true, slab_start/slab_end should be put behind the slabs_lock.
970 * Which would cause a huge potential slowdown.
971 * Could also use a specific lock for slab_rebal.* and
972 * slab_rebalance_signal (shorter lock?)
973 */
974 /*if (slab_rebalance_signal &&
975 ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) {
976 do_item_unlink(it, hv);
977 do_item_remove(it);
978 it = NULL;
979 }*/
980 }
981 int was_found = 0;
982
983 if (settings.verbose > 2) {
984 int ii;
985 if (it == NULL) {
986 fprintf(stderr, "> NOT FOUND ");
987 } else {
988 fprintf(stderr, "> FOUND KEY ");
989 }
990 for (ii = 0; ii < nkey; ++ii) {
991 fprintf(stderr, "%c", key[ii]);
992 }
993 }
994
995 if (it != NULL) {
996 was_found = 1;
997 if (item_is_flushed(it)) {
998 do_item_unlink(it, hv);
999 STORAGE_delete(c->thread->storage, it);
1000 do_item_remove(it);
1001 it = NULL;
1002 pthread_mutex_lock(&c->thread->stats.mutex);
1003 c->thread->stats.get_flushed++;
1004 pthread_mutex_unlock(&c->thread->stats.mutex);
1005 if (settings.verbose > 2) {
1006 fprintf(stderr, " -nuked by flush");
1007 }
1008 was_found = 2;
1009 } else if (it->exptime != 0 && it->exptime <= current_time) {
1010 do_item_unlink(it, hv);
1011 STORAGE_delete(c->thread->storage, it);
1012 do_item_remove(it);
1013 it = NULL;
1014 pthread_mutex_lock(&c->thread->stats.mutex);
1015 c->thread->stats.get_expired++;
1016 pthread_mutex_unlock(&c->thread->stats.mutex);
1017 if (settings.verbose > 2) {
1018 fprintf(stderr, " -nuked by expire");
1019 }
1020 was_found = 3;
1021 } else {
1022 if (do_update) {
1023 do_item_bump(c, it, hv);
1024 }
1025 DEBUG_REFCNT(it, '+');
1026 }
1027 }
1028
1029 if (settings.verbose > 2)
1030 fprintf(stderr, "\n");
1031 /* For now this is in addition to the above verbose logging. */
1032 LOGGER_LOG(c->thread->l, LOG_FETCHERS, LOGGER_ITEM_GET, NULL, was_found, key,
1033 nkey, (it) ? it->nbytes : 0, (it) ? ITEM_clsid(it) : 0, c->sfd);
1034
1035 return it;
1036 }
1037
1038 // Requires lock held for item.
1039 // Split out of do_item_get() to allow mget functions to look through header
1040 // data before losing state modified via the bump function.
1041 void do_item_bump(conn *c, item *it, const uint32_t hv) {
1042 /* We update the hit markers only during fetches.
1043 * An item needs to be hit twice overall to be considered
1044 * ACTIVE, but only needs a single hit to maintain activity
1045 * afterward.
1046 * FETCHED tells if an item has ever been active.
1047 */
1048 if (settings.lru_segmented) {
1049 if ((it->it_flags & ITEM_ACTIVE) == 0) {
1050 if ((it->it_flags & ITEM_FETCHED) == 0) {
1051 it->it_flags |= ITEM_FETCHED;
1052 } else {
1053 it->it_flags |= ITEM_ACTIVE;
1054 if (ITEM_lruid(it) != COLD_LRU) {
1055 it->time = current_time; // only need to bump time.
1056 } else if (!lru_bump_async(c->thread->lru_bump_buf, it, hv)) {
1057 // add flag before async bump to avoid race.
1058 it->it_flags &= ~ITEM_ACTIVE;
1059 }
1060 }
1061 }
1062 } else {
1063 it->it_flags |= ITEM_FETCHED;
1064 do_item_update(it);
1065 }
1066 }
1067
1068 item *do_item_touch(const char *key, size_t nkey, uint32_t exptime,
1069 const uint32_t hv, conn *c) {
1070 item *it = do_item_get(key, nkey, hv, c, DO_UPDATE);
1071 if (it != NULL) {
1072 it->exptime = exptime;
1073 }
1074 return it;
1075 }
1076
1077 /*** LRU MAINTENANCE THREAD ***/
1078
1079 /* Returns number of items remove, expired, or evicted.
1080 * Callable from worker threads or the LRU maintainer thread */
1081 int lru_pull_tail(const int orig_id, const int cur_lru,
1082 const uint64_t total_bytes, const uint8_t flags, const rel_time_t max_age,
1083 struct lru_pull_tail_return *ret_it) {
1084 item *it = NULL;
1085 int id = orig_id;
1086 int removed = 0;
1087 if (id == 0)
1088 return 0;
1089
1090 int tries = 5;
1091 item *search;
1092 item *next_it;
1093 void *hold_lock = NULL;
1094 unsigned int move_to_lru = 0;
1095 uint64_t limit = 0;
1096
1097 id |= cur_lru;
1098 pthread_mutex_lock(&lru_locks[id]);
1099 search = tails[id];
1100 /* We walk up *only* for locked items, and if bottom is expired. */
1101 for (; tries > 0 && search != NULL; tries--, search=next_it) {
1102 /* we might relink search mid-loop, so search->prev isn't reliable */
1103 next_it = search->prev;
1104 if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) {
1105 /* We are a crawler, ignore it. */
1106 if (flags & LRU_PULL_CRAWL_BLOCKS) {
1107 pthread_mutex_unlock(&lru_locks[id]);
1108 return 0;
1109 }
1110 tries++;
1111 continue;
1112 }
1113 uint32_t hv = hash(ITEM_key(search), search->nkey);
1114 /* Attempt to hash item lock the "search" item. If locked, no
1115 * other callers can incr the refcount. Also skip ourselves. */
1116 if ((hold_lock = item_trylock(hv)) == NULL)
1117 continue;
1118 /* Now see if the item is refcount locked */
1119 if (refcount_incr(search) != 2) {
1120 /* Note pathological case with ref'ed items in tail.
1121 * Can still unlink the item, but it won't be reusable yet */
1122 itemstats[id].lrutail_reflocked++;
1123 /* In case of refcount leaks, enable for quick workaround. */
1124 /* WARNING: This can cause terrible corruption */
1125 if (settings.tail_repair_time &&
1126 search->time + settings.tail_repair_time < current_time) {
1127 itemstats[id].tailrepairs++;
1128 search->refcount = 1;
1129 /* This will call item_remove -> item_free since refcnt is 1 */
1130 STORAGE_delete(ext_storage, search);
1131 do_item_unlink_nolock(search, hv);
1132 item_trylock_unlock(hold_lock);
1133 continue;
1134 }
1135 }
1136
1137 /* Expired or flushed */
1138 if ((search->exptime != 0 && search->exptime < current_time)
1139 || item_is_flushed(search)) {
1140 itemstats[id].reclaimed++;
1141 if ((search->it_flags & ITEM_FETCHED) == 0) {
1142 itemstats[id].expired_unfetched++;
1143 }
1144 /* refcnt 2 -> 1 */
1145 do_item_unlink_nolock(search, hv);
1146 STORAGE_delete(ext_storage, search);
1147 /* refcnt 1 -> 0 -> item_free */
1148 do_item_remove(search);
1149 item_trylock_unlock(hold_lock);
1150 removed++;
1151
1152 /* If all we're finding are expired, can keep going */
1153 continue;
1154 }
1155
1156 /* If we're HOT_LRU or WARM_LRU and over size limit, send to COLD_LRU.
1157 * If we're COLD_LRU, send to WARM_LRU unless we need to evict
1158 */
1159 switch (cur_lru) {
1160 case HOT_LRU:
1161 limit = total_bytes * settings.hot_lru_pct / 100;
1162 case WARM_LRU:
1163 if (limit == 0)
1164 limit = total_bytes * settings.warm_lru_pct / 100;
1165 /* Rescue ACTIVE items aggressively */
1166 if ((search->it_flags & ITEM_ACTIVE) != 0) {
1167 search->it_flags &= ~ITEM_ACTIVE;
1168 removed++;
1169 if (cur_lru == WARM_LRU) {
1170 itemstats[id].moves_within_lru++;
1171 do_item_unlink_q(search);
1172 do_item_link_q(search);
1173 do_item_remove(search);
1174 item_trylock_unlock(hold_lock);
1175 } else {
1176 /* Active HOT_LRU items flow to WARM */
1177 itemstats[id].moves_to_warm++;
1178 move_to_lru = WARM_LRU;
1179 do_item_unlink_q(search);
1180 it = search;
1181 }
1182 } else if (sizes_bytes[id] > limit ||
1183 current_time - search->time > max_age) {
1184 itemstats[id].moves_to_cold++;
1185 move_to_lru = COLD_LRU;
1186 do_item_unlink_q(search);
1187 it = search;
1188 removed++;
1189 break;
1190 } else {
1191 /* Don't want to move to COLD, not active, bail out */
1192 it = search;
1193 }
1194 break;
1195 case COLD_LRU:
1196 it = search; /* No matter what, we're stopping */
1197 if (flags & LRU_PULL_EVICT) {
1198 if (settings.evict_to_free == 0) {
1199 /* Don't think we need a counter for this. It'll OOM. */
1200 break;
1201 }
1202 itemstats[id].evicted++;
1203 itemstats[id].evicted_time = current_time - search->time;
1204 if (search->exptime != 0)
1205 itemstats[id].evicted_nonzero++;
1206 if ((search->it_flags & ITEM_FETCHED) == 0) {
1207 itemstats[id].evicted_unfetched++;
1208 }
1209 if ((search->it_flags & ITEM_ACTIVE)) {
1210 itemstats[id].evicted_active++;
1211 }
1212 LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EVICTION, search);
1213 STORAGE_delete(ext_storage, search);
1214 do_item_unlink_nolock(search, hv);
1215 removed++;
1216 if (settings.slab_automove == 2) {
1217 slabs_reassign(-1, orig_id);
1218 }
1219 } else if (flags & LRU_PULL_RETURN_ITEM) {
1220 /* Keep a reference to this item and return it. */
1221 ret_it->it = it;
1222 ret_it->hv = hv;
1223 } else if ((search->it_flags & ITEM_ACTIVE) != 0
1224 && settings.lru_segmented) {
1225 itemstats[id].moves_to_warm++;
1226 search->it_flags &= ~ITEM_ACTIVE;
1227 move_to_lru = WARM_LRU;
1228 do_item_unlink_q(search);
1229 removed++;
1230 }
1231 break;
1232 case TEMP_LRU:
1233 it = search; /* Kill the loop. Parent only interested in reclaims */
1234 break;
1235 }
1236 if (it != NULL)
1237 break;
1238 }
1239
1240 pthread_mutex_unlock(&lru_locks[id]);
1241
1242 if (it != NULL) {
1243 if (move_to_lru) {
1244 it->slabs_clsid = ITEM_clsid(it);
1245 it->slabs_clsid |= move_to_lru;
1246 item_link_q(it);
1247 }
1248 if ((flags & LRU_PULL_RETURN_ITEM) == 0) {
1249 do_item_remove(it);
1250 item_trylock_unlock(hold_lock);
1251 }
1252 }
1253
1254 return removed;
1255 }
1256
1257
1258 /* TODO: Third place this code needs to be deduped */
1259 static void lru_bump_buf_link_q(lru_bump_buf *b) {
1260 pthread_mutex_lock(&bump_buf_lock);
1261 assert(b != bump_buf_head);
1262
1263 b->prev = 0;
1264 b->next = bump_buf_head;
1265 if (b->next) b->next->prev = b;
1266 bump_buf_head = b;
1267 if (bump_buf_tail == 0) bump_buf_tail = b;
1268 pthread_mutex_unlock(&bump_buf_lock);
1269 return;
1270 }
1271
1272 void *item_lru_bump_buf_create(void) {
1273 lru_bump_buf *b = calloc(1, sizeof(lru_bump_buf));
1274 if (b == NULL) {
1275 return NULL;
1276 }
1277
1278 b->buf = bipbuf_new(sizeof(lru_bump_entry) * LRU_BUMP_BUF_SIZE);
1279 if (b->buf == NULL) {
1280 free(b);
1281 return NULL;
1282 }
1283
1284 pthread_mutex_init(&b->mutex, NULL);
1285
1286 lru_bump_buf_link_q(b);
1287 return b;
1288 }
1289
1290 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv) {
1291 bool ret = true;
1292 refcount_incr(it);
1293 pthread_mutex_lock(&b->mutex);
1294 lru_bump_entry *be = (lru_bump_entry *) bipbuf_request(b->buf, sizeof(lru_bump_entry));
1295 if (be != NULL) {
1296 be->it = it;
1297 be->hv = hv;
1298 if (bipbuf_push(b->buf, sizeof(lru_bump_entry)) == 0) {
1299 ret = false;
1300 b->dropped++;
1301 }
1302 } else {
1303 ret = false;
1304 b->dropped++;
1305 }
1306 if (!ret) {
1307 refcount_decr(it);
1308 }
1309 pthread_mutex_unlock(&b->mutex);
1310 return ret;
1311 }
1312
1313 /* TODO: Might be worth a micro-optimization of having bump buffers link
1314 * themselves back into the central queue when queue goes from zero to
1315 * non-zero, then remove from list if zero more than N times.
1316 * If very few hits on cold this would avoid extra memory barriers from LRU
1317 * maintainer thread. If many hits, they'll just stay in the list.
1318 */
1319 static bool lru_maintainer_bumps(void) {
1320 lru_bump_buf *b;
1321 lru_bump_entry *be;
1322 unsigned int size;
1323 unsigned int todo;
1324 bool bumped = false;
1325 pthread_mutex_lock(&bump_buf_lock);
1326 for (b = bump_buf_head; b != NULL; b=b->next) {
1327 pthread_mutex_lock(&b->mutex);
1328 be = (lru_bump_entry *) bipbuf_peek_all(b->buf, &size);
1329 pthread_mutex_unlock(&b->mutex);
1330
1331 if (be == NULL) {
1332 continue;
1333 }
1334 todo = size;
1335 bumped = true;
1336
1337 while (todo) {
1338 item_lock(be->hv);
1339 do_item_update(be->it);
1340 do_item_remove(be->it);
1341 item_unlock(be->hv);
1342 be++;
1343 todo -= sizeof(lru_bump_entry);
1344 }
1345
1346 pthread_mutex_lock(&b->mutex);
1347 be = (lru_bump_entry *) bipbuf_poll(b->buf, size);
1348 pthread_mutex_unlock(&b->mutex);
1349 }
1350 pthread_mutex_unlock(&bump_buf_lock);
1351 return bumped;
1352 }
1353
1354 static uint64_t lru_total_bumps_dropped(void) {
1355 uint64_t total = 0;
1356 lru_bump_buf *b;
1357 pthread_mutex_lock(&bump_buf_lock);
1358 for (b = bump_buf_head; b != NULL; b=b->next) {
1359 pthread_mutex_lock(&b->mutex);
1360 total += b->dropped;
1361 pthread_mutex_unlock(&b->mutex);
1362 }
1363 pthread_mutex_unlock(&bump_buf_lock);
1364 return total;
1365 }
1366
1367 /* Loop up to N times:
1368 * If too many items are in HOT_LRU, push to COLD_LRU
1369 * If too many items are in WARM_LRU, push to COLD_LRU
1370 * If too many items are in COLD_LRU, poke COLD_LRU tail
1371 * 1000 loops with 1ms min sleep gives us under 1m items shifted/sec. The
1372 * locks can't handle much more than that. Leaving a TODO for how to
1373 * autoadjust in the future.
1374 */
1375 static int lru_maintainer_juggle(const int slabs_clsid) {
1376 int i;
1377 int did_moves = 0;
1378 uint64_t total_bytes = 0;
1379 unsigned int chunks_perslab = 0;
1380 //unsigned int chunks_free = 0;
1381 /* TODO: if free_chunks below high watermark, increase aggressiveness */
1382 slabs_available_chunks(slabs_clsid, NULL,
1383 &chunks_perslab);
1384 if (settings.temp_lru) {
1385 /* Only looking for reclaims. Run before we size the LRU. */
1386 for (i = 0; i < 500; i++) {
1387 if (lru_pull_tail(slabs_clsid, TEMP_LRU, 0, 0, 0, NULL) <= 0) {
1388 break;
1389 } else {
1390 did_moves++;
1391 }
1392 }
1393 }
1394
1395 rel_time_t cold_age = 0;
1396 rel_time_t hot_age = 0;
1397 rel_time_t warm_age = 0;
1398 /* If LRU is in flat mode, force items to drain into COLD via max age of 0 */
1399 if (settings.lru_segmented) {
1400 pthread_mutex_lock(&lru_locks[slabs_clsid|COLD_LRU]);
1401 if (tails[slabs_clsid|COLD_LRU]) {
1402 cold_age = current_time - tails[slabs_clsid|COLD_LRU]->time;
1403 }
1404 // Also build up total_bytes for the classes.
1405 total_bytes += sizes_bytes[slabs_clsid|COLD_LRU];
1406 pthread_mutex_unlock(&lru_locks[slabs_clsid|COLD_LRU]);
1407
1408 hot_age = cold_age * settings.hot_max_factor;
1409 warm_age = cold_age * settings.warm_max_factor;
1410
1411 // total_bytes doesn't have to be exact. cache it for the juggles.
1412 pthread_mutex_lock(&lru_locks[slabs_clsid|HOT_LRU]);
1413 total_bytes += sizes_bytes[slabs_clsid|HOT_LRU];
1414 pthread_mutex_unlock(&lru_locks[slabs_clsid|HOT_LRU]);
1415
1416 pthread_mutex_lock(&lru_locks[slabs_clsid|WARM_LRU]);
1417 total_bytes += sizes_bytes[slabs_clsid|WARM_LRU];
1418 pthread_mutex_unlock(&lru_locks[slabs_clsid|WARM_LRU]);
1419 }
1420
1421 /* Juggle HOT/WARM up to N times */
1422 for (i = 0; i < 500; i++) {
1423 int do_more = 0;
1424 if (lru_pull_tail(slabs_clsid, HOT_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, hot_age, NULL) ||
1425 lru_pull_tail(slabs_clsid, WARM_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, warm_age, NULL)) {
1426 do_more++;
1427 }
1428 if (settings.lru_segmented) {
1429 do_more += lru_pull_tail(slabs_clsid, COLD_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, 0, NULL);
1430 }
1431 if (do_more == 0)
1432 break;
1433 did_moves++;
1434 }
1435 return did_moves;
1436 }
1437
1438 /* Will crawl all slab classes a minimum of once per hour */
1439 #define MAX_MAINTCRAWL_WAIT 60 * 60
1440
1441 /* Hoping user input will improve this function. This is all a wild guess.
1442 * Operation: Kicks crawler for each slab id. Crawlers take some statistics as
1443 * to items with nonzero expirations. It then buckets how many items will
1444 * expire per minute for the next hour.
1445 * This function checks the results of a run, and if it things more than 1% of
1446 * expirable objects are ready to go, kick the crawler again to reap.
1447 * It will also kick the crawler once per minute regardless, waiting a minute
1448 * longer for each time it has no work to do, up to an hour wait time.
1449 * The latter is to avoid newly started daemons from waiting too long before
1450 * retrying a crawl.
1451 */
1452 static void lru_maintainer_crawler_check(struct crawler_expired_data *cdata, logger *l) {
1453 int i;
1454 static rel_time_t next_crawls[POWER_LARGEST];
1455 static rel_time_t next_crawl_wait[POWER_LARGEST];
1456 uint8_t todo[POWER_LARGEST];
1457 memset(todo, 0, sizeof(uint8_t) * POWER_LARGEST);
1458 bool do_run = false;
1459 unsigned int tocrawl_limit = 0;
1460
1461 // TODO: If not segmented LRU, skip non-cold
1462 for (i = POWER_SMALLEST; i < POWER_LARGEST; i++) {
1463 crawlerstats_t *s = &cdata->crawlerstats[i];
1464 /* We've not successfully kicked off a crawl yet. */
1465 if (s->run_complete) {
1466 char *lru_name = "na";
1467 pthread_mutex_lock(&cdata->lock);
1468 int x;
1469 /* Should we crawl again? */
1470 uint64_t possible_reclaims = s->seen - s->noexp;
1471 uint64_t available_reclaims = 0;
1472 /* Need to think we can free at least 1% of the items before
1473 * crawling. */
1474 /* FIXME: Configurable? */
1475 uint64_t low_watermark = (possible_reclaims / 100) + 1;
1476 rel_time_t since_run = current_time - s->end_time;
1477 /* Don't bother if the payoff is too low. */
1478 for (x = 0; x < 60; x++) {
1479 available_reclaims += s->histo[x];
1480 if (available_reclaims > low_watermark) {
1481 if (next_crawl_wait[i] < (x * 60)) {
1482 next_crawl_wait[i] += 60;
1483 } else if (next_crawl_wait[i] >= 60) {
1484 next_crawl_wait[i] -= 60;
1485 }
1486 break;
1487 }
1488 }
1489
1490 if (available_reclaims == 0) {
1491 next_crawl_wait[i] += 60;
1492 }
1493
1494 if (next_crawl_wait[i] > MAX_MAINTCRAWL_WAIT) {
1495 next_crawl_wait[i] = MAX_MAINTCRAWL_WAIT;
1496 }
1497
1498 next_crawls[i] = current_time + next_crawl_wait[i] + 5;
1499 switch (GET_LRU(i)) {
1500 case HOT_LRU:
1501 lru_name = "hot";
1502 break;
1503 case WARM_LRU:
1504 lru_name = "warm";
1505 break;
1506 case COLD_LRU:
1507 lru_name = "cold";
1508 break;
1509 case TEMP_LRU:
1510 lru_name = "temp";
1511 break;
1512 }
1513 LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_CRAWLER_STATUS, NULL,
1514 CLEAR_LRU(i),
1515 lru_name,
1516 (unsigned long long)low_watermark,
1517 (unsigned long long)available_reclaims,
1518 (unsigned int)since_run,
1519 next_crawls[i] - current_time,
1520 s->end_time - s->start_time,
1521 s->seen,
1522 s->reclaimed);
1523 // Got our calculation, avoid running until next actual run.
1524 s->run_complete = false;
1525 pthread_mutex_unlock(&cdata->lock);
1526 }
1527 if (current_time > next_crawls[i]) {
1528 pthread_mutex_lock(&lru_locks[i]);
1529 if (sizes[i] > tocrawl_limit) {
1530 tocrawl_limit = sizes[i];
1531 }
1532 pthread_mutex_unlock(&lru_locks[i]);
1533 todo[i] = 1;
1534 do_run = true;
1535 next_crawls[i] = current_time + 5; // minimum retry wait.
1536 }
1537 }
1538 if (do_run) {
1539 if (settings.lru_crawler_tocrawl && settings.lru_crawler_tocrawl < tocrawl_limit) {
1540 tocrawl_limit = settings.lru_crawler_tocrawl;
1541 }
1542 lru_crawler_start(todo, tocrawl_limit, CRAWLER_AUTOEXPIRE, cdata, NULL, 0);
1543 }
1544 }
1545
1546 slab_automove_reg_t slab_automove_default = {
1547 .init = slab_automove_init,
1548 .free = slab_automove_free,
1549 .run = slab_automove_run
1550 };
1551 #ifdef EXTSTORE
1552 slab_automove_reg_t slab_automove_extstore = {
1553 .init = slab_automove_extstore_init,
1554 .free = slab_automove_extstore_free,
1555 .run = slab_automove_extstore_run
1556 };
1557 #endif
1558 static pthread_t lru_maintainer_tid;
1559
1560 #define MAX_LRU_MAINTAINER_SLEEP 1000000
1561 #define MIN_LRU_MAINTAINER_SLEEP 1000
1562
1563 static void *lru_maintainer_thread(void *arg) {
1564 slab_automove_reg_t *sam = &slab_automove_default;
1565 #ifdef EXTSTORE
1566 void *storage = arg;
1567 if (storage != NULL)
1568 sam = &slab_automove_extstore;
1569 #endif
1570 int i;
1571 useconds_t to_sleep = MIN_LRU_MAINTAINER_SLEEP;
1572 useconds_t last_sleep = MIN_LRU_MAINTAINER_SLEEP;
1573 rel_time_t last_crawler_check = 0;
1574 rel_time_t last_automove_check = 0;
1575 useconds_t next_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
1576 useconds_t backoff_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
1577 struct crawler_expired_data *cdata =
1578 calloc(1, sizeof(struct crawler_expired_data));
1579 if (cdata == NULL) {
1580 fprintf(stderr, "Failed to allocate crawler data for LRU maintainer thread\n");
1581 abort();
1582 }
1583 pthread_mutex_init(&cdata->lock, NULL);
1584 cdata->crawl_complete = true; // kick off the crawler.
1585 logger *l = logger_create();
1586 if (l == NULL) {
1587 fprintf(stderr, "Failed to allocate logger for LRU maintainer thread\n");
1588 abort();
1589 }
1590
1591 double last_ratio = settings.slab_automove_ratio;
1592 void *am = sam->init(&settings);
1593
1594 pthread_mutex_lock(&lru_maintainer_lock);
1595 if (settings.verbose > 2)
1596 fprintf(stderr, "Starting LRU maintainer background thread\n");
1597 while (do_run_lru_maintainer_thread) {
1598 pthread_mutex_unlock(&lru_maintainer_lock);
1599 if (to_sleep)
1600 usleep(to_sleep);
1601 pthread_mutex_lock(&lru_maintainer_lock);
1602 /* A sleep of zero counts as a minimum of a 1ms wait */
1603 last_sleep = to_sleep > 1000 ? to_sleep : 1000;
1604 to_sleep = MAX_LRU_MAINTAINER_SLEEP;
1605
1606 STATS_LOCK();
1607 stats.lru_maintainer_juggles++;
1608 STATS_UNLOCK();
1609
1610 /* Each slab class gets its own sleep to avoid hammering locks */
1611 for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
1612 next_juggles[i] = next_juggles[i] > last_sleep ? next_juggles[i] - last_sleep : 0;
1613
1614 if (next_juggles[i] > 0) {
1615 // Sleep the thread just for the minimum amount (or not at all)
1616 if (next_juggles[i] < to_sleep)
1617 to_sleep = next_juggles[i];
1618 continue;
1619 }
1620
1621 int did_moves = lru_maintainer_juggle(i);
1622 if (did_moves == 0) {
1623 if (backoff_juggles[i] != 0) {
1624 backoff_juggles[i] += backoff_juggles[i] / 8;
1625 } else {
1626 backoff_juggles[i] = MIN_LRU_MAINTAINER_SLEEP;
1627 }
1628 if (backoff_juggles[i] > MAX_LRU_MAINTAINER_SLEEP)
1629 backoff_juggles[i] = MAX_LRU_MAINTAINER_SLEEP;
1630 } else if (backoff_juggles[i] > 0) {
1631 backoff_juggles[i] /= 2;
1632 if (backoff_juggles[i] < MIN_LRU_MAINTAINER_SLEEP) {
1633 backoff_juggles[i] = 0;
1634 }
1635 }
1636 next_juggles[i] = backoff_juggles[i];
1637 if (next_juggles[i] < to_sleep)
1638 to_sleep = next_juggles[i];
1639 }
1640
1641 /* Minimize the sleep if we had async LRU bumps to process */
1642 if (settings.lru_segmented && lru_maintainer_bumps() && to_sleep > 1000) {
1643 to_sleep = 1000;
1644 }
1645
1646 /* Once per second at most */
1647 if (settings.lru_crawler && last_crawler_check != current_time) {
1648 lru_maintainer_crawler_check(cdata, l);
1649 last_crawler_check = current_time;
1650 }
1651
1652 if (settings.slab_automove == 1 && last_automove_check != current_time) {
1653 if (last_ratio != settings.slab_automove_ratio) {
1654 sam->free(am);
1655 am = sam->init(&settings);
1656 last_ratio = settings.slab_automove_ratio;
1657 }
1658 int src, dst;
1659 sam->run(am, &src, &dst);
1660 if (src != -1 && dst != -1) {
1661 slabs_reassign(src, dst);
1662 LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_SLAB_MOVE, NULL,
1663 src, dst);
1664 }
1665 // dst == 0 means reclaim to global pool, be more aggressive
1666 if (dst != 0) {
1667 last_automove_check = current_time;
1668 } else if (dst == 0) {
1669 // also ensure we minimize the thread sleep
1670 to_sleep = 1000;
1671 }
1672 }
1673 }
1674 pthread_mutex_unlock(&lru_maintainer_lock);
1675 sam->free(am);
1676 // LRU crawler *must* be stopped.
1677 free(cdata);
1678 if (settings.verbose > 2)
1679 fprintf(stderr, "LRU maintainer thread stopping\n");
1680
1681 return NULL;
1682 }
1683
1684 int stop_lru_maintainer_thread(void) {
1685 int ret;
1686 pthread_mutex_lock(&lru_maintainer_lock);
1687 /* LRU thread is a sleep loop, will die on its own */
1688 do_run_lru_maintainer_thread = 0;
1689 pthread_mutex_unlock(&lru_maintainer_lock);
1690 if ((ret = pthread_join(lru_maintainer_tid, NULL)) != 0) {
1691 fprintf(stderr, "Failed to stop LRU maintainer thread: %s\n", strerror(ret));
1692 return -1;
1693 }
1694 settings.lru_maintainer_thread = false;
1695 return 0;
1696 }
1697
1698 int start_lru_maintainer_thread(void *arg) {
1699 int ret;
1700
1701 pthread_mutex_lock(&lru_maintainer_lock);
1702 do_run_lru_maintainer_thread = 1;
1703 settings.lru_maintainer_thread = true;
1704 if ((ret = pthread_create(&lru_maintainer_tid, NULL,
1705 lru_maintainer_thread, arg)) != 0) {
1706 fprintf(stderr, "Can't create LRU maintainer thread: %s\n",
1707 strerror(ret));
1708 pthread_mutex_unlock(&lru_maintainer_lock);
1709 return -1;
1710 }
1711 pthread_mutex_unlock(&lru_maintainer_lock);
1712
1713 return 0;
1714 }
1715
1716 /* If we hold this lock, crawler can't wake up or move */
1717 void lru_maintainer_pause(void) {
1718 pthread_mutex_lock(&lru_maintainer_lock);
1719 }
1720
1721 void lru_maintainer_resume(void) {
1722 pthread_mutex_unlock(&lru_maintainer_lock);
1723 }
1724
1725 int init_lru_maintainer(void) {
1726 lru_maintainer_initialized = 1;
1727 return 0;
1728 }
1729
1730 /* Tail linkers and crawler for the LRU crawler. */
1731 void do_item_linktail_q(item *it) { /* item is the new tail */
1732 item **head, **tail;
1733 assert(it->it_flags == 1);
1734 assert(it->nbytes == 0);
1735
1736 head = &heads[it->slabs_clsid];
1737 tail = &tails[it->slabs_clsid];
1738 //assert(*tail != 0);
1739 assert(it != *tail);
1740 assert((*head && *tail) || (*head == 0 && *tail == 0));
1741 it->prev = *tail;
1742 it->next = 0;
1743 if (it->prev) {
1744 assert(it->prev->next == 0);
1745 it->prev->next = it;
1746 }
1747 *tail = it;
1748 if (*head == 0) *head = it;
1749 return;
1750 }
1751
1752 void do_item_unlinktail_q(item *it) {
1753 item **head, **tail;
1754 head = &heads[it->slabs_clsid];
1755 tail = &tails[it->slabs_clsid];
1756
1757 if (*head == it) {
1758 assert(it->prev == 0);
1759 *head = it->next;
1760 }
1761 if (*tail == it) {
1762 assert(it->next == 0);
1763 *tail = it->prev;
1764 }
1765 assert(it->next != it);
1766 assert(it->prev != it);
1767
1768 if (it->next) it->next->prev = it->prev;
1769 if (it->prev) it->prev->next = it->next;
1770 return;
1771 }
1772
1773 /* This is too convoluted, but it's a difficult shuffle. Try to rewrite it
1774 * more clearly. */
1775 item *do_item_crawl_q(item *it) {
1776 item **head, **tail;
1777 assert(it->it_flags == 1);
1778 assert(it->nbytes == 0);
1779 head = &heads[it->slabs_clsid];
1780 tail = &tails[it->slabs_clsid];
1781
1782 /* We've hit the head, pop off */
1783 if (it->prev == 0) {
1784 assert(*head == it);
1785 if (it->next) {
1786 *head = it->next;
1787 assert(it->next->prev == it);
1788 it->next->prev = 0;
1789 }
1790 return NULL; /* Done */
1791 }
1792
1793 /* Swing ourselves in front of the next item */
1794 /* NB: If there is a prev, we can't be the head */
1795 assert(it->prev != it);
1796 if (it->prev) {
1797 if (*head == it->prev) {
1798 /* Prev was the head, now we're the head */
1799 *head = it;
1800 }
1801 if (*tail == it) {
1802 /* We are the tail, now they are the tail */
1803 *tail = it->prev;
1804 }
1805 assert(it->next != it);
1806 if (it->next) {
1807 assert(it->prev->next == it);
1808 it->prev->next = it->next;
1809 it->next->prev = it->prev;
1810 } else {
1811 /* Tail. Move this above? */
1812 it->prev->next = 0;
1813 }
1814 /* prev->prev's next is it->prev */
1815 it->next = it->prev;
1816 it->prev = it->next->prev;
1817 it->next->prev = it;
1818 /* New it->prev now, if we're not at the head. */
1819 if (it->prev) {
1820 it->prev->next = it;
1821 }
1822 }
1823 assert(it->next != it);
1824 assert(it->prev != it);
1825
1826 return it->next; /* success */
1827 }