"Fossies" - the Fresh Open Source Software Archive 
Member "memcached-1.6.15/slabs.c" (21 Feb 2022, 46171 Bytes) of package /linux/www/memcached-1.6.15.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "slabs.c" see the
Fossies "Dox" file reference documentation.
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3 * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
4 * and are divided into chunks. The chunk sizes start off at the size of the
5 * "item" structure plus space for a small key and value. They increase by
6 * a multiplier factor from there, up to half the maximum slab size. The last
7 * slab size is always 1MB, since that's the maximum item size allowed by the
8 * memcached protocol.
9 */
10 #include "memcached.h"
11 #include "storage.h"
12 #include <sys/mman.h>
13 #include <sys/stat.h>
14 #include <sys/socket.h>
15 #include <sys/resource.h>
16 #include <fcntl.h>
17 #include <netinet/in.h>
18 #include <errno.h>
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22 #include <signal.h>
23 #include <assert.h>
24 #include <pthread.h>
25
26 //#define DEBUG_SLAB_MOVER
27 /* powers-of-N allocation structures */
28
29 typedef struct {
30 unsigned int size; /* sizes of items */
31 unsigned int perslab; /* how many items per slab */
32
33 void *slots; /* list of item ptrs */
34 unsigned int sl_curr; /* total free items in list */
35
36 unsigned int slabs; /* how many slabs were allocated for this class */
37
38 void **slab_list; /* array of slab pointers */
39 unsigned int list_size; /* size of prev array */
40 } slabclass_t;
41
42 static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
43 static size_t mem_limit = 0;
44 static size_t mem_malloced = 0;
45 /* If the memory limit has been hit once. Used as a hint to decide when to
46 * early-wake the LRU maintenance thread */
47 static bool mem_limit_reached = false;
48 static int power_largest;
49
50 static void *mem_base = NULL;
51 static void *mem_current = NULL;
52 static size_t mem_avail = 0;
53 #ifdef EXTSTORE
54 static void *storage = NULL;
55 #endif
56 /**
57 * Access to the slab allocator is protected by this lock
58 */
59 static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
60 static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
61
62 /*
63 * Forward Declarations
64 */
65 static int grow_slab_list (const unsigned int id);
66 static int do_slabs_newslab(const unsigned int id);
67 static void *memory_allocate(size_t size);
68 static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
69
70 /* Preallocate as many slab pages as possible (called from slabs_init)
71 on start-up, so users don't get confused out-of-memory errors when
72 they do have free (in-slab) space, but no space to make new slabs.
73 if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
74 slab types can be made. if max memory is less than 18 MB, only the
75 smaller ones will be made. */
76 static void slabs_preallocate (const unsigned int maxslabs);
77 #ifdef EXTSTORE
78 void slabs_set_storage(void *arg) {
79 storage = arg;
80 }
81 #endif
82 /*
83 * Figures out which slab class (chunk size) is required to store an item of
84 * a given size.
85 *
86 * Given object size, return id to use when allocating/freeing memory for object
87 * 0 means error: can't store such a large object
88 */
89
90 unsigned int slabs_clsid(const size_t size) {
91 int res = POWER_SMALLEST;
92
93 if (size == 0 || size > settings.item_size_max)
94 return 0;
95 while (size > slabclass[res].size)
96 if (res++ == power_largest) /* won't fit in the biggest slab */
97 return power_largest;
98 return res;
99 }
100
101 unsigned int slabs_size(const int clsid) {
102 return slabclass[clsid].size;
103 }
104
105 // TODO: could this work with the restartable memory?
106 // Docs say hugepages only work with private shm allocs.
107 /* Function split out for better error path handling */
108 static void * alloc_large_chunk(const size_t limit)
109 {
110 void *ptr = NULL;
111 #if defined(__linux__) && defined(MADV_HUGEPAGE)
112 size_t pagesize = 0;
113 FILE *fp;
114 int ret;
115
116 /* Get the size of huge pages */
117 fp = fopen("/proc/meminfo", "r");
118 if (fp != NULL) {
119 char buf[64];
120
121 while ((fgets(buf, sizeof(buf), fp)))
122 if (!strncmp(buf, "Hugepagesize:", 13)) {
123 ret = sscanf(buf + 13, "%zu\n", &pagesize);
124
125 /* meminfo huge page size is in KiBs */
126 pagesize <<= 10;
127 }
128 fclose(fp);
129 }
130
131 if (!pagesize) {
132 fprintf(stderr, "Failed to get supported huge page size\n");
133 return NULL;
134 }
135
136 if (settings.verbose > 1)
137 fprintf(stderr, "huge page size: %zu\n", pagesize);
138
139 /* This works because glibc simply uses mmap when the alignment is
140 * above a certain limit. */
141 ret = posix_memalign(&ptr, pagesize, limit);
142 if (ret != 0) {
143 fprintf(stderr, "Failed to get aligned memory chunk: %d\n", ret);
144 return NULL;
145 }
146
147 ret = madvise(ptr, limit, MADV_HUGEPAGE);
148 if (ret < 0) {
149 fprintf(stderr, "Failed to set transparent hugepage hint: %d\n", ret);
150 free(ptr);
151 ptr = NULL;
152 }
153 #elif defined(__FreeBSD__)
154 size_t align = (sizeof(size_t) * 8 - (__builtin_clzl(4095)));
155 ptr = mmap(NULL, limit, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON | MAP_ALIGNED(align) | MAP_ALIGNED_SUPER, -1, 0);
156 if (ptr == MAP_FAILED) {
157 fprintf(stderr, "Failed to set super pages\n");
158 ptr = NULL;
159 }
160 #else
161 ptr = malloc(limit);
162 #endif
163 return ptr;
164 }
165
166 unsigned int slabs_fixup(char *chunk, const int border) {
167 slabclass_t *p;
168 item *it = (item *)chunk;
169 int id = ITEM_clsid(it);
170
171 // memory isn't used yet. shunt to global pool.
172 // (which must be 0)
173 if (id == 0) {
174 //assert(border == 0);
175 p = &slabclass[0];
176 grow_slab_list(0);
177 p->slab_list[p->slabs++] = (char*)chunk;
178 return -1;
179 }
180 p = &slabclass[id];
181
182 // if we're on a page border, add the slab to slab class
183 if (border == 0) {
184 grow_slab_list(id);
185 p->slab_list[p->slabs++] = chunk;
186 }
187
188 // increase free count if ITEM_SLABBED
189 if (it->it_flags == ITEM_SLABBED) {
190 // if ITEM_SLABBED re-stack on freelist.
191 // don't have to run pointer fixups.
192 it->prev = 0;
193 it->next = p->slots;
194 if (it->next) it->next->prev = it;
195 p->slots = it;
196
197 p->sl_curr++;
198 //fprintf(stderr, "replacing into freelist\n");
199 }
200
201 return p->size;
202 }
203
204 /**
205 * Determines the chunk sizes and initializes the slab class descriptors
206 * accordingly.
207 */
208 void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes, void *mem_base_external, bool reuse_mem) {
209 int i = POWER_SMALLEST - 1;
210 unsigned int size = sizeof(item) + settings.chunk_size;
211
212 /* Some platforms use runtime transparent hugepages. If for any reason
213 * the initial allocation fails, the required settings do not persist
214 * for remaining allocations. As such it makes little sense to do slab
215 * preallocation. */
216 bool __attribute__ ((unused)) do_slab_prealloc = false;
217
218 mem_limit = limit;
219
220 if (prealloc && mem_base_external == NULL) {
221 mem_base = alloc_large_chunk(mem_limit);
222 if (mem_base) {
223 do_slab_prealloc = true;
224 mem_current = mem_base;
225 mem_avail = mem_limit;
226 } else {
227 fprintf(stderr, "Warning: Failed to allocate requested memory in"
228 " one large chunk.\nWill allocate in smaller chunks\n");
229 }
230 } else if (prealloc && mem_base_external != NULL) {
231 // Can't (yet) mix hugepages with mmap allocations, so separate the
232 // logic from above. Reusable memory also force-preallocates memory
233 // pages into the global pool, which requires turning mem_* variables.
234 do_slab_prealloc = true;
235 mem_base = mem_base_external;
236 // _current shouldn't be used in this case, but we set it to where it
237 // should be anyway.
238 if (reuse_mem) {
239 mem_current = ((char*)mem_base) + mem_limit;
240 mem_avail = 0;
241 } else {
242 mem_current = mem_base;
243 mem_avail = mem_limit;
244 }
245 }
246
247 memset(slabclass, 0, sizeof(slabclass));
248
249 while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) {
250 if (slab_sizes != NULL) {
251 if (slab_sizes[i-1] == 0)
252 break;
253 size = slab_sizes[i-1];
254 } else if (size >= settings.slab_chunk_size_max / factor) {
255 break;
256 }
257 /* Make sure items are always n-byte aligned */
258 if (size % CHUNK_ALIGN_BYTES)
259 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
260
261 slabclass[i].size = size;
262 slabclass[i].perslab = settings.slab_page_size / slabclass[i].size;
263 if (slab_sizes == NULL)
264 size *= factor;
265 if (settings.verbose > 1) {
266 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
267 i, slabclass[i].size, slabclass[i].perslab);
268 }
269 }
270
271 power_largest = i;
272 slabclass[power_largest].size = settings.slab_chunk_size_max;
273 slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max;
274 if (settings.verbose > 1) {
275 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
276 i, slabclass[i].size, slabclass[i].perslab);
277 }
278
279 /* for the test suite: faking of how much we've already malloc'd */
280 {
281 char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
282 if (t_initial_malloc) {
283 int64_t env_malloced;
284 if (safe_strtoll((const char *)t_initial_malloc, &env_malloced)) {
285 mem_malloced = (size_t)env_malloced;
286 }
287 }
288
289 }
290
291 if (do_slab_prealloc) {
292 if (!reuse_mem) {
293 slabs_preallocate(power_largest);
294 }
295 }
296 }
297
298 void slabs_prefill_global(void) {
299 void *ptr;
300 slabclass_t *p = &slabclass[0];
301 int len = settings.slab_page_size;
302
303 while (mem_malloced < mem_limit
304 && (ptr = memory_allocate(len)) != NULL) {
305 grow_slab_list(0);
306 // Ensure the front header is zero'd to avoid confusing restart code.
307 // It's probably good enough to cast it and just zero slabs_clsid, but
308 // this is extra paranoid.
309 memset(ptr, 0, sizeof(item));
310 p->slab_list[p->slabs++] = ptr;
311 }
312 mem_limit_reached = true;
313 }
314
315 static void slabs_preallocate (const unsigned int maxslabs) {
316 int i;
317 unsigned int prealloc = 0;
318
319 /* pre-allocate a 1MB slab in every size class so people don't get
320 confused by non-intuitive "SERVER_ERROR out of memory"
321 messages. this is the most common question on the mailing
322 list. if you really don't want this, you can rebuild without
323 these three lines. */
324
325 for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
326 if (++prealloc > maxslabs)
327 break;
328 if (do_slabs_newslab(i) == 0) {
329 fprintf(stderr, "Error while preallocating slab memory!\n"
330 "If using -L or other prealloc options, max memory must be "
331 "at least %d megabytes.\n", power_largest);
332 exit(1);
333 }
334 }
335 }
336
337 static int grow_slab_list (const unsigned int id) {
338 slabclass_t *p = &slabclass[id];
339 if (p->slabs == p->list_size) {
340 size_t new_size = (p->list_size != 0) ? p->list_size * 2 : 16;
341 void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
342 if (new_list == 0) return 0;
343 p->list_size = new_size;
344 p->slab_list = new_list;
345 }
346 return 1;
347 }
348
349 static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
350 slabclass_t *p = &slabclass[id];
351 int x;
352 for (x = 0; x < p->perslab; x++) {
353 do_slabs_free(ptr, 0, id);
354 ptr += p->size;
355 }
356 }
357
358 /* Fast FIFO queue */
359 static void *get_page_from_global_pool(void) {
360 slabclass_t *p = &slabclass[SLAB_GLOBAL_PAGE_POOL];
361 if (p->slabs < 1) {
362 return NULL;
363 }
364 char *ret = p->slab_list[p->slabs - 1];
365 p->slabs--;
366 return ret;
367 }
368
369 static int do_slabs_newslab(const unsigned int id) {
370 slabclass_t *p = &slabclass[id];
371 slabclass_t *g = &slabclass[SLAB_GLOBAL_PAGE_POOL];
372 int len = (settings.slab_reassign || settings.slab_chunk_size_max != settings.slab_page_size)
373 ? settings.slab_page_size
374 : p->size * p->perslab;
375 char *ptr;
376
377 if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0
378 && g->slabs == 0)) {
379 mem_limit_reached = true;
380 MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
381 return 0;
382 }
383
384 if ((grow_slab_list(id) == 0) ||
385 (((ptr = get_page_from_global_pool()) == NULL) &&
386 ((ptr = memory_allocate((size_t)len)) == 0))) {
387
388 MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
389 return 0;
390 }
391
392 // Always wipe the memory at this stage: in restart mode the mmap memory
393 // could be unused, yet still full of data. Better for usability if we're
394 // wiping memory as it's being pulled out of the global pool instead of
395 // blocking startup all at once.
396 memset(ptr, 0, (size_t)len);
397 split_slab_page_into_freelist(ptr, id);
398
399 p->slab_list[p->slabs++] = ptr;
400 MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
401
402 return 1;
403 }
404
405 /*@null@*/
406 static void *do_slabs_alloc(const size_t size, unsigned int id,
407 unsigned int flags) {
408 slabclass_t *p;
409 void *ret = NULL;
410 item *it = NULL;
411
412 if (id < POWER_SMALLEST || id > power_largest) {
413 MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
414 return NULL;
415 }
416 p = &slabclass[id];
417 assert(p->sl_curr == 0 || (((item *)p->slots)->it_flags & ITEM_SLABBED));
418
419 assert(size <= p->size);
420 /* fail unless we have space at the end of a recently allocated page,
421 we have something on our freelist, or we could allocate a new page */
422 if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) {
423 do_slabs_newslab(id);
424 }
425
426 if (p->sl_curr != 0) {
427 /* return off our freelist */
428 it = (item *)p->slots;
429 p->slots = it->next;
430 if (it->next) it->next->prev = 0;
431 /* Kill flag and initialize refcount here for lock safety in slab
432 * mover's freeness detection. */
433 it->it_flags &= ~ITEM_SLABBED;
434 it->refcount = 1;
435 p->sl_curr--;
436 ret = (void *)it;
437 } else {
438 ret = NULL;
439 }
440
441 if (ret) {
442 MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
443 } else {
444 MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
445 }
446
447 return ret;
448 }
449
450 static void do_slabs_free_chunked(item *it, const size_t size) {
451 item_chunk *chunk = (item_chunk *) ITEM_schunk(it);
452 slabclass_t *p;
453
454 it->it_flags = ITEM_SLABBED;
455 // FIXME: refresh on how this works?
456 //it->slabs_clsid = 0;
457 it->prev = 0;
458 // header object's original classid is stored in chunk.
459 p = &slabclass[chunk->orig_clsid];
460 // original class id needs to be set on free memory.
461 it->slabs_clsid = chunk->orig_clsid;
462 if (chunk->next) {
463 chunk = chunk->next;
464 chunk->prev = 0;
465 } else {
466 // header with no attached chunk
467 chunk = NULL;
468 }
469
470 // return the header object.
471 // TODO: This is in three places, here and in do_slabs_free().
472 it->prev = 0;
473 it->next = p->slots;
474 if (it->next) it->next->prev = it;
475 p->slots = it;
476 p->sl_curr++;
477
478 item_chunk *next_chunk;
479 while (chunk) {
480 assert(chunk->it_flags == ITEM_CHUNK);
481 chunk->it_flags = ITEM_SLABBED;
482 p = &slabclass[chunk->slabs_clsid];
483 next_chunk = chunk->next;
484
485 chunk->prev = 0;
486 chunk->next = p->slots;
487 if (chunk->next) chunk->next->prev = chunk;
488 p->slots = chunk;
489 p->sl_curr++;
490
491 chunk = next_chunk;
492 }
493
494 return;
495 }
496
497
498 static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
499 slabclass_t *p;
500 item *it;
501
502 assert(id >= POWER_SMALLEST && id <= power_largest);
503 if (id < POWER_SMALLEST || id > power_largest)
504 return;
505
506 MEMCACHED_SLABS_FREE(size, id, ptr);
507 p = &slabclass[id];
508
509 it = (item *)ptr;
510 if ((it->it_flags & ITEM_CHUNKED) == 0) {
511 it->it_flags = ITEM_SLABBED;
512 it->slabs_clsid = id;
513 it->prev = 0;
514 it->next = p->slots;
515 if (it->next) it->next->prev = it;
516 p->slots = it;
517
518 p->sl_curr++;
519 } else {
520 do_slabs_free_chunked(it, size);
521 }
522 return;
523 }
524
525 /* With refactoring of the various stats code the automover won't need a
526 * custom function here.
527 */
528 void fill_slab_stats_automove(slab_stats_automove *am) {
529 int n;
530 pthread_mutex_lock(&slabs_lock);
531 for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
532 slabclass_t *p = &slabclass[n];
533 slab_stats_automove *cur = &am[n];
534 cur->chunks_per_page = p->perslab;
535 cur->free_chunks = p->sl_curr;
536 cur->total_pages = p->slabs;
537 cur->chunk_size = p->size;
538 }
539 pthread_mutex_unlock(&slabs_lock);
540 }
541
542 /* TODO: slabs_available_chunks should grow up to encompass this.
543 * mem_flag is redundant with the other function.
544 */
545 unsigned int global_page_pool_size(bool *mem_flag) {
546 unsigned int ret = 0;
547 pthread_mutex_lock(&slabs_lock);
548 if (mem_flag != NULL)
549 *mem_flag = mem_malloced >= mem_limit ? true : false;
550 ret = slabclass[SLAB_GLOBAL_PAGE_POOL].slabs;
551 pthread_mutex_unlock(&slabs_lock);
552 return ret;
553 }
554
555 /*@null@*/
556 static void do_slabs_stats(ADD_STAT add_stats, void *c) {
557 int i, total;
558 /* Get the per-thread stats which contain some interesting aggregates */
559 struct thread_stats thread_stats;
560 threadlocal_stats_aggregate(&thread_stats);
561
562 total = 0;
563 for(i = POWER_SMALLEST; i <= power_largest; i++) {
564 slabclass_t *p = &slabclass[i];
565 if (p->slabs != 0) {
566 uint32_t perslab, slabs;
567 slabs = p->slabs;
568 perslab = p->perslab;
569
570 char key_str[STAT_KEY_LEN];
571 char val_str[STAT_VAL_LEN];
572 int klen = 0, vlen = 0;
573
574 APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
575 APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
576 APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
577 APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
578 APPEND_NUM_STAT(i, "used_chunks", "%u",
579 slabs*perslab - p->sl_curr);
580 APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
581 /* Stat is dead, but displaying zero instead of removing it. */
582 APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
583 APPEND_NUM_STAT(i, "get_hits", "%llu",
584 (unsigned long long)thread_stats.slab_stats[i].get_hits);
585 APPEND_NUM_STAT(i, "cmd_set", "%llu",
586 (unsigned long long)thread_stats.slab_stats[i].set_cmds);
587 APPEND_NUM_STAT(i, "delete_hits", "%llu",
588 (unsigned long long)thread_stats.slab_stats[i].delete_hits);
589 APPEND_NUM_STAT(i, "incr_hits", "%llu",
590 (unsigned long long)thread_stats.slab_stats[i].incr_hits);
591 APPEND_NUM_STAT(i, "decr_hits", "%llu",
592 (unsigned long long)thread_stats.slab_stats[i].decr_hits);
593 APPEND_NUM_STAT(i, "cas_hits", "%llu",
594 (unsigned long long)thread_stats.slab_stats[i].cas_hits);
595 APPEND_NUM_STAT(i, "cas_badval", "%llu",
596 (unsigned long long)thread_stats.slab_stats[i].cas_badval);
597 APPEND_NUM_STAT(i, "touch_hits", "%llu",
598 (unsigned long long)thread_stats.slab_stats[i].touch_hits);
599 total++;
600 }
601 }
602
603 /* add overall slab stats and append terminator */
604
605 APPEND_STAT("active_slabs", "%d", total);
606 APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
607 add_stats(NULL, 0, NULL, 0, c);
608 }
609
610 static void *memory_allocate(size_t size) {
611 void *ret;
612
613 if (mem_base == NULL) {
614 /* We are not using a preallocated large memory chunk */
615 ret = malloc(size);
616 } else {
617 ret = mem_current;
618
619 if (size > mem_avail) {
620 return NULL;
621 }
622
623 /* mem_current pointer _must_ be aligned!!! */
624 if (size % CHUNK_ALIGN_BYTES) {
625 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
626 }
627
628 mem_current = ((char*)mem_current) + size;
629 if (size < mem_avail) {
630 mem_avail -= size;
631 } else {
632 mem_avail = 0;
633 }
634 }
635 mem_malloced += size;
636
637 return ret;
638 }
639
640 /* Must only be used if all pages are item_size_max */
641 static void memory_release() {
642 void *p = NULL;
643 if (mem_base != NULL)
644 return;
645
646 if (!settings.slab_reassign)
647 return;
648
649 while (mem_malloced > mem_limit &&
650 (p = get_page_from_global_pool()) != NULL) {
651 free(p);
652 mem_malloced -= settings.slab_page_size;
653 }
654 }
655
656 void *slabs_alloc(size_t size, unsigned int id,
657 unsigned int flags) {
658 void *ret;
659
660 pthread_mutex_lock(&slabs_lock);
661 ret = do_slabs_alloc(size, id, flags);
662 pthread_mutex_unlock(&slabs_lock);
663 return ret;
664 }
665
666 void slabs_free(void *ptr, size_t size, unsigned int id) {
667 pthread_mutex_lock(&slabs_lock);
668 do_slabs_free(ptr, size, id);
669 pthread_mutex_unlock(&slabs_lock);
670 }
671
672 void slabs_stats(ADD_STAT add_stats, void *c) {
673 pthread_mutex_lock(&slabs_lock);
674 do_slabs_stats(add_stats, c);
675 pthread_mutex_unlock(&slabs_lock);
676 }
677
678 static bool do_slabs_adjust_mem_limit(size_t new_mem_limit) {
679 /* Cannot adjust memory limit at runtime if prealloc'ed */
680 if (mem_base != NULL)
681 return false;
682 settings.maxbytes = new_mem_limit;
683 mem_limit = new_mem_limit;
684 mem_limit_reached = false; /* Will reset on next alloc */
685 memory_release(); /* free what might already be in the global pool */
686 return true;
687 }
688
689 bool slabs_adjust_mem_limit(size_t new_mem_limit) {
690 bool ret;
691 pthread_mutex_lock(&slabs_lock);
692 ret = do_slabs_adjust_mem_limit(new_mem_limit);
693 pthread_mutex_unlock(&slabs_lock);
694 return ret;
695 }
696
697 unsigned int slabs_available_chunks(const unsigned int id, bool *mem_flag,
698 unsigned int *chunks_perslab) {
699 unsigned int ret;
700 slabclass_t *p;
701
702 pthread_mutex_lock(&slabs_lock);
703 p = &slabclass[id];
704 ret = p->sl_curr;
705 if (mem_flag != NULL)
706 *mem_flag = mem_malloced >= mem_limit ? true : false;
707 if (chunks_perslab != NULL)
708 *chunks_perslab = p->perslab;
709 pthread_mutex_unlock(&slabs_lock);
710 return ret;
711 }
712
713 /* The slabber system could avoid needing to understand much, if anything,
714 * about items if callbacks were strategically used. Due to how the slab mover
715 * works, certain flag bits can only be adjusted while holding the slabs lock.
716 * Using these functions, isolate sections of code needing this and turn them
717 * into callbacks when an interface becomes more obvious.
718 */
719 void slabs_mlock(void) {
720 pthread_mutex_lock(&slabs_lock);
721 }
722
723 void slabs_munlock(void) {
724 pthread_mutex_unlock(&slabs_lock);
725 }
726
727 static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
728 static volatile int do_run_slab_rebalance_thread = 1;
729
730 static int slab_rebalance_start(void) {
731 slabclass_t *s_cls;
732 int no_go = 0;
733
734 pthread_mutex_lock(&slabs_lock);
735
736 if (slab_rebal.s_clsid < SLAB_GLOBAL_PAGE_POOL ||
737 slab_rebal.s_clsid > power_largest ||
738 slab_rebal.d_clsid < SLAB_GLOBAL_PAGE_POOL ||
739 slab_rebal.d_clsid > power_largest ||
740 slab_rebal.s_clsid == slab_rebal.d_clsid)
741 no_go = -2;
742
743 s_cls = &slabclass[slab_rebal.s_clsid];
744
745 if (!grow_slab_list(slab_rebal.d_clsid)) {
746 no_go = -1;
747 }
748
749 if (s_cls->slabs < 2)
750 no_go = -3;
751
752 if (no_go != 0) {
753 pthread_mutex_unlock(&slabs_lock);
754 return no_go; /* Should use a wrapper function... */
755 }
756
757 /* Always kill the first available slab page as it is most likely to
758 * contain the oldest items
759 */
760 slab_rebal.slab_start = s_cls->slab_list[0];
761 slab_rebal.slab_end = (char *)slab_rebal.slab_start +
762 (s_cls->size * s_cls->perslab);
763 slab_rebal.slab_pos = slab_rebal.slab_start;
764 slab_rebal.done = 0;
765 // Don't need to do chunk move work if page is in global pool.
766 if (slab_rebal.s_clsid == SLAB_GLOBAL_PAGE_POOL) {
767 slab_rebal.done = 1;
768 }
769
770 // Bit-vector to keep track of completed chunks
771 slab_rebal.completed = (uint8_t*)calloc(s_cls->perslab,sizeof(uint8_t));
772
773 slab_rebalance_signal = 2;
774
775 if (settings.verbose > 1) {
776 fprintf(stderr, "Started a slab rebalance\n");
777 }
778
779 pthread_mutex_unlock(&slabs_lock);
780
781 STATS_LOCK();
782 stats_state.slab_reassign_running = true;
783 STATS_UNLOCK();
784
785 return 0;
786 }
787
788 /* CALLED WITH slabs_lock HELD */
789 static void *slab_rebalance_alloc(const size_t size, unsigned int id) {
790 slabclass_t *s_cls;
791 s_cls = &slabclass[slab_rebal.s_clsid];
792 int x;
793 item *new_it = NULL;
794
795 for (x = 0; x < s_cls->perslab; x++) {
796 new_it = do_slabs_alloc(size, id, SLABS_ALLOC_NO_NEWPAGE);
797 /* check that memory isn't within the range to clear */
798 if (new_it == NULL) {
799 break;
800 }
801 if ((void *)new_it >= slab_rebal.slab_start
802 && (void *)new_it < slab_rebal.slab_end) {
803 /* Pulled something we intend to free. Mark it as freed since
804 * we've already done the work of unlinking it from the freelist.
805 */
806 new_it->refcount = 0;
807 new_it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
808 #ifdef DEBUG_SLAB_MOVER
809 memcpy(ITEM_key(new_it), "deadbeef", 8);
810 #endif
811 new_it = NULL;
812 slab_rebal.inline_reclaim++;
813 } else {
814 break;
815 }
816 }
817 return new_it;
818 }
819
820 /* CALLED WITH slabs_lock HELD */
821 /* detaches item/chunk from freelist. */
822 static void slab_rebalance_cut_free(slabclass_t *s_cls, item *it) {
823 /* Ensure this was on the freelist and nothing else. */
824 assert(it->it_flags == ITEM_SLABBED);
825 if (s_cls->slots == it) {
826 s_cls->slots = it->next;
827 }
828 if (it->next) it->next->prev = it->prev;
829 if (it->prev) it->prev->next = it->next;
830 s_cls->sl_curr--;
831 }
832
833 enum move_status {
834 MOVE_PASS=0, MOVE_FROM_SLAB, MOVE_FROM_LRU, MOVE_BUSY, MOVE_LOCKED
835 };
836
837 #define SLAB_MOVE_MAX_LOOPS 1000
838
839 /* refcount == 0 is safe since nobody can incr while item_lock is held.
840 * refcount != 0 is impossible since flags/etc can be modified in other
841 * threads. instead, note we found a busy one and bail. logic in do_item_get
842 * will prevent busy items from continuing to be busy
843 * NOTE: This is checking it_flags outside of an item lock. I believe this
844 * works since it_flags is 8 bits, and we're only ever comparing a single bit
845 * regardless. ITEM_SLABBED bit will always be correct since we're holding the
846 * lock which modifies that bit. ITEM_LINKED won't exist if we're between an
847 * item having ITEM_SLABBED removed, and the key hasn't been added to the item
848 * yet. The memory barrier from the slabs lock should order the key write and the
849 * flags to the item?
850 * If ITEM_LINKED did exist and was just removed, but we still see it, that's
851 * still safe since it will have a valid key, which we then lock, and then
852 * recheck everything.
853 * This may not be safe on all platforms; If not, slabs_alloc() will need to
854 * seed the item key while holding slabs_lock.
855 */
856 static int slab_rebalance_move(void) {
857 slabclass_t *s_cls;
858 int was_busy = 0;
859 int refcount = 0;
860 uint32_t hv;
861 void *hold_lock;
862 enum move_status status = MOVE_PASS;
863
864 s_cls = &slabclass[slab_rebal.s_clsid];
865 // the offset to check if completed or not
866 int offset = ((char*)slab_rebal.slab_pos-(char*)slab_rebal.slab_start)/(s_cls->size);
867
868 // skip acquiring the slabs lock for items we've already fully processed.
869 if (slab_rebal.completed[offset] == 0) {
870 pthread_mutex_lock(&slabs_lock);
871 hv = 0;
872 hold_lock = NULL;
873 item *it = slab_rebal.slab_pos;
874
875 item_chunk *ch = NULL;
876 status = MOVE_PASS;
877
878 if (it->it_flags & ITEM_CHUNK) {
879 /* This chunk is a chained part of a larger item. */
880 ch = (item_chunk *) it;
881 /* Instead, we use the head chunk to find the item and effectively
882 * lock the entire structure. If a chunk has ITEM_CHUNK flag, its
883 * head cannot be slabbed, so the normal routine is safe. */
884 it = ch->head;
885 assert(it->it_flags & ITEM_CHUNKED);
886 }
887
888 /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared
889 * the chunk for move. Only these two flags should exist.
890 */
891 if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) {
892 /* ITEM_SLABBED can only be added/removed under the slabs_lock */
893 if (it->it_flags & ITEM_SLABBED) {
894 assert(ch == NULL);
895 slab_rebalance_cut_free(s_cls, it);
896 status = MOVE_FROM_SLAB;
897 } else if ((it->it_flags & ITEM_LINKED) != 0) {
898 /* If it doesn't have ITEM_SLABBED, the item could be in any
899 * state on its way to being freed or written to. If no
900 * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active
901 * and have the key written to it already.
902 */
903 hv = hash(ITEM_key(it), it->nkey);
904 if ((hold_lock = item_trylock(hv)) == NULL) {
905 status = MOVE_LOCKED;
906 } else {
907 bool is_linked = (it->it_flags & ITEM_LINKED);
908 refcount = refcount_incr(it);
909 if (refcount == 2) { /* item is linked but not busy */
910 /* Double check ITEM_LINKED flag here, since we're
911 * past a memory barrier from the mutex. */
912 if (is_linked) {
913 status = MOVE_FROM_LRU;
914 } else {
915 /* refcount == 1 + !ITEM_LINKED means the item is being
916 * uploaded to, or was just unlinked but hasn't been freed
917 * yet. Let it bleed off on its own and try again later */
918 status = MOVE_BUSY;
919 }
920 } else if (refcount > 2 && is_linked) {
921 // TODO: Mark items for delete/rescue and process
922 // outside of the main loop.
923 if (slab_rebal.busy_loops > SLAB_MOVE_MAX_LOOPS) {
924 slab_rebal.busy_deletes++;
925 // Only safe to hold slabs lock because refcount
926 // can't drop to 0 until we release item lock.
927 STORAGE_delete(storage, it);
928 pthread_mutex_unlock(&slabs_lock);
929 do_item_unlink(it, hv);
930 pthread_mutex_lock(&slabs_lock);
931 }
932 status = MOVE_BUSY;
933 } else {
934 if (settings.verbose > 2) {
935 fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
936 it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
937 }
938 status = MOVE_BUSY;
939 }
940 /* Item lock must be held while modifying refcount */
941 if (status == MOVE_BUSY) {
942 refcount_decr(it);
943 item_trylock_unlock(hold_lock);
944 }
945 }
946 } else {
947 /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark
948 * busy and wait for item to complete its upload. */
949 status = MOVE_BUSY;
950 }
951 }
952
953 int save_item = 0;
954 item *new_it = NULL;
955 size_t ntotal = 0;
956 switch (status) {
957 case MOVE_FROM_LRU:
958 /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock.
959 * We only need to hold the slabs_lock while initially looking
960 * at an item, and at this point we have an exclusive refcount
961 * (2) + the item is locked. Drop slabs lock, drop item to
962 * refcount 1 (just our own, then fall through and wipe it
963 */
964 /* Check if expired or flushed */
965 ntotal = ITEM_ntotal(it);
966 #ifdef EXTSTORE
967 if (it->it_flags & ITEM_HDR) {
968 ntotal = (ntotal - it->nbytes) + sizeof(item_hdr);
969 }
970 #endif
971 /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */
972 if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) {
973 /* Chunked should be identical to non-chunked, except we need
974 * to swap out ntotal for the head-chunk-total. */
975 ntotal = s_cls->size;
976 }
977 if ((it->exptime != 0 && it->exptime < current_time)
978 || item_is_flushed(it)) {
979 /* Expired, don't save. */
980 save_item = 0;
981 } else if (ch == NULL &&
982 (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) {
983 /* Not a chunk of an item, and nomem. */
984 save_item = 0;
985 slab_rebal.evictions_nomem++;
986 } else if (ch != NULL &&
987 (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) {
988 /* Is a chunk of an item, and nomem. */
989 save_item = 0;
990 slab_rebal.evictions_nomem++;
991 } else {
992 /* Was whatever it was, and we have memory for it. */
993 save_item = 1;
994 }
995 pthread_mutex_unlock(&slabs_lock);
996 if (save_item) {
997 if (ch == NULL) {
998 assert((new_it->it_flags & ITEM_CHUNKED) == 0);
999 /* if free memory, memcpy. clear prev/next/h_bucket */
1000 memcpy(new_it, it, ntotal);
1001 new_it->prev = 0;
1002 new_it->next = 0;
1003 new_it->h_next = 0;
1004 /* These are definitely required. else fails assert */
1005 new_it->it_flags &= ~ITEM_LINKED;
1006 new_it->refcount = 0;
1007 do_item_replace(it, new_it, hv);
1008 /* Need to walk the chunks and repoint head */
1009 if (new_it->it_flags & ITEM_CHUNKED) {
1010 item_chunk *fch = (item_chunk *) ITEM_schunk(new_it);
1011 fch->next->prev = fch;
1012 while (fch) {
1013 fch->head = new_it;
1014 fch = fch->next;
1015 }
1016 }
1017 it->refcount = 0;
1018 it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1019 #ifdef DEBUG_SLAB_MOVER
1020 memcpy(ITEM_key(it), "deadbeef", 8);
1021 #endif
1022 slab_rebal.rescues++;
1023 } else {
1024 item_chunk *nch = (item_chunk *) new_it;
1025 /* Chunks always have head chunk (the main it) */
1026 ch->prev->next = nch;
1027 if (ch->next)
1028 ch->next->prev = nch;
1029 memcpy(nch, ch, ch->used + sizeof(item_chunk));
1030 ch->refcount = 0;
1031 ch->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1032 slab_rebal.chunk_rescues++;
1033 #ifdef DEBUG_SLAB_MOVER
1034 memcpy(ITEM_key((item *)ch), "deadbeef", 8);
1035 #endif
1036 refcount_decr(it);
1037 }
1038 slab_rebal.completed[offset] = 1;
1039 } else {
1040 /* unlink and mark as done if it's not
1041 * a chunked item as they require more book-keeping) */
1042 STORAGE_delete(storage, it);
1043 if (!ch && (it->it_flags & ITEM_CHUNKED) == 0) {
1044 do_item_unlink(it, hv);
1045 it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1046 it->refcount = 0;
1047 #ifdef DEBUG_SLAB_MOVER
1048 memcpy(ITEM_key(it), "deadbeef", 8);
1049 #endif
1050 slab_rebal.completed[offset] = 1;
1051 } else {
1052 ntotal = ITEM_ntotal(it);
1053 do_item_unlink(it, hv);
1054 slabs_free(it, ntotal, slab_rebal.s_clsid);
1055 /* Swing around again later to remove it from the freelist. */
1056 slab_rebal.busy_items++;
1057 was_busy++;
1058 }
1059
1060 }
1061 item_trylock_unlock(hold_lock);
1062 pthread_mutex_lock(&slabs_lock);
1063 /* Always remove the ntotal, as we added it in during
1064 * do_slabs_alloc() when copying the item.
1065 */
1066 break;
1067 case MOVE_FROM_SLAB:
1068 slab_rebal.completed[offset] = 1;
1069 it->refcount = 0;
1070 it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1071 #ifdef DEBUG_SLAB_MOVER
1072 memcpy(ITEM_key(it), "deadbeef", 8);
1073 #endif
1074 break;
1075 case MOVE_BUSY:
1076 case MOVE_LOCKED:
1077 slab_rebal.busy_items++;
1078 was_busy++;
1079 break;
1080 case MOVE_PASS:
1081 break;
1082 }
1083
1084 pthread_mutex_unlock(&slabs_lock);
1085 }
1086
1087 // Note: slab_rebal.* is occasionally protected under slabs_lock, but
1088 // the mover thread is the only user while active: so it's only necessary
1089 // for start/stop synchronization.
1090 slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
1091
1092 if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
1093 /* Some items were busy, start again from the top */
1094 if (slab_rebal.busy_items) {
1095 slab_rebal.slab_pos = slab_rebal.slab_start;
1096 STATS_LOCK();
1097 stats.slab_reassign_busy_items += slab_rebal.busy_items;
1098 STATS_UNLOCK();
1099 slab_rebal.busy_items = 0;
1100 slab_rebal.busy_loops++;
1101 } else {
1102 slab_rebal.done++;
1103 }
1104 }
1105
1106 return was_busy;
1107 }
1108
1109 static void slab_rebalance_finish(void) {
1110 slabclass_t *s_cls;
1111 slabclass_t *d_cls;
1112 int x;
1113 uint32_t rescues;
1114 uint32_t evictions_nomem;
1115 uint32_t inline_reclaim;
1116 uint32_t chunk_rescues;
1117 uint32_t busy_deletes;
1118
1119 pthread_mutex_lock(&slabs_lock);
1120
1121 s_cls = &slabclass[slab_rebal.s_clsid];
1122 d_cls = &slabclass[slab_rebal.d_clsid];
1123
1124 #ifdef DEBUG_SLAB_MOVER
1125 /* If the algorithm is broken, live items can sneak in. */
1126 slab_rebal.slab_pos = slab_rebal.slab_start;
1127 while (1) {
1128 item *it = slab_rebal.slab_pos;
1129 assert(it->it_flags == (ITEM_SLABBED|ITEM_FETCHED));
1130 assert(memcmp(ITEM_key(it), "deadbeef", 8) == 0);
1131 it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1132 slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
1133 if (slab_rebal.slab_pos >= slab_rebal.slab_end)
1134 break;
1135 }
1136 #endif
1137
1138 /* At this point the stolen slab is completely clear.
1139 * We always kill the "first"/"oldest" slab page in the slab_list, so
1140 * shuffle the page list backwards and decrement.
1141 */
1142 s_cls->slabs--;
1143 for (x = 0; x < s_cls->slabs; x++) {
1144 s_cls->slab_list[x] = s_cls->slab_list[x+1];
1145 }
1146
1147 d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
1148 /* Don't need to split the page into chunks if we're just storing it */
1149 if (slab_rebal.d_clsid > SLAB_GLOBAL_PAGE_POOL) {
1150 memset(slab_rebal.slab_start, 0, (size_t)settings.slab_page_size);
1151 split_slab_page_into_freelist(slab_rebal.slab_start,
1152 slab_rebal.d_clsid);
1153 } else if (slab_rebal.d_clsid == SLAB_GLOBAL_PAGE_POOL) {
1154 /* memset just enough to signal restart handler to skip */
1155 memset(slab_rebal.slab_start, 0, sizeof(item));
1156 /* mem_malloc'ed might be higher than mem_limit. */
1157 mem_limit_reached = false;
1158 memory_release();
1159 }
1160
1161 slab_rebal.busy_loops = 0;
1162 slab_rebal.done = 0;
1163 slab_rebal.s_clsid = 0;
1164 slab_rebal.d_clsid = 0;
1165 slab_rebal.slab_start = NULL;
1166 slab_rebal.slab_end = NULL;
1167 slab_rebal.slab_pos = NULL;
1168 evictions_nomem = slab_rebal.evictions_nomem;
1169 inline_reclaim = slab_rebal.inline_reclaim;
1170 rescues = slab_rebal.rescues;
1171 chunk_rescues = slab_rebal.chunk_rescues;
1172 busy_deletes = slab_rebal.busy_deletes;
1173 slab_rebal.evictions_nomem = 0;
1174 slab_rebal.inline_reclaim = 0;
1175 slab_rebal.rescues = 0;
1176 slab_rebal.chunk_rescues = 0;
1177 slab_rebal.busy_deletes = 0;
1178
1179 slab_rebalance_signal = 0;
1180
1181 free(slab_rebal.completed);
1182 pthread_mutex_unlock(&slabs_lock);
1183
1184 STATS_LOCK();
1185 stats.slabs_moved++;
1186 stats.slab_reassign_rescues += rescues;
1187 stats.slab_reassign_evictions_nomem += evictions_nomem;
1188 stats.slab_reassign_inline_reclaim += inline_reclaim;
1189 stats.slab_reassign_chunk_rescues += chunk_rescues;
1190 stats.slab_reassign_busy_deletes += busy_deletes;
1191 stats_state.slab_reassign_running = false;
1192 STATS_UNLOCK();
1193
1194 if (settings.verbose > 1) {
1195 fprintf(stderr, "finished a slab move\n");
1196 }
1197 }
1198
1199 /* Slab mover thread.
1200 * Sits waiting for a condition to jump off and shovel some memory about
1201 */
1202 static void *slab_rebalance_thread(void *arg) {
1203 int was_busy = 0;
1204 int backoff_timer = 1;
1205 int backoff_max = 1000;
1206 /* So we first pass into cond_wait with the mutex held */
1207 mutex_lock(&slabs_rebalance_lock);
1208
1209 /* Must finish moving page before stopping */
1210 while (slab_rebalance_signal || do_run_slab_rebalance_thread) {
1211 if (slab_rebalance_signal == 1) {
1212 if (slab_rebalance_start() < 0) {
1213 /* Handle errors with more specificity as required. */
1214 slab_rebalance_signal = 0;
1215 }
1216
1217 was_busy = 0;
1218 } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
1219 was_busy = slab_rebalance_move();
1220 }
1221
1222 if (slab_rebal.done) {
1223 slab_rebalance_finish();
1224 } else if (was_busy) {
1225 /* Stuck waiting for some items to unlock, so slow down a bit
1226 * to give them a chance to free up */
1227 usleep(backoff_timer);
1228 backoff_timer = backoff_timer * 2;
1229 if (backoff_timer > backoff_max)
1230 backoff_timer = backoff_max;
1231 }
1232
1233 if (slab_rebalance_signal == 0) {
1234 /* always hold this lock while we're running */
1235 pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
1236 }
1237 }
1238
1239 // TODO: cancel in-flight slab page move
1240 mutex_unlock(&slabs_rebalance_lock);
1241 return NULL;
1242 }
1243
1244 /* Iterate at most once through the slab classes and pick a "random" source.
1245 * I like this better than calling rand() since rand() is slow enough that we
1246 * can just check all of the classes once instead.
1247 */
1248 static int slabs_reassign_pick_any(int dst) {
1249 static int cur = POWER_SMALLEST - 1;
1250 int tries = power_largest - POWER_SMALLEST + 1;
1251 for (; tries > 0; tries--) {
1252 cur++;
1253 if (cur > power_largest)
1254 cur = POWER_SMALLEST;
1255 if (cur == dst)
1256 continue;
1257 if (slabclass[cur].slabs > 1) {
1258 return cur;
1259 }
1260 }
1261 return -1;
1262 }
1263
1264 static enum reassign_result_type do_slabs_reassign(int src, int dst) {
1265 bool nospare = false;
1266 if (slab_rebalance_signal != 0)
1267 return REASSIGN_RUNNING;
1268
1269 if (src == dst)
1270 return REASSIGN_SRC_DST_SAME;
1271
1272 /* Special indicator to choose ourselves. */
1273 if (src == -1) {
1274 src = slabs_reassign_pick_any(dst);
1275 /* TODO: If we end up back at -1, return a new error type */
1276 }
1277
1278 if (src < SLAB_GLOBAL_PAGE_POOL || src > power_largest ||
1279 dst < SLAB_GLOBAL_PAGE_POOL || dst > power_largest)
1280 return REASSIGN_BADCLASS;
1281
1282 pthread_mutex_lock(&slabs_lock);
1283 if (slabclass[src].slabs < 2)
1284 nospare = true;
1285 pthread_mutex_unlock(&slabs_lock);
1286 if (nospare)
1287 return REASSIGN_NOSPARE;
1288
1289 slab_rebal.s_clsid = src;
1290 slab_rebal.d_clsid = dst;
1291
1292 slab_rebalance_signal = 1;
1293 pthread_cond_signal(&slab_rebalance_cond);
1294
1295 return REASSIGN_OK;
1296 }
1297
1298 enum reassign_result_type slabs_reassign(int src, int dst) {
1299 enum reassign_result_type ret;
1300 if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
1301 return REASSIGN_RUNNING;
1302 }
1303 ret = do_slabs_reassign(src, dst);
1304 pthread_mutex_unlock(&slabs_rebalance_lock);
1305 return ret;
1306 }
1307
1308 /* If we hold this lock, rebalancer can't wake up or move */
1309 void slabs_rebalancer_pause(void) {
1310 pthread_mutex_lock(&slabs_rebalance_lock);
1311 }
1312
1313 void slabs_rebalancer_resume(void) {
1314 pthread_mutex_unlock(&slabs_rebalance_lock);
1315 }
1316
1317 static pthread_t rebalance_tid;
1318
1319 int start_slab_maintenance_thread(void) {
1320 int ret;
1321 slab_rebalance_signal = 0;
1322 slab_rebal.slab_start = NULL;
1323
1324 if ((ret = pthread_create(&rebalance_tid, NULL,
1325 slab_rebalance_thread, NULL)) != 0) {
1326 fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
1327 return -1;
1328 }
1329 return 0;
1330 }
1331
1332 /* The maintenance thread is on a sleep/loop cycle, so it should join after a
1333 * short wait */
1334 void stop_slab_maintenance_thread(void) {
1335 mutex_lock(&slabs_rebalance_lock);
1336 do_run_slab_rebalance_thread = 0;
1337 pthread_cond_signal(&slab_rebalance_cond);
1338 pthread_mutex_unlock(&slabs_rebalance_lock);
1339
1340 /* Wait for the maintenance thread to stop */
1341 pthread_join(rebalance_tid, NULL);
1342 }