diff options
author | Will Newton <will.newton@linaro.org> | 2013-07-11 12:37:17 +0100 |
---|---|---|
committer | Will Newton <will.newton@linaro.org> | 2013-07-11 12:37:17 +0100 |
commit | 18e703c46e36dbeaeed1ac12fad345dbb1bb402c (patch) | |
tree | 971f9032633f449d91380f231f1ec4bfb4fcf5ef | |
parent | 2fa8fd4d7bd6a6b976fadccc18bd1b65e704ba16 (diff) |
Add ssmalloc reference code.
-rw-r--r-- | Makefile.am | 43 | ||||
-rw-r--r-- | configure.ac | 20 | ||||
-rw-r--r-- | reference/ssmalloc/include-x86_64/atomic.h | 99 | ||||
-rw-r--r-- | reference/ssmalloc/include-x86_64/bitops.h | 64 | ||||
-rw-r--r-- | reference/ssmalloc/include-x86_64/cpu.h | 30 | ||||
-rw-r--r-- | reference/ssmalloc/include-x86_64/double-list.h | 64 | ||||
-rw-r--r-- | reference/ssmalloc/include-x86_64/queue.h | 192 | ||||
-rw-r--r-- | reference/ssmalloc/ssmalloc.c | 785 | ||||
-rw-r--r-- | reference/ssmalloc/ssmalloc.h | 177 |
9 files changed, 1461 insertions, 13 deletions
diff --git a/Makefile.am b/Makefile.am index d7a813c..4137d9e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -63,19 +63,6 @@ tests_tst_valloc_CFLAGS = $(tests_cflags) TESTS = $(check_PROGRAMS) -## Benchmarks -noinst_PROGRAMS = \ - try-none \ - try-this \ - try-glibc \ - try-jemalloc - -# Benchmark harness -noinst_LIBRARIES = \ - libmallocbench.a \ - libglibc.a \ - libjemalloc.a - libmallocbench_a_SOURCES = \ benchmarks/mallocbench.c @@ -126,6 +113,36 @@ try_glibc_LDADD = libmallocbench.a libglibc.a -lrt -lpthread try_jemalloc_SOURCES = try_jemalloc_LDADD = libmallocbench.a libjemalloc.a -lrt -lpthread +if HOST_X86_64 +libssmalloc_a_SOURCES = \ + reference/ssmalloc/ssmalloc.c + +libssmalloc_a_CFLAGS = \ + -O3 -Wstrict-prototypes -fomit-frame-pointer -g -Wall \ + -Ireference/ssmalloc/include-x86_64 + +try_ssmalloc_SOURCES = +try_ssmalloc_LDADD = libmallocbench.a libssmalloc.a -lrt -lpthread + +x86_64_programs = try-ssmalloc +x86_64_libraries = libssmalloc.a +endif + +## Benchmarks +noinst_PROGRAMS = \ + try-none \ + try-this \ + try-glibc \ + try-jemalloc \ + $(x86_64_programs) + +# Benchmark harness +noinst_LIBRARIES = \ + libmallocbench.a \ + libglibc.a \ + libjemalloc.a \ + $(x86_64_libraries) + # Main library libcortex_malloc_la_SOURCES = \ src/malloc.c diff --git a/configure.ac b/configure.ac index 1a4b75a..381cbcb 100644 --- a/configure.ac +++ b/configure.ac @@ -32,4 +32,24 @@ AM_PROG_AS AC_PROG_CC AC_PROG_LIBTOOL +case $host in +aarch64*-*-*) + arch=aarch64 + ;; +arm*-*-*) + arch=aarch32 + ;; +x86_64-*-*-*) + arch=x86_64 + ;; +*) + arch=generic + ;; +esac + +AM_CONDITIONAL([HOST_AARCH32], [test x$arch = xaarch32]) +AM_CONDITIONAL([HOST_AARCH64], [test x$arch = xaarch64]) +AM_CONDITIONAL([HOST_X86_64], [test x$arch = xx86_64]) +AM_CONDITIONAL([HOST_GENERIC], [test x$arch = xgeneric]) + AC_OUTPUT diff --git a/reference/ssmalloc/include-x86_64/atomic.h b/reference/ssmalloc/include-x86_64/atomic.h new file mode 100644 index 0000000..2431300 --- /dev/null +++ b/reference/ssmalloc/include-x86_64/atomic.h @@ -0,0 +1,99 @@ +#ifndef __SYNCHRO_ATOMIC_H__ +#define __SYNCHRO_ATOMIC_H__ + +#define mb() asm volatile ("sync" : : : "memory") +#define LOCK_PREFIX "lock ; " + +static inline unsigned long fetch_and_store(volatile unsigned int *address, unsigned int value) +{ + asm volatile("xchgl %k0,%1" + : "=r" (value) + : "m" (*address), "0" (value) + : "memory"); + + return value; +} + +static inline int atmc_fetch_and_add(volatile unsigned int *address, int value) +{ + int prev = value; + + asm volatile( + LOCK_PREFIX "xaddl %0, %1" + : "+r" (value), "+m" (*address) + : : "memory"); + + return prev + value; +} + +static inline long long atmc_fetch_and_add64(volatile unsigned long long *address, long long value) +{ + int prev = value; + + asm volatile( + LOCK_PREFIX "xaddq %0, %1" + : "+r" (value), "+m" (*address) + : : "memory"); + + return prev + value; +} + +static inline void atmc_add32(volatile unsigned int* address, int value) +{ + asm volatile( + LOCK_PREFIX "addl %1,%0" + : "=m" (*address) + : "ir" (value), "m" (*address)); +} + +static inline void atmc_add64(volatile unsigned long long* address, unsigned long long value) +{ + asm volatile( + LOCK_PREFIX "addq %1,%0" + : "=m" (*address) + : "ir" (value), "m" (*address)); +} + +static inline unsigned int compare_and_swap32(volatile unsigned int *address, unsigned int old_value, unsigned int new_value) +{ + unsigned long prev = 0; + + asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2" + : "=a"(prev) + : "r"(new_value), "m"(*address), "0"(old_value) + : "memory"); + + return prev == old_value; +} + +static inline unsigned int compare_and_swap64(volatile unsigned long long *address, unsigned long old_value, unsigned long new_value) +{ + unsigned long prev = 0; + + asm volatile(LOCK_PREFIX "cmpxchgq %1,%2" + : "=a"(prev) + : "r"(new_value), "m"(*address), "0"(old_value) + : "memory"); + + return prev == old_value; +} + +static inline unsigned long compare_and_swap64_out(volatile unsigned long long *address, unsigned long old_value, unsigned long new_value) +{ + unsigned long prev = 0; + + asm volatile(LOCK_PREFIX "cmpxchgq %1,%2" + : "=a"(prev) + : "r"(new_value), "m"(*address), "0"(old_value) + : "memory"); + + return prev; +} + +static inline unsigned long compare_and_swap_ptr(volatile void *address, void* old_ptr, void* new_ptr) +{ + return compare_and_swap64((volatile unsigned long long *)address, (unsigned long)old_ptr, (unsigned long)new_ptr); +} + +#endif + diff --git a/reference/ssmalloc/include-x86_64/bitops.h b/reference/ssmalloc/include-x86_64/bitops.h new file mode 100644 index 0000000..879f237 --- /dev/null +++ b/reference/ssmalloc/include-x86_64/bitops.h @@ -0,0 +1,64 @@ +#ifndef __X86_64_BITOPS_H_ +#define __X86_64_BITOPS_H_ + +/* + * Copyright 1992, Linus Torvalds. + */ + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define ADDR "=m" (*(volatile long *) addr) +#else +#define ADDR "+m" (*(volatile long *) addr) +#endif + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static __inline__ void __change_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( + "btcl %1,%0" + :ADDR + :"dIr" (nr)); +} + +/* WARNING: non atomic and it can be reordered! */ +static __inline__ int __test_and_change_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( + "btcl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),ADDR + :"dIr" (nr) : "memory"); + return oldbit; +} + +static inline unsigned long __fls(unsigned long word) +{ + asm("bsr %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +static __inline__ unsigned int __get_size_class(unsigned int word) { + asm("dec %1\n" + "shr $2,%1\n" + "bsr %1,%0\n" + "cmovz %2,%0\n" + : "=r" (word) + : "rm" (word), "r" (0)); + return word; +} + +#endif /* _X86_64_BITOPS_H */ + diff --git a/reference/ssmalloc/include-x86_64/cpu.h b/reference/ssmalloc/include-x86_64/cpu.h new file mode 100644 index 0000000..1efce67 --- /dev/null +++ b/reference/ssmalloc/include-x86_64/cpu.h @@ -0,0 +1,30 @@ +#ifndef __CPU_H_ +#define __CPU_H_ + +/* Machine related macros*/ +#define PAGE_SIZE (4096) +#define SUPER_PAGE_SIZE (4*1024*1024) +#define CACHE_LINE_SIZE (64) +#define DEFAULT_BLOCK_CLASS (100) +#define MAX_CORE_ID (8) + +static inline int get_core_id(void) { + return 0; + int result; + __asm__ __volatile__ ( + "mov $1, %%eax\n" + "cpuid\n" + :"=b"(result) + : + :"eax","ecx","edx"); + return (result>>24)%8; +} + +static inline unsigned long read_tsc(void) +{ + unsigned a, d; + __asm __volatile("rdtsc":"=a"(a), "=d"(d)); + return ((unsigned long)a) | (((unsigned long) d) << 32); +} + +#endif diff --git a/reference/ssmalloc/include-x86_64/double-list.h b/reference/ssmalloc/include-x86_64/double-list.h new file mode 100644 index 0000000..9edd49c --- /dev/null +++ b/reference/ssmalloc/include-x86_64/double-list.h @@ -0,0 +1,64 @@ +#ifndef __DOUBLE_LIST_H_ +#define __DOUBLE_LIST_H_ + +typedef struct double_list_elem double_list_elem_t; +typedef struct double_list double_list_t;
+
+struct double_list_elem {
+ void* __padding; + struct double_list_elem* next; + struct double_list_elem* prev; +}; + +struct double_list { + struct double_list_elem* head; + struct double_list_elem* tail; +}; +
+
+/* Places new_node at the front of the list. */
+static void double_list_insert_front(void* new_node, double_list_t* list) +{ + double_list_elem_t* elem_new = (double_list_elem_t*)new_node; + double_list_elem_t* old_head = list->head; + + if (old_head == NULL) { + list->tail = elem_new; + } + else { + old_head->prev = elem_new; + } + + elem_new->next = old_head; + elem_new->prev = NULL; + list->head = elem_new; +} + +/* Removes node from the list. */ +static void double_list_remove(void* node, double_list_t* list) +{ + double_list_elem_t* elem_node = (double_list_elem_t*)node; + + if (elem_node->prev != NULL) { + elem_node->prev->next = elem_node->next; + } + else { + list->head = elem_node->next; + } + + if (elem_node->next != NULL) { + elem_node->next->prev = elem_node->prev; + } + else { + list->tail = elem_node->prev; + } + + if (list->head != NULL && list->head->next == NULL) { + list->tail = list->head; + } + else if (list->tail != NULL && list->tail->prev == NULL) { + list->head = list->tail; + } +}
+ +#endif diff --git a/reference/ssmalloc/include-x86_64/queue.h b/reference/ssmalloc/include-x86_64/queue.h new file mode 100644 index 0000000..666a47c --- /dev/null +++ b/reference/ssmalloc/include-x86_64/queue.h @@ -0,0 +1,192 @@ +#ifndef __QUEUE_H_ +#define __QUEUE_H_ + +#include "atomic.h" +#include <stdlib.h> + +#define CACHE_LINE_SIZE (64) +#define CACHE_ALIGN __attribute__ ((aligned (CACHE_LINE_SIZE))) + +#define ABA_ADDR_BIT (48) +#define ABA_ADDR_MASK ((1L<<ABA_ADDR_BIT)-1) +#define ABA_COUNT_MASK (~ABA_ADDR_MASK) +#define ABA_COUNT_ONE (1L<<ABA_ADDR_BIT) +#define ABA_ADDR(e) ((void*)((unsigned long long)(e) & ABA_ADDR_MASK)) +#define ABA_COUNT(e) ((unsigned long long)(e) & ABA_COUNT_MASK) + +typedef struct { + CACHE_ALIGN volatile unsigned long long head; +} queue_head_t; + +typedef void *seq_queue_head_t; + +/* Multi-Consumer LIFO Queue */ + +static inline void mc_queue_init(queue_head_t *queue) +{ + queue->head = 0; +} + +static inline void mc_enqueue(queue_head_t *queue, void *element) +{ + unsigned long long old_head; + unsigned long long new_head; + + while(1) { + old_head = queue->head; + *(unsigned long long*)element = (unsigned long long )ABA_ADDR(old_head); + new_head = (unsigned long long)element; + new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE; + if (compare_and_swap64(&queue->head, old_head, new_head)) { + return; + } + } +} + +static inline void *mc_dequeue(queue_head_t *queue) +{ + unsigned long long old_head; + unsigned long long new_head; + void* old_addr; + + while(1) { + old_head = queue->head; + old_addr = ABA_ADDR(old_head); + if(old_addr == NULL) { + return NULL; + } + new_head = *(unsigned long long*)old_addr; + new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE; + if (compare_and_swap64(&queue->head, old_head, new_head)) { + return old_addr; + } + } +} + +/* Single-Consumer LIFO Queue */ + +static inline void sc_queue_init(queue_head_t *queue) +{ + queue->head = 0; +} + +static inline void sc_enqueue(queue_head_t *queue, void *element) +{ + unsigned long long old_head; + unsigned long long new_head; + + while(1) { + old_head = queue->head; + *(unsigned long long*)element = old_head; + new_head = (unsigned long long)element; + if (compare_and_swap64(&queue->head, old_head, new_head)) { + return; + } + } +} + +static inline void *sc_dequeue(queue_head_t *queue) +{ + unsigned long long old_head; + unsigned long long new_head; + + while(1) { + old_head = queue->head; + if(old_head == 0) { + return NULL; + } + new_head = *(unsigned long long*)old_head; + if (compare_and_swap64(&queue->head, old_head, new_head)) { + return (void*)old_head; + } + } +} + +static inline void *sc_chain_dequeue(queue_head_t *queue) +{ + unsigned long long old_head; + while(1) { + old_head = queue->head; + if(old_head == 0) { + return NULL; + } + if (compare_and_swap64(&queue->head, old_head, 0)) { + return (void*)old_head; + } + } +} + +/* Sequential LIFO Queue */ + +static inline void seq_queue_init(seq_queue_head_t *queue) +{ + *queue = NULL; +} + +static inline void seq_enqueue(seq_queue_head_t *queue, void *element) +{ + *(void**)element = *queue; + *queue = element; +} + +static inline void *seq_dequeue(seq_queue_head_t *queue) +{ + void* old_head = *queue; + if(old_head == NULL) { + return NULL; + } + *queue = *(void**)old_head; + return old_head; +} + +#define seq_head(queue) (queue) + +/* Counted Queue */ +static inline void* counted_enqueue(queue_head_t *queue, void* elem) { + unsigned long long old_head, new_head, prev; + do { + old_head = queue->head; + *(unsigned long long*)elem = (unsigned long long)ABA_ADDR(old_head); + new_head = (unsigned long long)elem; + new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE; + + } while((prev=compare_and_swap64_out ( + &queue->head, + old_head, + new_head + ))!=old_head); + + return (void*)prev; +} + +static inline void* counted_chain_enqueue(queue_head_t *queue, void* elems, void* tail, int cnt) { + unsigned long long old_head, new_head, prev; + do { + old_head = queue->head; + *(unsigned long long*)tail = (unsigned long long)ABA_ADDR(old_head); + new_head = (unsigned long long)elems; + new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE * cnt; + + } while((prev=compare_and_swap64_out ( + &queue->head, + old_head, + new_head + ))!=old_head); + + return (void*)prev; +} + +static inline void* counted_chain_dequeue(queue_head_t* queue, uint32_t *count) { + unsigned long long old_head; + while(1) { + old_head = *(unsigned long long*)queue; + if (old_head == 0) + return(NULL); + if (compare_and_swap64(&queue->head, old_head, 0)) { + *count = ABA_COUNT(old_head) >> ABA_ADDR_BIT; + return(ABA_ADDR(old_head)); + } + } +} + +#endif diff --git a/reference/ssmalloc/ssmalloc.c b/reference/ssmalloc/ssmalloc.c new file mode 100644 index 0000000..df79a89 --- /dev/null +++ b/reference/ssmalloc/ssmalloc.c @@ -0,0 +1,785 @@ +#define _GNU_SOURCE +#include "ssmalloc.h" + +/* Global metadata */ +init_state global_state = UNINITIALIZED; +gpool_t global_pool; +pthread_key_t destructor; +pthread_once_t init_once = PTHREAD_ONCE_INIT; + +/* Mappings */ +CACHE_ALIGN int cls2size[128]; +char sizemap[256]; +char sizemap2[128]; + +/* Private metadata */ +THREAD_LOCAL init_state thread_state = UNINITIALIZED; +THREAD_LOCAL lheap_t *local_heap = NULL; + +/* System init functions */ +static void maps_init(void); +static void thread_init(void); +static void thread_exit(void *dummy); +static void global_init(void); +inline static void check_init(void); + +/* Global pool management functions */ +inline static void gpool_check_size(void *target); +static int gpool_grow(void); +static void gpool_init(void); +static void *gpool_make_raw_chunk(void); +inline static chunk_t *gpool_acquire_chunk(void); +inline static void gpool_release_chunk(dchunk_t *dc); +static lheap_t *gpool_acquire_lheap(void); +static void gpool_release_lheap(lheap_t *lh); + +/* Local heap management functions */ +inline static void lheap_init(lheap_t *lh); +inline static void lheap_replace_foreground(lheap_t *lh, int size_cls); + +/* Data chunk management functions */ +inline static void dchunk_change_cls(dchunk_t *dc, int size_cls); +inline static void dchunk_init(dchunk_t *dc, int size_cls); +inline static void dchunk_collect_garbage(dchunk_t *dc); +inline static void *dchunk_alloc_obj(dchunk_t *dc); +inline static dchunk_t* dchunk_extract(void *ptr); + +/* Object buffer management functions */ +inline static void obj_buf_flush(obj_buf_t *obuf); +inline static void obj_buf_flush_all(lheap_t *lh); +inline static void obj_buf_put(obj_buf_t *bbuf, dchunk_t * dc, void *ptr); + +/* Allocator helpers */ +inline static void *large_malloc(size_t size); +inline static void *small_malloc(int size_cls); +inline static void large_free(void *ptr); +inline static void local_free(lheap_t *lh, dchunk_t *dc, void *ptr); +inline static void remote_free(lheap_t *lh, dchunk_t *dc, void *ptr); +static void *large_memalign(size_t boundary, size_t size); + +/* Misc functions */ +static void* page_alloc(void *pos, size_t size); +static void page_free(void *pos, size_t size); +static void touch_memory_range(void *start, size_t len); +inline static int size2cls(size_t size); + +#ifdef DEBUG +static void handler(int sig); +#endif + +/* Interface */ +void *malloc(size_t size); +void free(void* ptr); +void *realloc(void *ptr, size_t size); +void *calloc(size_t nmemb, size_t size); +void *memalign(size_t boundary, size_t size); +int posix_memalign(void **memptr, size_t alignment, size_t size); +void *valloc(size_t size); +void *pvalloc(size_t size); + +#ifdef RETURN_MEMORY +pthread_t gpool_gc_thread; +static void* gpool_gc(void* arg) +{ + pthread_detach(pthread_self()); + char *ptr = NULL; + + /* sleeptime = 100 ms */ + struct timespec sleeptime = {0, 10000000}; + + while(1) { + nanosleep(&sleeptime, NULL); + ptr = (char*) queue_fetch(&global_pool.free_dc_head[get_core_id()]); + if(ptr) { + void *ptr_end = PAGE_ROUNDDOWN(ptr + CHUNK_SIZE); + void *ptr_start = PAGE_ROUNDUP(ptr); + madvise(ptr_start, (uintptr_t)ptr_end - (uintptr_t)ptr_start, MADV_DONTNEED); + queue_put(&global_pool.released_dc_head[get_core_id()], ptr); + } + } +} +#endif + +static void maps_init() +{ + int size; + int class; + + /* 8 +4 64 */ + for (size = 8, class = 0; size <= 64; size += 4, class++) { + cls2size[class] = size; + } + + /* 80 +16 128 */ + for (size = 64 + 16; size <= 128; size += 16, class++) { + cls2size[class] = size; + } + + /* 160 +32 256 */ + for (size = 128 + 32; size <= 256; size += 32, class++) { + cls2size[class] = size; + } + + for (size = 256; size < 65536; size <<= 1) { + cls2size[class++] = size + (size >> 1); + cls2size[class++] = size << 1; + } + + int cur_class = 0; + int cur_size = 0; + + /* init sizemap */ + for (cur_size = 4; cur_size <= 1024; cur_size += 4) { + if (cur_size > cls2size[cur_class]) + cur_class++; + sizemap[(cur_size - 1) >> 2] = cur_class; + } + + /* init sizemap2 */ + for (cur_size = 1024; cur_size <= 65536; cur_size += 512) { + if (cur_size > cls2size[cur_class]) + cur_class++; + sizemap2[(cur_size - 1) >> 9] = cur_class; + } +} + +static void thread_init() +{ + /* Register the destructor */ + pthread_setspecific(destructor, ACTIVE); + + /* Initialize thread pool */ + local_heap = gpool_acquire_lheap(); + thread_state = READY; +} + +static void thread_exit(void *dummy) +{ + gpool_release_lheap(local_heap); +} + +static void global_init() +{ +#ifdef DEBUG + /* Register the signal handler for backtrace*/ + signal(SIGSEGV, handler); +#endif + + pthread_key_create(&destructor, thread_exit); + + /* Initialize global data */ + gpool_init(); + maps_init(); + global_state = READY; + +#ifdef RETURN_MEMORY + /* Create the gc thread */ + pthread_create(&gpool_gc_thread, NULL, gpool_gc, NULL); +#endif +} + +inline static void check_init() +{ + if (unlikely(thread_state != READY)) { + if (global_state != READY) { + pthread_once(&init_once, global_init); + } + thread_init(); + } +} + +inline static void gpool_check_size(void *target) +{ + if (global_pool.pool_end <= (char *)target) { + /* Global Pool Full */ + pthread_mutex_lock(&global_pool.lock); + while (global_pool.pool_end <= (char *)target) { + gpool_grow(); + } + pthread_mutex_unlock(&global_pool.lock); + } +} + +static int gpool_grow() +{ + /* Enlarge the raw memory pool */ + static int last_alloc = 8; + int alloc_size = ALLOC_UNIT * last_alloc; + if (last_alloc < 32) { + last_alloc *= 2; + } + + void *mem = page_alloc((void *)global_pool.pool_end, alloc_size); + if (mem == MAP_FAILED) { + exit(-1); + return -1; + } + + /* Increase the global pool size */ + global_pool.pool_end += alloc_size; + return 0; +} + +/* Initialize the global memory pool */ +static void gpool_init() +{ + global_pool.pool_start = RAW_POOL_START; + global_pool.pool_end = RAW_POOL_START; + global_pool.free_start = RAW_POOL_START; + //queue_init(&global_pool.free_dc_head); + pthread_mutex_init(&global_pool.lock, NULL); + gpool_grow(); +} + +inline static chunk_t *gpool_acquire_chunk() +{ + void *ptr = NULL; + + /* Try to alloc a freed chunk from the free list */ + ptr = queue_fetch(&global_pool.free_dc_head[get_core_id()]); + if (ptr) { + return (chunk_t *) ptr; + } + +#ifdef RETURN_MEMORY + ptr = queue_fetch(&global_pool.released_dc_head[get_core_id()]); + if (ptr) { + // XXX: Fix me + ((chunk_t *) ptr)->numa_node = get_core_id(); + touch_memory_range(ptr, CHUNK_SIZE); + return (chunk_t *) ptr; + } +#endif + + /* Or just alloc a new chunk */ + ptr = gpool_make_raw_chunk(); + gpool_check_size(ptr); + ptr -= CHUNK_SIZE; + ((chunk_t *) ptr)->numa_node = get_core_id(); + touch_memory_range(ptr, CHUNK_SIZE); + return (chunk_t *) ptr; +} + +static void *gpool_make_raw_chunk() +{ + /* Atomic increse the global pool size */ + void *ret = (void *)(atmc_fetch_and_add64((unsigned long long *) + &global_pool.free_start, + CHUNK_SIZE)); + return ret; +} + +inline static void gpool_release_chunk(dchunk_t *dc) +{ + queue_put(&global_pool.free_dc_head[dc->numa_node], dc); +} + +static lheap_t *gpool_acquire_lheap() +{ + lheap_t *lh; + + /* Try to alloc a freed thread pool from the list */ + lh = queue_fetch(&global_pool.free_lh_head[get_core_id()]); + + /* Alloc a new one */ + if (!lh) { + lh = (lheap_t *) gpool_acquire_chunk(); + lheap_init(lh); + } + return lh; +} + +static void gpool_release_lheap(lheap_t *lh) +{ + queue_put(&global_pool.free_lh_head[local_heap->numa_node], lh); +} + +inline static void lheap_init(lheap_t * lh) +{ + memset(&lh->free_head, 0, sizeof(lheap_t)); + + int size_cls; + lh->dummy_chunk.size_cls = DUMMY_CLASS; + lh->dummy_chunk.free_blk_cnt = 1; + + for (size_cls = 0; size_cls < DEFAULT_BLOCK_CLASS; size_cls++) { + /* Install the dummy chunk */ + lh->foreground[size_cls] = &lh->dummy_chunk; + } +} + +inline static void lheap_replace_foreground + (lheap_t * lh, int size_cls) { + dchunk_t *dc; + + /* Try to acquire the block from background list */ + dc = (dchunk_t *) lh->background[size_cls].head; + if (dc != NULL) { + double_list_remove(dc, &lh->background[size_cls]); + goto finish; + } + + /* Try to acquire a block in the remote freed list */ + dc = fast_queue_fetch(&lh->need_gc[size_cls]); + if (dc != NULL) { + dchunk_collect_garbage(dc); + goto finish; + } + + /* Try to acquire the chunk from local pool */ + dc = (dchunk_t *) seq_queue_fetch(&lh->free_head); + if (dc != NULL) { + lh->free_cnt--; + dchunk_change_cls(dc, size_cls); + goto finish; + } + + /* Acquire the chunk from global pool */ + + dc = (dchunk_t *) gpool_acquire_chunk(); + dc->owner = lh; + fast_queue_init((FastQueue *) & (dc->remote_free_head)); + dchunk_init(dc, size_cls); + + finish: + /* Set the foreground chunk */ + lh->foreground[size_cls] = dc; + dc->state = FOREGROUND; +} + +inline static void dchunk_change_cls(dchunk_t * dc, int size_cls) +{ + int size = cls2size[size_cls]; + int data_offset = DCH; + dc->blk_cnt = (CHUNK_SIZE - data_offset) / size; + dc->free_blk_cnt = dc->blk_cnt; + dc->block_size = size; + dc->free_mem = (char *)dc + data_offset; + dc->size_cls = size_cls; + seq_queue_init(&dc->free_head); +} + +inline static void dchunk_init(dchunk_t * dc, int size_cls) +{ + dc->active_link.next = NULL; + dc->active_link.prev = NULL; + dchunk_change_cls(dc, size_cls); +} + +inline static void dchunk_collect_garbage(dchunk_t * dc) +{ + seq_head(dc->free_head) = + counted_chain_dequeue(&dc->remote_free_head, &dc->free_blk_cnt); +} + +inline static void *dchunk_alloc_obj(dchunk_t * dc) +{ + void *ret; + + /* Dirty implementation of dequeue, avoid one branch */ + ret = seq_head(dc->free_head); + + if (unlikely(!ret)) { + ret = dc->free_mem; + dc->free_mem += dc->block_size; + } else { + seq_head(dc->free_head) = *(void**)ret; + } + +#if 0 + /* A clearer implementation with one more branch*/ + ret = seq_lifo_dequeue(&dc->free_head); + if (unlikely(!ret)) { + ret = dc->free_mem; + dc->free_mem += dc->block_size; + } +#endif + + return ret; +} + +inline static dchunk_t *dchunk_extract(void *ptr) +{ + return (dchunk_t *) ((uintptr_t)ptr - ((uintptr_t)ptr % CHUNK_SIZE)); +} + +inline static void obj_buf_flush(obj_buf_t * bbuf) +{ + void *prev; + + dchunk_t *dc = bbuf->dc; + lheap_t *lh = dc->owner; + + prev = counted_chain_enqueue(&(dc->remote_free_head), + seq_head(bbuf->free_head), bbuf->first, bbuf->count); + bbuf->count = 0; + bbuf->dc = NULL; + bbuf->first = NULL; + seq_head(bbuf->free_head) = NULL; + + /* If I am the first thread done remote free in this memory chunk*/ + if ((unsigned long long)prev == 0L) { + fast_queue_put(&(lh->need_gc[dc->size_cls]), dc); + } + return; +} + +inline static void obj_buf_flush_all(lheap_t *lh) { + int i; + for (i = 0; i < BLOCK_BUF_CNT; i++) { + obj_buf_t *buf = &lh->block_bufs[i]; + if (buf->count == 0) + continue; + obj_buf_flush(buf); + buf->dc = NULL; + } +} + +inline static void obj_buf_put(obj_buf_t *bbuf, dchunk_t * dc, void *ptr) { + if (unlikely(bbuf->dc != dc)) { + if (bbuf->dc != NULL) { + obj_buf_flush(bbuf); + } + bbuf->dc = dc; + bbuf->first = ptr; + bbuf->count = 0; + seq_head(bbuf->free_head) = NULL; + } + + seq_queue_put(&bbuf->free_head, ptr); + bbuf->count++; +} + +inline static void *large_malloc(size_t size) +{ + size_t alloc_size = PAGE_ROUNDUP(size + CHUNK_SIZE); + void *mem = page_alloc(NULL, alloc_size); + void *mem_start = (char*)mem + CHUNK_SIZE - CACHE_LINE_SIZE; + large_header_t *header = (large_header_t *)dchunk_extract(mem_start); + + /* If space is enough for the header of a large block */ + intptr_t distance = (intptr_t)mem_start - (intptr_t)header; + if (distance >= sizeof(large_header_t)) { + header->alloc_size = alloc_size; + header->mem = mem; + header->owner = LARGE_OWNER; + return mem_start; + } + + /* If not, Retry Allocation */ + void *ret = large_malloc(size); + page_free(mem, alloc_size); + return ret; +} + +inline static void *small_malloc(int size_cls) +{ + lheap_t *lh = local_heap; + dchunk_t *dc; + void *ret; + retry: + dc = lh->foreground[size_cls]; + ret = dchunk_alloc_obj(dc); + + /* Check if the datachunk is full */ + if (unlikely(--dc->free_blk_cnt == 0)) { + dc->state = FULL; + lheap_replace_foreground(lh, size_cls); + if (unlikely(dc->size_cls == DUMMY_CLASS)) { + /* A dummy chunk */ + dc->free_blk_cnt = 1; + goto retry; + } + } + + return ret; +} + +inline static void large_free(void *ptr) +{ + large_header_t *header = (large_header_t*)dchunk_extract(ptr); + page_free(header->mem, header->alloc_size); +} + +inline static void local_free(lheap_t * lh, dchunk_t * dc, void *ptr) +{ + unsigned int free_blk_cnt = ++dc->free_blk_cnt; + seq_queue_put(&dc->free_head, ptr); + + switch (dc->state) { + case FULL: + double_list_insert_front(dc, &lh->background[dc->size_cls]); + dc->state = BACKGROUND; + break; + case BACKGROUND: + if (unlikely(free_blk_cnt == dc->blk_cnt)) { + int free_cnt = lh->free_cnt; + double_list_remove(dc, &lh->background[dc->size_cls]); + + if (free_cnt >= MAX_FREE_CHUNK) { + gpool_release_chunk(dc); + } else { + seq_queue_put(&lh->free_head, dc); + lh->free_cnt = free_cnt + 1; + } + } + break; + case FOREGROUND: + /* Tada.. */ + break; + } +} + +THREAD_LOCAL int buf_cnt; +inline static void remote_free(lheap_t * lh, dchunk_t * dc, void *ptr) +{ + /* Put the object in a local buffer rather than return it to owner */ + int tag = ((unsigned long long)dc / CHUNK_SIZE) % BLOCK_BUF_CNT; + obj_buf_t *bbuf = &lh->block_bufs[tag]; + obj_buf_put(bbuf, dc, ptr); + + /* Periodically flush buffered remote objects */ + if ((buf_cnt++ & 0xFFFF) == 0) { + obj_buf_flush_all(lh); + } +} + +static void touch_memory_range(void *addr, size_t len) +{ + char *ptr = (char *)addr; + char *end = ptr + len; + + for (; ptr < end; ptr += PAGE_SIZE) { + *ptr = 0; + } +} + +static void *large_memalign(size_t boundary, size_t size) { + /* Alloc a large enough memory block */ + size_t padding = boundary + CHUNK_SIZE; + size_t alloc_size = PAGE_ROUNDUP(size + padding); + void *mem = page_alloc(NULL, alloc_size); + + /* Align up the address to boundary */ + void *mem_start = + (void*)((uintptr_t)((char*)mem + padding) & ~(boundary - 1)); + + /* Extract space for an header */ + large_header_t *header = + (large_header_t *)dchunk_extract(mem_start); + + /* If space is enough for the header of a large block */ + intptr_t distance = (intptr_t)mem_start - (intptr_t)header; + if (distance >= sizeof(large_header_t)) { + header->alloc_size = alloc_size; + header->mem = mem; + header->owner = LARGE_OWNER; + return mem_start; + } + + /* If not, retry allocation */ + void *ret = NULL; + + /* Avoid infinite loop if application call memalign(CHUNK_SIZE,size), + * althrough it is actually illegal + */ + if (boundary % CHUNK_SIZE != 0) { + ret = large_memalign(boundary, size); + } + page_free(mem, alloc_size); + return ret; +} + +#ifdef DEBUG +/* Signal handler for debugging use */ +static void handler(int sig) +{ + void *array[10]; + size_t size; + + /* get void*'s for all entries on the stack */ + size = backtrace(array, 10); + + /* print out all the frames to stderr */ + fprintf(stderr, "Error: signal %d:\n", sig); + backtrace_symbols_fd(array, size, 2); + exit(1); +} +#endif + +static void *page_alloc(void *pos, size_t size) +{ + return mmap(pos, + size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); +} + +static void page_free(void *pos, size_t size) +{ + munmap(pos, size); +} + +inline static int size2cls(size_t size) +{ + int ret; + if (likely(size <= 1024)) { + ret = sizemap[(size - 1) >> 2]; + } else if (size <= 65536) { + ret = sizemap2[(size - 1) >> 9]; + } else { + ret = LARGE_CLASS; + } + return ret; +} + +void *malloc(size_t size) +{ + void *ret = NULL; + + /* Initialize the allocator */ + check_init(); + + /* Deal with zero-size allocation */ + size += (size == 0); + +#if 0 + /* The expression above is equivalent to the code below */ + if (unlikely(size == 0)) { + size = 1; + } +#endif + + int size_cls = size2cls(size); + if (likely(size_cls < DEFAULT_BLOCK_CLASS)) { + ret = small_malloc(size_cls); + } else { + ret = large_malloc(size); + } + return ret; +} + +void free(void *ptr) +{ + if(ptr == NULL) { + return; + } + + dchunk_t *dc = dchunk_extract(ptr); + lheap_t *lh = local_heap; + lheap_t *target_lh = dc->owner; + + if (likely(target_lh == lh)) { + local_free(lh, dc, ptr); + } else if(likely(target_lh != LARGE_OWNER)){ + check_init(); + lh = local_heap; + remote_free(lh, dc, ptr); + } else { + large_free(ptr); + } +} + +void *realloc(void* ptr, size_t size) +{ + /* Handle special cases */ + if (ptr == NULL) { + void *ret = malloc(size); + return ret; + } + + if (size == 0) { + free(ptr); + } + + dchunk_t *dc = dchunk_extract(ptr); + if (dc->owner != LARGE_OWNER) { + int old_size = cls2size[dc->size_cls]; + + /* Not exceed the current size, return */ + if (size <= old_size) { + return ptr; + } + + /* Alloc a new block */ + void *new_ptr = malloc(size); + memcpy(new_ptr, ptr, old_size); + free(ptr); + return new_ptr; + } else { + large_header_t *header = (large_header_t *)dc; + size_t alloc_size = header->alloc_size; + void* mem = header->mem; + size_t offset = (uintptr_t)ptr - (uintptr_t)mem; + size_t old_size = alloc_size - offset; + + /* Not exceed the current size, return */ + if(size <= old_size) { + return ptr; + } + + /* Try to do mremap */ + int new_size = PAGE_ROUNDUP(size + CHUNK_SIZE); + mem = mremap(mem, alloc_size, new_size, MREMAP_MAYMOVE); + void* mem_start = (void*)((uintptr_t)mem + offset); + header = (large_header_t*)dchunk_extract(mem_start); + + intptr_t distance = (intptr_t)mem_start - (intptr_t)header; + if (distance >= sizeof(large_header_t)) { + header->alloc_size = new_size; + header->mem = mem; + header->owner = LARGE_OWNER; + return mem_start; + } + + void* new_ptr = large_malloc(size); + memcpy(new_ptr, mem_start, old_size); + free(mem); + return new_ptr; + } +} + +void *calloc(size_t nmemb, size_t size) +{ + void *ptr; + ptr = malloc(nmemb * size); + if (!ptr) { + return NULL; + } + return memset(ptr, 0, nmemb * size); +} + +void *memalign(size_t boundary, size_t size) { + /* Deal with zero-size allocation */ + size += (size == 0); + if(boundary <= 256 && size <= 65536) { + /* In this case, we handle it as small allocations */ + int boundary_cls = size2cls(boundary); + int size_cls = size2cls(size); + int alloc_cls = max(boundary_cls, size_cls); + return small_malloc(alloc_cls); + } else { + /* Handle it as a special large allocation */ + return large_memalign(boundary, size); + } +} + +int posix_memalign(void **memptr, size_t alignment, size_t size) +{ + *memptr = memalign(alignment, size); + if (*memptr) { + return 0; + } else { + /* We have to "personalize" the return value according to the error */ + return -1; + } +} + +void *valloc(size_t size) +{ + return memalign(PAGE_SIZE, size); +} + +void *pvalloc(size_t size) +{ + fprintf(stderr, "pvalloc() called. Not implemented! Exiting.\n"); + exit(1); +} diff --git a/reference/ssmalloc/ssmalloc.h b/reference/ssmalloc/ssmalloc.h new file mode 100644 index 0000000..fb10597 --- /dev/null +++ b/reference/ssmalloc/ssmalloc.h @@ -0,0 +1,177 @@ +#include <stdint.h> +#include <stdlib.h> +#include <pthread.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <sys/mman.h> +#include <sched.h> +#include <time.h> + +#include <assert.h> +#include <execinfo.h> +#include <signal.h> + +#include "atomic.h" +#include "bitops.h" +#include "queue.h" +#include "double-list.h" +#include "cpu.h" + +/* Configurations */ +#define CHUNK_DATA_SIZE (16*PAGE_SIZE) +#define ALLOC_UNIT (4*1024*1024) +#define MAX_FREE_SIZE (4*1024*1024) +#define RAW_POOL_START ((void*)((0x600000000000/CHUNK_SIZE+1)*CHUNK_SIZE)) +#define BLOCK_BUF_CNT (16) + +// #define RETURN_MEMORY +// #define DEBUG + +/* Other */ +#define CHUNK_SIZE (CHUNK_DATA_SIZE+sizeof(dchunk_t)) +#define CHUNK_MASK (~(CHUNK_SIZE-1)) +#define LARGE_CLASS (100) +#define DUMMY_CLASS (101) +#define DCH (sizeof(dchunk_t)) +#define MAX_FREE_CHUNK (MAX_FREE_SIZE/CHUNK_SIZE) +#define LARGE_OWNER ((void*)0xDEAD) +#define ACTIVE ((void*)1) + +/* Utility Macros */ +#define ROUNDUP(x,n) ((x+n-1)&(~(n-1))) +#define ROUNDDOWN(x,n) (((x-n)&(~(n-1)))+1) +#define PAGE_ROUNDUP(x) (ROUNDUP((uintptr_t)x,PAGE_SIZE)) +#define PAGE_ROUNDDOWN(x) (ROUNDDOWN((uintptr_t)x,PAGE_SIZE)) +#define CACHE_ALIGN __attribute__ ((aligned (CACHE_LINE_SIZE))) +#define THREAD_LOCAL __attribute__ ((tls_model ("initial-exec"))) __thread +#define likely(x) __builtin_expect(!!(x),1) +#define unlikely(x) __builtin_expect(!!(x),0) + +/* Multi consumer queue */ +#define queue_init(head)\ + mc_queue_init(head) +#define queue_put(head,elem)\ + mc_enqueue(head,elem) +#define queue_fetch(head)\ + mc_dequeue(head) +typedef queue_head_t Queue; + +/* Single consumer queue */ +#define fast_queue_init(head)\ + sc_queue_init(head) +#define fast_queue_put(head,elem)\ + sc_enqueue(head,elem) +#define fast_queue_fetch(head)\ + sc_dequeue(head) +#define fast_queue_chain_fetch(head)\ + sc_chain_dequeue(head) +typedef queue_head_t FastQueue; + +/* Sequencial queue */ +#define seq_queue_init(head)\ + seq_queue_init(head) +#define seq_queue_put(head,elem)\ + seq_enqueue(head,elem) +#define seq_queue_fetch(head)\ + seq_dequeue(head) +#define fast_queue_chain_put(head)\ + seq_chain_enqueue(head) +typedef seq_queue_head_t SeqQueue; + +/* Type definations */ +typedef enum { + UNINITIALIZED, + READY +} init_state; + +typedef enum { + FOREGROUND, + BACKGROUND, + FULL +} dchunk_state; + +typedef struct lheap_s lheap_t; +typedef struct gpool_s gpool_t; +typedef struct dchunk_s dchunk_t; +typedef struct chunk_s chunk_t; +typedef struct obj_buf_s obj_buf_t; +typedef struct large_header_s large_header_t; + +typedef double_list_t LinkedList; +typedef double_list_elem_t LinkedListElem; + +struct large_header_s { + CACHE_ALIGN size_t alloc_size; + void* mem; + CACHE_ALIGN lheap_t *owner; +}; + +struct chunk_s { + CACHE_ALIGN LinkedListElem active_link; + uint32_t numa_node; +}; + +/* Data chunk header */ +struct dchunk_s { + /* Local Area */ + CACHE_ALIGN LinkedListElem active_link; + uint32_t numa_node; + + /* Read Area */ + CACHE_ALIGN lheap_t * owner; + uint32_t size_cls; + + /* Local Write Area */ + CACHE_ALIGN dchunk_state state; + uint32_t free_blk_cnt; + uint32_t blk_cnt; + SeqQueue free_head; + uint32_t block_size; + char *free_mem; + + /* Remote Write Area */ + CACHE_ALIGN FastQueue remote_free_head; +}; + +struct gpool_s { + pthread_mutex_t lock; + volatile char *pool_start; + volatile char *pool_end; + volatile char *free_start; + Queue free_dc_head[MAX_CORE_ID]; + Queue free_lh_head[MAX_CORE_ID]; + Queue released_dc_head[MAX_CORE_ID]; +}; + +struct obj_buf_s { + void *dc; + void *first; + SeqQueue free_head; + int count; +}; + +/* Per-thread data chunk pool */ +struct lheap_s { + CACHE_ALIGN LinkedListElem active_link; + uint32_t numa_node; + SeqQueue free_head; + uint32_t free_cnt; + + dchunk_t *foreground[DEFAULT_BLOCK_CLASS]; + LinkedList background[DEFAULT_BLOCK_CLASS]; + dchunk_t dummy_chunk; + obj_buf_t block_bufs[BLOCK_BUF_CNT]; + + CACHE_ALIGN FastQueue need_gc[DEFAULT_BLOCK_CLASS]; +}; + +static inline int max(int a, int b) +{ + return (a > b) ? a : b; +} + +void *malloc(size_t __size); +void *realloc(void *__ptr, size_t __size); +void free(void *__ptr); |