diff options
Diffstat (limited to 'drivers/staging/zcache/zcache-main.c')
-rw-r--r-- | drivers/staging/zcache/zcache-main.c | 2558 |
1 files changed, 1250 insertions, 1308 deletions
diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c index 52b43b7b83d..328898ea76c 100644 --- a/drivers/staging/zcache/zcache-main.c +++ b/drivers/staging/zcache/zcache-main.c @@ -1,23 +1,15 @@ /* * zcache.c * - * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp. + * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. * Copyright (c) 2010,2011, Nitin Gupta * * Zcache provides an in-kernel "host implementation" for transcendent memory - * and, thus indirectly, for cleancache and frontswap. Zcache includes two - * page-accessible memory [1] interfaces, both utilizing the crypto compression - * API: - * 1) "compression buddies" ("zbud") is used for ephemeral pages - * 2) zsmalloc is used for persistent pages. - * Xvmalloc (based on the TLSF allocator) has very low fragmentation - * so maximizes space efficiency, while zbud allows pairs (and potentially, - * in the future, more than a pair of) compressed pages to be closely linked - * so that reclaiming can be done via the kernel's physical-page-oriented - * "shrinker" interface. - * - * [1] For a definition of page-accessible memory (aka PAM), see: - * http://marc.info/?l=linux-mm&m=127811271605009 + * ("tmem") and, thus indirectly, for cleancache and frontswap. Zcache uses + * lzo1x compression to improve density and an embedded allocator called + * "zbud" which "buddies" two compressed pages semi-optimally in each physical + * pageframe. Zbud is integrally tied into tmem to allow pageframes to + * be "reclaimed" efficiently. */ #include <linux/module.h> @@ -30,70 +22,62 @@ #include <linux/atomic.h> #include <linux/math64.h> #include <linux/crypto.h> -#include <linux/string.h> -#include <linux/idr.h> -#include "tmem.h" - -#include "../zsmalloc/zsmalloc.h" +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> -#ifdef CONFIG_CLEANCACHE #include <linux/cleancache.h> -#endif -#ifdef CONFIG_FRONTSWAP #include <linux/frontswap.h> -#endif - -#if 0 -/* this is more aggressive but may cause other problems? */ -#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) +#include "tmem.h" +#include "zcache.h" +#include "zbud.h" +#include "ramster.h" +#ifdef CONFIG_RAMSTER +static int ramster_enabled; #else -#define ZCACHE_GFP_MASK \ - (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) +#define ramster_enabled 0 #endif -#define MAX_CLIENTS 16 -#define LOCAL_CLIENT ((uint16_t)-1) - -MODULE_LICENSE("GPL"); - -struct zcache_client { - struct idr tmem_pools; - struct zs_pool *zspool; - bool allocated; - atomic_t refcount; -}; - -static struct zcache_client zcache_host; -static struct zcache_client zcache_clients[MAX_CLIENTS]; - -static inline uint16_t get_client_id_from_client(struct zcache_client *cli) +#ifndef __PG_WAS_ACTIVE +static inline bool PageWasActive(struct page *page) { - BUG_ON(cli == NULL); - if (cli == &zcache_host) - return LOCAL_CLIENT; - return cli - &zcache_clients[0]; + return true; } -static struct zcache_client *get_zcache_client(uint16_t cli_id) +static inline void SetPageWasActive(struct page *page) { - if (cli_id == LOCAL_CLIENT) - return &zcache_host; - - if ((unsigned int)cli_id < MAX_CLIENTS) - return &zcache_clients[cli_id]; - - return NULL; } +#endif -static inline bool is_local_client(struct zcache_client *cli) +#ifdef FRONTSWAP_HAS_EXCLUSIVE_GETS +static bool frontswap_has_exclusive_gets __read_mostly = true; +#else +static bool frontswap_has_exclusive_gets __read_mostly; +static inline void frontswap_tmem_exclusive_gets(bool b) { - return cli == &zcache_host; } +#endif + +/* enable (or fix code) when Seth's patches are accepted upstream */ +#define zcache_writeback_enabled 0 + +static int zcache_enabled __read_mostly; +static int disable_cleancache __read_mostly; +static int disable_frontswap __read_mostly; +static int disable_frontswap_ignore_nonactive __read_mostly; +static int disable_cleancache_ignore_nonactive __read_mostly; +static char *namestr __read_mostly = "zcache"; + +#define ZCACHE_GFP_MASK \ + (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) + +MODULE_LICENSE("GPL"); /* crypto API for zcache */ #define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME -static char zcache_comp_name[ZCACHE_COMP_NAME_SZ]; -static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms; +static char zcache_comp_name[ZCACHE_COMP_NAME_SZ] __read_mostly; +static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms __read_mostly; enum comp_op { ZCACHE_COMPOP_COMPRESS, @@ -105,7 +89,7 @@ static inline int zcache_comp_op(enum comp_op op, u8 *dst, unsigned int *dlen) { struct crypto_comp *tfm; - int ret; + int ret = -1; BUG_ON(!zcache_comp_pcpu_tfms); tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu()); @@ -124,816 +108,247 @@ static inline int zcache_comp_op(enum comp_op op, return ret; } -/********** - * Compression buddies ("zbud") provides for packing two (or, possibly - * in the future, more) compressed ephemeral pages into a single "raw" - * (physical) page and tracking them with data structures so that - * the raw pages can be easily reclaimed. - * - * A zbud page ("zbpg") is an aligned page containing a list_head, - * a lock, and two "zbud headers". The remainder of the physical - * page is divided up into aligned 64-byte "chunks" which contain - * the compressed data for zero, one, or two zbuds. Each zbpg - * resides on: (1) an "unused list" if it has no zbuds; (2) a - * "buddied" list if it is fully populated with two zbuds; or - * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks - * the one unbuddied zbud uses. The data inside a zbpg cannot be - * read or written unless the zbpg's lock is held. - */ - -#define ZBH_SENTINEL 0x43214321 -#define ZBPG_SENTINEL 0xdeadbeef - -#define ZBUD_MAX_BUDS 2 - -struct zbud_hdr { - uint16_t client_id; - uint16_t pool_id; - struct tmem_oid oid; - uint32_t index; - uint16_t size; /* compressed size in bytes, zero means unused */ - DECL_SENTINEL -}; - -struct zbud_page { - struct list_head bud_list; - spinlock_t lock; - struct zbud_hdr buddy[ZBUD_MAX_BUDS]; - DECL_SENTINEL - /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ -}; - -#define CHUNK_SHIFT 6 -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define CHUNK_MASK (~(CHUNK_SIZE-1)) -#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ - CHUNK_MASK) >> CHUNK_SHIFT) -#define MAX_CHUNK (NCHUNKS-1) - -static struct { - struct list_head list; - unsigned count; -} zbud_unbuddied[NCHUNKS]; -/* list N contains pages with N chunks USED and NCHUNKS-N unused */ -/* element 0 is never used but optimizing that isn't worth it */ -static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; - -struct list_head zbud_buddied_list; -static unsigned long zcache_zbud_buddied_count; - -/* protects the buddied list and all unbuddied lists */ -static DEFINE_SPINLOCK(zbud_budlists_spinlock); - -static LIST_HEAD(zbpg_unused_list); -static unsigned long zcache_zbpg_unused_list_count; - -/* protects the unused page list */ -static DEFINE_SPINLOCK(zbpg_unused_list_spinlock); - -static atomic_t zcache_zbud_curr_raw_pages; -static atomic_t zcache_zbud_curr_zpages; -static unsigned long zcache_zbud_curr_zbytes; -static unsigned long zcache_zbud_cumul_zpages; -static unsigned long zcache_zbud_cumul_zbytes; -static unsigned long zcache_compress_poor; -static unsigned long zcache_mean_compress_poor; - -/* forward references */ -static void *zcache_get_free_page(void); -static void zcache_free_page(void *p); - -/* - * zbud helper functions - */ - -static inline unsigned zbud_max_buddy_size(void) -{ - return MAX_CHUNK << CHUNK_SHIFT; -} - -static inline unsigned zbud_size_to_chunks(unsigned size) -{ - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; -} - -static inline int zbud_budnum(struct zbud_hdr *zh) -{ - unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); - struct zbud_page *zbpg = NULL; - unsigned budnum = -1U; - int i; - - for (i = 0; i < ZBUD_MAX_BUDS; i++) - if (offset == offsetof(typeof(*zbpg), buddy[i])) { - budnum = i; - break; - } - BUG_ON(budnum == -1U); - return budnum; -} - -static char *zbud_data(struct zbud_hdr *zh, unsigned size) -{ - struct zbud_page *zbpg; - char *p; - unsigned budnum; - - ASSERT_SENTINEL(zh, ZBH); - budnum = zbud_budnum(zh); - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - ASSERT_SPINLOCK(&zbpg->lock); - p = (char *)zbpg; - if (budnum == 0) - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & - CHUNK_MASK); - else if (budnum == 1) - p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); - return p; -} - -/* - * zbud raw page management - */ - -static struct zbud_page *zbud_alloc_raw_page(void) -{ - struct zbud_page *zbpg = NULL; - struct zbud_hdr *zh0, *zh1; - bool recycled = 0; - - /* if any pages on the zbpg list, use one */ - spin_lock(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - recycled = 1; - } - spin_unlock(&zbpg_unused_list_spinlock); - if (zbpg == NULL) - /* none on zbpg list, try to get a kernel page */ - zbpg = zcache_get_free_page(); - if (likely(zbpg != NULL)) { - INIT_LIST_HEAD(&zbpg->bud_list); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - spin_lock_init(&zbpg->lock); - if (recycled) { - ASSERT_INVERTED_SENTINEL(zbpg, ZBPG); - SET_SENTINEL(zbpg, ZBPG); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - } else { - atomic_inc(&zcache_zbud_curr_raw_pages); - INIT_LIST_HEAD(&zbpg->bud_list); - SET_SENTINEL(zbpg, ZBPG); - zh0->size = 0; zh1->size = 0; - tmem_oid_set_invalid(&zh0->oid); - tmem_oid_set_invalid(&zh1->oid); - } - } - return zbpg; -} - -static void zbud_free_raw_page(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; - - ASSERT_SENTINEL(zbpg, ZBPG); - BUG_ON(!list_empty(&zbpg->bud_list)); - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - INVERT_SENTINEL(zbpg, ZBPG); - spin_unlock(&zbpg->lock); - spin_lock(&zbpg_unused_list_spinlock); - list_add(&zbpg->bud_list, &zbpg_unused_list); - zcache_zbpg_unused_list_count++; - spin_unlock(&zbpg_unused_list_spinlock); -} - -/* - * core zbud handling routines - */ - -static unsigned zbud_free(struct zbud_hdr *zh) -{ - unsigned size; - - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(!tmem_oid_valid(&zh->oid)); - size = zh->size; - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - zh->size = 0; - tmem_oid_set_invalid(&zh->oid); - INVERT_SENTINEL(zh, ZBH); - zcache_zbud_curr_zbytes -= size; - atomic_dec(&zcache_zbud_curr_zpages); - return size; -} - -static void zbud_free_and_delist(struct zbud_hdr *zh) -{ - unsigned chunks; - struct zbud_hdr *zh_other; - unsigned budnum = zbud_budnum(zh), size; - struct zbud_page *zbpg = - container_of(zh, struct zbud_page, buddy[budnum]); - - spin_lock(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - spin_unlock(&zbpg->lock); - spin_unlock(&zbud_budlists_spinlock); - return; - } - size = zbud_free(zh); - ASSERT_SPINLOCK(&zbpg->lock); - zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; - if (zh_other->size == 0) { /* was unbuddied: unlist and free */ - chunks = zbud_size_to_chunks(size) ; - BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[chunks].count--; - spin_unlock(&zbud_budlists_spinlock); - zbud_free_raw_page(zbpg); - } else { /* was buddied: move remaining buddy to unbuddied list */ - chunks = zbud_size_to_chunks(zh_other->size) ; - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); - zbud_unbuddied[chunks].count++; - spin_unlock(&zbud_budlists_spinlock); - spin_unlock(&zbpg->lock); - } -} - -static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id, - struct tmem_oid *oid, - uint32_t index, struct page *page, - void *cdata, unsigned size) -{ - struct zbud_hdr *zh0, *zh1, *zh = NULL; - struct zbud_page *zbpg = NULL, *ztmp; - unsigned nchunks; - char *to; - int i, found_good_buddy = 0; - - nchunks = zbud_size_to_chunks(size) ; - for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { - spin_lock(&zbud_budlists_spinlock); - if (!list_empty(&zbud_unbuddied[i].list)) { - list_for_each_entry_safe(zbpg, ztmp, - &zbud_unbuddied[i].list, bud_list) { - if (spin_trylock(&zbpg->lock)) { - found_good_buddy = i; - goto found_unbuddied; - } - } - } - spin_unlock(&zbud_budlists_spinlock); - } - /* didn't find a good buddy, try allocating a new page */ - zbpg = zbud_alloc_raw_page(); - if (unlikely(zbpg == NULL)) - goto out; - /* ok, have a page, now compress the data before taking locks */ - spin_lock(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); - zbud_unbuddied[nchunks].count++; - zh = &zbpg->buddy[0]; - goto init_zh; - -found_unbuddied: - ASSERT_SPINLOCK(&zbpg->lock); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); - if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ - ASSERT_SENTINEL(zh0, ZBH); - zh = zh1; - } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ - ASSERT_SENTINEL(zh1, ZBH); - zh = zh0; - } else - BUG(); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[found_good_buddy].count--; - list_add_tail(&zbpg->bud_list, &zbud_buddied_list); - zcache_zbud_buddied_count++; - -init_zh: - SET_SENTINEL(zh, ZBH); - zh->size = size; - zh->index = index; - zh->oid = *oid; - zh->pool_id = pool_id; - zh->client_id = client_id; - to = zbud_data(zh, size); - memcpy(to, cdata, size); - spin_unlock(&zbpg->lock); - spin_unlock(&zbud_budlists_spinlock); - - zbud_cumul_chunk_counts[nchunks]++; - atomic_inc(&zcache_zbud_curr_zpages); - zcache_zbud_cumul_zpages++; - zcache_zbud_curr_zbytes += size; - zcache_zbud_cumul_zbytes += size; -out: - return zh; -} - -static int zbud_decompress(struct page *page, struct zbud_hdr *zh) -{ - struct zbud_page *zbpg; - unsigned budnum = zbud_budnum(zh); - unsigned int out_len = PAGE_SIZE; - char *to_va, *from_va; - unsigned size; - int ret = 0; - - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - ret = -EINVAL; - goto out; - } - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - to_va = kmap_atomic(page); - size = zh->size; - from_va = zbud_data(zh, size); - ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size, - to_va, &out_len); - BUG_ON(ret); - BUG_ON(out_len != PAGE_SIZE); - kunmap_atomic(to_va); -out: - spin_unlock(&zbpg->lock); - return ret; -} - /* - * The following routines handle shrinking of ephemeral pages by evicting - * pages "least valuable" first. + * policy parameters */ -static unsigned long zcache_evicted_raw_pages; -static unsigned long zcache_evicted_buddied_pages; -static unsigned long zcache_evicted_unbuddied_pages; - -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, - uint16_t poolid); -static void zcache_put_pool(struct tmem_pool *pool); - -/* - * Flush and free all zbuds in a zbpg, then free the pageframe - */ -static void zbud_evict_zbpg(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh; - int i, j; - uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS]; - uint32_t index[ZBUD_MAX_BUDS]; - struct tmem_oid oid[ZBUD_MAX_BUDS]; - struct tmem_pool *pool; - - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(!list_empty(&zbpg->bud_list)); - for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { - zh = &zbpg->buddy[i]; - if (zh->size) { - client_id[j] = zh->client_id; - pool_id[j] = zh->pool_id; - oid[j] = zh->oid; - index[j] = zh->index; - j++; - zbud_free(zh); - } - } - spin_unlock(&zbpg->lock); - for (i = 0; i < j; i++) { - pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); - if (pool != NULL) { - tmem_flush_page(pool, &oid[i], index[i]); - zcache_put_pool(pool); - } - } - ASSERT_SENTINEL(zbpg, ZBPG); - spin_lock(&zbpg->lock); - zbud_free_raw_page(zbpg); -} - -/* - * Free nr pages. This code is funky because we want to hold the locks - * protecting various lists for as short a time as possible, and in some - * circumstances the list may change asynchronously when the list lock is - * not held. In some cases we also trylock not only to avoid waiting on a - * page in use by another cpu, but also to avoid potential deadlock due to - * lock inversion. - */ -static void zbud_evict_pages(int nr) -{ - struct zbud_page *zbpg; - int i; - - /* first try freeing any pages on unused list */ -retry_unused_list: - spin_lock_bh(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - /* can't walk list here, since it may change when unlocked */ - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - atomic_dec(&zcache_zbud_curr_raw_pages); - spin_unlock_bh(&zbpg_unused_list_spinlock); - zcache_free_page(zbpg); - zcache_evicted_raw_pages++; - if (--nr <= 0) - goto out; - goto retry_unused_list; - } - spin_unlock_bh(&zbpg_unused_list_spinlock); - - /* now try freeing unbuddied pages, starting with least space avail */ - for (i = 0; i < MAX_CHUNK; i++) { -retry_unbud_list_i: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_unbuddied[i].list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - continue; - } - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zbud_unbuddied[i].count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_unbuddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_unbud_list_i; - } - spin_unlock_bh(&zbud_budlists_spinlock); - } - - /* as a last resort, free buddied pages */ -retry_bud_list: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_buddied_list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - goto out; - } - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_buddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_bud_list; - } - spin_unlock_bh(&zbud_budlists_spinlock); -out: - return; -} - -static void __init zbud_init(void) -{ - int i; - - INIT_LIST_HEAD(&zbud_buddied_list); - - for (i = 0; i < NCHUNKS; i++) - INIT_LIST_HEAD(&zbud_unbuddied[i].list); -} - -#ifdef CONFIG_SYSFS -/* - * These sysfs routines show a nice distribution of how many zbpg's are - * currently (and have ever been placed) in each unbuddied list. It's fun - * to watch but can probably go away before final merge. - */ -static int zbud_show_unbuddied_list_counts(char *buf) -{ - int i; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) - p += sprintf(p, "%u ", zbud_unbuddied[i].count); - return p - buf; -} - -static int zbud_show_cumul_chunk_counts(char *buf) -{ - unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; - unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; - unsigned long total_chunks_lte_42 = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); - chunks += zbud_cumul_chunk_counts[i]; - total_chunks += zbud_cumul_chunk_counts[i]; - sum_total_chunks += i * zbud_cumul_chunk_counts[i]; - if (i == 21) - total_chunks_lte_21 = total_chunks; - if (i == 32) - total_chunks_lte_32 = total_chunks; - if (i == 42) - total_chunks_lte_42 = total_chunks; - } - p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", - total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} -#endif - -/********** - * This "zv" PAM implementation combines the slab-based zsmalloc - * with the crypto compression API to maximize the amount of data that can - * be packed into a physical page. - * - * Zv represents a PAM page with the index and object (plus a "size" value - * necessary for decompression) immediately preceding the compressed data. - */ - -#define ZVH_SENTINEL 0x43214321 - -struct zv_hdr { - uint32_t pool_id; - struct tmem_oid oid; - uint32_t index; - size_t size; - DECL_SENTINEL -}; - -/* rudimentary policy limits */ -/* total number of persistent pages may not exceed this percentage */ -static unsigned int zv_page_count_policy_percent = 75; /* * byte count defining poor compression; pages with greater zsize will be * rejected */ -static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7; +static unsigned int zbud_max_zsize __read_mostly = (PAGE_SIZE / 8) * 7; /* * byte count defining poor *mean* compression; pages with greater zsize * will be rejected until sufficient better-compressed pages are accepted * driving the mean below this threshold */ -static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5; - -static atomic_t zv_curr_dist_counts[NCHUNKS]; -static atomic_t zv_cumul_dist_counts[NCHUNKS]; +static unsigned int zbud_max_mean_zsize __read_mostly = (PAGE_SIZE / 8) * 5; -static unsigned long zv_create(struct zs_pool *pool, uint32_t pool_id, - struct tmem_oid *oid, uint32_t index, - void *cdata, unsigned clen) -{ - struct zv_hdr *zv; - u32 size = clen + sizeof(struct zv_hdr); - int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - unsigned long handle = 0; - - BUG_ON(!irqs_disabled()); - BUG_ON(chunks >= NCHUNKS); - handle = zs_malloc(pool, size); - if (!handle) - goto out; - atomic_inc(&zv_curr_dist_counts[chunks]); - atomic_inc(&zv_cumul_dist_counts[chunks]); - zv = zs_map_object(pool, handle, ZS_MM_WO); - zv->index = index; - zv->oid = *oid; - zv->pool_id = pool_id; - zv->size = clen; - SET_SENTINEL(zv, ZVH); - memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); - zs_unmap_object(pool, handle); -out: - return handle; -} - -static void zv_free(struct zs_pool *pool, unsigned long handle) -{ - unsigned long flags; - struct zv_hdr *zv; - uint16_t size; - int chunks; - - zv = zs_map_object(pool, handle, ZS_MM_RW); - ASSERT_SENTINEL(zv, ZVH); - size = zv->size + sizeof(struct zv_hdr); - INVERT_SENTINEL(zv, ZVH); - zs_unmap_object(pool, handle); - - chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - BUG_ON(chunks >= NCHUNKS); - atomic_dec(&zv_curr_dist_counts[chunks]); - - local_irq_save(flags); - zs_free(pool, handle); - local_irq_restore(flags); -} - -static void zv_decompress(struct page *page, unsigned long handle) -{ - unsigned int clen = PAGE_SIZE; - char *to_va; - int ret; - struct zv_hdr *zv; - - zv = zs_map_object(zcache_host.zspool, handle, ZS_MM_RO); - BUG_ON(zv->size == 0); - ASSERT_SENTINEL(zv, ZVH); - to_va = kmap_atomic(page); - ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, (char *)zv + sizeof(*zv), - zv->size, to_va, &clen); - kunmap_atomic(to_va); - zs_unmap_object(zcache_host.zspool, handle); - BUG_ON(ret); - BUG_ON(clen != PAGE_SIZE); -} - -#ifdef CONFIG_SYSFS /* - * show a distribution of compression stats for zv pages. + * for now, used named slabs so can easily track usage; later can + * either just use kmalloc, or perhaps add a slab-like allocator + * to more carefully manage total memory utilization */ +static struct kmem_cache *zcache_objnode_cache; +static struct kmem_cache *zcache_obj_cache; -static int zv_curr_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_curr_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} - -static int zv_cumul_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_cumul_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} +static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; -/* - * setting zv_max_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected. We don't allow the value to get too close - * to PAGE_SIZE. - */ -static ssize_t zv_max_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_zsize); +/* we try to keep these statistics SMP-consistent */ +static ssize_t zcache_obj_count; +static atomic_t zcache_obj_atomic = ATOMIC_INIT(0); +static ssize_t zcache_obj_count_max; +static ssize_t zcache_objnode_count; +static atomic_t zcache_objnode_atomic = ATOMIC_INIT(0); +static ssize_t zcache_objnode_count_max; +static u64 zcache_eph_zbytes; +static atomic_long_t zcache_eph_zbytes_atomic = ATOMIC_INIT(0); +static u64 zcache_eph_zbytes_max; +static u64 zcache_pers_zbytes; +static atomic_long_t zcache_pers_zbytes_atomic = ATOMIC_INIT(0); +static u64 zcache_pers_zbytes_max; +static ssize_t zcache_eph_pageframes; +static atomic_t zcache_eph_pageframes_atomic = ATOMIC_INIT(0); +static ssize_t zcache_eph_pageframes_max; +static ssize_t zcache_pers_pageframes; +static atomic_t zcache_pers_pageframes_atomic = ATOMIC_INIT(0); +static ssize_t zcache_pers_pageframes_max; +static ssize_t zcache_pageframes_alloced; +static atomic_t zcache_pageframes_alloced_atomic = ATOMIC_INIT(0); +static ssize_t zcache_pageframes_freed; +static atomic_t zcache_pageframes_freed_atomic = ATOMIC_INIT(0); +static ssize_t zcache_eph_zpages; +static ssize_t zcache_eph_zpages; +static atomic_t zcache_eph_zpages_atomic = ATOMIC_INIT(0); +static ssize_t zcache_eph_zpages_max; +static ssize_t zcache_pers_zpages; +static atomic_t zcache_pers_zpages_atomic = ATOMIC_INIT(0); +static ssize_t zcache_pers_zpages_max; + +/* but for the rest of these, counting races are ok */ +static ssize_t zcache_flush_total; +static ssize_t zcache_flush_found; +static ssize_t zcache_flobj_total; +static ssize_t zcache_flobj_found; +static ssize_t zcache_failed_eph_puts; +static ssize_t zcache_failed_pers_puts; +static ssize_t zcache_failed_getfreepages; +static ssize_t zcache_failed_alloc; +static ssize_t zcache_put_to_flush; +static ssize_t zcache_compress_poor; +static ssize_t zcache_mean_compress_poor; +static ssize_t zcache_eph_ate_tail; +static ssize_t zcache_eph_ate_tail_failed; +static ssize_t zcache_pers_ate_eph; +static ssize_t zcache_pers_ate_eph_failed; +static ssize_t zcache_evicted_eph_zpages; +static ssize_t zcache_evicted_eph_pageframes; +static ssize_t zcache_last_active_file_pageframes; +static ssize_t zcache_last_inactive_file_pageframes; +static ssize_t zcache_last_active_anon_pageframes; +static ssize_t zcache_last_inactive_anon_pageframes; +static ssize_t zcache_eph_nonactive_puts_ignored; +static ssize_t zcache_pers_nonactive_puts_ignored; +static ssize_t zcache_writtenback_pages; +static ssize_t zcache_outstanding_writeback_pages; + +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> +#define zdfs debugfs_create_size_t +#define zdfs64 debugfs_create_u64 +static int zcache_debugfs_init(void) +{ + struct dentry *root = debugfs_create_dir("zcache", NULL); + if (root == NULL) + return -ENXIO; + + zdfs("obj_count", S_IRUGO, root, &zcache_obj_count); + zdfs("obj_count_max", S_IRUGO, root, &zcache_obj_count_max); + zdfs("objnode_count", S_IRUGO, root, &zcache_objnode_count); + zdfs("objnode_count_max", S_IRUGO, root, &zcache_objnode_count_max); + zdfs("flush_total", S_IRUGO, root, &zcache_flush_total); + zdfs("flush_found", S_IRUGO, root, &zcache_flush_found); + zdfs("flobj_total", S_IRUGO, root, &zcache_flobj_total); + zdfs("flobj_found", S_IRUGO, root, &zcache_flobj_found); + zdfs("failed_eph_puts", S_IRUGO, root, &zcache_failed_eph_puts); + zdfs("failed_pers_puts", S_IRUGO, root, &zcache_failed_pers_puts); + zdfs("failed_get_free_pages", S_IRUGO, root, + &zcache_failed_getfreepages); + zdfs("failed_alloc", S_IRUGO, root, &zcache_failed_alloc); + zdfs("put_to_flush", S_IRUGO, root, &zcache_put_to_flush); + zdfs("compress_poor", S_IRUGO, root, &zcache_compress_poor); + zdfs("mean_compress_poor", S_IRUGO, root, &zcache_mean_compress_poor); + zdfs("eph_ate_tail", S_IRUGO, root, &zcache_eph_ate_tail); + zdfs("eph_ate_tail_failed", S_IRUGO, root, &zcache_eph_ate_tail_failed); + zdfs("pers_ate_eph", S_IRUGO, root, &zcache_pers_ate_eph); + zdfs("pers_ate_eph_failed", S_IRUGO, root, &zcache_pers_ate_eph_failed); + zdfs("evicted_eph_zpages", S_IRUGO, root, &zcache_evicted_eph_zpages); + zdfs("evicted_eph_pageframes", S_IRUGO, root, + &zcache_evicted_eph_pageframes); + zdfs("eph_pageframes", S_IRUGO, root, &zcache_eph_pageframes); + zdfs("eph_pageframes_max", S_IRUGO, root, &zcache_eph_pageframes_max); + zdfs("pers_pageframes", S_IRUGO, root, &zcache_pers_pageframes); + zdfs("pers_pageframes_max", S_IRUGO, root, &zcache_pers_pageframes_max); + zdfs("eph_zpages", S_IRUGO, root, &zcache_eph_zpages); + zdfs("eph_zpages_max", S_IRUGO, root, &zcache_eph_zpages_max); + zdfs("pers_zpages", S_IRUGO, root, &zcache_pers_zpages); + zdfs("pers_zpages_max", S_IRUGO, root, &zcache_pers_zpages_max); + zdfs("last_active_file_pageframes", S_IRUGO, root, + &zcache_last_active_file_pageframes); + zdfs("last_inactive_file_pageframes", S_IRUGO, root, + &zcache_last_inactive_file_pageframes); + zdfs("last_active_anon_pageframes", S_IRUGO, root, + &zcache_last_active_anon_pageframes); + zdfs("last_inactive_anon_pageframes", S_IRUGO, root, + &zcache_last_inactive_anon_pageframes); + zdfs("eph_nonactive_puts_ignored", S_IRUGO, root, + &zcache_eph_nonactive_puts_ignored); + zdfs("pers_nonactive_puts_ignored", S_IRUGO, root, + &zcache_pers_nonactive_puts_ignored); + zdfs64("eph_zbytes", S_IRUGO, root, &zcache_eph_zbytes); + zdfs64("eph_zbytes_max", S_IRUGO, root, &zcache_eph_zbytes_max); + zdfs64("pers_zbytes", S_IRUGO, root, &zcache_pers_zbytes); + zdfs64("pers_zbytes_max", S_IRUGO, root, &zcache_pers_zbytes_max); + zdfs("outstanding_writeback_pages", S_IRUGO, root, + &zcache_outstanding_writeback_pages); + zdfs("writtenback_pages", S_IRUGO, root, &zcache_writtenback_pages); + return 0; } +#undef zdebugfs +#undef zdfs64 +#endif -static ssize_t zv_max_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_zsize = val; - return count; +#define ZCACHE_DEBUG +#ifdef ZCACHE_DEBUG +/* developers can call this in case of ooms, e.g. to find memory leaks */ +void zcache_dump(void) +{ + pr_info("zcache: obj_count=%zd\n", zcache_obj_count); + pr_info("zcache: obj_count_max=%zd\n", zcache_obj_count_max); + pr_info("zcache: objnode_count=%zd\n", zcache_objnode_count); + pr_info("zcache: objnode_count_max=%zd\n", zcache_objnode_count_max); + pr_info("zcache: flush_total=%zd\n", zcache_flush_total); + pr_info("zcache: flush_found=%zd\n", zcache_flush_found); + pr_info("zcache: flobj_total=%zd\n", zcache_flobj_total); + pr_info("zcache: flobj_found=%zd\n", zcache_flobj_found); + pr_info("zcache: failed_eph_puts=%zd\n", zcache_failed_eph_puts); + pr_info("zcache: failed_pers_puts=%zd\n", zcache_failed_pers_puts); + pr_info("zcache: failed_get_free_pages=%zd\n", + zcache_failed_getfreepages); + pr_info("zcache: failed_alloc=%zd\n", zcache_failed_alloc); + pr_info("zcache: put_to_flush=%zd\n", zcache_put_to_flush); + pr_info("zcache: compress_poor=%zd\n", zcache_compress_poor); + pr_info("zcache: mean_compress_poor=%zd\n", + zcache_mean_compress_poor); + pr_info("zcache: eph_ate_tail=%zd\n", zcache_eph_ate_tail); + pr_info("zcache: eph_ate_tail_failed=%zd\n", + zcache_eph_ate_tail_failed); + pr_info("zcache: pers_ate_eph=%zd\n", zcache_pers_ate_eph); + pr_info("zcache: pers_ate_eph_failed=%zd\n", + zcache_pers_ate_eph_failed); + pr_info("zcache: evicted_eph_zpages=%zd\n", zcache_evicted_eph_zpages); + pr_info("zcache: evicted_eph_pageframes=%zd\n", + zcache_evicted_eph_pageframes); + pr_info("zcache: eph_pageframes=%zd\n", zcache_eph_pageframes); + pr_info("zcache: eph_pageframes_max=%zd\n", zcache_eph_pageframes_max); + pr_info("zcache: pers_pageframes=%zd\n", zcache_pers_pageframes); + pr_info("zcache: pers_pageframes_max=%zd\n", + zcache_pers_pageframes_max); + pr_info("zcache: eph_zpages=%zd\n", zcache_eph_zpages); + pr_info("zcache: eph_zpages_max=%zd\n", zcache_eph_zpages_max); + pr_info("zcache: pers_zpages=%zd\n", zcache_pers_zpages); + pr_info("zcache: pers_zpages_max=%zd\n", zcache_pers_zpages_max); + pr_info("zcache: last_active_file_pageframes=%zd\n", + zcache_last_active_file_pageframes); + pr_info("zcache: last_inactive_file_pageframes=%zd\n", + zcache_last_inactive_file_pageframes); + pr_info("zcache: last_active_anon_pageframes=%zd\n", + zcache_last_active_anon_pageframes); + pr_info("zcache: last_inactive_anon_pageframes=%zd\n", + zcache_last_inactive_anon_pageframes); + pr_info("zcache: eph_nonactive_puts_ignored=%zd\n", + zcache_eph_nonactive_puts_ignored); + pr_info("zcache: pers_nonactive_puts_ignored=%zd\n", + zcache_pers_nonactive_puts_ignored); + pr_info("zcache: eph_zbytes=%llu\n", + zcache_eph_zbytes); + pr_info("zcache: eph_zbytes_max=%llu\n", + zcache_eph_zbytes_max); + pr_info("zcache: pers_zbytes=%llu\n", + zcache_pers_zbytes); + pr_info("zcache: pers_zbytes_max=%llu\n", + zcache_pers_zbytes_max); + pr_info("zcache: outstanding_writeback_pages=%zd\n", + zcache_outstanding_writeback_pages); + pr_info("zcache: writtenback_pages=%zd\n", zcache_writtenback_pages); } +#endif /* - * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected UNLESS the mean compression is also smaller - * than this value. In other words, we are load-balancing-by-zsize the - * accepted pages. Again, we don't allow the value to get too close - * to PAGE_SIZE. + * zcache core code starts here */ -static ssize_t zv_max_mean_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_mean_zsize); -} - -static ssize_t zv_max_mean_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_mean_zsize = val; - return count; -} +static struct zcache_client zcache_host; +static struct zcache_client zcache_clients[MAX_CLIENTS]; -/* - * setting zv_page_count_policy_percent via sysfs sets an upper bound of - * persistent (e.g. swap) pages that will be retained according to: - * (zv_page_count_policy_percent * totalram_pages) / 100) - * when that limit is reached, further puts will be rejected (until - * some pages have been flushed). Note that, due to compression, - * this number may exceed 100; it defaults to 75 and we set an - * arbitary limit of 150. A poor choice will almost certainly result - * in OOM's, so this value should only be changed prudently. - */ -static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static inline bool is_local_client(struct zcache_client *cli) { - return sprintf(buf, "%u\n", zv_page_count_policy_percent); + return cli == &zcache_host; } -static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static struct zcache_client *zcache_get_client_by_id(uint16_t cli_id) { - unsigned long val; - int err; + struct zcache_client *cli = &zcache_host; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > 150)) - return -EINVAL; - zv_page_count_policy_percent = val; - return count; + if (cli_id != LOCAL_CLIENT) { + if (cli_id >= MAX_CLIENTS) + goto out; + cli = &zcache_clients[cli_id]; + } +out: + return cli; } -static struct kobj_attribute zcache_zv_max_zsize_attr = { - .attr = { .name = "zv_max_zsize", .mode = 0644 }, - .show = zv_max_zsize_show, - .store = zv_max_zsize_store, -}; - -static struct kobj_attribute zcache_zv_max_mean_zsize_attr = { - .attr = { .name = "zv_max_mean_zsize", .mode = 0644 }, - .show = zv_max_mean_zsize_show, - .store = zv_max_mean_zsize_store, -}; - -static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = { - .attr = { .name = "zv_page_count_policy_percent", - .mode = 0644 }, - .show = zv_page_count_policy_percent_show, - .store = zv_page_count_policy_percent_store, -}; -#endif - -/* - * zcache core code starts here - */ - -/* useful stats not collected by cleancache or frontswap */ -static unsigned long zcache_flush_total; -static unsigned long zcache_flush_found; -static unsigned long zcache_flobj_total; -static unsigned long zcache_flobj_found; -static unsigned long zcache_failed_eph_puts; -static unsigned long zcache_failed_pers_puts; - /* * Tmem operations assume the poolid implies the invoking client. * Zcache only has one client (the kernel itself): LOCAL_CLIENT. @@ -941,24 +356,26 @@ static unsigned long zcache_failed_pers_puts; * of zcache would have one client per guest and each client might * have a poolid==N. */ -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid) +struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid) { struct tmem_pool *pool = NULL; struct zcache_client *cli = NULL; - cli = get_zcache_client(cli_id); - if (!cli) + cli = zcache_get_client_by_id(cli_id); + if (cli == NULL) goto out; - - atomic_inc(&cli->refcount); - pool = idr_find(&cli->tmem_pools, poolid); - if (pool != NULL) - atomic_inc(&pool->refcount); + if (!is_local_client(cli)) + atomic_inc(&cli->refcount); + if (poolid < MAX_POOLS_PER_CLIENT) { + pool = cli->tmem_pools[poolid]; + if (pool != NULL) + atomic_inc(&pool->refcount); + } out: return pool; } -static void zcache_put_pool(struct tmem_pool *pool) +void zcache_put_pool(struct tmem_pool *pool) { struct zcache_client *cli = NULL; @@ -966,7 +383,8 @@ static void zcache_put_pool(struct tmem_pool *pool) BUG(); cli = pool->client; atomic_dec(&pool->refcount); - atomic_dec(&cli->refcount); + if (!is_local_client(cli)) + atomic_dec(&cli->refcount); } int zcache_new_client(uint16_t cli_id) @@ -974,120 +392,17 @@ int zcache_new_client(uint16_t cli_id) struct zcache_client *cli; int ret = -1; - cli = get_zcache_client(cli_id); - + cli = zcache_get_client_by_id(cli_id); if (cli == NULL) goto out; if (cli->allocated) goto out; cli->allocated = 1; -#ifdef CONFIG_FRONTSWAP - cli->zspool = zs_create_pool("zcache", ZCACHE_GFP_MASK); - if (cli->zspool == NULL) - goto out; - idr_init(&cli->tmem_pools); -#endif - ret = 0; -out: - return ret; -} - -/* counters for debugging */ -static unsigned long zcache_failed_get_free_pages; -static unsigned long zcache_failed_alloc; -static unsigned long zcache_put_to_flush; - -/* - * for now, used named slabs so can easily track usage; later can - * either just use kmalloc, or perhaps add a slab-like allocator - * to more carefully manage total memory utilization - */ -static struct kmem_cache *zcache_objnode_cache; -static struct kmem_cache *zcache_obj_cache; -static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_obj_count_max; -static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_objnode_count_max; - -/* - * to avoid memory allocation recursion (e.g. due to direct reclaim), we - * preload all necessary data structures so the hostops callbacks never - * actually do a malloc - */ -struct zcache_preload { - void *page; - struct tmem_obj *obj; - int nr; - struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; -}; -static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; - -static int zcache_do_preload(struct tmem_pool *pool) -{ - struct zcache_preload *kp; - struct tmem_objnode *objnode; - struct tmem_obj *obj; - void *page; - int ret = -ENOMEM; - - if (unlikely(zcache_objnode_cache == NULL)) - goto out; - if (unlikely(zcache_obj_cache == NULL)) - goto out; - - /* IRQ has already been disabled. */ - kp = &__get_cpu_var(zcache_preloads); - while (kp->nr < ARRAY_SIZE(kp->objnodes)) { - objnode = kmem_cache_alloc(zcache_objnode_cache, - ZCACHE_GFP_MASK); - if (unlikely(objnode == NULL)) { - zcache_failed_alloc++; - goto out; - } - - kp->objnodes[kp->nr++] = objnode; - } - - if (!kp->obj) { - obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); - if (unlikely(obj == NULL)) { - zcache_failed_alloc++; - goto out; - } - kp->obj = obj; - } - - if (!kp->page) { - page = (void *)__get_free_page(ZCACHE_GFP_MASK); - if (unlikely(page == NULL)) { - zcache_failed_get_free_pages++; - goto out; - } - kp->page = page; - } - ret = 0; out: return ret; } -static void *zcache_get_free_page(void) -{ - struct zcache_preload *kp; - void *page; - - kp = &__get_cpu_var(zcache_preloads); - page = kp->page; - BUG_ON(page == NULL); - kp->page = NULL; - return page; -} - -static void zcache_free_page(void *p) -{ - free_page((unsigned long)p); -} - /* * zcache implementation for tmem host ops */ @@ -1095,51 +410,53 @@ static void zcache_free_page(void *p) static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) { struct tmem_objnode *objnode = NULL; - unsigned long count; struct zcache_preload *kp; + int i; kp = &__get_cpu_var(zcache_preloads); - if (kp->nr <= 0) - goto out; - objnode = kp->objnodes[kp->nr - 1]; + for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) { + objnode = kp->objnodes[i]; + if (objnode != NULL) { + kp->objnodes[i] = NULL; + break; + } + } BUG_ON(objnode == NULL); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - count = atomic_inc_return(&zcache_curr_objnode_count); - if (count > zcache_curr_objnode_count_max) - zcache_curr_objnode_count_max = count; -out: + zcache_objnode_count = atomic_inc_return(&zcache_objnode_atomic); + if (zcache_objnode_count > zcache_objnode_count_max) + zcache_objnode_count_max = zcache_objnode_count; return objnode; } static void zcache_objnode_free(struct tmem_objnode *objnode, struct tmem_pool *pool) { - atomic_dec(&zcache_curr_objnode_count); - BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); + zcache_objnode_count = + atomic_dec_return(&zcache_objnode_atomic); + BUG_ON(zcache_objnode_count < 0); kmem_cache_free(zcache_objnode_cache, objnode); } static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) { struct tmem_obj *obj = NULL; - unsigned long count; struct zcache_preload *kp; kp = &__get_cpu_var(zcache_preloads); obj = kp->obj; BUG_ON(obj == NULL); kp->obj = NULL; - count = atomic_inc_return(&zcache_curr_obj_count); - if (count > zcache_curr_obj_count_max) - zcache_curr_obj_count_max = count; + zcache_obj_count = atomic_inc_return(&zcache_obj_atomic); + if (zcache_obj_count > zcache_obj_count_max) + zcache_obj_count_max = zcache_obj_count; return obj; } static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) { - atomic_dec(&zcache_curr_obj_count); - BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); + zcache_obj_count = + atomic_dec_return(&zcache_obj_atomic); + BUG_ON(zcache_obj_count < 0); kmem_cache_free(zcache_obj_cache, obj); } @@ -1150,96 +467,302 @@ static struct tmem_hostops zcache_hostops = { .objnode_free = zcache_objnode_free, }; +static struct page *zcache_alloc_page(void) +{ + struct page *page = alloc_page(ZCACHE_GFP_MASK); + + if (page != NULL) + zcache_pageframes_alloced = + atomic_inc_return(&zcache_pageframes_alloced_atomic); + return page; +} + +static void zcache_free_page(struct page *page) +{ + long curr_pageframes; + static long max_pageframes, min_pageframes; + + if (page == NULL) + BUG(); + __free_page(page); + zcache_pageframes_freed = + atomic_inc_return(&zcache_pageframes_freed_atomic); + curr_pageframes = zcache_pageframes_alloced - + atomic_read(&zcache_pageframes_freed_atomic) - + atomic_read(&zcache_eph_pageframes_atomic) - + atomic_read(&zcache_pers_pageframes_atomic); + if (curr_pageframes > max_pageframes) + max_pageframes = curr_pageframes; + if (curr_pageframes < min_pageframes) + min_pageframes = curr_pageframes; +#ifdef ZCACHE_DEBUG + if (curr_pageframes > 2L || curr_pageframes < -2L) { + /* pr_info here */ + } +#endif +} + /* * zcache implementations for PAM page descriptor ops */ -static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_eph_pampd_count_max; -static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_pers_pampd_count_max; - /* forward reference */ -static int zcache_compress(struct page *from, void **out_va, unsigned *out_len); +static void zcache_compress(struct page *from, + void **out_va, unsigned *out_len); + +static struct page *zcache_evict_eph_pageframe(void); -static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, - struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index) +static void *zcache_pampd_eph_create(char *data, size_t size, bool raw, + struct tmem_handle *th) { - void *pampd = NULL, *cdata; - unsigned clen; - int ret; - unsigned long count; - struct page *page = (struct page *)(data); - struct zcache_client *cli = pool->client; - uint16_t client_id = get_client_id_from_client(cli); - unsigned long zv_mean_zsize; - unsigned long curr_pers_pampd_count; - u64 total_zsize; + void *pampd = NULL, *cdata = data; + unsigned clen = size; + struct page *page = (struct page *)(data), *newpage; - if (eph) { - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - if (clen == 0 || clen > zbud_max_buddy_size()) { + if (!raw) { + zcache_compress(page, &cdata, &clen); + if (clen > zbud_max_buddy_size()) { zcache_compress_poor++; goto out; } - pampd = (void *)zbud_create(client_id, pool->pool_id, oid, - index, page, cdata, clen); - if (pampd != NULL) { - count = atomic_inc_return(&zcache_curr_eph_pampd_count); - if (count > zcache_curr_eph_pampd_count_max) - zcache_curr_eph_pampd_count_max = count; - } } else { - curr_pers_pampd_count = - atomic_read(&zcache_curr_pers_pampd_count); - if (curr_pers_pampd_count > - (zv_page_count_policy_percent * totalram_pages) / 100) - goto out; - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - /* reject if compression is too poor */ - if (clen > zv_max_zsize) { - zcache_compress_poor++; + BUG_ON(clen > zbud_max_buddy_size()); + } + + /* look for space via an existing match first */ + pampd = (void *)zbud_match_prep(th, true, cdata, clen); + if (pampd != NULL) + goto got_pampd; + + /* no match, now we need to find (or free up) a full page */ + newpage = zcache_alloc_page(); + if (newpage != NULL) + goto create_in_new_page; + + zcache_failed_getfreepages++; + /* can't allocate a page, evict an ephemeral page via LRU */ + newpage = zcache_evict_eph_pageframe(); + if (newpage == NULL) { + zcache_eph_ate_tail_failed++; + goto out; + } + zcache_eph_ate_tail++; + +create_in_new_page: + pampd = (void *)zbud_create_prep(th, true, cdata, clen, newpage); + BUG_ON(pampd == NULL); + zcache_eph_pageframes = + atomic_inc_return(&zcache_eph_pageframes_atomic); + if (zcache_eph_pageframes > zcache_eph_pageframes_max) + zcache_eph_pageframes_max = zcache_eph_pageframes; + +got_pampd: + zcache_eph_zbytes = + atomic_long_add_return(clen, &zcache_eph_zbytes_atomic); + if (zcache_eph_zbytes > zcache_eph_zbytes_max) + zcache_eph_zbytes_max = zcache_eph_zbytes; + zcache_eph_zpages = atomic_inc_return(&zcache_eph_zpages_atomic); + if (zcache_eph_zpages > zcache_eph_zpages_max) + zcache_eph_zpages_max = zcache_eph_zpages; + if (ramster_enabled && raw) + ramster_count_foreign_pages(true, 1); +out: + return pampd; +} + +static void *zcache_pampd_pers_create(char *data, size_t size, bool raw, + struct tmem_handle *th) +{ + void *pampd = NULL, *cdata = data; + unsigned clen = size; + struct page *page = (struct page *)(data), *newpage; + unsigned long zbud_mean_zsize; + unsigned long curr_pers_zpages, total_zsize; + + if (data == NULL) { + BUG_ON(!ramster_enabled); + goto create_pampd; + } + curr_pers_zpages = zcache_pers_zpages; +/* FIXME CONFIG_RAMSTER... subtract atomic remote_pers_pages here? */ + if (!raw) + zcache_compress(page, &cdata, &clen); + /* reject if compression is too poor */ + if (clen > zbud_max_zsize) { + zcache_compress_poor++; + goto out; + } + /* reject if mean compression is too poor */ + if ((clen > zbud_max_mean_zsize) && (curr_pers_zpages > 0)) { + total_zsize = zcache_pers_zbytes; + if ((long)total_zsize < 0) + total_zsize = 0; + zbud_mean_zsize = div_u64(total_zsize, + curr_pers_zpages); + if (zbud_mean_zsize > zbud_max_mean_zsize) { + zcache_mean_compress_poor++; goto out; } - /* reject if mean compression is too poor */ - if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) { - total_zsize = zs_get_total_size_bytes(cli->zspool); - zv_mean_zsize = div_u64(total_zsize, - curr_pers_pampd_count); - if (zv_mean_zsize > zv_max_mean_zsize) { - zcache_mean_compress_poor++; + } + +create_pampd: + /* look for space via an existing match first */ + pampd = (void *)zbud_match_prep(th, false, cdata, clen); + if (pampd != NULL) + goto got_pampd; + + /* no match, now we need to find (or free up) a full page */ + newpage = zcache_alloc_page(); + if (newpage != NULL) + goto create_in_new_page; + /* + * FIXME do the following only if eph is oversized? + * if (zcache_eph_pageframes > + * (global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE) + + * global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE))) + */ + zcache_failed_getfreepages++; + /* can't allocate a page, evict an ephemeral page via LRU */ + newpage = zcache_evict_eph_pageframe(); + if (newpage == NULL) { + zcache_pers_ate_eph_failed++; + goto out; + } + zcache_pers_ate_eph++; + +create_in_new_page: + pampd = (void *)zbud_create_prep(th, false, cdata, clen, newpage); + BUG_ON(pampd == NULL); + zcache_pers_pageframes = + atomic_inc_return(&zcache_pers_pageframes_atomic); + if (zcache_pers_pageframes > zcache_pers_pageframes_max) + zcache_pers_pageframes_max = zcache_pers_pageframes; + +got_pampd: + zcache_pers_zpages = atomic_inc_return(&zcache_pers_zpages_atomic); + if (zcache_pers_zpages > zcache_pers_zpages_max) + zcache_pers_zpages_max = zcache_pers_zpages; + zcache_pers_zbytes = + atomic_long_add_return(clen, &zcache_pers_zbytes_atomic); + if (zcache_pers_zbytes > zcache_pers_zbytes_max) + zcache_pers_zbytes_max = zcache_pers_zbytes; + if (ramster_enabled && raw) + ramster_count_foreign_pages(false, 1); +out: + return pampd; +} + +/* + * This is called directly from zcache_put_page to pre-allocate space + * to store a zpage. + */ +void *zcache_pampd_create(char *data, unsigned int size, bool raw, + int eph, struct tmem_handle *th) +{ + void *pampd = NULL; + struct zcache_preload *kp; + struct tmem_objnode *objnode; + struct tmem_obj *obj; + int i; + + BUG_ON(!irqs_disabled()); + /* pre-allocate per-cpu metadata */ + BUG_ON(zcache_objnode_cache == NULL); + BUG_ON(zcache_obj_cache == NULL); + kp = &__get_cpu_var(zcache_preloads); + for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) { + objnode = kp->objnodes[i]; + if (objnode == NULL) { + objnode = kmem_cache_alloc(zcache_objnode_cache, + ZCACHE_GFP_MASK); + if (unlikely(objnode == NULL)) { + zcache_failed_alloc++; goto out; } + kp->objnodes[i] = objnode; } - pampd = (void *)zv_create(cli->zspool, pool->pool_id, - oid, index, cdata, clen); - if (pampd == NULL) - goto out; - count = atomic_inc_return(&zcache_curr_pers_pampd_count); - if (count > zcache_curr_pers_pampd_count_max) - zcache_curr_pers_pampd_count_max = count; } + if (kp->obj == NULL) { + obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); + kp->obj = obj; + } + if (unlikely(kp->obj == NULL)) { + zcache_failed_alloc++; + goto out; + } + /* + * ok, have all the metadata pre-allocated, now do the data + * but since how we allocate the data is dependent on ephemeral + * or persistent, we split the call here to different sub-functions + */ + if (eph) + pampd = zcache_pampd_eph_create(data, size, raw, th); + else + pampd = zcache_pampd_pers_create(data, size, raw, th); out: return pampd; } /* + * This is a pamops called via tmem_put and is necessary to "finish" + * a pampd creation. + */ +void zcache_pampd_create_finish(void *pampd, bool eph) +{ + zbud_create_finish((struct zbudref *)pampd, eph); +} + +/* + * This is passed as a function parameter to zbud_decompress so that + * zbud need not be familiar with the details of crypto. It assumes that + * the bytes from_va and to_va through from_va+size-1 and to_va+size-1 are + * kmapped. It must be successful, else there is a logic bug somewhere. + */ +static void zcache_decompress(char *from_va, unsigned int size, char *to_va) +{ + int ret; + unsigned int outlen = PAGE_SIZE; + + ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size, + to_va, &outlen); + BUG_ON(ret); + BUG_ON(outlen != PAGE_SIZE); +} + +/* + * Decompress from the kernel va to a pageframe + */ +void zcache_decompress_to_page(char *from_va, unsigned int size, + struct page *to_page) +{ + char *to_va = kmap_atomic(to_page); + zcache_decompress(from_va, size, to_va); + kunmap_atomic(to_va); +} + +/* * fill the pageframe corresponding to the struct page with the data * from the passed pampd */ -static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw, +static int zcache_pampd_get_data(char *data, size_t *sizep, bool raw, void *pampd, struct tmem_pool *pool, struct tmem_oid *oid, uint32_t index) { - int ret = 0; - - BUG_ON(is_ephemeral(pool)); - zv_decompress((struct page *)(data), (unsigned long)pampd); + int ret; + bool eph = !is_persistent(pool); + + BUG_ON(preemptible()); + BUG_ON(eph); /* fix later if shared pools get implemented */ + BUG_ON(pampd_is_remote(pampd)); + if (raw) + ret = zbud_copy_from_zbud(data, (struct zbudref *)pampd, + sizep, eph); + else { + ret = zbud_decompress((struct page *)(data), + (struct zbudref *)pampd, false, + zcache_decompress); + *sizep = PAGE_SIZE; + } return ret; } @@ -1247,16 +770,50 @@ static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw, * fill the pageframe corresponding to the struct page with the data * from the passed pampd */ -static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, +static int zcache_pampd_get_data_and_free(char *data, size_t *sizep, bool raw, void *pampd, struct tmem_pool *pool, struct tmem_oid *oid, uint32_t index) { - BUG_ON(!is_ephemeral(pool)); - if (zbud_decompress((struct page *)(data), pampd) < 0) - return -EINVAL; - zbud_free_and_delist((struct zbud_hdr *)pampd); - atomic_dec(&zcache_curr_eph_pampd_count); - return 0; + int ret; + bool eph = !is_persistent(pool); + struct page *page = NULL; + unsigned int zsize, zpages; + + BUG_ON(preemptible()); + BUG_ON(pampd_is_remote(pampd)); + if (raw) + ret = zbud_copy_from_zbud(data, (struct zbudref *)pampd, + sizep, eph); + else { + ret = zbud_decompress((struct page *)(data), + (struct zbudref *)pampd, eph, + zcache_decompress); + *sizep = PAGE_SIZE; + } + page = zbud_free_and_delist((struct zbudref *)pampd, eph, + &zsize, &zpages); + if (eph) { + if (page) + zcache_eph_pageframes = + atomic_dec_return(&zcache_eph_pageframes_atomic); + zcache_eph_zpages = + atomic_sub_return(zpages, &zcache_eph_zpages_atomic); + zcache_eph_zbytes = + atomic_long_sub_return(zsize, &zcache_eph_zbytes_atomic); + } else { + if (page) + zcache_pers_pageframes = + atomic_dec_return(&zcache_pers_pageframes_atomic); + zcache_pers_zpages = + atomic_sub_return(zpages, &zcache_pers_zpages_atomic); + zcache_pers_zbytes = + atomic_long_sub_return(zsize, &zcache_pers_zbytes_atomic); + } + if (!is_local_client(pool->client)) + ramster_count_foreign_pages(eph, -1); + if (page) + zcache_free_page(page); + return ret; } /* @@ -1264,48 +821,51 @@ static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, * pampd must no longer be pointed to from any tmem data structures! */ static void zcache_pampd_free(void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) + struct tmem_oid *oid, uint32_t index, bool acct) { - struct zcache_client *cli = pool->client; + struct page *page = NULL; + unsigned int zsize, zpages; + BUG_ON(preemptible()); + if (pampd_is_remote(pampd)) { + BUG_ON(!ramster_enabled); + pampd = ramster_pampd_free(pampd, pool, oid, index, acct); + if (pampd == NULL) + return; + } if (is_ephemeral(pool)) { - zbud_free_and_delist((struct zbud_hdr *)pampd); - atomic_dec(&zcache_curr_eph_pampd_count); - BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0); + page = zbud_free_and_delist((struct zbudref *)pampd, + true, &zsize, &zpages); + if (page) + zcache_eph_pageframes = + atomic_dec_return(&zcache_eph_pageframes_atomic); + zcache_eph_zpages = + atomic_sub_return(zpages, &zcache_eph_zpages_atomic); + zcache_eph_zbytes = + atomic_long_sub_return(zsize, &zcache_eph_zbytes_atomic); + /* FIXME CONFIG_RAMSTER... check acct parameter? */ } else { - zv_free(cli->zspool, (unsigned long)pampd); - atomic_dec(&zcache_curr_pers_pampd_count); - BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0); + page = zbud_free_and_delist((struct zbudref *)pampd, + false, &zsize, &zpages); + if (page) + zcache_pers_pageframes = + atomic_dec_return(&zcache_pers_pageframes_atomic); + zcache_pers_zpages = + atomic_sub_return(zpages, &zcache_pers_zpages_atomic); + zcache_pers_zbytes = + atomic_long_sub_return(zsize, &zcache_pers_zbytes_atomic); } -} - -static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj) -{ -} - -static void zcache_pampd_new_obj(struct tmem_obj *obj) -{ -} - -static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj) -{ - return -1; -} - -static bool zcache_pampd_is_remote(void *pampd) -{ - return 0; + if (!is_local_client(pool->client)) + ramster_count_foreign_pages(is_ephemeral(pool), -1); + if (page) + zcache_free_page(page); } static struct tmem_pamops zcache_pamops = { - .create = zcache_pampd_create, + .create_finish = zcache_pampd_create_finish, .get_data = zcache_pampd_get_data, .get_data_and_free = zcache_pampd_get_data_and_free, .free = zcache_pampd_free, - .free_obj = zcache_pampd_free_obj, - .new_obj = zcache_pampd_new_obj, - .replace_in_obj = zcache_pampd_replace_in_obj, - .is_remote = zcache_pampd_is_remote, }; /* @@ -1315,15 +875,15 @@ static struct tmem_pamops zcache_pamops = { static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); #define ZCACHE_DSTMEM_ORDER 1 -static int zcache_compress(struct page *from, void **out_va, unsigned *out_len) +static void zcache_compress(struct page *from, void **out_va, unsigned *out_len) { - int ret = 0; + int ret; unsigned char *dmem = __get_cpu_var(zcache_dstmem); char *from_va; BUG_ON(!irqs_disabled()); - if (unlikely(dmem == NULL)) - goto out; /* no buffer or no compressor so can't compress */ + /* no buffer or no compressor so can't compress */ + BUG_ON(dmem == NULL); *out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER; from_va = kmap_atomic(from); mb(); @@ -1332,9 +892,6 @@ static int zcache_compress(struct page *from, void **out_va, unsigned *out_len) BUG_ON(ret); *out_va = dmem; kunmap_atomic(from_va); - ret = 1; -out: - return ret; } static int zcache_comp_cpu_up(int cpu) @@ -1360,18 +917,21 @@ static void zcache_comp_cpu_down(int cpu) static int zcache_cpu_notifier(struct notifier_block *nb, unsigned long action, void *pcpu) { - int ret, cpu = (long)pcpu; + int ret, i, cpu = (long)pcpu; struct zcache_preload *kp; switch (action) { case CPU_UP_PREPARE: ret = zcache_comp_cpu_up(cpu); if (ret != NOTIFY_OK) { - pr_err("zcache: can't allocate compressor transform\n"); + pr_err("%s: can't allocate compressor xform\n", + namestr); return ret; } per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER); + if (ramster_enabled) + ramster_cpu_up(cpu); break; case CPU_DEAD: case CPU_UP_CANCELED: @@ -1380,20 +940,17 @@ static int zcache_cpu_notifier(struct notifier_block *nb, ZCACHE_DSTMEM_ORDER); per_cpu(zcache_dstmem, cpu) = NULL; kp = &per_cpu(zcache_preloads, cpu); - while (kp->nr) { - kmem_cache_free(zcache_objnode_cache, - kp->objnodes[kp->nr - 1]); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; + for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) { + if (kp->objnodes[i]) + kmem_cache_free(zcache_objnode_cache, + kp->objnodes[i]); } if (kp->obj) { kmem_cache_free(zcache_obj_cache, kp->obj); kp->obj = NULL; } - if (kp->page) { - free_page((unsigned long)kp->page); - kp->page = NULL; - } + if (ramster_enabled) + ramster_cpu_down(cpu); break; default: break; @@ -1405,116 +962,279 @@ static struct notifier_block zcache_cpu_notifier_block = { .notifier_call = zcache_cpu_notifier }; -#ifdef CONFIG_SYSFS -#define ZCACHE_SYSFS_RO(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", zcache_##_name); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } +/* + * The following code interacts with the zbud eviction and zbud + * zombify code to access LRU pages + */ + +static struct page *zcache_evict_eph_pageframe(void) +{ + struct page *page; + unsigned int zsize = 0, zpages = 0; + + page = zbud_evict_pageframe_lru(&zsize, &zpages); + if (page == NULL) + goto out; + zcache_eph_zbytes = atomic_long_sub_return(zsize, + &zcache_eph_zbytes_atomic); + zcache_eph_zpages = atomic_sub_return(zpages, + &zcache_eph_zpages_atomic); + zcache_evicted_eph_zpages += zpages; + zcache_eph_pageframes = + atomic_dec_return(&zcache_eph_pageframes_atomic); + zcache_evicted_eph_pageframes++; +out: + return page; +} + +#ifdef CONFIG_ZCACHE_WRITEBACK + +static atomic_t zcache_outstanding_writeback_pages_atomic = ATOMIC_INIT(0); + +static void unswiz(struct tmem_oid oid, u32 index, + unsigned *type, pgoff_t *offset); + +/* + * Choose an LRU persistent pageframe and attempt to write it back to + * the backing swap disk by calling frontswap_writeback on both zpages. + * + * This is work-in-progress. + */ -#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ +static void zcache_end_swap_write(struct bio *bio, int err) +{ + end_swap_bio_write(bio, err); + zcache_outstanding_writeback_pages = + atomic_dec_return(&zcache_outstanding_writeback_pages_atomic); + zcache_writtenback_pages++; +} + +/* + * zcache_get_swap_cache_page + * + * This is an adaption of read_swap_cache_async() + * + * If success, page is returned in retpage + * Returns 0 if page was already in the swap cache, page is not locked + * Returns 1 if the new page needs to be populated, page is locked + */ +static int zcache_get_swap_cache_page(int type, pgoff_t offset, + struct page *new_page) +{ + struct page *found_page; + swp_entry_t entry = swp_entry(type, offset); + int err; + + BUG_ON(new_page == NULL); + do { + /* + * First check the swap cache. Since this is normally + * called after lookup_swap_cache() failed, re-calling + * that would confuse statistics. + */ + found_page = find_get_page(&swapper_space, entry.val); + if (found_page) + return 0; + + /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_preload(GFP_KERNEL); + if (err) + break; + + /* + * Swap entry may have been freed since our caller observed it. + */ + err = swapcache_prepare(entry); + if (err == -EEXIST) { /* seems racy */ + radix_tree_preload_end(); + continue; + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); + break; + } + + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + err = __add_to_swap_cache(new_page, entry); + if (likely(!err)) { + radix_tree_preload_end(); + lru_cache_add_anon(new_page); + return 1; + } + radix_tree_preload_end(); + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ + swapcache_free(entry, NULL); + /* FIXME: is it possible to get here without err==-ENOMEM? + * If not, we can dispense with the do loop, use goto retry */ + } while (err != -ENOMEM); + + return -ENOMEM; +} + +/* + * Given a frontswap zpage in zcache (identified by type/offset) and + * an empty page, put the page into the swap cache, use frontswap + * to get the page from zcache into the empty page, then give it + * to the swap subsystem to send to disk (carefully avoiding the + * possibility that frontswap might snatch it back). + * Returns < 0 if error, 0 if successful, and 1 if successful but + * the newpage passed in not needed and should be freed. + */ +static int zcache_frontswap_writeback_zpage(int type, pgoff_t offset, + struct page *newpage) +{ + struct page *page = newpage; + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + ret = zcache_get_swap_cache_page(type, offset, page); + if (ret < 0) + return ret; + else if (ret == 0) { + /* more uptodate page is already in swapcache */ + __frontswap_invalidate_page(type, offset); + return 1; } -#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return _func(buf); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ + BUG_ON(!frontswap_has_exclusive_gets); /* load must also invalidate */ + /* FIXME: how is it possible to get here when page is unlocked? */ + __frontswap_load(page); + SetPageUptodate(page); /* above does SetPageDirty, is that enough? */ + + /* start writeback */ + SetPageReclaim(page); + /* + * Return value is ignored here because it doesn't change anything + * for us. Page is returned unlocked. + */ + (void)__swap_writepage(page, &wbc, zcache_end_swap_write); + page_cache_release(page); + zcache_outstanding_writeback_pages = + atomic_inc_return(&zcache_outstanding_writeback_pages_atomic); + + return 0; +} + +/* + * The following is still a magic number... we want to allow forward progress + * for writeback because it clears out needed RAM when under pressure, but + * we don't want to allow writeback to absorb and queue too many GFP_KERNEL + * pages if the swap device is very slow. + */ +#define ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES 6400 + +/* + * Try to allocate two free pages, first using a non-aggressive alloc, + * then by evicting zcache ephemeral (clean pagecache) pages, and last + * by aggressive GFP_KERNEL alloc. We allow zbud to choose a pageframe + * consisting of 1-2 zbuds/zpages, then call the writeback_zpage helper + * function above for each. + */ +static int zcache_frontswap_writeback(void) +{ + struct tmem_handle th[2]; + int ret = 0; + int nzbuds, writeback_ret; + unsigned type; + struct page *znewpage1 = NULL, *znewpage2 = NULL; + struct page *evictpage1 = NULL, *evictpage2 = NULL; + struct page *newpage1 = NULL, *newpage2 = NULL; + struct page *page1 = NULL, *page2 = NULL; + pgoff_t offset; + + znewpage1 = alloc_page(ZCACHE_GFP_MASK); + znewpage2 = alloc_page(ZCACHE_GFP_MASK); + if (znewpage1 == NULL) + evictpage1 = zcache_evict_eph_pageframe(); + if (znewpage2 == NULL) + evictpage2 = zcache_evict_eph_pageframe(); + + if ((evictpage1 == NULL || evictpage2 == NULL) && + atomic_read(&zcache_outstanding_writeback_pages_atomic) > + ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES) { + goto free_and_out; + } + if (znewpage1 == NULL && evictpage1 == NULL) + newpage1 = alloc_page(GFP_KERNEL); + if (znewpage2 == NULL && evictpage2 == NULL) + newpage2 = alloc_page(GFP_KERNEL); + if (newpage1 == NULL || newpage2 == NULL) + goto free_and_out; + + /* ok, we have two pageframes pre-allocated, get a pair of zbuds */ + nzbuds = zbud_make_zombie_lru(&th[0], NULL, NULL, false); + if (nzbuds == 0) { + ret = -ENOENT; + goto free_and_out; } -ZCACHE_SYSFS_RO(curr_obj_count_max); -ZCACHE_SYSFS_RO(curr_objnode_count_max); -ZCACHE_SYSFS_RO(flush_total); -ZCACHE_SYSFS_RO(flush_found); -ZCACHE_SYSFS_RO(flobj_total); -ZCACHE_SYSFS_RO(flobj_found); -ZCACHE_SYSFS_RO(failed_eph_puts); -ZCACHE_SYSFS_RO(failed_pers_puts); -ZCACHE_SYSFS_RO(zbud_curr_zbytes); -ZCACHE_SYSFS_RO(zbud_cumul_zpages); -ZCACHE_SYSFS_RO(zbud_cumul_zbytes); -ZCACHE_SYSFS_RO(zbud_buddied_count); -ZCACHE_SYSFS_RO(zbpg_unused_list_count); -ZCACHE_SYSFS_RO(evicted_raw_pages); -ZCACHE_SYSFS_RO(evicted_unbuddied_pages); -ZCACHE_SYSFS_RO(evicted_buddied_pages); -ZCACHE_SYSFS_RO(failed_get_free_pages); -ZCACHE_SYSFS_RO(failed_alloc); -ZCACHE_SYSFS_RO(put_to_flush); -ZCACHE_SYSFS_RO(compress_poor); -ZCACHE_SYSFS_RO(mean_compress_poor); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); -ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); -ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); -ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, - zbud_show_unbuddied_list_counts); -ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, - zbud_show_cumul_chunk_counts); -ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts, - zv_curr_dist_counts_show); -ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts, - zv_cumul_dist_counts_show); - -static struct attribute *zcache_attrs[] = { - &zcache_curr_obj_count_attr.attr, - &zcache_curr_obj_count_max_attr.attr, - &zcache_curr_objnode_count_attr.attr, - &zcache_curr_objnode_count_max_attr.attr, - &zcache_flush_total_attr.attr, - &zcache_flobj_total_attr.attr, - &zcache_flush_found_attr.attr, - &zcache_flobj_found_attr.attr, - &zcache_failed_eph_puts_attr.attr, - &zcache_failed_pers_puts_attr.attr, - &zcache_compress_poor_attr.attr, - &zcache_mean_compress_poor_attr.attr, - &zcache_zbud_curr_raw_pages_attr.attr, - &zcache_zbud_curr_zpages_attr.attr, - &zcache_zbud_curr_zbytes_attr.attr, - &zcache_zbud_cumul_zpages_attr.attr, - &zcache_zbud_cumul_zbytes_attr.attr, - &zcache_zbud_buddied_count_attr.attr, - &zcache_zbpg_unused_list_count_attr.attr, - &zcache_evicted_raw_pages_attr.attr, - &zcache_evicted_unbuddied_pages_attr.attr, - &zcache_evicted_buddied_pages_attr.attr, - &zcache_failed_get_free_pages_attr.attr, - &zcache_failed_alloc_attr.attr, - &zcache_put_to_flush_attr.attr, - &zcache_zbud_unbuddied_list_counts_attr.attr, - &zcache_zbud_cumul_chunk_counts_attr.attr, - &zcache_zv_curr_dist_counts_attr.attr, - &zcache_zv_cumul_dist_counts_attr.attr, - &zcache_zv_max_zsize_attr.attr, - &zcache_zv_max_mean_zsize_attr.attr, - &zcache_zv_page_count_policy_percent_attr.attr, - NULL, -}; + /* process the first zbud */ + unswiz(th[0].oid, th[0].index, &type, &offset); + page1 = (znewpage1 != NULL) ? znewpage1 : + ((newpage1 != NULL) ? newpage1 : evictpage1); + writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page1); + if (writeback_ret < 0) { + ret = -ENOMEM; + goto free_and_out; + } + if (evictpage1 != NULL) + zcache_pageframes_freed = + atomic_inc_return(&zcache_pageframes_freed_atomic); + if (writeback_ret == 0) { + /* zcache_get_swap_cache_page will free, don't double free */ + znewpage1 = NULL; + newpage1 = NULL; + evictpage1 = NULL; + } + if (nzbuds < 2) + goto free_and_out; + + /* if there is a second zbud, process it */ + unswiz(th[1].oid, th[1].index, &type, &offset); + page2 = (znewpage2 != NULL) ? znewpage2 : + ((newpage2 != NULL) ? newpage2 : evictpage2); + writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page2); + if (writeback_ret < 0) { + ret = -ENOMEM; + goto free_and_out; + } + if (evictpage2 != NULL) + zcache_pageframes_freed = + atomic_inc_return(&zcache_pageframes_freed_atomic); + if (writeback_ret == 0) { + znewpage2 = NULL; + newpage2 = NULL; + evictpage2 = NULL; + } -static struct attribute_group zcache_attr_group = { - .attrs = zcache_attrs, - .name = "zcache", -}; +free_and_out: + if (znewpage1 != NULL) + page_cache_release(znewpage1); + if (znewpage2 != NULL) + page_cache_release(znewpage2); + if (newpage1 != NULL) + page_cache_release(newpage1); + if (newpage2 != NULL) + page_cache_release(newpage2); + if (evictpage1 != NULL) + zcache_free_page(evictpage1); + if (evictpage2 != NULL) + zcache_free_page(evictpage2); + return ret; +} +#endif /* CONFIG_ZCACHE_WRITEBACK */ -#endif /* CONFIG_SYSFS */ /* * When zcache is disabled ("frozen"), pools can be created and destroyed, * but all puts (and thus all other operations that require memory allocation) @@ -1525,23 +1245,81 @@ static struct attribute_group zcache_attr_group = { static bool zcache_freeze; /* - * zcache shrinker interface (only useful for ephemeral pages, so zbud only) + * This zcache shrinker interface reduces the number of ephemeral pageframes + * used by zcache to approximately the same as the total number of LRU_FILE + * pageframes in use, and now also reduces the number of persistent pageframes + * used by zcache to approximately the same as the total number of LRU_ANON + * pageframes in use. FIXME POLICY: Probably the writeback should only occur + * if the eviction doesn't free enough pages. */ static int shrink_zcache_memory(struct shrinker *shrink, struct shrink_control *sc) { + static bool in_progress; int ret = -1; int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; + int nr_evict = 0; + int nr_writeback = 0; + struct page *page; + int file_pageframes_inuse, anon_pageframes_inuse; + + if (nr <= 0) + goto skip_evict; + + /* don't allow more than one eviction thread at a time */ + if (in_progress) + goto skip_evict; + + in_progress = true; + + /* we are going to ignore nr, and target a different value */ + zcache_last_active_file_pageframes = + global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + zcache_last_inactive_file_pageframes = + global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + file_pageframes_inuse = zcache_last_active_file_pageframes + + zcache_last_inactive_file_pageframes; + if (zcache_eph_pageframes > file_pageframes_inuse) + nr_evict = zcache_eph_pageframes - file_pageframes_inuse; + else + nr_evict = 0; + while (nr_evict-- > 0) { + page = zcache_evict_eph_pageframe(); + if (page == NULL) + break; + zcache_free_page(page); + } - if (nr >= 0) { - if (!(gfp_mask & __GFP_FS)) - /* does this case really need to be skipped? */ - goto out; - zbud_evict_pages(nr); + zcache_last_active_anon_pageframes = + global_page_state(NR_LRU_BASE + LRU_ACTIVE_ANON); + zcache_last_inactive_anon_pageframes = + global_page_state(NR_LRU_BASE + LRU_INACTIVE_ANON); + anon_pageframes_inuse = zcache_last_active_anon_pageframes + + zcache_last_inactive_anon_pageframes; + if (zcache_pers_pageframes > anon_pageframes_inuse) + nr_writeback = zcache_pers_pageframes - anon_pageframes_inuse; + else + nr_writeback = 0; + while (nr_writeback-- > 0) { +#ifdef CONFIG_ZCACHE_WRITEBACK + int writeback_ret; + writeback_ret = zcache_frontswap_writeback(); + if (writeback_ret == -ENOMEM) +#endif + break; } - ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); -out: + in_progress = false; + +skip_evict: + /* resample: has changed, but maybe not all the way yet */ + zcache_last_active_file_pageframes = + global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + zcache_last_inactive_file_pageframes = + global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + ret = zcache_eph_pageframes - zcache_last_active_file_pageframes + + zcache_last_inactive_file_pageframes; + if (ret < 0) + ret = 0; return ret; } @@ -1554,59 +1332,86 @@ static struct shrinker zcache_shrinker = { * zcache shims between cleancache/frontswap ops and tmem */ -static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) +/* FIXME rename these core routines to zcache_tmemput etc? */ +int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp, + uint32_t index, void *page, + unsigned int size, bool raw, int ephemeral) { struct tmem_pool *pool; + struct tmem_handle th; int ret = -1; + void *pampd = NULL; BUG_ON(!irqs_disabled()); pool = zcache_get_pool_by_id(cli_id, pool_id); if (unlikely(pool == NULL)) goto out; - if (!zcache_freeze && zcache_do_preload(pool) == 0) { - /* preload does preempt_disable on success */ - ret = tmem_put(pool, oidp, index, (char *)(page), - PAGE_SIZE, 0, is_ephemeral(pool)); - if (ret < 0) { - if (is_ephemeral(pool)) + if (!zcache_freeze) { + ret = 0; + th.client_id = cli_id; + th.pool_id = pool_id; + th.oid = *oidp; + th.index = index; + pampd = zcache_pampd_create((char *)page, size, raw, + ephemeral, &th); + if (pampd == NULL) { + ret = -ENOMEM; + if (ephemeral) zcache_failed_eph_puts++; else zcache_failed_pers_puts++; + } else { + if (ramster_enabled) + ramster_do_preload_flnode(pool); + ret = tmem_put(pool, oidp, index, 0, pampd); + if (ret < 0) + BUG(); } + zcache_put_pool(pool); } else { zcache_put_to_flush++; + if (ramster_enabled) + ramster_do_preload_flnode(pool); if (atomic_read(&pool->obj_count) > 0) /* the put fails whether the flush succeeds or not */ (void)tmem_flush_page(pool, oidp, index); + zcache_put_pool(pool); } - - zcache_put_pool(pool); out: return ret; } -static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) +int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp, + uint32_t index, void *page, + size_t *sizep, bool raw, int get_and_free) { struct tmem_pool *pool; int ret = -1; - unsigned long flags; - size_t size = PAGE_SIZE; + bool eph; - local_irq_save(flags); + if (!raw) { + BUG_ON(irqs_disabled()); + BUG_ON(in_softirq()); + } pool = zcache_get_pool_by_id(cli_id, pool_id); + eph = is_ephemeral(pool); if (likely(pool != NULL)) { if (atomic_read(&pool->obj_count) > 0) ret = tmem_get(pool, oidp, index, (char *)(page), - &size, 0, is_ephemeral(pool)); + sizep, raw, get_and_free); zcache_put_pool(pool); } - local_irq_restore(flags); + WARN_ONCE((!is_ephemeral(pool) && (ret != 0)), + "zcache_get fails on persistent pool, " + "bad things are very likely to happen soon\n"); +#ifdef RAMSTER_TESTING + if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool))) + pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret); +#endif return ret; } -static int zcache_flush_page(int cli_id, int pool_id, +int zcache_flush_page(int cli_id, int pool_id, struct tmem_oid *oidp, uint32_t index) { struct tmem_pool *pool; @@ -1616,6 +1421,8 @@ static int zcache_flush_page(int cli_id, int pool_id, local_irq_save(flags); zcache_flush_total++; pool = zcache_get_pool_by_id(cli_id, pool_id); + if (ramster_enabled) + ramster_do_preload_flnode(pool); if (likely(pool != NULL)) { if (atomic_read(&pool->obj_count) > 0) ret = tmem_flush_page(pool, oidp, index); @@ -1627,7 +1434,7 @@ static int zcache_flush_page(int cli_id, int pool_id, return ret; } -static int zcache_flush_object(int cli_id, int pool_id, +int zcache_flush_object(int cli_id, int pool_id, struct tmem_oid *oidp) { struct tmem_pool *pool; @@ -1637,6 +1444,8 @@ static int zcache_flush_object(int cli_id, int pool_id, local_irq_save(flags); zcache_flobj_total++; pool = zcache_get_pool_by_id(cli_id, pool_id); + if (ramster_enabled) + ramster_do_preload_flnode(pool); if (likely(pool != NULL)) { if (atomic_read(&pool->obj_count) > 0) ret = tmem_flush_object(pool, oidp); @@ -1648,24 +1457,25 @@ static int zcache_flush_object(int cli_id, int pool_id, return ret; } -static int zcache_destroy_pool(int cli_id, int pool_id) +static int zcache_client_destroy_pool(int cli_id, int pool_id) { struct tmem_pool *pool = NULL; - struct zcache_client *cli; + struct zcache_client *cli = NULL; int ret = -1; if (pool_id < 0) goto out; - - cli = get_zcache_client(cli_id); + if (cli_id == LOCAL_CLIENT) + cli = &zcache_host; + else if ((unsigned int)cli_id < MAX_CLIENTS) + cli = &zcache_clients[cli_id]; if (cli == NULL) goto out; - atomic_inc(&cli->refcount); - pool = idr_find(&cli->tmem_pools, pool_id); + pool = cli->tmem_pools[pool_id]; if (pool == NULL) goto out; - idr_remove(&cli->tmem_pools, pool_id); + cli->tmem_pools[pool_id] = NULL; /* wait for pool activity on other cpus to quiesce */ while (atomic_read(&pool->refcount) != 0) ; @@ -1674,56 +1484,119 @@ static int zcache_destroy_pool(int cli_id, int pool_id) ret = tmem_destroy_pool(pool); local_bh_enable(); kfree(pool); - pr_info("zcache: destroyed pool id=%d, cli_id=%d\n", - pool_id, cli_id); + if (cli_id == LOCAL_CLIENT) + pr_info("%s: destroyed local pool id=%d\n", namestr, pool_id); + else + pr_info("%s: destroyed pool id=%d, client=%d\n", + namestr, pool_id, cli_id); out: return ret; } -static int zcache_new_pool(uint16_t cli_id, uint32_t flags) +int zcache_new_pool(uint16_t cli_id, uint32_t flags) { int poolid = -1; struct tmem_pool *pool; struct zcache_client *cli = NULL; - int r; - cli = get_zcache_client(cli_id); + if (cli_id == LOCAL_CLIENT) + cli = &zcache_host; + else if ((unsigned int)cli_id < MAX_CLIENTS) + cli = &zcache_clients[cli_id]; if (cli == NULL) goto out; - atomic_inc(&cli->refcount); pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC); - if (pool == NULL) { - pr_info("zcache: pool creation failed: out of memory\n"); + if (pool == NULL) + goto out; + + for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) + if (cli->tmem_pools[poolid] == NULL) + break; + if (poolid >= MAX_POOLS_PER_CLIENT) { + pr_info("%s: pool creation failed: max exceeded\n", namestr); + kfree(pool); + poolid = -1; goto out; } + atomic_set(&pool->refcount, 0); + pool->client = cli; + pool->pool_id = poolid; + tmem_new_pool(pool, flags); + cli->tmem_pools[poolid] = pool; + if (cli_id == LOCAL_CLIENT) + pr_info("%s: created %s local tmem pool, id=%d\n", namestr, + flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", + poolid); + else + pr_info("%s: created %s tmem pool, id=%d, client=%d\n", namestr, + flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", + poolid, cli_id); +out: + if (cli != NULL) + atomic_dec(&cli->refcount); + return poolid; +} - do { - r = idr_pre_get(&cli->tmem_pools, GFP_ATOMIC); - if (r != 1) { - kfree(pool); - pr_info("zcache: pool creation failed: out of memory\n"); +static int zcache_local_new_pool(uint32_t flags) +{ + return zcache_new_pool(LOCAL_CLIENT, flags); +} + +int zcache_autocreate_pool(unsigned int cli_id, unsigned int pool_id, bool eph) +{ + struct tmem_pool *pool; + struct zcache_client *cli; + uint32_t flags = eph ? 0 : TMEM_POOL_PERSIST; + int ret = -1; + + BUG_ON(!ramster_enabled); + if (cli_id == LOCAL_CLIENT) + goto out; + if (pool_id >= MAX_POOLS_PER_CLIENT) + goto out; + if (cli_id >= MAX_CLIENTS) + goto out; + + cli = &zcache_clients[cli_id]; + if ((eph && disable_cleancache) || (!eph && disable_frontswap)) { + pr_err("zcache_autocreate_pool: pool type disabled\n"); + goto out; + } + if (!cli->allocated) { + if (zcache_new_client(cli_id)) { + pr_err("zcache_autocreate_pool: can't create client\n"); goto out; } - r = idr_get_new(&cli->tmem_pools, pool, &poolid); - } while (r == -EAGAIN); - if (r) { - pr_info("zcache: pool creation failed: error %d\n", r); - kfree(pool); + cli = &zcache_clients[cli_id]; + } + atomic_inc(&cli->refcount); + pool = cli->tmem_pools[pool_id]; + if (pool != NULL) { + if (pool->persistent && eph) { + pr_err("zcache_autocreate_pool: type mismatch\n"); + goto out; + } + ret = 0; goto out; } + pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); + if (pool == NULL) + goto out; atomic_set(&pool->refcount, 0); pool->client = cli; - pool->pool_id = poolid; + pool->pool_id = pool_id; tmem_new_pool(pool, flags); - pr_info("zcache: created %s tmem pool, id=%d, client=%d\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - poolid, cli_id); + cli->tmem_pools[pool_id] = pool; + pr_info("%s: AUTOcreated %s tmem poolid=%d, for remote client=%d\n", + namestr, flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", + pool_id, cli_id); + ret = 0; out: if (cli != NULL) atomic_dec(&cli->refcount); - return poolid; + return ret; } /********** @@ -1734,7 +1607,6 @@ out: * to translate in-kernel semantics to zcache semantics. */ -#ifdef CONFIG_CLEANCACHE static void zcache_cleancache_put_page(int pool_id, struct cleancache_filekey key, pgoff_t index, struct page *page) @@ -1742,8 +1614,13 @@ static void zcache_cleancache_put_page(int pool_id, u32 ind = (u32) index; struct tmem_oid oid = *(struct tmem_oid *)&key; + if (!disable_cleancache_ignore_nonactive && !PageWasActive(page)) { + zcache_eph_nonactive_puts_ignored++; + return; + } if (likely(ind == index)) - (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page); + (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, + page, PAGE_SIZE, false, 1); } static int zcache_cleancache_get_page(int pool_id, @@ -1752,10 +1629,16 @@ static int zcache_cleancache_get_page(int pool_id, { u32 ind = (u32) index; struct tmem_oid oid = *(struct tmem_oid *)&key; + size_t size; int ret = -1; - if (likely(ind == index)) - ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page); + if (likely(ind == index)) { + ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, + page, &size, false, 0); + BUG_ON(ret >= 0 && size != PAGE_SIZE); + if (ret == 0) + SetPageWasActive(page); + } return ret; } @@ -1781,7 +1664,7 @@ static void zcache_cleancache_flush_inode(int pool_id, static void zcache_cleancache_flush_fs(int pool_id) { if (pool_id >= 0) - (void)zcache_destroy_pool(LOCAL_CLIENT, pool_id); + (void)zcache_client_destroy_pool(LOCAL_CLIENT, pool_id); } static int zcache_cleancache_init_fs(size_t pagesize) @@ -1789,7 +1672,7 @@ static int zcache_cleancache_init_fs(size_t pagesize) BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(LOCAL_CLIENT, 0); + return zcache_local_new_pool(0); } static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) @@ -1798,7 +1681,7 @@ static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(LOCAL_CLIENT, 0); + return zcache_local_new_pool(0); } static struct cleancache_ops zcache_cleancache_ops = { @@ -1818,17 +1701,15 @@ struct cleancache_ops zcache_cleancache_register_ops(void) return old_ops; } -#endif -#ifdef CONFIG_FRONTSWAP /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ -static int zcache_frontswap_poolid = -1; +static int zcache_frontswap_poolid __read_mostly = -1; /* * Swizzling increases objects per swaptype, increasing tmem concurrency * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from - * frontswap_load(), but has side-effects. Hence using 8. + * frontswap_get_page(), but has side-effects. Hence using 8. */ #define SWIZ_BITS 8 #define SWIZ_MASK ((1 << SWIZ_BITS) - 1) @@ -1842,8 +1723,18 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) return oid; } -static int zcache_frontswap_store(unsigned type, pgoff_t offset, - struct page *page) +#ifdef CONFIG_ZCACHE_WRITEBACK +static void unswiz(struct tmem_oid oid, u32 index, + unsigned *type, pgoff_t *offset) +{ + *type = (unsigned)(oid.oid[0] >> SWIZ_BITS); + *offset = (pgoff_t)((index << SWIZ_BITS) | + (oid.oid[0] & SWIZ_MASK)); +} +#endif + +static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, + struct page *page) { u64 ind64 = (u64)offset; u32 ind = (u32)offset; @@ -1852,29 +1743,44 @@ static int zcache_frontswap_store(unsigned type, pgoff_t offset, unsigned long flags; BUG_ON(!PageLocked(page)); + if (!disable_frontswap_ignore_nonactive && !PageWasActive(page)) { + zcache_pers_nonactive_puts_ignored++; + ret = -ERANGE; + goto out; + } if (likely(ind64 == ind)) { local_irq_save(flags); ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), page); + &oid, iswiz(ind), + page, PAGE_SIZE, false, 0); local_irq_restore(flags); } +out: return ret; } /* returns 0 if the page was successfully gotten from frontswap, -1 if * was not present (should never happen!) */ -static int zcache_frontswap_load(unsigned type, pgoff_t offset, - struct page *page) +static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, + struct page *page) { u64 ind64 = (u64)offset; u32 ind = (u32)offset; struct tmem_oid oid = oswiz(type, ind); - int ret = -1; + size_t size; + int ret = -1, get_and_free; + if (frontswap_has_exclusive_gets) + get_and_free = 1; + else + get_and_free = -1; BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) + if (likely(ind64 == ind)) { ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), page); + &oid, iswiz(ind), + page, &size, false, get_and_free); + BUG_ON(ret >= 0 && size != PAGE_SIZE); + } return ret; } @@ -1908,12 +1814,12 @@ static void zcache_frontswap_init(unsigned ignored) /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ if (zcache_frontswap_poolid < 0) zcache_frontswap_poolid = - zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST); + zcache_local_new_pool(TMEM_POOL_PERSIST); } static struct frontswap_ops zcache_frontswap_ops = { - .store = zcache_frontswap_store, - .load = zcache_frontswap_load, + .store = zcache_frontswap_put_page, + .load = zcache_frontswap_get_page, .invalidate_page = zcache_frontswap_flush_page, .invalidate_area = zcache_frontswap_flush_area, .init = zcache_frontswap_init @@ -1926,16 +1832,13 @@ struct frontswap_ops zcache_frontswap_register_ops(void) return old_ops; } -#endif /* * zcache initialization - * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR - * NOTHING HAPPENS! + * NOTE FOR NOW zcache or ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER + * OR NOTHING HAPPENS! */ -static int zcache_enabled; - static int __init enable_zcache(char *s) { zcache_enabled = 1; @@ -1943,28 +1846,58 @@ static int __init enable_zcache(char *s) } __setup("zcache", enable_zcache); -/* allow independent dynamic disabling of cleancache and frontswap */ +static int __init enable_ramster(char *s) +{ + zcache_enabled = 1; +#ifdef CONFIG_RAMSTER + ramster_enabled = 1; +#endif + return 1; +} +__setup("ramster", enable_ramster); -static int use_cleancache = 1; +/* allow independent dynamic disabling of cleancache and frontswap */ static int __init no_cleancache(char *s) { - use_cleancache = 0; + disable_cleancache = 1; return 1; } __setup("nocleancache", no_cleancache); -static int use_frontswap = 1; - static int __init no_frontswap(char *s) { - use_frontswap = 0; + disable_frontswap = 1; return 1; } __setup("nofrontswap", no_frontswap); +static int __init no_frontswap_exclusive_gets(char *s) +{ + frontswap_has_exclusive_gets = false; + return 1; +} + +__setup("nofrontswapexclusivegets", no_frontswap_exclusive_gets); + +static int __init no_frontswap_ignore_nonactive(char *s) +{ + disable_frontswap_ignore_nonactive = 1; + return 1; +} + +__setup("nofrontswapignorenonactive", no_frontswap_ignore_nonactive); + +static int __init no_cleancache_ignore_nonactive(char *s) +{ + disable_cleancache_ignore_nonactive = 1; + return 1; +} + +__setup("nocleancacheignorenonactive", no_cleancache_ignore_nonactive); + static int __init enable_zcache_compressor(char *s) { strncpy(zcache_comp_name, s, ZCACHE_COMP_NAME_SZ); @@ -2007,14 +1940,13 @@ static int __init zcache_init(void) { int ret = 0; -#ifdef CONFIG_SYSFS - ret = sysfs_create_group(mm_kobj, &zcache_attr_group); - if (ret) { - pr_err("zcache: can't create sysfs\n"); - goto out; + if (ramster_enabled) { + namestr = "ramster"; + ramster_register_pamops(&zcache_pamops); } -#endif /* CONFIG_SYSFS */ - +#ifdef CONFIG_DEBUG_FS + zcache_debugfs_init(); +#endif if (zcache_enabled) { unsigned int cpu; @@ -2022,12 +1954,13 @@ static int __init zcache_init(void) tmem_register_pamops(&zcache_pamops); ret = register_cpu_notifier(&zcache_cpu_notifier_block); if (ret) { - pr_err("zcache: can't register cpu notifier\n"); + pr_err("%s: can't register cpu notifier\n", namestr); goto out; } ret = zcache_comp_init(); if (ret) { - pr_err("zcache: compressor initialization failed\n"); + pr_err("%s: compressor initialization failed\n", + namestr); goto out; } for_each_online_cpu(cpu) { @@ -2042,36 +1975,45 @@ static int __init zcache_init(void) sizeof(struct tmem_obj), 0, 0, NULL); ret = zcache_new_client(LOCAL_CLIENT); if (ret) { - pr_err("zcache: can't create client\n"); + pr_err("%s: can't create client\n", namestr); goto out; } - -#ifdef CONFIG_CLEANCACHE - if (zcache_enabled && use_cleancache) { + zbud_init(); + if (zcache_enabled && !disable_cleancache) { struct cleancache_ops old_ops; - zbud_init(); register_shrinker(&zcache_shrinker); old_ops = zcache_cleancache_register_ops(); - pr_info("zcache: cleancache enabled using kernel " - "transcendent memory and compression buddies\n"); + pr_info("%s: cleancache enabled using kernel transcendent " + "memory and compression buddies\n", namestr); +#ifdef ZCACHE_DEBUG + pr_info("%s: cleancache: ignorenonactive = %d\n", + namestr, !disable_cleancache_ignore_nonactive); +#endif if (old_ops.init_fs != NULL) - pr_warning("zcache: cleancache_ops overridden"); + pr_warn("%s: cleancache_ops overridden\n", namestr); } -#endif -#ifdef CONFIG_FRONTSWAP - if (zcache_enabled && use_frontswap) { + if (zcache_enabled && !disable_frontswap) { struct frontswap_ops old_ops; old_ops = zcache_frontswap_register_ops(); - pr_info("zcache: frontswap enabled using kernel " - "transcendent memory and zsmalloc\n"); + if (frontswap_has_exclusive_gets) + frontswap_tmem_exclusive_gets(true); + pr_info("%s: frontswap enabled using kernel transcendent " + "memory and compression buddies\n", namestr); +#ifdef ZCACHE_DEBUG + pr_info("%s: frontswap: excl gets = %d active only = %d\n", + namestr, frontswap_has_exclusive_gets, + !disable_frontswap_ignore_nonactive); +#endif if (old_ops.init != NULL) - pr_warning("zcache: frontswap_ops overridden"); + pr_warn("%s: frontswap_ops overridden\n", namestr); } -#endif + if (ramster_enabled) + ramster_init(!disable_cleancache, !disable_frontswap, + frontswap_has_exclusive_gets); out: return ret; } -module_init(zcache_init) +late_initcall(zcache_init); |