kmem_cache hardening and performance improvements

- Added slab work queue task which gradually ages and free's slabs from the cache which have not been used recently. - Optimized slab packing algorithm to ensure each slab contains the maximum number of objects without create to large a slab. - Fix deadlock, we can never call kv_free() under the skc_lock. We now unlink the objects and slabs from the cache itself and attach them to a private work list. The contents of the list are then subsequently freed outside the spin lock. - Move magazine create/destroy operation on to local cpu. - Further performace optimizations by minimize the usage of the large per-cache skc_lock. This includes the addition of KMC_BIT_REAPING bit mask which is used to prevent concurrent reaping, and to defer new slab creation when reaping is occuring. - Add KMC_BIT_DESTROYING bit mask which is set when the cache is being destroyed, this is used to catch any task accessing the cache while it is being destroyed. - Add comments to all the functions and additional comments to try and make everything as clear as possible. - Major cleanup and additions to the SPLAT kmem tests to more rigerously stress the cache implementation and look for any problems. This includes correctness and performance tests. - Updated portable work queue interfaces
2009-01-30 20:54:49 -08:00 · 2009-01-30 20:54:49 -08:00 · ea3e6ca9e5
parent 34e71c9e97
commit ea3e6ca9e5
6 changed files with 1096 additions and 567 deletions
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@ -45,6 +45,7 @@ extern "C" {
 #include <asm/atomic_compat.h>
 #include <sys/types.h>
 #include <sys/debug.h>
+#include <sys/workqueue.h>

 /*
 * Memory allocation interfaces
@ -161,17 +162,32 @@ kmem_alloc_tryhard(size_t size, size_t *alloc_size, int kmflags)
 /*
 * Slab allocation interfaces
 */
-#define KMC_NOTOUCH                     0x00000001
-#define KMC_NODEBUG                     0x00000002 /* Default behavior */
-#define KMC_NOMAGAZINE                  0x00000004 /* XXX: No disable support available */
-#define KMC_NOHASH                      0x00000008 /* XXX: No hash available */
-#define KMC_QCACHE                      0x00000010 /* XXX: Unsupported */
-#define KMC_KMEM			0x00000100 /* Use kmem cache */
-#define KMC_VMEM			0x00000200 /* Use vmem cache */
-#define KMC_OFFSLAB			0x00000400 /* Objects not on slab */
+enum {
+	KMC_BIT_NOTOUCH		= 0,	/* Don't update ages */
+	KMC_BIT_NODEBUG		= 1,	/* Default behavior */
+	KMC_BIT_NOMAGAZINE	= 2,	/* XXX: Unsupported */
+	KMC_BIT_NOHASH		= 3,	/* XXX: Unsupported */
+	KMC_BIT_QCACHE		= 4,	/* XXX: Unsupported */
+	KMC_BIT_KMEM		= 5,	/* Use kmem cache */
+	KMC_BIT_VMEM		= 6,	/* Use vmem cache */
+	KMC_BIT_OFFSLAB		= 7,	/* Objects not on slab */
+	KMC_BIT_REAPING		= 16,	/* Reaping in progress */
+	KMC_BIT_DESTROY		= 17,	/* Destroy in progress */
+};

-#define KMC_REAP_CHUNK                  256
-#define KMC_DEFAULT_SEEKS               DEFAULT_SEEKS
+#define KMC_NOTOUCH		(1 << KMC_BIT_NOTOUCH)
+#define KMC_NODEBUG		(1 << KMC_BIT_NODEBUG)
+#define KMC_NOMAGAZINE		(1 << KMC_BIT_NOMAGAZINE)
+#define KMC_NOHASH		(1 << KMC_BIT_NOHASH)
+#define KMC_QCACHE		(1 << KMC_BIT_QCACHE)
+#define KMC_KMEM		(1 << KMC_BIT_KMEM)
+#define KMC_VMEM		(1 << KMC_BIT_VMEM)
+#define KMC_OFFSLAB		(1 << KMC_BIT_OFFSLAB)
+#define KMC_REAPING		(1 << KMC_BIT_REAPING)
+#define KMC_DESTROY		(1 << KMC_BIT_DESTROY)
+
+#define KMC_REAP_CHUNK			INT_MAX
+#define KMC_DEFAULT_SEEKS		1

 #ifdef DEBUG_KMEM_UNIMPLEMENTED
 static __inline__ void kmem_init(void) {
@ -223,9 +239,10 @@ extern struct rw_semaphore spl_kmem_cache_sem;
 #define SKS_MAGIC			0x22222222
 #define SKC_MAGIC			0x2c2c2c2c

-#define SPL_KMEM_CACHE_DELAY		5
-#define SPL_KMEM_CACHE_OBJ_PER_SLAB	32
-#define SPL_KMEM_CACHE_ALIGN		8
+#define SPL_KMEM_CACHE_DELAY		5	/* Minimum slab release age */
+#define SPL_KMEM_CACHE_OBJ_PER_SLAB	32	/* Target objects per slab */
+#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN	8	/* Minimum objects per slab */
+#define SPL_KMEM_CACHE_ALIGN		8	/* Default object alignment */

 typedef int (*spl_kmem_ctor_t)(void *, void *, int);
 typedef void (*spl_kmem_dtor_t)(void *, void *);
@ -258,24 +275,28 @@ typedef struct spl_kmem_slab {
 } spl_kmem_slab_t;

 typedef struct spl_kmem_cache {
-        uint32_t		skc_magic;	/* Sanity magic */
-        uint32_t		skc_name_size;	/* Name length */
-        char			*skc_name;	/* Name string */
+	uint32_t		skc_magic;	/* Sanity magic */
+	uint32_t		skc_name_size;	/* Name length */
+	char			*skc_name;	/* Name string */
 	spl_kmem_magazine_t	*skc_mag[NR_CPUS]; /* Per-CPU warm cache */
 	uint32_t		skc_mag_size;	/* Magazine size */
 	uint32_t		skc_mag_refill;	/* Magazine refill count */
-        spl_kmem_ctor_t		skc_ctor;	/* Constructor */
-        spl_kmem_dtor_t		skc_dtor;	/* Destructor */
-        spl_kmem_reclaim_t      skc_reclaim;	/* Reclaimator */
-        void			*skc_private;	/* Private data */
-        void			*skc_vmp;	/* Unused */
+	spl_kmem_ctor_t		skc_ctor;	/* Constructor */
+	spl_kmem_dtor_t		skc_dtor;	/* Destructor */
+	spl_kmem_reclaim_t	skc_reclaim;	/* Reclaimator */
+	void			*skc_private;	/* Private data */
+	void			*skc_vmp;	/* Unused */
 	uint32_t		skc_flags;	/* Flags */
 	uint32_t		skc_obj_size;	/* Object size */
 	uint32_t		skc_obj_align;	/* Object alignment */
 	uint32_t		skc_slab_objs;	/* Objects per slab */
-	uint32_t		skc_slab_size;  /* Slab size */
-	uint32_t		skc_delay;	/* slab reclaim interval */
-        struct list_head	skc_list;	/* List of caches linkage */
+	uint32_t		skc_slab_size;	/* Slab size */
+	uint32_t		skc_delay;	/* Slab reclaim interval */
+	atomic_t		skc_ref;	/* Ref count callers */
+	struct delayed_work	skc_work;	/* Slab reclaim work */
+        struct work_struct work;
+        struct timer_list timer;
+	struct list_head	skc_list;	/* List of caches linkage */
 	struct list_head	skc_complete_list;/* Completely alloc'ed */
 	struct list_head	skc_partial_list; /* Partially alloc'ed */
 	spinlock_t		skc_lock;	/* Cache lock */
@ -283,7 +304,7 @@ typedef struct spl_kmem_cache {
 	uint64_t		skc_slab_create;/* Slab creates */
 	uint64_t		skc_slab_destroy;/* Slab destroys */
 	uint64_t		skc_slab_total;	/* Slab total current */
-	uint64_t		skc_slab_alloc; /* Slab alloc current */
+	uint64_t		skc_slab_alloc;	/* Slab alloc current */
 	uint64_t		skc_slab_max;	/* Slab max historic  */
 	uint64_t		skc_obj_total;	/* Obj total current */
 	uint64_t		skc_obj_alloc;	/* Obj alloc current */
--- a/include/sys/sysmacros.h
+++ b/include/sys/sysmacros.h
@ -203,18 +203,6 @@ extern int ddi_strtoul(const char *str, char **nptr,
 #define offsetof(s, m)  ((size_t)(&(((s *)0)->m)))
 #endif

-#ifdef HAVE_3ARGS_INIT_WORK
-
-#define spl_init_work(wq,cb,d)	INIT_WORK((wq), (void *)(cb), (void *)(d))
-#define spl_get_work_data(type,field,data)	(data)
-
-#else
-
-#define spl_init_work(wq,cb,d)	INIT_WORK((wq), (void *)(cb));
-#define spl_get_work_data(type,field,data)	container_of(data,type,field)
-
-#endif
-
 #ifdef  __cplusplus
 }
 #endif
--- a/include/sys/vmsystm.h
+++ b/include/sys/vmsystm.h
@ -35,8 +35,7 @@
 extern vmem_t *zio_alloc_arena;		/* arena for zio caches */

 #define physmem				num_physpages
-#define freemem				nr_free_pages() // Expensive on linux,
-							// cheap on solaris
+#define freemem				nr_free_pages()
 #define minfree				0
 #define needfree			0	/* # of needed pages */
 #define ptob(pages)			(pages * PAGE_SIZE)
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@ -132,10 +132,6 @@ EXPORT_SYMBOL(kmem_set_warning);
 * small virtual address space on 32bit arches.  This will seriously
 * constrain the size of the slab caches and their performance.
 *
- * XXX: Implement work requests to keep an eye on each cache and
- *      shrink them via spl_slab_reclaim() when they are wasting lots
- *      of space.  Currently this process is driven by the reapers.
- *
 * XXX: Improve the partial slab list by carefully maintaining a
 *      strict ordering of fullest to emptiest slabs based on
 *      the slab reference count.  This gaurentees the when freeing
@ -571,7 +567,8 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 	}
 }

-/* It's important that we pack the spl_kmem_obj_t structure and the
+/*
+ * It's important that we pack the spl_kmem_obj_t structure and the
 * actual objects in to one large address space to minimize the number
 * of calls to the allocator.  It is far better to do a few large
 * allocations and then subdivide it ourselves.  Now which allocator
@ -662,14 +659,17 @@ out:
 	RETURN(sks);
 }

-/* Removes slab from complete or partial list, so it must
- * be called with the 'skc->skc_lock' held.
+/*
+ * Remove a slab from complete or partial list, it must be called with
+ * the 'skc->skc_lock' held but the actual free must be performed
+ * outside the lock to prevent deadlocking on vmem addresses.
 */
 static void
-spl_slab_free(spl_kmem_slab_t *sks) {
+spl_slab_free(spl_kmem_slab_t *sks,
+	      struct list_head *sks_list, struct list_head *sko_list)
+{
 	spl_kmem_cache_t *skc;
 	spl_kmem_obj_t *sko, *n;
-	int size;
 	ENTRY;

 	ASSERT(sks->sks_magic == SKS_MAGIC);
@ -682,114 +682,190 @@ spl_slab_free(spl_kmem_slab_t *sks) {
 	skc->skc_obj_total -= sks->sks_objs;
 	skc->skc_slab_total--;
 	list_del(&sks->sks_list);
-	size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
-	       P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);

 	/* Run destructors slab is being released */
 	list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
 		ASSERT(sko->sko_magic == SKO_MAGIC);
+		list_del(&sko->sko_list);

 		if (skc->skc_dtor)
 			skc->skc_dtor(sko->sko_addr, skc->skc_private);

 		if (skc->skc_flags & KMC_OFFSLAB)
-			kv_free(skc, sko->sko_addr, size);
+			list_add(&sko->sko_list, sko_list);
 	}

-	kv_free(skc, sks, skc->skc_slab_size);
+	list_add(&sks->sks_list, sks_list);
 	EXIT;
 }

-static int
-__spl_slab_reclaim(spl_kmem_cache_t *skc)
+/*
+ * Traverses all the partial slabs attached to a cache and free those
+ * which which are currently empty, and have not been touched for
+ * skc_delay seconds.  This is to avoid thrashing.
+ */
+static void
+spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
 {
 	spl_kmem_slab_t *sks, *m;
-	int rc = 0;
+	spl_kmem_obj_t *sko, *n;
+	LIST_HEAD(sks_list);
+	LIST_HEAD(sko_list);
+	int size;
 	ENTRY;

-	ASSERT(spin_is_locked(&skc->skc_lock));
 	/*
-	 * Free empty slabs which have not been touched in skc_delay
-	 * seconds.  This delay time is important to avoid thrashing.
-	 * Empty slabs will be at the end of the skc_partial_list.
+	 * Move empty slabs and objects which have not been touched in
+	 * skc_delay seconds on to private lists to be freed outside
+	 * the spin lock.  This delay time is important to avoid
+	 * thrashing however when flag is set the delay will not be
+	 * used.  Empty slabs will be at the end of the skc_partial_list.
 	 */
+	spin_lock(&skc->skc_lock);
        list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
 					 sks_list) {
 		if (sks->sks_ref > 0)
 		       break;

-		if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
-			spl_slab_free(sks);
-			rc++;
-		}
+		if (flag || time_after(jiffies,sks->sks_age+skc->skc_delay*HZ))
+			spl_slab_free(sks, &sks_list, &sko_list);
 	}
-
-	/* Returns number of slabs reclaimed */
-	RETURN(rc);
-}
-
-static int
-spl_slab_reclaim(spl_kmem_cache_t *skc)
-{
-	int rc;
-	ENTRY;
-
-	spin_lock(&skc->skc_lock);
-	rc = __spl_slab_reclaim(skc);
 	spin_unlock(&skc->skc_lock);

-	RETURN(rc);
+	/*
+	 * We only have list of spl_kmem_obj_t's if they are located off
+	 * the slab, otherwise they get feed with the spl_kmem_slab_t.
+	 */
+	if (!list_empty(&sko_list)) {
+		ASSERT(skc->skc_flags & KMC_OFFSLAB);
+
+		size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
+		       P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
+
+		list_for_each_entry_safe(sko, n, &sko_list, sko_list)
+			kv_free(skc, sko->sko_addr, size);
+	}
+
+	list_for_each_entry_safe(sks, m, &sks_list, sks_list)
+		kv_free(skc, sks, skc->skc_slab_size);
+
+	EXIT;
 }

-/* Size slabs properly to ensure they are not too large */
+/*
+ * Called regularly on all caches to age objects out of the magazines
+ * which have not been access in skc->skc_delay seconds.  This prevents
+ * idle magazines from holding memory which might be better used by
+ * other caches or parts of the system.  The delay is present to
+ * prevent thrashing the magazine.
+ */
+static void
+spl_magazine_age(void *data)
+{
+	spl_kmem_cache_t *skc = data;
+	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+
+	if (skm->skm_avail > 0 &&
+	    time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
+		(void)spl_cache_flush(skc, skm, skm->skm_refill);
+}
+
+/*
+ * Called regularly to keep a downward pressure on the size of idle
+ * magazines and to release free slabs from the cache.  This function
+ * never calls the registered reclaim function, that only occures
+ * under memory pressure or with a direct call to spl_kmem_reap().
+ */
+static void
+spl_cache_age(void *data)
+{
+        spl_kmem_cache_t *skc =
+		spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	on_each_cpu(spl_magazine_age, skc, 0, 1);
+	spl_slab_reclaim(skc, 0);
+
+	if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
+		schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
+}
+
+/*
+ * Size a slab based on the size of each aliged object plus spl_kmem_obj_t.
+ * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However,
+ * for very small objects we may end up with more than this so as not
+ * to waste space in the minimal allocation of a single page.  Also for
+ * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN,
+ * lower than this and we will fail.
+ */
 static int
 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 {
-	int max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
-	int align = skc->skc_obj_align;
-
-	*objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
+	int sks_size, obj_size, max_size, align;

 	if (skc->skc_flags & KMC_OFFSLAB) {
+		*objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
 		*size = sizeof(spl_kmem_slab_t);
 	} else {
-resize:
-		*size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align) +
-			*objs * (P2ROUNDUP(skc->skc_obj_size, align) +
-		        P2ROUNDUP(sizeof(spl_kmem_obj_t), align));
+		align = skc->skc_obj_align;
+		sks_size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align);
+		obj_size = P2ROUNDUP(skc->skc_obj_size, align) +
+                           P2ROUNDUP(sizeof(spl_kmem_obj_t), align);

-		if (*size > max)
-			GOTO(resize, *objs = *objs - 1);
+		if (skc->skc_flags & KMC_KMEM)
+			max_size = ((uint64_t)1 << (MAX_ORDER-1)) * PAGE_SIZE;
+		else
+			max_size = (32 * 1024 * 1024);

-		ASSERT(*objs > 0);
+		for (*size = PAGE_SIZE; *size <= max_size; *size += PAGE_SIZE) {
+			*objs = (*size - sks_size) / obj_size;
+			if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB)
+				RETURN(0);
+		}
+
+		/*
+		 * Unable to satisfy target objets per slab, fallback to
+		 * allocating a maximally sized slab and assuming it can
+		 * contain the minimum objects count use it.  If not fail.
+		 */
+		*size = max_size;
+		*objs = (*size - sks_size) / obj_size;
+		if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN)
+			RETURN(0);
 	}

-	ASSERTF(*size <= max, "%d < %d\n", *size, max);
-	RETURN(0);
+	RETURN(-ENOSPC);
 }

+/*
+ * Make a guess at reasonable per-cpu magazine size based on the size of
+ * each object and the cost of caching N of them in each magazine.  Long
+ * term this should really adapt based on an observed usage heuristic.
+ */
 static int
 spl_magazine_size(spl_kmem_cache_t *skc)
 {
 	int size, align = skc->skc_obj_align;
 	ENTRY;

-	/* Guesses for reasonable magazine sizes, they
-	 * should really adapt based on observed usage. */
+	/* Per-magazine sizes below assume a 4Kib page size */
 	if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 256))
-		size = 4;
+		size = 4;  /* Minimum 4Mib per-magazine */
 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 32))
-		size = 16;
+		size = 16; /* Minimum 2Mib per-magazine */
 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE))
-		size = 64;
+		size = 64; /* Minimum 256Kib per-magazine */
 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE / 4))
-		size = 128;
+		size = 128; /* Minimum 128Kib per-magazine */
 	else
-		size = 512;
+		size = 256;

 	RETURN(size);
 }

+/*
+ * Allocate a per-cpu magazine to assoicate with a specific core.
+ */
 static spl_kmem_magazine_t *
 spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
 {
@ -798,19 +874,21 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
 	           sizeof(void *) * skc->skc_mag_size;
 	ENTRY;

-	skm = kmem_alloc_node(size, GFP_KERNEL, node);
+	skm = kmem_alloc_node(size, GFP_KERNEL | __GFP_NOFAIL, node);
 	if (skm) {
 		skm->skm_magic = SKM_MAGIC;
 		skm->skm_avail = 0;
 		skm->skm_size = skc->skc_mag_size;
 		skm->skm_refill = skc->skc_mag_refill;
-		if (!(skc->skc_flags & KMC_NOTOUCH))
-			skm->skm_age = jiffies;
+		skm->skm_age = jiffies;
 	}

 	RETURN(skm);
 }

+/*
+ * Free a per-cpu magazine assoicated with a specific core.
+ */
 static void
 spl_magazine_free(spl_kmem_magazine_t *skm)
 {
@ -825,44 +903,72 @@ spl_magazine_free(spl_kmem_magazine_t *skm)
 	EXIT;
 }

+static void
+__spl_magazine_create(void *data)
+{
+        spl_kmem_cache_t *skc = data;
+	int id = smp_processor_id();
+
+	skc->skc_mag[id] = spl_magazine_alloc(skc, cpu_to_node(id));
+	ASSERT(skc->skc_mag[id]);
+}
+
+/*
+ * Create all pre-cpu magazines of reasonable sizes.
+ */
 static int
 spl_magazine_create(spl_kmem_cache_t *skc)
 {
-	int i;
 	ENTRY;

 	skc->skc_mag_size = spl_magazine_size(skc);
-	skc->skc_mag_refill = (skc->skc_mag_size + 1)  / 2;
-
-	for_each_online_cpu(i) {
-		skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
-		if (!skc->skc_mag[i]) {
-			for (i--; i >= 0; i--)
-				spl_magazine_free(skc->skc_mag[i]);
-
-			RETURN(-ENOMEM);
-		}
-	}
+	skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
+	on_each_cpu(__spl_magazine_create, skc, 0, 1);

 	RETURN(0);
 }

+static void
+__spl_magazine_destroy(void *data)
+{
+        spl_kmem_cache_t *skc = data;
+	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+
+	(void)spl_cache_flush(skc, skm, skm->skm_avail);
+	spl_magazine_free(skm);
+}
+
+/*
+ * Destroy all pre-cpu magazines.
+ */
 static void
 spl_magazine_destroy(spl_kmem_cache_t *skc)
 {
-        spl_kmem_magazine_t *skm;
-	int i;
 	ENTRY;
-
-	for_each_online_cpu(i) {
-		skm = skc->skc_mag[i];
-		(void)spl_cache_flush(skc, skm, skm->skm_avail);
-		spl_magazine_free(skm);
-	}
-
+	on_each_cpu(__spl_magazine_destroy, skc, 0, 1);
 	EXIT;
 }

+/*
+ * Create a object cache based on the following arguments:
+ * name		cache name
+ * size		cache object size
+ * align	cache object alignment
+ * ctor		cache object constructor
+ * dtor		cache object destructor
+ * reclaim	cache object reclaim
+ * priv		cache private data for ctor/dtor/reclaim
+ * vmp		unused must be NULL
+ * flags
+ *	KMC_NOTOUCH	Disable cache object aging (unsupported)
+ *	KMC_NODEBUG	Disable debugging (unsupported)
+ *	KMC_NOMAGAZINE	Disable magazine (unsupported)
+ *	KMC_NOHASH      Disable hashing (unsupported)
+ *	KMC_QCACHE	Disable qcache (unsupported)
+ *	KMC_KMEM	Force kmem backed cache
+ *	KMC_VMEM        Force vmem backed cache
+ *	KMC_OFFSLAB	Locate objects off the slab
+ */
 spl_kmem_cache_t *
 spl_kmem_cache_create(char *name, size_t size, size_t align,
                      spl_kmem_ctor_t ctor,
@ -908,6 +1014,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	skc->skc_obj_size = size;
 	skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
 	skc->skc_delay = SPL_KMEM_CACHE_DELAY;
+	atomic_set(&skc->skc_ref, 0);

 	INIT_LIST_HEAD(&skc->skc_list);
 	INIT_LIST_HEAD(&skc->skc_complete_list);
@ -947,6 +1054,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	if (rc)
 		GOTO(out, rc);

+	spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
+	schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
+
 	down_write(&spl_kmem_cache_sem);
 	list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
 	up_write(&spl_kmem_cache_sem);
@ -959,10 +1069,13 @@ out:
 }
 EXPORT_SYMBOL(spl_kmem_cache_create);

+/*
+ * Destroy a cache and all objects assoicated with the cache.
+ */
 void
 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 {
-        spl_kmem_slab_t *sks, *m;
+	DECLARE_WAIT_QUEUE_HEAD(wq);
 	ENTRY;

 	ASSERT(skc->skc_magic == SKC_MAGIC);
@ -971,20 +1084,27 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 	list_del_init(&skc->skc_list);
 	up_write(&spl_kmem_cache_sem);

+	/* Cancel any and wait for any pending delayed work */
+	ASSERT(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+	cancel_delayed_work(&skc->skc_work);
+	flush_scheduled_work();
+
+	/* Wait until all current callers complete, this is mainly
+	 * to catch the case where a low memory situation triggers a
+	 * cache reaping action which races with this destroy. */
+	wait_event(wq, atomic_read(&skc->skc_ref) == 0);
+
 	spl_magazine_destroy(skc);
+	spl_slab_reclaim(skc, 1);
 	spin_lock(&skc->skc_lock);

 	/* Validate there are no objects in use and free all the
 	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
+	ASSERT3U(skc->skc_slab_alloc, ==, 0);
+	ASSERT3U(skc->skc_obj_alloc, ==, 0);
+	ASSERT3U(skc->skc_slab_total, ==, 0);
+	ASSERT3U(skc->skc_obj_total, ==, 0);
 	ASSERT(list_empty(&skc->skc_complete_list));
-	ASSERT(skc->skc_slab_alloc == 0);
-	ASSERT(skc->skc_obj_alloc == 0);
-
-	list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
-		spl_slab_free(sks);
-
-	ASSERT(skc->skc_slab_total == 0);
-	ASSERT(skc->skc_obj_total == 0);

 	kmem_free(skc->skc_name, skc->skc_name_size);
 	spin_unlock(&skc->skc_lock);
@ -995,6 +1115,10 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 }
 EXPORT_SYMBOL(spl_kmem_cache_destroy);

+/*
+ * Allocate an object from a slab attached to the cache.  This is used to
+ * repopulate the per-cpu magazine caches in batches when they run low.
+ */
 static void *
 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
 {
@ -1030,10 +1154,11 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
 	return sko->sko_addr;
 }

-/* No available objects create a new slab.  Since this is an
- * expensive operation we do it without holding the spinlock
- * and only briefly aquire it when we link in the fully
- * allocated and constructed slab.
+/*
+ * No available objects on any slabsi, create a new slab.  Since this
+ * is an expensive operation we do it without holding the spinlock and
+ * only briefly aquire it when we link in the fully allocated and
+ * constructed slab.
 */
 static spl_kmem_slab_t *
 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
@ -1042,34 +1167,42 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags)
 	ENTRY;

 	ASSERT(skc->skc_magic == SKC_MAGIC);
+	local_irq_enable();
+	might_sleep();

-	if (flags & __GFP_WAIT) {
-		flags |= __GFP_NOFAIL;
-		local_irq_enable();
-		might_sleep();
+	/*
+	 * Before allocating a new slab check if the slab is being reaped.
+	 * If it is there is a good chance we can wait until it finishes
+	 * and then use one of the newly freed but not aged-out slabs.
+	 */
+	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+		schedule();
+		GOTO(out, sks= NULL);
 	}

-	sks = spl_slab_alloc(skc, flags);
-	if (sks == NULL) {
-	        if (flags & __GFP_WAIT)
-			local_irq_disable();
+	/* Allocate a new slab for the cache */
+	sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | __GFP_NOWARN);
+	if (sks == NULL)
+		GOTO(out, sks = NULL);

-		RETURN(NULL);
-	}
-
-	if (flags & __GFP_WAIT)
-		local_irq_disable();
-
-	/* Link the new empty slab in to the end of skc_partial_list */
+	/* Link the new empty slab in to the end of skc_partial_list. */
 	spin_lock(&skc->skc_lock);
 	skc->skc_slab_total++;
 	skc->skc_obj_total += sks->sks_objs;
 	list_add_tail(&sks->sks_list, &skc->skc_partial_list);
 	spin_unlock(&skc->skc_lock);
+out:
+	local_irq_disable();

 	RETURN(sks);
 }

+/*
+ * Refill a per-cpu magazine with objects from the slabs for this
+ * cache.  Ideally the magazine can be repopulated using existing
+ * objects which have been released, however if we are unable to
+ * locate enough free objects new slabs of objects will be created.
+ */
 static int
 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 {
@ -1080,13 +1213,11 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skm->skm_magic == SKM_MAGIC);

-	/* XXX: Check for refill bouncing by age perhaps */
 	refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
-
 	spin_lock(&skc->skc_lock);

 	while (refill > 0) {
-		/* No slabs available we must grow the cache */
+		/* No slabs available we may need to grow the cache */
 		if (list_empty(&skc->skc_partial_list)) {
 			spin_unlock(&skc->skc_lock);

@ -1135,6 +1266,9 @@ out:
 	RETURN(rc);
 }

+/*
+ * Release an object back to the slab from which it came.
+ */
 static void
 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
 {
@ -1176,6 +1310,13 @@ spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
 	EXIT;
 }

+/*
+ * Release a batch of objects from a per-cpu magazine back to their
+ * respective slabs.  This occurs when we exceed the magazine size,
+ * are under memory pressure, when the cache is idle, or during
+ * cache cleanup.  The flush argument contains the number of entries
+ * to remove from the magazine.
+ */
 static int
 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 {
@ -1185,12 +1326,17 @@ spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skm->skm_magic == SKM_MAGIC);

+	/*
+	 * XXX: Currently we simply return objects from the magazine to
+	 * the slabs in fifo order.  The ideal thing to do from a memory
+	 * fragmentation standpoint is to cheaply determine the set of
+	 * objects in the magazine which will result in the largest
+	 * number of free slabs if released from the magazine.
+	 */
 	spin_lock(&skc->skc_lock);
-
 	for (i = 0; i < count; i++)
 		spl_cache_shrink(skc, skm->skm_objs[i]);

-//	__spl_slab_reclaim(skc);
 	skm->skm_avail -= count;
 	memmove(skm->skm_objs, &(skm->skm_objs[count]),
 	        sizeof(void *) * skm->skm_avail);
@ -1200,6 +1346,10 @@ spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 	RETURN(count);
 }

+/*
+ * Allocate an object from the per-cpu magazine, or if the magazine
+ * is empty directly allocate from a slab and repopulate the magazine.
+ */
 void *
 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
 {
@ -1209,7 +1359,9 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
 	ENTRY;

 	ASSERT(skc->skc_magic == SKC_MAGIC);
-	ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+	ASSERT(flags & KM_SLEEP);
+	atomic_inc(&skc->skc_ref);
 	local_irq_save(irq_flags);

 restart:
@ -1225,8 +1377,7 @@ restart:
 	if (likely(skm->skm_avail)) {
 		/* Object available in CPU cache, use it */
 		obj = skm->skm_objs[--skm->skm_avail];
-		if (!(skc->skc_flags & KMC_NOTOUCH))
-			skm->skm_age = jiffies;
+		skm->skm_age = jiffies;
 	} else {
 		/* Per-CPU cache empty, directly allocate from
 		 * the slab and refill the per-CPU cache. */
@ -1240,11 +1391,18 @@ restart:

 	/* Pre-emptively migrate object to CPU L1 cache */
 	prefetchw(obj);
+	atomic_dec(&skc->skc_ref);

 	RETURN(obj);
 }
 EXPORT_SYMBOL(spl_kmem_cache_alloc);

+/*
+ * Free an object back to the local per-cpu magazine, there is no
+ * guarantee that this is the same magazine the object was originally
+ * allocated from.  We may need to flush entire from the magazine
+ * back to the slabs to make space.
+ */
 void
 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 {
@ -1253,6 +1411,8 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 	ENTRY;

 	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+	atomic_inc(&skc->skc_ref);
 	local_irq_save(flags);

 	/* Safe to update per-cpu structure without lock, but
@ -1270,62 +1430,87 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 	skm->skm_objs[skm->skm_avail++] = obj;

 	local_irq_restore(flags);
+	atomic_dec(&skc->skc_ref);

 	EXIT;
 }
 EXPORT_SYMBOL(spl_kmem_cache_free);

+/*
+ * The generic shrinker function for all caches.  Under linux a shrinker
+ * may not be tightly coupled with a slab cache.  In fact linux always
+ * systematically trys calling all registered shrinker callbacks which
+ * report that they contain unused objects.  Because of this we only
+ * register one shrinker function in the shim layer for all slab caches.
+ * We always attempt to shrink all caches when this generic shrinker
+ * is called.  The shrinker should return the number of free objects
+ * in the cache when called with nr_to_scan == 0 but not attempt to
+ * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
+ * objects should be freed, because Solaris semantics are to free
+ * all available objects we may free more objects than requested.
+ */
 static int
 spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
 {
 	spl_kmem_cache_t *skc;
+	int unused = 0;

-	/* Under linux a shrinker is not tightly coupled with a slab
-	 * cache.  In fact linux always systematically trys calling all
-	 * registered shrinker callbacks until its target reclamation level
-	 * is reached.  Because of this we only register one shrinker
-	 * function in the shim layer for all slab caches.  And we always
-	 * attempt to shrink all caches when this generic shrinker is called.
-	 */
 	down_read(&spl_kmem_cache_sem);
+	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+		if (nr_to_scan)
+			spl_kmem_cache_reap_now(skc);

-	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
-		spl_kmem_cache_reap_now(skc);
-
+		/*
+		 * Presume everything alloc'ed in reclaimable, this ensures
+		 * we are called again with nr_to_scan > 0 so can try and
+		 * reclaim.  The exact number is not important either so
+		 * we forgo taking this already highly contented lock.
+		 */
+		unused += skc->skc_obj_alloc;
+	}
 	up_read(&spl_kmem_cache_sem);

-	/* XXX: Under linux we should return the remaining number of
-	 * entries in the cache.  We should do this as well.
-	 */
-	return 1;
+	return (unused * sysctl_vfs_cache_pressure) / 100;
 }

+/*
+ * Call the registered reclaim function for a cache.  Depending on how
+ * many and which objects are released it may simply repopulate the
+ * local magazine which will then need to age-out.  Objects which cannot
+ * fit in the magazine we will be released back to their slabs which will
+ * also need to age out before being release.  This is all just best
+ * effort and we do not want to thrash creating and destroying slabs.
+ */
 void
 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
 {
-	spl_kmem_magazine_t *skm;
-	int i;
 	ENTRY;

 	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+	/* Prevent concurrent cache reaping when contended */
+	if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+		EXIT;
+		return;
+	}
+
+	atomic_inc(&skc->skc_ref);

 	if (skc->skc_reclaim)
 		skc->skc_reclaim(skc->skc_private);

-	/* Ensure per-CPU caches which are idle gradually flush */
-	for_each_online_cpu(i) {
-		skm = skc->skc_mag[i];
-
-		if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
-			(void)spl_cache_flush(skc, skm, skm->skm_refill);
-	}
-
-	spl_slab_reclaim(skc);
+	spl_slab_reclaim(skc, 0);
+	clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
+	atomic_dec(&skc->skc_ref);

 	EXIT;
 }
 EXPORT_SYMBOL(spl_kmem_cache_reap_now);

+/*
+ * Reap all free slabs from all registered caches.
+ */
 void
 spl_kmem_reap(void)
 {
--- a/module/splat/splat-internal.h
+++ b/module/splat/splat-internal.h
@ -40,6 +40,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/list.h>
+#include <linux/swap.h>

 #include <asm/ioctls.h>
 #include <asm/uaccess.h>
--- a/module/splat/splat-kmem.c
+++ b/module/splat/splat-kmem.c