Implement per-cpu local caches. This seems to have bough me another

factor of 10x improvement on SMP system due to reduced lock contention. This may put me in the ballpark of what is needed. We can still further improve things on NUMA systems by creating an additional L3 cache per memory node instead of the current global pool. With luck this won't be needed. I should also take another look at the locking now that everything is working. There's a good chance I can tighten it up a little bit and improve things a little more. kmem_lock: time (sec) slabs objs hash kmem_lock: tot/max/calc tot/max/calc size/depth kmem_lock: 0.000999926 6/6/1 192/192/32 32768/0 kmem_lock: 0.000999926 4/4/2 128/128/64 32768/0 kmem_lock: 0.000999926 4/4/4 128/128/128 32768/0 kmem_lock: 0.000999926 4/4/8 128/128/256 32768/0 kmem_lock: 0.000999926 4/4/16 128/128/512 32768/0 kmem_lock: 0.000999926 4/4/32 128/128/1024 32768/0 kmem_lock: 0.000999926 4/4/64 128/128/2048 32768/0 kmem_lock: 0.000999926 8/8/128 256/256/4096 32768/0 kmem_lock: 0.003999704 24/23/256 768/736/8192 32768/1 kmem_lock: 0.012999038 44/41/512 1408/1312/16384 32768/1 kmem_lock: 0.051996153 96/93/1024 3072/2976/32768 32768/2 kmem_lock: 0.181986536 187/184/2048 5984/5888/65536 32768/3 kmem_lock: 0.655951469 342/339/4096 10944/10848/131072 32768/4 git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@136 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c
2008-06-25 20:57:45 +00:00 · 2008-06-25 20:57:45 +00:00 · 4afaaefa05
parent d46630e0f3
commit 4afaaefa05
5 changed files with 427 additions and 169 deletions
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@ -360,6 +360,7 @@ kmem_debugging(void)
 extern int kmem_set_warning(int flag);


+#define SKM_MAGIC			0x2e2e2e2e
 #define SKO_MAGIC			0x20202020
 #define SKS_MAGIC			0x22222222
 #define SKC_MAGIC			0x2c2c2c2c
@ -376,6 +377,15 @@ typedef int (*spl_kmem_ctor_t)(void *, void *, int);
 typedef void (*spl_kmem_dtor_t)(void *, void *);
 typedef void (*spl_kmem_reclaim_t)(void *);

+typedef struct spl_kmem_magazine {
+        uint32_t		skm_magic;	/* Sanity magic */
+	uint32_t		skm_avail;	/* Available objects */
+	uint32_t		skm_size;	/* Magazine size */
+	uint32_t		skm_refill;	/* Batch refill size */
+	unsigned long		skm_age;	/* Last cache access */
+	void			*skm_objs[0];	/* Object pointers */
+} spl_kmem_magazine_t;
+
 typedef struct spl_kmem_obj {
        uint32_t		sko_magic;	/* Sanity magic */
 	uint32_t		sko_flags;	/* Per object flags */
@ -392,13 +402,16 @@ typedef struct spl_kmem_slab {
 	struct list_head	sks_list;	/* Slab list linkage */
 	struct list_head	sks_free_list;	/* Free object list */
 	unsigned long		sks_age;	/* Last modify jiffie */
-	atomic_t		sks_ref;	/* Ref count used objects */
+	uint32_t		sks_ref;	/* Ref count used objects */
 } spl_kmem_slab_t;

 typedef struct spl_kmem_cache {
        uint32_t		skc_magic;	/* Sanity magic */
        uint32_t		skc_name_size;	/* Name length */
        char			*skc_name;	/* Name string */
+	spl_kmem_magazine_t	*skc_mag[NR_CPUS]; /* Per-CPU warm cache */
+	uint32_t		skc_mag_size;	/* Magazine size */
+	uint32_t		skc_mag_refill;	/* Magazine refill count */
        spl_kmem_ctor_t		skc_ctor;	/* Constructor */
        spl_kmem_dtor_t		skc_dtor;	/* Destructor */
        spl_kmem_reclaim_t      skc_reclaim;	/* Reclaimator */
@ -427,8 +440,8 @@ typedef struct spl_kmem_cache {
 	uint64_t		skc_obj_total;	/* Obj total current */
 	uint64_t		skc_obj_alloc;	/* Obj alloc current */
 	uint64_t		skc_obj_max;	/* Obj max historic */
-	uint64_t		skc_hash_depth;	/* Hash depth */
-	uint64_t		skc_hash_max;	/* Hash depth max */
+	uint64_t		skc_hash_depth;	/* Lazy hash depth */
+	uint64_t		skc_hash_count;	/* Hash entries current */
 } spl_kmem_cache_t;

 extern spl_kmem_cache_t *
--- a/modules/spl/spl-kmem.c
+++ b/modules/spl/spl-kmem.c
@ -109,13 +109,10 @@ EXPORT_SYMBOL(kmem_set_warning);
 * small virtual address space on 32bit arches.  This will seriously
 * constrain the size of the slab caches and their performance.
 *
- * XXX: Refactor the below code in to smaller functions.  This works
- *      for a first pass but each function is doing to much.
- *
 * XXX: Implement SPL proc interface to export full per cache stats.
 *
 * XXX: Implement work requests to keep an eye on each cache and
- *      shrink them via slab_reclaim() when they are wasting lots
+ *      shrink them via spl_slab_reclaim() when they are wasting lots
 *      of space.  Currently this process is driven by the reapers.
 *
 * XXX: Implement proper small cache object support by embedding
@ -138,6 +135,8 @@ EXPORT_SYMBOL(kmem_set_warning);
 *
 * XXX: Slab coloring may also yield performance improvements and would
 *      be desirable to implement.
+ *
+ * XXX: Proper hardware cache alignment would be good too.
 */

 /* Ensure the __kmem_cache_create/__kmem_cache_destroy macros are
@ -155,18 +154,22 @@ static struct rw_semaphore spl_kmem_cache_sem;	/* Cache list lock */
 static kmem_cache_t *spl_slab_cache;		/* Cache for slab structs */
 static kmem_cache_t *spl_obj_cache;		/* Cache for obj structs */

+static int spl_cache_flush(spl_kmem_cache_t *skc,
+			   spl_kmem_magazine_t *skm, int flush);
+
 #ifdef HAVE_SET_SHRINKER
 static struct shrinker *spl_kmem_cache_shrinker;
 #else
-static int kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask);
+static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
+					   unsigned int gfp_mask);
 static struct shrinker spl_kmem_cache_shrinker = {
-	.shrink = kmem_cache_generic_shrinker,
+	.shrink = spl_kmem_cache_generic_shrinker,
 	.seeks = KMC_DEFAULT_SEEKS,
 };
 #endif

 static spl_kmem_slab_t *
-slab_alloc(spl_kmem_cache_t *skc, int flags) {
+spl_slab_alloc(spl_kmem_cache_t *skc, int flags) {
 	spl_kmem_slab_t *sks;
 	spl_kmem_obj_t *sko, *n;
 	int i;
@ -182,7 +185,7 @@ slab_alloc(spl_kmem_cache_t *skc, int flags) {
 	sks->sks_cache = skc;
 	INIT_LIST_HEAD(&sks->sks_list);
 	INIT_LIST_HEAD(&sks->sks_free_list);
-	atomic_set(&sks->sks_ref, 0);
+	sks->sks_ref = 0;

 	for (i = 0; i < sks->sks_objs; i++) {
 		sko = kmem_cache_alloc(spl_obj_cache, flags);
@ -224,21 +227,19 @@ out:
 * be called with the 'skc->skc_lock' held.
 *                         */
 static void
-slab_free(spl_kmem_slab_t *sks) {
+spl_slab_free(spl_kmem_slab_t *sks) {
 	spl_kmem_cache_t *skc;
 	spl_kmem_obj_t *sko, *n;
 	int i = 0;
 	ENTRY;

 	ASSERT(sks->sks_magic == SKS_MAGIC);
-	ASSERT(atomic_read(&sks->sks_ref) == 0);
+	ASSERT(sks->sks_ref == 0);
 	skc = sks->sks_cache;
 	skc->skc_obj_total -= sks->sks_objs;
 	skc->skc_slab_total--;

-//#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
 	ASSERT(spin_is_locked(&skc->skc_lock));
-//#endif

 	list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
 		ASSERT(sko->sko_magic == SKO_MAGIC);
@ -261,15 +262,13 @@ slab_free(spl_kmem_slab_t *sks) {
 }

 static int
-__slab_reclaim(spl_kmem_cache_t *skc)
+__spl_slab_reclaim(spl_kmem_cache_t *skc)
 {
 	spl_kmem_slab_t *sks, *m;
 	int rc = 0;
 	ENTRY;

-//#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
 	ASSERT(spin_is_locked(&skc->skc_lock));
-//#endif
 	/*
 	 * Free empty slabs which have not been touched in skc_delay
 	 * seconds.  This delay time is important to avoid thrashing.
@ -277,11 +276,11 @@ __slab_reclaim(spl_kmem_cache_t *skc)
 	 */
        list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
 					 sks_list) {
-		if (atomic_read(&sks->sks_ref) > 0)
+		if (sks->sks_ref > 0)
 		       break;

 		if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
-			slab_free(sks);
+			spl_slab_free(sks);
 			rc++;
 		}
 	}
@ -291,18 +290,110 @@ __slab_reclaim(spl_kmem_cache_t *skc)
 }

 static int
-slab_reclaim(spl_kmem_cache_t *skc)
+spl_slab_reclaim(spl_kmem_cache_t *skc)
 {
 	int rc;
 	ENTRY;

 	spin_lock(&skc->skc_lock);
-	rc = __slab_reclaim(skc);
+	rc = __spl_slab_reclaim(skc);
 	spin_unlock(&skc->skc_lock);

 	RETURN(rc);
 }

+static int
+spl_magazine_size(spl_kmem_cache_t *skc)
+{
+	int size;
+	ENTRY;
+
+	/* Guesses for reasonable magazine sizes, they
+	 * should really adapt based on observed usage. */
+	if (skc->skc_obj_size > (PAGE_SIZE * 256))
+		size = 1;
+	else if (skc->skc_obj_size > (PAGE_SIZE * 32))
+		size = 4;
+	else if (skc->skc_obj_size > (PAGE_SIZE))
+		size = 16;
+	else if (skc->skc_obj_size > (PAGE_SIZE / 4))
+		size = 32;
+	else if (skc->skc_obj_size > (PAGE_SIZE / 16))
+		size = 64;
+	else
+		size = 128;
+
+	RETURN(size);
+}
+
+static spl_kmem_magazine_t *
+spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
+{
+	spl_kmem_magazine_t *skm;
+	int size = sizeof(spl_kmem_magazine_t) +
+	           sizeof(void *) * skc->skc_mag_size;
+	ENTRY;
+
+	skm = kmalloc_node(size, GFP_KERNEL, node);
+	if (skm) {
+		skm->skm_magic = SKM_MAGIC;
+		skm->skm_avail = 0;
+		skm->skm_size = skc->skc_mag_size;
+		skm->skm_refill = skc->skc_mag_refill;
+		skm->skm_age = jiffies;
+	}
+
+	RETURN(skm);
+}
+
+static void
+spl_magazine_free(spl_kmem_magazine_t *skm)
+{
+	ENTRY;
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+	ASSERT(skm->skm_avail == 0);
+	kfree(skm);
+	EXIT;
+}
+
+static int
+spl_magazine_create(spl_kmem_cache_t *skc)
+{
+	int i;
+	ENTRY;
+
+	skc->skc_mag_size = spl_magazine_size(skc);
+	skc->skc_mag_refill = (skc->skc_mag_size + 1)  / 2;
+
+	for_each_online_cpu(i) {
+		skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
+		if (!skc->skc_mag[i]) {
+			for (i--; i >= 0; i--)
+				spl_magazine_free(skc->skc_mag[i]);
+
+			RETURN(-ENOMEM);
+		}
+	}
+
+	RETURN(0);
+}
+
+static void
+spl_magazine_destroy(spl_kmem_cache_t *skc)
+{
+        spl_kmem_magazine_t *skm;
+	int i;
+	ENTRY;
+
+	for_each_online_cpu(i) {
+		skm = skc->skc_mag[i];
+		(void)spl_cache_flush(skc, skm, skm->skm_avail);
+		spl_magazine_free(skm);
+	}
+
+	EXIT;
+}
+
 spl_kmem_cache_t *
 spl_kmem_cache_create(char *name, size_t size, size_t align,
                      spl_kmem_ctor_t ctor,
@ -311,7 +402,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
                      void *priv, void *vmp, int flags)
 {
        spl_kmem_cache_t *skc;
-	int i, kmem_flags = KM_SLEEP;
+	int i, rc, kmem_flags = KM_SLEEP;
 	ENTRY;

        /* We may be called when there is a non-zero preempt_count or
@ -326,7 +417,6 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 		RETURN(NULL);

 	skc->skc_magic = SKC_MAGIC;
-
 	skc->skc_name_size = strlen(name) + 1;
 	skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
 	if (skc->skc_name == NULL) {
@ -355,6 +445,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	if (skc->skc_hash == NULL) {
 		kmem_free(skc->skc_name, skc->skc_name_size);
 		kmem_free(skc, sizeof(*skc));
+		RETURN(NULL);
 	}

 	for (i = 0; i < skc->skc_hash_elts; i++)
@ -374,7 +465,15 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	skc->skc_obj_alloc = 0;
 	skc->skc_obj_max = 0;
 	skc->skc_hash_depth = 0;
-	skc->skc_hash_max = 0;
+	skc->skc_hash_count = 0;
+
+	rc = spl_magazine_create(skc);
+	if (rc) {
+		kmem_free(skc->skc_hash, skc->skc_hash_size);
+		kmem_free(skc->skc_name, skc->skc_name_size);
+		kmem_free(skc, sizeof(*skc));
+		RETURN(NULL);
+	}

 	down_write(&spl_kmem_cache_sem);
        list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
@ -385,8 +484,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 EXPORT_SYMBOL(spl_kmem_cache_create);

 /* The caller must ensure there are no racing calls to
- * spl_kmem_cache_alloc() for this spl_kmem_cache_t when
- * it is being destroyed.
+ * spl_kmem_cache_alloc() for this spl_kmem_cache_t.
 */
 void
 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
@ -398,20 +496,22 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
        list_del_init(&skc->skc_list);
        up_write(&spl_kmem_cache_sem);

+	spl_magazine_destroy(skc);
 	spin_lock(&skc->skc_lock);

 	/* Validate there are no objects in use and free all the
-	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
-	 */
+	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
 	ASSERT(list_empty(&skc->skc_complete_list));
+	ASSERTF(skc->skc_hash_count == 0, "skc->skc_hash_count=%d\n",
+		skc->skc_hash_count);

        list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
-		slab_free(sks);
+		spl_slab_free(sks);

 	kmem_free(skc->skc_hash, skc->skc_hash_size);
 	kmem_free(skc->skc_name, skc->skc_name_size);
-	kmem_free(skc, sizeof(*skc));
 	spin_unlock(&skc->skc_lock);
+	kmem_free(skc, sizeof(*skc));

 	EXIT;
 }
@ -427,88 +527,92 @@ spl_hash_ptr(void *ptr, unsigned int bits)
 	return hash_long((unsigned long)ptr >> PAGE_SHIFT, bits);
 }

-#ifndef list_first_entry
-#define list_first_entry(ptr, type, member) \
-	        list_entry((ptr)->next, type, member)
-#endif
-
-void *
-spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
+static spl_kmem_obj_t *
+spl_hash_obj(spl_kmem_cache_t *skc, void *obj)
+{
+        struct hlist_node *node;
+	spl_kmem_obj_t *sko = NULL;
+	unsigned long key = spl_hash_ptr(obj, skc->skc_hash_bits);
+	int i = 0;
+
+	ASSERT(spin_is_locked(&skc->skc_lock));
+
+        hlist_for_each_entry(sko, node, &skc->skc_hash[key], sko_hlist) {
+
+		if (unlikely((++i) > skc->skc_hash_depth))
+			skc->skc_hash_depth = i;
+
+                if (sko->sko_addr == obj) {
+			ASSERT(sko->sko_magic == SKO_MAGIC);
+			RETURN(sko);
+		}
+	}
+
+	RETURN(NULL);
+}
+
+static void *
+spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
 {
-        spl_kmem_slab_t *sks;
 	spl_kmem_obj_t *sko;
-	void *obj;
 	unsigned long key;
-	ENTRY;

-	spin_lock(&skc->skc_lock);
-restart:
-	/* Check for available objects from the partial slabs */
-	if (!list_empty(&skc->skc_partial_list)) {
-		sks = list_first_entry(&skc->skc_partial_list,
-		                       spl_kmem_slab_t, sks_list);
-		ASSERT(sks->sks_magic == SKS_MAGIC);
-		ASSERT(atomic_read(&sks->sks_ref) < sks->sks_objs);
-		ASSERT(!list_empty(&sks->sks_free_list));
+	ASSERT(spin_is_locked(&skc->skc_lock));

-		sko = list_first_entry(&sks->sks_free_list,
-		                       spl_kmem_obj_t, sko_list);
+	sko = list_entry((&sks->sks_free_list)->next,spl_kmem_obj_t,sko_list);
 	ASSERT(sko->sko_magic == SKO_MAGIC);
 	ASSERT(sko->sko_addr != NULL);

-		/* Remove from sks_free_list, add to used hash */
+	/* Remove from sks_free_list and add to used hash */
 	list_del_init(&sko->sko_list);
 	key = spl_hash_ptr(sko->sko_addr, skc->skc_hash_bits);
 	hlist_add_head(&sko->sko_hlist, &skc->skc_hash[key]);

 	sks->sks_age = jiffies;
-		atomic_inc(&sks->sks_ref);
+	sks->sks_ref++;
 	skc->skc_obj_alloc++;
+	skc->skc_hash_count++;

+	/* Track max obj usage statistics */
 	if (skc->skc_obj_alloc > skc->skc_obj_max)
 		skc->skc_obj_max = skc->skc_obj_alloc;

-		if (atomic_read(&sks->sks_ref) == 1) {
+	/* Track max slab usage statistics */
+	if (sks->sks_ref == 1) {
 		skc->skc_slab_alloc++;

 		if (skc->skc_slab_alloc > skc->skc_slab_max)
 			skc->skc_slab_max = skc->skc_slab_alloc;
 	}

-		/* Move slab to skc_complete_list when full */
-		if (atomic_read(&sks->sks_ref) == sks->sks_objs) {
-			list_del(&sks->sks_list);
-			list_add(&sks->sks_list, &skc->skc_complete_list);
+	return sko->sko_addr;
 }

-		GOTO(out_lock, obj = sko->sko_addr);
-	}
-
-	spin_unlock(&skc->skc_lock);
-
 /* No available objects create a new slab.  Since this is an
-	 * expensive operation we do it without holding the semaphore
+ * expensive operation we do it without holding the spinlock
 * and only briefly aquire it when we link in the fully
 * allocated and constructed slab.
 */
+static spl_kmem_slab_t *
+spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+{
+        spl_kmem_slab_t *sks;
+	spl_kmem_obj_t *sko;
+	ENTRY;

-	/* Under Solaris if the KM_SLEEP flag is passed we may never
-	 * fail, so sleep as long as needed.  Additionally, since we are
-	 * using vmem_alloc() KM_NOSLEEP is not an option and we must
-	 * fail.  Shifting to allocating our own pages and mapping the
-	 * virtual address space may allow us to bypass this issue.
-	 */
-	if (!flags)
-		flags |= KM_SLEEP;
-
-	if (flags & KM_SLEEP)
+        if (flags & __GFP_WAIT) {
 		flags |= __GFP_NOFAIL;
-	else
-		GOTO(out, obj = NULL);
+		might_sleep();
+		local_irq_enable();
+	}

-	sks = slab_alloc(skc, flags);
-	if (sks == NULL)
-		GOTO(out, obj = NULL);
+	sks = spl_slab_alloc(skc, flags);
+	if (sks == NULL) {
+	        if (flags & __GFP_WAIT)
+			local_irq_disable();
+
+		RETURN(NULL);
+	}

 	/* Run all the constructors now that the slab is fully allocated */
 	list_for_each_entry(sko, &sks->sks_free_list, sko_list) {
@ -518,18 +622,171 @@ restart:
 			skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
 	}

-	/* Link the newly created slab in to the skc_partial_list,
-	 * and retry the allocation which will now succeed.
-	 */
+        if (flags & __GFP_WAIT)
+		local_irq_disable();
+
+	/* Link the new empty slab in to the end of skc_partial_list */
 	spin_lock(&skc->skc_lock);
 	skc->skc_slab_total++;
 	skc->skc_obj_total += sks->sks_objs;
 	list_add_tail(&sks->sks_list, &skc->skc_partial_list);
-	GOTO(restart, obj = NULL);
+	spin_unlock(&skc->skc_lock);
+
+	RETURN(sks);
+}
+
+static int
+spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
+{
+        spl_kmem_slab_t *sks;
+	int refill = skm->skm_refill;
+	ENTRY;
+
+	/* XXX: Check for refill bouncing by age perhaps */
+
+	spin_lock(&skc->skc_lock);
+	while (refill > 0) {
+		/* No slabs available we must grow the cache */
+		if (list_empty(&skc->skc_partial_list)) {
+			spin_unlock(&skc->skc_lock);
+			sks = spl_cache_grow(skc, flags);
+			if (!sks)
+				GOTO(out, refill);
+
+			/* Rescheduled to different CPU skm is not local */
+			if (skm != skc->skc_mag[smp_processor_id()])
+				GOTO(out, refill);
+
+			spin_lock(&skc->skc_lock);
+			continue;
+		}
+
+		/* Grab the next available slab */
+		sks = list_entry((&skc->skc_partial_list)->next,
+		                 spl_kmem_slab_t, sks_list);
+		ASSERT(sks->sks_magic == SKS_MAGIC);
+		ASSERT(sks->sks_ref < sks->sks_objs);
+		ASSERT(!list_empty(&sks->sks_free_list));
+
+		/* Consume as many objects as needed to refill the requested
+		 * cache.  We must be careful to lock here because our local
+		 * magazine may not be local anymore due to spl_cache_grow. */
+		while ((sks->sks_ref < sks->sks_objs) && (refill-- > 0))
+			skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
+
+		/* Move slab to skc_complete_list when full */
+		if (sks->sks_ref == sks->sks_objs) {
+			list_del(&sks->sks_list);
+			list_add(&sks->sks_list, &skc->skc_complete_list);
+		}
+	}

-out_lock:
 	spin_unlock(&skc->skc_lock);
 out:
+	/* Returns the number of entries added to cache */
+	RETURN(skm->skm_refill - refill);
+}
+
+static void
+spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
+{
+        spl_kmem_slab_t *sks = NULL;
+	spl_kmem_obj_t *sko = NULL;
+	ENTRY;
+
+	ASSERT(spin_is_locked(&skc->skc_lock));
+
+	sko = spl_hash_obj(skc, obj);
+	ASSERTF(sko, "Obj %p missing from in-use hash (%d) for cache %s\n",
+	        obj, skc->skc_hash_count, skc->skc_name);
+
+	sks = sko->sko_slab;
+	ASSERTF(sks, "Obj %p/%p linked to invalid slab for cache %s\n",
+		obj, sko, skc->skc_name);
+
+	ASSERT(sks->sks_cache == skc);
+	hlist_del_init(&sko->sko_hlist);
+	list_add(&sko->sko_list, &sks->sks_free_list);
+
+	sks->sks_age = jiffies;
+	sks->sks_ref--;
+	skc->skc_obj_alloc--;
+	skc->skc_hash_count--;
+
+	/* Move slab to skc_partial_list when no longer full.  Slabs
+	 * are added to the head to keep the partial list is quasi-full
+	 * sorted order.  Fuller at the head, emptier at the tail. */
+	if (sks->sks_ref == (sks->sks_objs - 1)) {
+		list_del(&sks->sks_list);
+		list_add(&sks->sks_list, &skc->skc_partial_list);
+	}
+
+	/* Move emply slabs to the end of the partial list so
+	 * they can be easily found and freed during reclamation. */
+	if (sks->sks_ref == 0) {
+		list_del(&sks->sks_list);
+		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+		skc->skc_slab_alloc--;
+	}
+
+	EXIT;
+}
+
+static int
+spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+	int i, count = MIN(flush, skm->skm_avail);
+	ENTRY;
+
+
+	spin_lock(&skc->skc_lock);
+	for (i = 0; i < count; i++)
+		spl_cache_shrink(skc, skm->skm_objs[i]);
+
+	__spl_slab_reclaim(skc);
+        skm->skm_avail -= count;
+        memmove(skm->skm_objs, &(skm->skm_objs[count]),
+	        sizeof(void *) * skm->skm_avail);
+
+	spin_unlock(&skc->skc_lock);
+
+	RETURN(count);
+}
+
+void *
+spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
+{
+	spl_kmem_magazine_t *skm;
+	unsigned long irq_flags;
+	void *obj = NULL;
+	ENTRY;
+
+	ASSERT(flags & KM_SLEEP);
+	local_irq_save(irq_flags);
+
+restart:
+	/* Safe to update per-cpu structure without lock, but
+	 * in the restart case we must be careful to reaquire
+	 * the local magazine since this may have changed
+	 * when we need to grow the cache. */
+	skm = skc->skc_mag[smp_processor_id()];
+
+	if (likely(skm->skm_avail)) {
+		/* Object available in CPU cache, use it */
+		obj = skm->skm_objs[--skm->skm_avail];
+		skm->skm_age = jiffies;
+	} else {
+		/* Per-CPU cache empty, directly allocate from
+		 * the slab and refill the per-CPU cache. */
+		(void)spl_cache_refill(skc, skm, flags);
+		GOTO(restart, obj = NULL);
+	}
+
+	local_irq_restore(irq_flags);
+
+	/* Pre-emptively migrate object to CPU L1 cache */
+	prefetchw(obj);
+
 	RETURN(obj);
 }
 EXPORT_SYMBOL(spl_kmem_cache_alloc);
@ -537,62 +794,33 @@ EXPORT_SYMBOL(spl_kmem_cache_alloc);
 void
 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 {
-        struct hlist_node *node;
-        spl_kmem_slab_t *sks = NULL;
-	spl_kmem_obj_t *sko = NULL;
-	unsigned long key = spl_hash_ptr(obj, skc->skc_hash_bits);
-	int i = 0;
+	spl_kmem_magazine_t *skm;
+	unsigned long flags;
 	ENTRY;

-	spin_lock(&skc->skc_lock);
+	local_irq_save(flags);

-        hlist_for_each_entry(sko, node, &skc->skc_hash[key], sko_hlist) {
+	/* Safe to update per-cpu structure without lock, but
+	 * no remote memory allocation tracking is being performed
+	 * it is entirely possible to allocate an object from one
+	 * CPU cache and return it to another. */
+	skm = skc->skc_mag[smp_processor_id()];

-		if (unlikely((++i) > skc->skc_hash_depth))
-			skc->skc_hash_depth = i;
+	/* Per-CPU cache full, flush it to make space */
+	if (unlikely(skm->skm_avail >= skm->skm_size))
+		(void)spl_cache_flush(skc, skm, skm->skm_refill);

-                if (sko->sko_addr == obj) {
-			ASSERT(sko->sko_magic == SKO_MAGIC);
-			sks = sko->sko_slab;
-			break;
-		}
-	}
+	/* Available space in cache, use it */
+	skm->skm_objs[skm->skm_avail++] = obj;

-	ASSERT(sko != NULL); /* Obj must be in hash */
-	ASSERT(sks != NULL); /* Obj must reference slab */
-	ASSERT(sks->sks_cache == skc);
-	hlist_del_init(&sko->sko_hlist);
-	list_add(&sko->sko_list, &sks->sks_free_list);
+	local_irq_restore(flags);

-	sks->sks_age = jiffies;
-	atomic_dec(&sks->sks_ref);
-	skc->skc_obj_alloc--;
-
-	/* Move slab to skc_partial_list when no longer full.  Slabs
-	 * are added to the kead to keep the partial list is quasi
-	 * full sorted order.  Fuller at the head, emptier at the tail.
-	 */
-	if (atomic_read(&sks->sks_ref) == (sks->sks_objs - 1)) {
-		list_del(&sks->sks_list);
-		list_add(&sks->sks_list, &skc->skc_partial_list);
-	}
-
-	/* Move emply slabs to the end of the partial list so
-	 * they can be easily found and freed during reclamation.
-	 */
-	if (atomic_read(&sks->sks_ref) == 0) {
-		list_del(&sks->sks_list);
-		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
-		skc->skc_slab_alloc--;
-	}
-
-	__slab_reclaim(skc);
-	spin_unlock(&skc->skc_lock);
+	EXIT;
 }
 EXPORT_SYMBOL(spl_kmem_cache_free);

 static int
-kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
+spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
 {
        spl_kmem_cache_t *skc;

@ -619,13 +847,24 @@ kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
 void
 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
 {
+	spl_kmem_magazine_t *skm;
+	int i;
 	ENTRY;
        ASSERT(skc && skc->skc_magic == SKC_MAGIC);

 	if (skc->skc_reclaim)
 		skc->skc_reclaim(skc->skc_private);

-	slab_reclaim(skc);
+	/* Ensure per-CPU caches which are idle gradually flush */
+	for_each_online_cpu(i) {
+		skm = skc->skc_mag[i];
+
+		if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
+			(void)spl_cache_flush(skc, skm, skm->skm_refill);
+	}
+
+	spl_slab_reclaim(skc);
+
 	EXIT;
 }
 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
@ -633,7 +872,7 @@ EXPORT_SYMBOL(spl_kmem_cache_reap_now);
 void
 spl_kmem_reap(void)
 {
-	kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
+	spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
 }
 EXPORT_SYMBOL(spl_kmem_reap);

@ -663,7 +902,7 @@ spl_kmem_init(void)

 #ifdef HAVE_SET_SHRINKER
 	spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
-					       kmem_cache_generic_shrinker);
+					       spl_kmem_cache_generic_shrinker);
 	if (spl_kmem_cache_shrinker == NULL)
 		GOTO(out_cache, rc = -ENOMEM);
 #else
@ -703,7 +942,7 @@ out_cache:

 #ifdef DEBUG_KMEM
 static char *
-sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
+spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
 {
        int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
 	int i, flag = 1;
@ -769,7 +1008,7 @@ spl_kmem_fini(void)
 	list_for_each_entry(kd, &kmem_list, kd_list)
 		CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n",
 		       kd->kd_addr, kd->kd_size,
-		       sprintf_addr(kd, str, 17, 8),
+		       spl_sprintf_addr(kd, str, 17, 8),
 		       kd->kd_func, kd->kd_line);

 	spin_unlock_irqrestore(&kmem_lock, flags);
@ -786,7 +1025,7 @@ spl_kmem_fini(void)
 	list_for_each_entry(kd, &vmem_list, kd_list)
 		CDEBUG(D_WARNING, "%p %-5d %-16s %s:%d\n",
 		       kd->kd_addr, kd->kd_size,
-		       sprintf_addr(kd, str, 17, 8),
+		       spl_sprintf_addr(kd, str, 17, 8),
 		       kd->kd_func, kd->kd_line);

 	spin_unlock_irqrestore(&vmem_lock, flags);
--- a/modules/spl/spl-proc.c
+++ b/modules/spl/spl-proc.c
@ -913,7 +913,9 @@ out:
 	if (rc) {
 		remove_proc_entry("kstat", proc_spl);
 		remove_proc_entry("kmem", proc_spl);
+#ifdef DEBUG_MUTEX
 	        remove_proc_entry("stats_per", proc_spl_mutex);
+#endif
 		remove_proc_entry("mutex", proc_spl);
 		remove_proc_entry("spl", NULL);
 #ifdef CONFIG_SYSCTL
@ -933,7 +935,9 @@ proc_fini(void)
 #if defined(DEBUG_MUTEX) || defined(DEBUG_KMEM) || defined(DEBUG_KSTAT)
 	remove_proc_entry("kstat", proc_spl);
 	remove_proc_entry("kmem", proc_spl);
+#ifdef DEBUG_MUTEX
        remove_proc_entry("stats_per", proc_spl_mutex);
+#endif
 	remove_proc_entry("mutex", proc_spl);
 	remove_proc_entry("spl", NULL);
 #endif /* DEBUG_MUTEX || DEBUG_KMEM || DEBUG_KSTAT */
--- a/modules/spl/spl-vnode.c
+++ b/modules/spl/spl-vnode.c
@ -486,7 +486,7 @@ vn_getf(int fd)
 	spin_unlock(&vn_file_lock);

 	/* File was not yet opened create the object and setup */
-	fp = kmem_cache_alloc(vn_file_cache, 0);
+	fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP);
 	if (fp == NULL)
 		GOTO(out, rc);

--- a/modules/splat/splat-kmem.c
+++ b/modules/splat/splat-kmem.c
@ -525,6 +525,9 @@ splat_kmem_test8_thread(void *arg)

 	objs = vmem_zalloc(count * sizeof(void *), KM_SLEEP);
 	if (!objs) {
+		splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST8_NAME,
+	                     "Unable to alloc objp array for cache '%s'\n",
+		             kcp->kcp_cache->skc_name);
 		rc = -ENOMEM;
 		goto out;
 	}
@ -533,14 +536,13 @@ splat_kmem_test8_thread(void *arg)
 		objs[i] = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
 		if (!objs[i]) {
 			splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST8_NAME,
-		                     "Unable to allocate from '%s'\n",
-			             SPLAT_KMEM_CACHE_NAME);
+		                     "Unable to allocate from cache '%s'\n",
+			             kcp->kcp_cache->skc_name);
 			rc = -ENOMEM;
-			goto out_free;
+			break;
 		}
 	}

-out_free:
 	for (i = 0; i < count; i++)
 		if (objs[i])
 			kmem_cache_free(kcp->kcp_cache, objs[i]);
@ -578,6 +580,7 @@ splat_kmem_test8(struct file *file, void *arg)
 	kmem_cache_priv_t kcp;
 	kthread_t *thr;
 	struct timespec start, stop, delta;
+	char cache_name[16];
 	int alloc, i;

 	kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC;
@ -588,7 +591,7 @@ splat_kmem_test8(struct file *file, void *arg)
        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%s",
 	             "          \ttot/max/calc\ttot/max/calc\tsize/depth\n");

-	for (alloc = 64; alloc <= 4096; alloc *= 2) {
+	for (alloc = 1; alloc <= 4096; alloc *= 2) {
 		kcp.kcp_size = 256;
 		kcp.kcp_count = 0;
 		kcp.kcp_threads = 0;
@ -597,9 +600,8 @@ splat_kmem_test8(struct file *file, void *arg)
 	        spin_lock_init(&kcp.kcp_lock);
 	        init_waitqueue_head(&kcp.kcp_waitq);

-
-		kcp.kcp_cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME,
-		                                  kcp.kcp_size, 0,
+		sprintf(cache_name, "%s-%d", SPLAT_KMEM_CACHE_NAME, alloc);
+		kcp.kcp_cache = kmem_cache_create(cache_name, kcp.kcp_size, 0,
 		                                  splat_kmem_cache_test_constructor,
 		                                  splat_kmem_cache_test_destructor,
 						  NULL, &kcp, NULL, 0);