From 165f13c33abadc06ccaea1c4f654fddfa316a80f Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 29 Oct 2012 16:51:59 -0700
Subject: [PATCH 1/4] Improved vmem cached deadlock detection

The entire goal of performing the slab allocations asynchronously
is to be able to detect when a vmalloc() deadlocks.  In this case,
and only this case, do we want to start allocating emergency objects.
The trick here is to minimize false positives because the overhead
of tracking emergency objects is far higher than normal slab objects.

With that goal in mind the code was reworked to be less sensitive
to slow allocations by increasing the wait time.  Once a cache is
is marked deadlocked all subsequent allocations which can not be
satisfied with existing cache objects will immediately allocate new
emergency objects.  This behavior persists until the asynchronous
allocation completes and clears the deadlocked flag.

The result of these tweaks is that far fewer emergency objects
get created which is important because this minimizes the cost of
releasing them latter in kmem_cache_free().

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 include/sys/kmem.h    |  3 +++
 module/spl/spl-kmem.c | 35 +++++++++++++++++++++++++----------
 module/spl/spl-proc.c |  9 ++++++---
 3 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index e71a443a09..b0f4208bdc 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -340,6 +340,7 @@ enum {
 	KMC_BIT_VMEM		= 6,	/* Use vmem cache */
 	KMC_BIT_OFFSLAB		= 7,	/* Objects not on slab */
 	KMC_BIT_NOEMERGENCY	= 8,	/* Disable emergency objects */
+	KMC_BIT_DEADLOCKED      = 14,	/* Deadlock detected */
 	KMC_BIT_GROWING         = 15,   /* Growing in progress */
 	KMC_BIT_REAPING		= 16,	/* Reaping in progress */
 	KMC_BIT_DESTROY		= 17,	/* Destroy in progress */
@@ -366,6 +367,7 @@ typedef enum kmem_cbrc {
 #define KMC_VMEM		(1 << KMC_BIT_VMEM)
 #define KMC_OFFSLAB		(1 << KMC_BIT_OFFSLAB)
 #define KMC_NOEMERGENCY		(1 << KMC_BIT_NOEMERGENCY)
+#define KMC_DEADLOCKED		(1 << KMC_BIT_DEADLOCKED)
 #define KMC_GROWING		(1 << KMC_BIT_GROWING)
 #define KMC_REAPING		(1 << KMC_BIT_REAPING)
 #define KMC_DESTROY		(1 << KMC_BIT_DESTROY)
@@ -473,6 +475,7 @@ typedef struct spl_kmem_cache {
 	uint64_t		skc_obj_total;	/* Obj total current */
 	uint64_t		skc_obj_alloc;	/* Obj alloc current */
 	uint64_t		skc_obj_max;	/* Obj max historic */
+	uint64_t		skc_obj_deadlock;  /* Obj emergency deadlocks */
 	uint64_t		skc_obj_emergency; /* Obj emergency current */
 	uint64_t		skc_obj_emergency_max; /* Obj emergency max */
 } spl_kmem_cache_t;
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index eca809c477..045075cc03 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -1495,6 +1495,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	skc->skc_obj_total = 0;
 	skc->skc_obj_alloc = 0;
 	skc->skc_obj_max = 0;
+	skc->skc_obj_deadlock = 0;
 	skc->skc_obj_emergency = 0;
 	skc->skc_obj_emergency_max = 0;
 
@@ -1662,6 +1663,7 @@ spl_cache_grow_work(void *data)
 
 	atomic_dec(&skc->skc_ref);
 	clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+	clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
 	wake_up_all(&skc->skc_waitq);
 	spin_unlock(&skc->skc_lock);
 
@@ -1683,7 +1685,7 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc)
 static int
 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
 {
-	int remaining, rc = 0;
+	int remaining, rc;
 	SENTRY;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -1722,17 +1724,30 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
 	}
 
 	/*
-	 * Allow a single timer tick before falling back to synchronously
-	 * allocating the minimum about of memory required by the caller.
+	 * The goal here is to only detect the rare case where a virtual slab
+	 * allocation has deadlocked.  We must be careful to minimize the use
+	 * of emergency objects which are more expensive to track.  Therefore,
+	 * we set a very long timeout for the asynchronous allocation and if
+	 * the timeout is reached the cache is flagged as deadlocked.  From
+	 * this point only new emergency objects will be allocated until the
+	 * asynchronous allocation completes and clears the deadlocked flag.
 	 */
-	remaining = wait_event_timeout(skc->skc_waitq,
-				       spl_cache_grow_wait(skc), 1);
+	if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
+		rc = spl_emergency_alloc(skc, flags, obj);
+	} else {
+		remaining = wait_event_timeout(skc->skc_waitq,
+					       spl_cache_grow_wait(skc), HZ);
 
-	if (remaining == 0) {
-		if (test_bit(KMC_BIT_NOEMERGENCY, &skc->skc_flags))
-			rc = -ENOMEM;
-		else
-			rc = spl_emergency_alloc(skc, flags, obj);
+		if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
+			spin_lock(&skc->skc_lock);
+			if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
+				set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+				skc->skc_obj_deadlock++;
+			}
+			spin_unlock(&skc->skc_lock);
+		}
+
+		rc = -ENOMEM;
 	}
 
 	SRETURN(rc);
diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c
index 11a2d1068e..152abff7f5 100644
--- a/module/spl/spl-proc.c
+++ b/module/spl/spl-proc.c
@@ -625,12 +625,14 @@ slab_seq_show_headers(struct seq_file *f)
             "--------------------- cache ----------"
             "---------------------------------------------  "
             "----- slab ------  "
-            "---- object -----------------\n");
+            "---- object -----  "
+            "--- emergency ---\n");
         seq_printf(f,
             "name                                  "
             "  flags      size     alloc slabsize  objsize  "
             "total alloc   max  "
-            "total alloc   max emerg   max\n");
+            "total alloc   max  "
+            "dlock alloc   max\n");
 }
 
 static int
@@ -643,7 +645,7 @@ slab_seq_show(struct seq_file *f, void *p)
         spin_lock(&skc->skc_lock);
         seq_printf(f, "%-36s  ", skc->skc_name);
         seq_printf(f, "0x%05lx %9lu %9lu %8u %8u  "
-            "%5lu %5lu %5lu  %5lu %5lu %5lu %5lu %5lu\n",
+            "%5lu %5lu %5lu  %5lu %5lu %5lu  %5lu %5lu %5lu\n",
             (long unsigned)skc->skc_flags,
             (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
             (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
@@ -655,6 +657,7 @@ slab_seq_show(struct seq_file *f, void *p)
             (long unsigned)skc->skc_obj_total,
             (long unsigned)skc->skc_obj_alloc,
             (long unsigned)skc->skc_obj_max,
+            (long unsigned)skc->skc_obj_deadlock,
             (long unsigned)skc->skc_obj_emergency,
             (long unsigned)skc->skc_obj_emergency_max);
 

From ed3163484d2e70df8d9c50bad9678891b26c0fa0 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 30 Oct 2012 10:45:50 -0700
Subject: [PATCH 2/4] Track emergency object in rbtree

In the initial implementation emergency objects were tracked on a
per-cache list.  The assumption was that under normal operation we
would never allocate more than a handful of these objects.  So the
cost of walking the list during free was expected to be negligible.

However real world usage has shown that emergency objects tend to
be allocated in batches.  A deadlock will be detected and several
thousand emergency objects will be allocated before the original
blocked slab allocation can complete.

Therefore the original list has been replaced by a red black tree
which is sorted by the memory address of each allocated object.
This bounds the worst case insertion and removal time to O(log n)
which minimize contention on the assoicated spin lock.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 include/sys/kmem.h    |  5 ++-
 module/spl/spl-kmem.c | 98 +++++++++++++++++++++++++++++++------------
 2 files changed, 75 insertions(+), 28 deletions(-)

diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index b0f4208bdc..83adc8d2a3 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -31,6 +31,7 @@
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
 #include <linux/hash.h>
+#include <linux/rbtree.h>
 #include <linux/ctype.h>
 #include <asm/atomic.h>
 #include <sys/types.h>
@@ -435,8 +436,8 @@ typedef struct spl_kmem_alloc {
 } spl_kmem_alloc_t;
 
 typedef struct spl_kmem_emergency {
+	struct rb_node		ske_node;	/* Emergency tree linkage */
 	void			*ske_obj;	/* Buffer address */
-	struct list_head	ske_list;	/* Emergency list linkage */
 } spl_kmem_emergency_t;
 
 typedef struct spl_kmem_cache {
@@ -463,7 +464,7 @@ typedef struct spl_kmem_cache {
 	struct list_head	skc_list;	/* List of caches linkage */
 	struct list_head	skc_complete_list;/* Completely alloc'ed */
 	struct list_head	skc_partial_list; /* Partially alloc'ed */
-	struct list_head	skc_emergency_list; /* Min sized objects */
+	struct rb_root		skc_emergency_tree; /* Min sized objects */
 	spinlock_t		skc_lock;	/* Cache lock */
 	wait_queue_head_t	skc_waitq;	/* Allocation waiters */
 	uint64_t		skc_slab_fail;	/* Slab alloc failures */
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index 045075cc03..7e68522ad3 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -1116,8 +1116,54 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
 	SEXIT;
 }
 
+static spl_kmem_emergency_t *
+spl_emergency_search(struct rb_root *root, void *obj)
+{
+	struct rb_node *node = root->rb_node;
+	spl_kmem_emergency_t *ske;
+	unsigned long address = (unsigned long)obj;
+
+	while (node) {
+		ske = container_of(node, spl_kmem_emergency_t, ske_node);
+
+		if (address < (unsigned long)ske->ske_obj)
+			node = node->rb_left;
+		else if (address > (unsigned long)ske->ske_obj)
+			node = node->rb_right;
+		else
+			return ske;
+	}
+
+	return NULL;
+}
+
+static int
+spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	spl_kmem_emergency_t *ske_tmp;
+	unsigned long address = (unsigned long)ske->ske_obj;
+
+	while (*new) {
+		ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
+
+		parent = *new;
+		if (address < (unsigned long)ske_tmp->ske_obj)
+			new = &((*new)->rb_left);
+		else if (address > (unsigned long)ske_tmp->ske_obj)
+			new = &((*new)->rb_right);
+		else
+			return 0;
+	}
+
+	rb_link_node(&ske->ske_node, parent, new);
+	rb_insert_color(&ske->ske_node, root);
+
+	return 1;
+}
+
 /*
- * Allocate a single emergency object for use by the caller.
+ * Allocate a single emergency object and track it in a red black tree.
  */
 static int
 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
@@ -1143,48 +1189,49 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
 		SRETURN(-ENOMEM);
 	}
 
+	spin_lock(&skc->skc_lock);
+	empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
+	if (likely(empty)) {
+		skc->skc_obj_total++;
+		skc->skc_obj_emergency++;
+		if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+			skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+	}
+	spin_unlock(&skc->skc_lock);
+
+	if (unlikely(!empty)) {
+		kfree(ske->ske_obj);
+		kfree(ske);
+		SRETURN(-EINVAL);
+	}
+
 	if (skc->skc_ctor)
 		skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
 
-	spin_lock(&skc->skc_lock);
-	skc->skc_obj_total++;
-	skc->skc_obj_emergency++;
-	if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
-		skc->skc_obj_emergency_max = skc->skc_obj_emergency;
-
-	list_add(&ske->ske_list, &skc->skc_emergency_list);
-	spin_unlock(&skc->skc_lock);
-
 	*obj = ske->ske_obj;
 
 	SRETURN(0);
 }
 
 /*
- * Free the passed object if it is an emergency object or a normal slab
- * object.  Currently this is done by walking what should be a short list of
- * emergency objects.  If this proves to be too inefficient we can replace
- * the simple list with a hash.
+ * Locate the passed object in the red black tree and free it.
  */
 static int
 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 {
-	spl_kmem_emergency_t *m, *n, *ske = NULL;
+	spl_kmem_emergency_t *ske;
 	SENTRY;
 
 	spin_lock(&skc->skc_lock);
-	list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
-		if (m->ske_obj == obj) {
-			list_del(&m->ske_list);
-			skc->skc_obj_emergency--;
-			skc->skc_obj_total--;
-			ske = m;
-			break;
-		}
+	ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
+	if (likely(ske)) {
+		rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
+		skc->skc_obj_emergency--;
+		skc->skc_obj_total--;
 	}
 	spin_unlock(&skc->skc_lock);
 
-	if (ske == NULL)
+	if (unlikely(ske == NULL))
 		SRETURN(-ENOENT);
 
 	if (skc->skc_dtor)
@@ -1483,7 +1530,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	INIT_LIST_HEAD(&skc->skc_list);
 	INIT_LIST_HEAD(&skc->skc_complete_list);
 	INIT_LIST_HEAD(&skc->skc_partial_list);
-	INIT_LIST_HEAD(&skc->skc_emergency_list);
+	skc->skc_emergency_tree = RB_ROOT;
 	spin_lock_init(&skc->skc_lock);
 	init_waitqueue_head(&skc->skc_waitq);
 	skc->skc_slab_fail = 0;
@@ -1590,7 +1637,6 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 	ASSERT3U(skc->skc_obj_total, ==, 0);
 	ASSERT3U(skc->skc_obj_emergency, ==, 0);
 	ASSERT(list_empty(&skc->skc_complete_list));
-	ASSERT(list_empty(&skc->skc_emergency_list));
 
 	kmem_free(skc->skc_name, skc->skc_name_size);
 	spin_unlock(&skc->skc_lock);

From a1af8fb1eaa08e55f6e0799779a89f455a5017f2 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 30 Oct 2012 11:21:42 -0700
Subject: [PATCH 3/4] Optimize spl_kmem_cache_free()

Because only virtual slabs may have emergency objects and these
objects are guaranteed to have physical addresses.  It can be
easily determined if the passed object is a virtual slab object
or an emergency object.  This allows us to completely optimize
the emergency object free case out of the common free path.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 module/spl/spl-kmem.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index 7e68522ad3..5627b5590b 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -2023,11 +2023,12 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 	atomic_inc(&skc->skc_ref);
 
 	/*
-	 * Emergency objects are never part of the virtual address space
-	 * so if we get a virtual address we can optimize this check out.
+	 * Only virtual slabs may have emergency objects and these objects
+	 * are guaranteed to have physical addresses.  They must be removed
+	 * from the tree of emergency objects and the freed.
 	 */
-	if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
-		SGOTO(out, 0);
+	if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
+		SGOTO(out, spl_emergency_free(skc, obj));
 
 	local_irq_save(flags);
 

From dc1b30224f9b1587dbe383d9c8e16caa4b1f71d3 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 5 Nov 2012 13:54:20 -0800
Subject: [PATCH 4/4] Never spin in kmem_cache_alloc()

If we are reaping from the cache and a concurrent allocation
occurs then the caller must block until the reaping is complete.
This is signaled by the clearing of the KMC_BIT_REAPING bit.

Otherwise the caller will be in a tight loop which takes and
releases the skc->skc_cache lock.  When there are multiple
concurrent callers the system will thrash on the lock and
appear to lock up.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 module/spl/spl-kmem.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index 5627b5590b..f3113e0f49 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -1725,6 +1725,13 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc)
 	return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
 }
 
+static int
+spl_cache_reclaim_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
 /*
  * No available objects on any slabs, create a new slab.
  */
@@ -1739,12 +1746,14 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
 	*obj = NULL;
 
 	/*
-	 * Before allocating a new slab check if the slab is being reaped.
-	 * If it is there is a good chance we can wait until it finishes
-	 * and then use one of the newly freed but not aged-out slabs.
+	 * Before allocating a new slab wait for any reaping to complete and
+	 * then return so the local magazine can be rechecked for new objects.
 	 */
-	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
-		SRETURN(-EAGAIN);
+	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+		rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
+		    spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE);
+		SRETURN(rc ? rc : -EAGAIN);
+	}
 
 	/*
 	 * This is handled by dispatching a work request to the global work
@@ -2156,6 +2165,9 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
 	/* Reclaim from the cache, ignoring it's age and delay. */
 	spl_slab_reclaim(skc, count, 1);
 	clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
+
 	atomic_dec(&skc->skc_ref);
 
 	SEXIT;