From 1ee159f423b5eb3c4646b0ba2dd0fb359503ba90 Mon Sep 17 00:00:00 2001
From: Boris Protopopov <boris.protopopov@actifio.com>
Date: Wed, 23 Sep 2015 12:34:51 -0400
Subject: [PATCH] Fix lock order inversion with zvol_open()

zfsonlinux issue #3681 - lock order inversion between zvol_open() and
dsl_pool_sync()...zvol_rename_minors()

Remove trylock of spa_namespace_lock as it is no longer needed when
zvol minor operations are performed in a separate context with no
prior locking state; the spa_namespace_lock is no longer held
when bdev->bd_mutex or zfs_state_lock might be taken in the code
paths originating from the zvol minor operation callbacks.

Signed-off-by: Boris Protopopov <boris.protopopov@actifio.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3681
---
 module/zfs/zvol.c | 65 +++++++++++++++--------------------------------
 1 file changed, 21 insertions(+), 44 deletions(-)

diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index ab4d3ceb74..ba482a4740 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -957,56 +957,27 @@ zvol_first_open(zvol_state_t *zv)
 {
 	objset_t *os;
 	uint64_t volsize;
-	int locked = 0;
 	int error;
 	uint64_t ro;
 
-	/*
-	 * In all other cases the spa_namespace_lock is taken before the
-	 * bdev->bd_mutex lock.  But in this case the Linux __blkdev_get()
-	 * function calls fops->open() with the bdev->bd_mutex lock held.
-	 *
-	 * To avoid a potential lock inversion deadlock we preemptively
-	 * try to take the spa_namespace_lock().  Normally it will not
-	 * be contended and this is safe because spa_open_common() handles
-	 * the case where the caller already holds the spa_namespace_lock.
-	 *
-	 * When it is contended we risk a lock inversion if we were to
-	 * block waiting for the lock.  Luckily, the __blkdev_get()
-	 * function allows us to return -ERESTARTSYS which will result in
-	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
-	 * called again.  This process can be repeated safely until both
-	 * locks are acquired.
-	 */
-	if (!mutex_owned(&spa_namespace_lock)) {
-		locked = mutex_tryenter(&spa_namespace_lock);
-		if (!locked)
-			return (-SET_ERROR(ERESTARTSYS));
-	}
-
-	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
-	if (error)
-		goto out_mutex;
-
 	/* lie and say we're read-only */
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
 	if (error)
-		goto out_mutex;
-
-	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
-	if (error) {
-		dmu_objset_disown(os, zvol_tag);
-		zv->zv_objset = NULL;
-		goto out_mutex;
-	}
+		return (SET_ERROR(-error));
 
 	zv->zv_objset = os;
+
+	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
+	if (error)
+		goto out_owned;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error)
+		goto out_owned;
+
 	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
-	if (error) {
-		dmu_objset_disown(os, zvol_tag);
-		zv->zv_objset = NULL;
-		goto out_mutex;
-	}
+	if (error)
+		goto out_owned;
 
 	set_capacity(zv->zv_disk, volsize >> 9);
 	zv->zv_volsize = volsize;
@@ -1021,9 +992,11 @@ zvol_first_open(zvol_state_t *zv)
 		zv->zv_flags &= ~ZVOL_RDONLY;
 	}
 
-out_mutex:
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
+out_owned:
+	if (error) {
+		dmu_objset_disown(os, zvol_tag);
+		zv->zv_objset = NULL;
+	}
 
 	return (SET_ERROR(-error));
 }
@@ -1526,6 +1499,8 @@ zvol_create_snap_minor_cb(const char *dsname, void *arg)
 {
 	const char *name = (const char *)arg;
 
+	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
+
 	/* skip the designated dataset */
 	if (name && strcmp(dsname, name) == 0)
 		return (0);
@@ -1550,6 +1525,8 @@ zvol_create_minors_cb(const char *dsname, void *arg)
 	uint64_t snapdev;
 	int error;
 
+	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
+
 	error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
 	if (error)
 		return (0);