Fix lock order inversion with zvol_open()

zfsonlinux issue #3681 - lock order inversion between zvol_open() and
dsl_pool_sync()...zvol_rename_minors()

Remove trylock of spa_namespace_lock as it is no longer needed when
zvol minor operations are performed in a separate context with no
prior locking state; the spa_namespace_lock is no longer held
when bdev->bd_mutex or zfs_state_lock might be taken in the code
paths originating from the zvol minor operation callbacks.

Signed-off-by: Boris Protopopov <boris.protopopov@actifio.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3681
This commit is contained in:
Boris Protopopov 2015-09-23 12:34:51 -04:00 committed by Ned Bass
parent 9d02f9557f
commit d0337e80ca
1 changed files with 21 additions and 44 deletions

View File

@ -901,56 +901,27 @@ zvol_first_open(zvol_state_t *zv)
{
objset_t *os;
uint64_t volsize;
int locked = 0;
int error;
uint64_t ro;
/*
* In all other cases the spa_namespace_lock is taken before the
* bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
* function calls fops->open() with the bdev->bd_mutex lock held.
*
* To avoid a potential lock inversion deadlock we preemptively
* try to take the spa_namespace_lock(). Normally it will not
* be contended and this is safe because spa_open_common() handles
* the case where the caller already holds the spa_namespace_lock.
*
* When it is contended we risk a lock inversion if we were to
* block waiting for the lock. Luckily, the __blkdev_get()
* function allows us to return -ERESTARTSYS which will result in
* bdev->bd_mutex being dropped, reacquired, and fops->open() being
* called again. This process can be repeated safely until both
* locks are acquired.
*/
if (!mutex_owned(&spa_namespace_lock)) {
locked = mutex_tryenter(&spa_namespace_lock);
if (!locked)
return (-SET_ERROR(ERESTARTSYS));
}
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
goto out_mutex;
/* lie and say we're read-only */
error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
if (error)
goto out_mutex;
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
if (error) {
dmu_objset_disown(os, zvol_tag);
zv->zv_objset = NULL;
goto out_mutex;
}
return (SET_ERROR(-error));
zv->zv_objset = os;
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
goto out_owned;
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
if (error)
goto out_owned;
error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
if (error) {
dmu_objset_disown(os, zvol_tag);
zv->zv_objset = NULL;
goto out_mutex;
}
if (error)
goto out_owned;
set_capacity(zv->zv_disk, volsize >> 9);
zv->zv_volsize = volsize;
@ -965,9 +936,11 @@ zvol_first_open(zvol_state_t *zv)
zv->zv_flags &= ~ZVOL_RDONLY;
}
out_mutex:
if (locked)
mutex_exit(&spa_namespace_lock);
out_owned:
if (error) {
dmu_objset_disown(os, zvol_tag);
zv->zv_objset = NULL;
}
return (SET_ERROR(-error));
}
@ -1470,6 +1443,8 @@ zvol_create_snap_minor_cb(const char *dsname, void *arg)
{
const char *name = (const char *)arg;
ASSERT0(MUTEX_HELD(&spa_namespace_lock));
/* skip the designated dataset */
if (name && strcmp(dsname, name) == 0)
return (0);
@ -1494,6 +1469,8 @@ zvol_create_minors_cb(const char *dsname, void *arg)
uint64_t snapdev;
int error;
ASSERT0(MUTEX_HELD(&spa_namespace_lock));
error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
if (error)
return (0);