From 5428dc51fb55145fbac1c142402dafc11d1e7d28 Mon Sep 17 00:00:00 2001 From: Boris Protopopov Date: Tue, 16 Feb 2016 14:52:55 -0500 Subject: [PATCH] Make zvol minor functionality more robust Close the race window in zvol_open() to prevent removal of zvol_state in the 'first open' code path. Move the call to check_disk_change() under zvol_state_lock to make sure the zvol_media_changed() and zvol_revalidate_disk() called by check_disk_change() are invoked with positive zv_open_count. Skip opened zvols when removing minors and set private_data to NULL for zvols that are not in use whose minors are being removed, to indicate to zvol_open() that the state is gone. Skip opened zvols when renaming minors to avoid modifying zv_name that might be in use, e.g. in zvol_ioctl(). Drop zvol_state_lock before calling add_disk() when creating minors to avoid deadlocks with zvol_open(). Wrap dmu_objset_find() with spl_fstran_mark()/unmark(). Signed-off-by: Boris Protopopov Signed-off-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #4344 --- module/zfs/zvol.c | 93 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 16 deletions(-) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 52774a82ce..c90e4ec6b9 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -35,6 +35,7 @@ * needs to be run before opening and using a device. * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include @@ -607,6 +608,8 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync) rl_t *rl; int error = 0; + ASSERT(zv && zv->zv_open_count > 0); + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_WRITER); @@ -675,6 +678,8 @@ zvol_discard(struct bio *bio) rl_t *rl; dmu_tx_t *tx; + ASSERT(zv && zv->zv_open_count > 0); + if (end > zv->zv_volsize) return (SET_ERROR(EIO)); @@ -722,6 +727,8 @@ zvol_read(zvol_state_t *zv, uio_t *uio) rl_t *rl; int error = 0; + ASSERT(zv && zv->zv_open_count > 0); + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1020,7 +1027,7 @@ zvol_last_close(zvol_state_t *zv) static int zvol_open(struct block_device *bdev, fmode_t flag) { - zvol_state_t *zv = bdev->bd_disk->private_data; + zvol_state_t *zv; int error = 0, drop_mutex = 0; /* @@ -1034,7 +1041,17 @@ zvol_open(struct block_device *bdev, fmode_t flag) drop_mutex = 1; } - ASSERT3P(zv, !=, NULL); + /* + * Obtain a copy of private_data under the lock to make sure + * that either the result of zvol_freeg() setting + * bdev->bd_disk->private_data to NULL is observed, or zvol_free() + * is not called on this zv because of the positive zv_open_count. + */ + zv = bdev->bd_disk->private_data; + if (zv == NULL) { + error = -ENXIO; + goto out_mutex; + } if (zv->zv_open_count == 0) { error = zvol_first_open(zv); @@ -1049,6 +1066,8 @@ zvol_open(struct block_device *bdev, fmode_t flag) zv->zv_open_count++; + check_disk_change(bdev); + out_open_count: if (zv->zv_open_count == 0) zvol_last_close(zv); @@ -1057,8 +1076,6 @@ out_mutex: if (drop_mutex) mutex_exit(&zvol_state_lock); - check_disk_change(bdev); - return (SET_ERROR(error)); } @@ -1072,16 +1089,16 @@ zvol_release(struct gendisk *disk, fmode_t mode) zvol_state_t *zv = disk->private_data; int drop_mutex = 0; + ASSERT(zv && zv->zv_open_count > 0); + if (!mutex_owned(&zvol_state_lock)) { mutex_enter(&zvol_state_lock); drop_mutex = 1; } - if (zv->zv_open_count > 0) { - zv->zv_open_count--; - if (zv->zv_open_count == 0) - zvol_last_close(zv); - } + zv->zv_open_count--; + if (zv->zv_open_count == 0) + zvol_last_close(zv); if (drop_mutex) mutex_exit(&zvol_state_lock); @@ -1098,8 +1115,7 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; - if (zv == NULL) - return (SET_ERROR(-ENXIO)); + ASSERT(zv && zv->zv_open_count > 0); switch (cmd) { case BLKFLSBUF: @@ -1133,6 +1149,8 @@ static int zvol_media_changed(struct gendisk *disk) { zvol_state_t *zv = disk->private_data; + ASSERT(zv && zv->zv_open_count > 0); + return (zv->zv_changed); } @@ -1140,6 +1158,8 @@ static int zvol_revalidate_disk(struct gendisk *disk) { zvol_state_t *zv = disk->private_data; + ASSERT(zv && zv->zv_open_count > 0); + zv->zv_changed = 0; set_capacity(zv->zv_disk, zv->zv_volsize >> 9); @@ -1156,7 +1176,11 @@ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; - sector_t sectors = get_capacity(zv->zv_disk); + sector_t sectors; + + ASSERT(zv && zv->zv_open_count > 0); + + sectors = get_capacity(zv->zv_disk); if (sectors > 2048) { geo->heads = 16; @@ -1312,9 +1336,14 @@ out_kmem: static void zvol_free(zvol_state_t *zv) { + ASSERT(MUTEX_HELD(&zvol_state_lock)); + ASSERT(zv->zv_open_count == 0); + avl_destroy(&zv->zv_znode.z_range_avl); mutex_destroy(&zv->zv_znode.z_range_lock); + zv->zv_disk->private_data = NULL; + del_gendisk(zv->zv_disk); blk_cleanup_queue(zv->zv_queue); put_disk(zv->zv_disk); @@ -1448,7 +1477,15 @@ out: if (error == 0) { zvol_insert(zv); + /* + * Drop the lock to prevent deadlock with sys_open() -> + * zvol_open(), which first takes bd_disk->bd_mutex and then + * takes zvol_state_lock, whereas this code path first takes + * zvol_state_lock, and then takes bd_disk->bd_mutex. + */ + mutex_exit(&zvol_state_lock); add_disk(zv->zv_disk); + mutex_enter(&zvol_state_lock); } return (SET_ERROR(error)); @@ -1545,10 +1582,15 @@ int zvol_create_minors(const char *name) { int error = 0; + fstrans_cookie_t cookie; - if (!zvol_inhibit_dev) - error = dmu_objset_find((char *)name, zvol_create_minors_cb, - NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + if (zvol_inhibit_dev) + return (0); + + cookie = spl_fstrans_mark(); + error = dmu_objset_find((char *)name, zvol_create_minors_cb, + NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + spl_fstrans_unmark(cookie); return (SET_ERROR(error)); } @@ -1572,7 +1614,13 @@ zvol_remove_minors(const char *name) if (name == NULL || strcmp(zv->zv_name, name) == 0 || (strncmp(zv->zv_name, name, namelen) == 0 && - zv->zv_name[namelen] == '/')) { + (zv->zv_name[namelen] == '/' || + zv->zv_name[namelen] == '@'))) { + + /* If in use, leave alone */ + if (zv->zv_open_count > 0) + continue; + zvol_remove(zv); zvol_free(zv); } @@ -1603,6 +1651,10 @@ zvol_rename_minors(const char *oldname, const char *newname) for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); + /* If in use, leave alone */ + if (zv->zv_open_count > 0) + continue; + if (strcmp(zv->zv_name, oldname) == 0) { __zvol_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && @@ -1643,8 +1695,17 @@ snapdev_snapshot_changed_cb(const char *dsname, void *arg) { int zvol_set_snapdev(const char *dsname, uint64_t snapdev) { + fstrans_cookie_t cookie; + + if (zvol_inhibit_dev) + /* caller should continue to modify snapdev property */ + return (-1); + + cookie = spl_fstrans_mark(); (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb, &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + spl_fstrans_unmark(cookie); + /* caller should continue to modify snapdev property */ return (-1); }