diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 777b1b93ee..3cd0d78c35 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -18,6 +18,9 @@ * * CDDL HEADER END */ +/* + * Copyright (c) 2024, Klara, Inc. + */ #ifndef _SYS_ZVOL_IMPL_H #define _SYS_ZVOL_IMPL_H @@ -27,6 +30,7 @@ #define ZVOL_RDONLY (1<<0) /* zvol is readonly (writes rejected) */ #define ZVOL_WRITTEN_TO (1<<1) /* zvol has been written to (needs flush) */ #define ZVOL_EXCL (1<<2) /* zvol has O_EXCL client right now */ +#define ZVOL_REMOVING (1<<3) /* zvol waiting to remove minor */ /* * The in-core state of each volume. @@ -50,6 +54,7 @@ typedef struct zvol_state { kmutex_t zv_state_lock; /* protects zvol_state_t */ atomic_t zv_suspend_ref; /* refcount for suspend */ krwlock_t zv_suspend_lock; /* suspend lock */ + kcondvar_t zv_removing_cv; /* ready to remove minor */ struct zvol_state_os *zv_zso; /* private platform state */ boolean_t zv_threading; /* volthreading property */ } zvol_state_t; diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index a53a541414..ddb20b0314 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -30,6 +30,7 @@ * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2024, Klara, Inc. */ /* Portions Copyright 2011 Martin Matuska */ @@ -250,7 +251,7 @@ retry: } mutex_enter(&zv->zv_state_lock); - if (zv->zv_zso->zso_dying) { + if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); goto out_zv_locked; @@ -683,6 +684,11 @@ zvol_geom_bio_strategy(struct bio *bp) rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + if (zv->zv_flags & ZVOL_REMOVING) { + error = SET_ERROR(ENXIO); + goto resume; + } + switch (bp->bio_cmd) { case BIO_READ: doread = B_TRUE; @@ -1358,6 +1364,7 @@ zvol_os_free(zvol_state_t *zv) } mutex_destroy(&zv->zv_state_lock); + cv_destroy(&zv->zv_removing_cv); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); @@ -1415,6 +1422,7 @@ zvol_os_create_minor(const char *name) zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); zv->zv_hash = hash; mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_volmode = volmode; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index ba6a24f312..83f80f62ae 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #include @@ -526,6 +527,11 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, uint64_t size = io_size(bio, rq); int rw = io_data_dir(bio, rq); + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + END_IO(zv, bio, rq, -SET_ERROR(ENXIO)); + goto out; + } + if (zvol_request_sync || zv->zv_threading == B_FALSE) force_sync = 1; @@ -734,6 +740,13 @@ retry: } mutex_enter(&zv->zv_state_lock); + + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zvol_state_lock); + return (-SET_ERROR(ENXIO)); + } + /* * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition @@ -1313,6 +1326,7 @@ zvol_alloc(dev_t dev, const char *name) list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); #ifdef HAVE_BLK_MQ zv->zv_zso->use_blk_mq = zvol_use_blk_mq; @@ -1438,6 +1452,7 @@ zvol_os_free(zvol_state_t *zv) ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); + cv_destroy(&zv->zv_removing_cv); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 5b6a3f5cb4..001f774a6d 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -37,6 +37,7 @@ * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ /* @@ -894,6 +895,9 @@ zvol_resume(zvol_state_t *zv) */ atomic_dec(&zv->zv_suspend_ref); + if (zv->zv_flags & ZVOL_REMOVING) + cv_broadcast(&zv->zv_removing_cv); + return (SET_ERROR(error)); } @@ -929,6 +933,9 @@ zvol_last_close(zvol_state_t *zv) ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + if (zv->zv_flags & ZVOL_REMOVING) + cv_broadcast(&zv->zv_removing_cv); + zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, 1, zv); @@ -1221,6 +1228,41 @@ zvol_create_minor(const char *name) * Remove minors for specified dataset including children and snapshots. */ +/* + * Remove the minor for a given zvol. This will do it all: + * - flag the zvol for removal, so new requests are rejected + * - wait until outstanding requests are completed + * - remove it from lists + * - free it + * It's also usable as a taskq task, and smells nice too. + */ +static void +zvol_remove_minor_task(void *arg) +{ + zvol_state_t *zv = (zvol_state_t *)arg; + + ASSERT(!RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + + mutex_enter(&zv->zv_state_lock); + while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { + zv->zv_flags |= ZVOL_REMOVING; + cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock); + } + mutex_exit(&zv->zv_state_lock); + + rw_enter(&zvol_state_lock, RW_WRITER); + mutex_enter(&zv->zv_state_lock); + + zvol_remove(zv); + zvol_os_clear_private(zv); + + mutex_exit(&zv->zv_state_lock); + rw_exit(&zvol_state_lock); + + zvol_os_free(zv); +} + static void zvol_free_task(void *arg) { @@ -1233,11 +1275,13 @@ zvol_remove_minors_impl(const char *name) zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); taskqid_t t; - list_t free_list; + list_t delay_list, free_list; if (zvol_inhibit_dev) return; + list_create(&delay_list, sizeof (zvol_state_t), + offsetof(zvol_state_t, zv_next)); list_create(&free_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); @@ -1256,9 +1300,24 @@ zvol_remove_minors_impl(const char *name) * one is currently using this zv */ - /* If in use, leave alone */ + /* + * If in use, try to throw everyone off and try again + * later. + */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { + zv->zv_flags |= ZVOL_REMOVING; + t = taskq_dispatch( + zv->zv_objset->os_spa->spa_zvol_taskq, + zvol_remove_minor_task, zv, TQ_SLEEP); + if (t == TASKQID_INVALID) { + /* + * Couldn't create the task, so we'll + * do it in place once the loop is + * finished. + */ + list_insert_head(&delay_list, zv); + } mutex_exit(&zv->zv_state_lock); continue; } @@ -1285,7 +1344,11 @@ zvol_remove_minors_impl(const char *name) } rw_exit(&zvol_state_lock); - /* Drop zvol_state_lock before calling zvol_free() */ + /* Wait for zvols that we couldn't create a remove task for */ + while ((zv = list_remove_head(&delay_list)) != NULL) + zvol_remove_minor_task(zv); + + /* Free any that we couldn't free in parallel earlier */ while ((zv = list_remove_head(&free_list)) != NULL) zvol_os_free(zv); } @@ -1305,33 +1368,38 @@ zvol_remove_minor_impl(const char *name) zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); - if (strcmp(zv->zv_name, name) == 0) { - /* - * By holding zv_state_lock here, we guarantee that no - * one is currently using this zv - */ - - /* If in use, leave alone */ - if (zv->zv_open_count > 0 || - atomic_read(&zv->zv_suspend_ref)) { - mutex_exit(&zv->zv_state_lock); - continue; - } - zvol_remove(zv); - - zvol_os_clear_private(zv); - mutex_exit(&zv->zv_state_lock); + if (strcmp(zv->zv_name, name) == 0) + /* Found, leave the the loop with zv_lock held */ break; - } else { - mutex_exit(&zv->zv_state_lock); - } + mutex_exit(&zv->zv_state_lock); } - /* Drop zvol_state_lock before calling zvol_free() */ + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return; + } + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { + /* + * In use, so try to throw everyone off, then wait + * until finished. + */ + zv->zv_flags |= ZVOL_REMOVING; + mutex_exit(&zv->zv_state_lock); + rw_exit(&zvol_state_lock); + zvol_remove_minor_task(zv); + return; + } + + zvol_remove(zv); + zvol_os_clear_private(zv); + + mutex_exit(&zv->zv_state_lock); rw_exit(&zvol_state_lock); - if (zv != NULL) - zvol_os_free(zv); + zvol_os_free(zv); } /*