zvol: ensure device minors are properly cleaned up
Currently, if a minor is in use when we try to remove it, we'll skip it and never come back to it again. Since the zvol state is hung off the minor in the kernel, this can get us into weird situations if something tries to use it after the removal fails. It's even worse at pool export, as there's now a vestigial zvol state with no pool under it. It's weirder again if the pool is subsequently reimported, as the zvol code (reasonably) assumes the zvol state has been properly setup, when it's actually left over from the previous import of the pool. This commit attempts to tackle that by setting a flag on the zvol if its minor can't be removed, and then checking that flag when a request is made and rejecting it, thus stopping new work coming in. The flag also causes a condvar to be signaled when the last client finishes. For the case where a single minor is being removed (eg changing volmode), it will wait for this signal before proceeding. Meanwhile, when removing all minors, a background task is created for each minor that couldn't be removed on the spot, and those tasks then wake and clean up. Since any new tasks are queued on to the pool's spa_zvol_taskq, spa_export_common() will continue to wait at export until all minors are removed. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes #14872 Closes #16364
This commit is contained in:
parent
88aab1d2d0
commit
670147be53
|
@ -18,6 +18,9 @@
|
||||||
*
|
*
|
||||||
* CDDL HEADER END
|
* CDDL HEADER END
|
||||||
*/
|
*/
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2024, Klara, Inc.
|
||||||
|
*/
|
||||||
|
|
||||||
#ifndef _SYS_ZVOL_IMPL_H
|
#ifndef _SYS_ZVOL_IMPL_H
|
||||||
#define _SYS_ZVOL_IMPL_H
|
#define _SYS_ZVOL_IMPL_H
|
||||||
|
@ -27,6 +30,7 @@
|
||||||
#define ZVOL_RDONLY (1<<0) /* zvol is readonly (writes rejected) */
|
#define ZVOL_RDONLY (1<<0) /* zvol is readonly (writes rejected) */
|
||||||
#define ZVOL_WRITTEN_TO (1<<1) /* zvol has been written to (needs flush) */
|
#define ZVOL_WRITTEN_TO (1<<1) /* zvol has been written to (needs flush) */
|
||||||
#define ZVOL_EXCL (1<<2) /* zvol has O_EXCL client right now */
|
#define ZVOL_EXCL (1<<2) /* zvol has O_EXCL client right now */
|
||||||
|
#define ZVOL_REMOVING (1<<3) /* zvol waiting to remove minor */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The in-core state of each volume.
|
* The in-core state of each volume.
|
||||||
|
@ -50,6 +54,7 @@ typedef struct zvol_state {
|
||||||
kmutex_t zv_state_lock; /* protects zvol_state_t */
|
kmutex_t zv_state_lock; /* protects zvol_state_t */
|
||||||
atomic_t zv_suspend_ref; /* refcount for suspend */
|
atomic_t zv_suspend_ref; /* refcount for suspend */
|
||||||
krwlock_t zv_suspend_lock; /* suspend lock */
|
krwlock_t zv_suspend_lock; /* suspend lock */
|
||||||
|
kcondvar_t zv_removing_cv; /* ready to remove minor */
|
||||||
struct zvol_state_os *zv_zso; /* private platform state */
|
struct zvol_state_os *zv_zso; /* private platform state */
|
||||||
boolean_t zv_threading; /* volthreading property */
|
boolean_t zv_threading; /* volthreading property */
|
||||||
} zvol_state_t;
|
} zvol_state_t;
|
||||||
|
|
|
@ -30,6 +30,7 @@
|
||||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
|
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014 Integros [integros.com]
|
* Copyright (c) 2014 Integros [integros.com]
|
||||||
|
* Copyright (c) 2024, Klara, Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
|
/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
|
||||||
|
@ -250,7 +251,7 @@ retry:
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_enter(&zv->zv_state_lock);
|
mutex_enter(&zv->zv_state_lock);
|
||||||
if (zv->zv_zso->zso_dying) {
|
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
|
||||||
rw_exit(&zvol_state_lock);
|
rw_exit(&zvol_state_lock);
|
||||||
err = SET_ERROR(ENXIO);
|
err = SET_ERROR(ENXIO);
|
||||||
goto out_zv_locked;
|
goto out_zv_locked;
|
||||||
|
@ -683,6 +684,11 @@ zvol_geom_bio_strategy(struct bio *bp)
|
||||||
|
|
||||||
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
||||||
|
|
||||||
|
if (zv->zv_flags & ZVOL_REMOVING) {
|
||||||
|
error = SET_ERROR(ENXIO);
|
||||||
|
goto resume;
|
||||||
|
}
|
||||||
|
|
||||||
switch (bp->bio_cmd) {
|
switch (bp->bio_cmd) {
|
||||||
case BIO_READ:
|
case BIO_READ:
|
||||||
doread = B_TRUE;
|
doread = B_TRUE;
|
||||||
|
@ -1358,6 +1364,7 @@ zvol_os_free(zvol_state_t *zv)
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_destroy(&zv->zv_state_lock);
|
mutex_destroy(&zv->zv_state_lock);
|
||||||
|
cv_destroy(&zv->zv_removing_cv);
|
||||||
dataset_kstats_destroy(&zv->zv_kstat);
|
dataset_kstats_destroy(&zv->zv_kstat);
|
||||||
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
|
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
|
||||||
kmem_free(zv, sizeof (zvol_state_t));
|
kmem_free(zv, sizeof (zvol_state_t));
|
||||||
|
@ -1415,6 +1422,7 @@ zvol_os_create_minor(const char *name)
|
||||||
zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
|
zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
|
||||||
zv->zv_hash = hash;
|
zv->zv_hash = hash;
|
||||||
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||||
|
cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
|
||||||
zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
|
zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
|
||||||
zv->zv_volmode = volmode;
|
zv->zv_volmode = volmode;
|
||||||
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
|
||||||
|
* Copyright (c) 2024, Klara, Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/dataset_kstats.h>
|
#include <sys/dataset_kstats.h>
|
||||||
|
@ -526,6 +527,11 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
|
||||||
uint64_t size = io_size(bio, rq);
|
uint64_t size = io_size(bio, rq);
|
||||||
int rw = io_data_dir(bio, rq);
|
int rw = io_data_dir(bio, rq);
|
||||||
|
|
||||||
|
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
|
||||||
|
END_IO(zv, bio, rq, -SET_ERROR(ENXIO));
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
if (zvol_request_sync || zv->zv_threading == B_FALSE)
|
if (zvol_request_sync || zv->zv_threading == B_FALSE)
|
||||||
force_sync = 1;
|
force_sync = 1;
|
||||||
|
|
||||||
|
@ -734,6 +740,13 @@ retry:
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_enter(&zv->zv_state_lock);
|
mutex_enter(&zv->zv_state_lock);
|
||||||
|
|
||||||
|
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
|
||||||
|
mutex_exit(&zv->zv_state_lock);
|
||||||
|
rw_exit(&zvol_state_lock);
|
||||||
|
return (-SET_ERROR(ENXIO));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Make sure zvol is not suspended during first open
|
* Make sure zvol is not suspended during first open
|
||||||
* (hold zv_suspend_lock) and respect proper lock acquisition
|
* (hold zv_suspend_lock) and respect proper lock acquisition
|
||||||
|
@ -1313,6 +1326,7 @@ zvol_alloc(dev_t dev, const char *name)
|
||||||
|
|
||||||
list_link_init(&zv->zv_next);
|
list_link_init(&zv->zv_next);
|
||||||
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||||
|
cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
|
||||||
|
|
||||||
#ifdef HAVE_BLK_MQ
|
#ifdef HAVE_BLK_MQ
|
||||||
zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
|
zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
|
||||||
|
@ -1438,6 +1452,7 @@ zvol_os_free(zvol_state_t *zv)
|
||||||
ida_simple_remove(&zvol_ida,
|
ida_simple_remove(&zvol_ida,
|
||||||
MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
|
MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
|
||||||
|
|
||||||
|
cv_destroy(&zv->zv_removing_cv);
|
||||||
mutex_destroy(&zv->zv_state_lock);
|
mutex_destroy(&zv->zv_state_lock);
|
||||||
dataset_kstats_destroy(&zv->zv_kstat);
|
dataset_kstats_destroy(&zv->zv_kstat);
|
||||||
|
|
||||||
|
|
|
@ -37,6 +37,7 @@
|
||||||
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
|
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
|
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
||||||
|
* Copyright (c) 2024, Klara, Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -894,6 +895,9 @@ zvol_resume(zvol_state_t *zv)
|
||||||
*/
|
*/
|
||||||
atomic_dec(&zv->zv_suspend_ref);
|
atomic_dec(&zv->zv_suspend_ref);
|
||||||
|
|
||||||
|
if (zv->zv_flags & ZVOL_REMOVING)
|
||||||
|
cv_broadcast(&zv->zv_removing_cv);
|
||||||
|
|
||||||
return (SET_ERROR(error));
|
return (SET_ERROR(error));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -929,6 +933,9 @@ zvol_last_close(zvol_state_t *zv)
|
||||||
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
|
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
|
||||||
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
||||||
|
|
||||||
|
if (zv->zv_flags & ZVOL_REMOVING)
|
||||||
|
cv_broadcast(&zv->zv_removing_cv);
|
||||||
|
|
||||||
zvol_shutdown_zv(zv);
|
zvol_shutdown_zv(zv);
|
||||||
|
|
||||||
dmu_objset_disown(zv->zv_objset, 1, zv);
|
dmu_objset_disown(zv->zv_objset, 1, zv);
|
||||||
|
@ -1221,6 +1228,41 @@ zvol_create_minor(const char *name)
|
||||||
* Remove minors for specified dataset including children and snapshots.
|
* Remove minors for specified dataset including children and snapshots.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remove the minor for a given zvol. This will do it all:
|
||||||
|
* - flag the zvol for removal, so new requests are rejected
|
||||||
|
* - wait until outstanding requests are completed
|
||||||
|
* - remove it from lists
|
||||||
|
* - free it
|
||||||
|
* It's also usable as a taskq task, and smells nice too.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
zvol_remove_minor_task(void *arg)
|
||||||
|
{
|
||||||
|
zvol_state_t *zv = (zvol_state_t *)arg;
|
||||||
|
|
||||||
|
ASSERT(!RW_LOCK_HELD(&zvol_state_lock));
|
||||||
|
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
|
||||||
|
|
||||||
|
mutex_enter(&zv->zv_state_lock);
|
||||||
|
while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
|
||||||
|
zv->zv_flags |= ZVOL_REMOVING;
|
||||||
|
cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
|
||||||
|
}
|
||||||
|
mutex_exit(&zv->zv_state_lock);
|
||||||
|
|
||||||
|
rw_enter(&zvol_state_lock, RW_WRITER);
|
||||||
|
mutex_enter(&zv->zv_state_lock);
|
||||||
|
|
||||||
|
zvol_remove(zv);
|
||||||
|
zvol_os_clear_private(zv);
|
||||||
|
|
||||||
|
mutex_exit(&zv->zv_state_lock);
|
||||||
|
rw_exit(&zvol_state_lock);
|
||||||
|
|
||||||
|
zvol_os_free(zv);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
zvol_free_task(void *arg)
|
zvol_free_task(void *arg)
|
||||||
{
|
{
|
||||||
|
@ -1233,11 +1275,13 @@ zvol_remove_minors_impl(const char *name)
|
||||||
zvol_state_t *zv, *zv_next;
|
zvol_state_t *zv, *zv_next;
|
||||||
int namelen = ((name) ? strlen(name) : 0);
|
int namelen = ((name) ? strlen(name) : 0);
|
||||||
taskqid_t t;
|
taskqid_t t;
|
||||||
list_t free_list;
|
list_t delay_list, free_list;
|
||||||
|
|
||||||
if (zvol_inhibit_dev)
|
if (zvol_inhibit_dev)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
list_create(&delay_list, sizeof (zvol_state_t),
|
||||||
|
offsetof(zvol_state_t, zv_next));
|
||||||
list_create(&free_list, sizeof (zvol_state_t),
|
list_create(&free_list, sizeof (zvol_state_t),
|
||||||
offsetof(zvol_state_t, zv_next));
|
offsetof(zvol_state_t, zv_next));
|
||||||
|
|
||||||
|
@ -1256,9 +1300,24 @@ zvol_remove_minors_impl(const char *name)
|
||||||
* one is currently using this zv
|
* one is currently using this zv
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* If in use, leave alone */
|
/*
|
||||||
|
* If in use, try to throw everyone off and try again
|
||||||
|
* later.
|
||||||
|
*/
|
||||||
if (zv->zv_open_count > 0 ||
|
if (zv->zv_open_count > 0 ||
|
||||||
atomic_read(&zv->zv_suspend_ref)) {
|
atomic_read(&zv->zv_suspend_ref)) {
|
||||||
|
zv->zv_flags |= ZVOL_REMOVING;
|
||||||
|
t = taskq_dispatch(
|
||||||
|
zv->zv_objset->os_spa->spa_zvol_taskq,
|
||||||
|
zvol_remove_minor_task, zv, TQ_SLEEP);
|
||||||
|
if (t == TASKQID_INVALID) {
|
||||||
|
/*
|
||||||
|
* Couldn't create the task, so we'll
|
||||||
|
* do it in place once the loop is
|
||||||
|
* finished.
|
||||||
|
*/
|
||||||
|
list_insert_head(&delay_list, zv);
|
||||||
|
}
|
||||||
mutex_exit(&zv->zv_state_lock);
|
mutex_exit(&zv->zv_state_lock);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1285,7 +1344,11 @@ zvol_remove_minors_impl(const char *name)
|
||||||
}
|
}
|
||||||
rw_exit(&zvol_state_lock);
|
rw_exit(&zvol_state_lock);
|
||||||
|
|
||||||
/* Drop zvol_state_lock before calling zvol_free() */
|
/* Wait for zvols that we couldn't create a remove task for */
|
||||||
|
while ((zv = list_remove_head(&delay_list)) != NULL)
|
||||||
|
zvol_remove_minor_task(zv);
|
||||||
|
|
||||||
|
/* Free any that we couldn't free in parallel earlier */
|
||||||
while ((zv = list_remove_head(&free_list)) != NULL)
|
while ((zv = list_remove_head(&free_list)) != NULL)
|
||||||
zvol_os_free(zv);
|
zvol_os_free(zv);
|
||||||
}
|
}
|
||||||
|
@ -1305,32 +1368,37 @@ zvol_remove_minor_impl(const char *name)
|
||||||
zv_next = list_next(&zvol_state_list, zv);
|
zv_next = list_next(&zvol_state_list, zv);
|
||||||
|
|
||||||
mutex_enter(&zv->zv_state_lock);
|
mutex_enter(&zv->zv_state_lock);
|
||||||
if (strcmp(zv->zv_name, name) == 0) {
|
if (strcmp(zv->zv_name, name) == 0)
|
||||||
/*
|
/* Found, leave the the loop with zv_lock held */
|
||||||
* By holding zv_state_lock here, we guarantee that no
|
|
||||||
* one is currently using this zv
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* If in use, leave alone */
|
|
||||||
if (zv->zv_open_count > 0 ||
|
|
||||||
atomic_read(&zv->zv_suspend_ref)) {
|
|
||||||
mutex_exit(&zv->zv_state_lock);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
zvol_remove(zv);
|
|
||||||
|
|
||||||
zvol_os_clear_private(zv);
|
|
||||||
mutex_exit(&zv->zv_state_lock);
|
|
||||||
break;
|
break;
|
||||||
} else {
|
|
||||||
mutex_exit(&zv->zv_state_lock);
|
mutex_exit(&zv->zv_state_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (zv == NULL) {
|
||||||
|
rw_exit(&zvol_state_lock);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Drop zvol_state_lock before calling zvol_free() */
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
||||||
|
|
||||||
|
if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
|
||||||
|
/*
|
||||||
|
* In use, so try to throw everyone off, then wait
|
||||||
|
* until finished.
|
||||||
|
*/
|
||||||
|
zv->zv_flags |= ZVOL_REMOVING;
|
||||||
|
mutex_exit(&zv->zv_state_lock);
|
||||||
|
rw_exit(&zvol_state_lock);
|
||||||
|
zvol_remove_minor_task(zv);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
zvol_remove(zv);
|
||||||
|
zvol_os_clear_private(zv);
|
||||||
|
|
||||||
|
mutex_exit(&zv->zv_state_lock);
|
||||||
rw_exit(&zvol_state_lock);
|
rw_exit(&zvol_state_lock);
|
||||||
|
|
||||||
if (zv != NULL)
|
|
||||||
zvol_os_free(zv);
|
zvol_os_free(zv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue