From 9f4d64e711f6cef57e1e3042d64db2098748e637 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 3 Sep 2024 17:41:16 -0700 Subject: [PATCH] Remove spa_namespace_lock from zpool status This commit removes spa_namespace_lock from the zpool status codepath. This means that zpool status will not hang if a pool fails while holding the spa_namespace_lock. Background: The spa_namespace_lock was originally meant to protect the spa_namespace_avl AVL tree. The spa_namespace_avl tree held the mappings from pool names to spa_t's. So if you wanted to lookup the spa_t for the "tank" pool, you would do an AVL search for "tank" while holding spa_namespace_lock. Over time though the spa_namespace_lock was re-purposed to protect other critical codepaths in the spa subsystem as well. In many cases we don't know what the original authors meant to protect with it, or if they needed it for read-only or read-write protection. It is simply "too big and risky to fix properly". The workaround is to add a new lightweight version of the spa_namespace_lock called spa_namespace_lite_lock. spa_namespace_lite_lock only protects the AVL tree, and nothing else. It can be used for read-only access to the AVL tree without requiring the spa_namespace_lock. Calls to spa_lookup_lite() and spa_next_lite() only need to acquire a reader lock on spa_namespace_lite_lock; they do not need to also acquire the old spa_namespace_lock. This allows us to still run zpool status even if the zfs module has spa_namespace_lock held. Note that these AVL tree locks only protect the tree, not the actual spa_t contents. Signed-off-by: Tony Hutter --- TEST | 2 +- include/sys/spa.h | 18 +++ include/sys/spa_impl.h | 1 + include/sys/zfs_context.h | 2 + include/sys/zfs_delay.h | 1 + module/os/freebsd/zfs/spa_os.c | 2 +- module/os/freebsd/zfs/zfs_ioctl_os.c | 2 +- module/zfs/arc.c | 2 +- module/zfs/mmp.c | 2 +- module/zfs/spa.c | 134 ++++++++++++---- module/zfs/spa_config.c | 11 +- module/zfs/spa_misc.c | 92 ++++++++++- module/zfs/zfs_fm.c | 2 +- module/zfs/zfs_ioctl.c | 23 +-- module/zfs/zio_inject.c | 4 +- tests/runfiles/common.run | 2 +- tests/zfs-tests/include/commands.cfg | 2 + tests/zfs-tests/include/tunables.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_status_namespace_lock.ksh | 144 ++++++++++++++++++ 20 files changed, 375 insertions(+), 73 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh diff --git a/TEST b/TEST index 376d6eb691..25d04dd01e 100644 --- a/TEST +++ b/TEST @@ -35,7 +35,7 @@ #TEST_ZFSTESTS_ITERS="1" #TEST_ZFSTESTS_OPTIONS="-vx" #TEST_ZFSTESTS_RUNFILE="linux.run" -#TEST_ZFSTESTS_TAGS="functional" +TEST_ZFSTESTS_TAGS="zpool_import,zpool_export,zpool_status,zpool_add" ### zfsstress #TEST_ZFSSTRESS_SKIP="yes" diff --git a/include/sys/spa.h b/include/sys/spa.h index aa66d489ef..71013ca6b8 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -842,6 +842,22 @@ extern kmutex_t spa_namespace_lock; extern avl_tree_t spa_namespace_avl; extern kcondvar_t spa_namespace_cv; +extern krwlock_t spa_namespace_lite_lock; + +extern uint_t spa_namespace_delay_ms; +/* + * Special version of mutex_enter() used for testing the spa_namespace_lock. + * It inserts an artificial delay after acquiring the lock to simulate a failure + * while holding the lock. The delay is controlled by the + * spa_namespace_lock_delay module param and is only to be used by ZTS. + */ +#define mutex_enter_ns(lock) do { \ + ASSERT(lock == &spa_namespace_lock); \ + mutex_enter(lock); \ + if (unlikely(spa_namespace_delay_ms != 0)) \ + zfs_msleep(spa_namespace_delay_ms); \ + } while (0); + /* * SPA configuration functions in spa_config.c */ @@ -866,9 +882,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_lookup_lite(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); +extern spa_t *spa_next_lite(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, const void *tag); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 7811abbb9c..91850b1b13 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -484,6 +484,7 @@ struct spa { extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; +extern uint_t spa_namespace_delay_ms; extern void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, zio_t *zio, boolean_t cutinline); extern void spa_load_spares(spa_t *spa); diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 998eaa5dd8..546589165f 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -769,6 +769,8 @@ void ksiddomain_rele(ksiddomain_t *); (void) nanosleep(&ts, NULL); \ } while (0) +#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms))) + typedef int fstrans_cookie_t; extern fstrans_cookie_t spl_fstrans_mark(void); diff --git a/include/sys/zfs_delay.h b/include/sys/zfs_delay.h index 56ac1f3c43..156d5043bc 100644 --- a/include/sys/zfs_delay.h +++ b/include/sys/zfs_delay.h @@ -37,5 +37,6 @@ usleep_range(delta_us, delta_us + 100); \ } \ } while (0) +#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms))) #endif /* _SYS_FS_ZFS_DELAY_H */ diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c index 1b9f1a4ec9..d2b8587d82 100644 --- a/module/os/freebsd/zfs/spa_os.c +++ b/module/os/freebsd/zfs/spa_os.c @@ -192,7 +192,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind) */ config = spa_generate_rootconf(name); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (config != NULL) { pname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); VERIFY0(strcmp(name, pname)); diff --git a/module/os/freebsd/zfs/zfs_ioctl_os.c b/module/os/freebsd/zfs/zfs_ioctl_os.c index 928cf25e94..61731665d9 100644 --- a/module/os/freebsd/zfs/zfs_ioctl_os.c +++ b/module/os/freebsd/zfs/zfs_ioctl_os.c @@ -107,7 +107,7 @@ zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) "command", &command) != 0) return (EINVAL); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa = spa_by_guid(pool_guid, vdev_guid); if (spa != NULL) strcpy(name, spa_name(spa)); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 714a30e863..e7f410c5fc 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8205,7 +8205,7 @@ l2arc_dev_get_next(void) * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 7112254275..7e8e9cdf62 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -728,7 +728,7 @@ mmp_signal_all_threads(void) { spa_t *spa = NULL; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa))) { if (spa->spa_state == POOL_STATE_ACTIVE) mmp_signal_thread(spa); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1a68a09535..e25bad9189 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1038,7 +1038,7 @@ spa_change_guid(spa_t *spa, const uint64_t *guidp) int error; mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (guidp != NULL) { guid = *guidp; @@ -5635,7 +5635,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_load_note(spa, "LOADED"); fail: - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa->spa_load_thread = NULL; cv_broadcast(&spa_namespace_cv); @@ -5800,7 +5800,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); locked = B_TRUE; } @@ -5898,6 +5898,46 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (0); } +/* + * This is a lightweight version of spa_open_common() used only for getting + * stats. It does not use the spa_namespace_lock. + * + * Returns: + * + * 0 Success + * ENOENT No pool with that name + * EAGAIN Try doing a spa_open_common() instead + * + * On failure, *spapp will be set to NULL; + */ +static int +spa_read_config_lite(const char *pool, spa_t **spapp, nvlist_t **config) +{ + spa_t *spa; + ASSERT(RW_LOCK_HELD(&spa_namespace_lite_lock)); + + if ((spa = spa_lookup_lite(pool)) == NULL) { + *spapp = NULL; + return (SET_ERROR(ENOENT)); + } + + /* + * Special case: If the pool isn't initialized yet, then tell the caller + * to call spa_open_common() instead. That will do the full, + * heavyweight load of spa_t. + */ + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + *spapp = NULL; + return (EAGAIN); + } + + if (config != NULL) + *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + *spapp = spa; + return (0); +} + int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) @@ -5920,7 +5960,7 @@ spa_inject_addref(char *name) { spa_t *spa; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); @@ -5934,7 +5974,7 @@ spa_inject_addref(char *name) void spa_inject_delref(spa_t *spa) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } @@ -6134,9 +6174,26 @@ spa_get_stats(const char *name, nvlist_t **config, { int error; spa_t *spa; + boolean_t is_lite = B_TRUE; *config = NULL; - error = spa_open_common(name, &spa, FTAG, NULL, config); + + rw_enter(&spa_namespace_lite_lock, RW_READER); + /* + * First, try a lightweight "read the config". This will avoid the + * spa_namespace_lock and will work if the pool is already imported. + */ + error = spa_read_config_lite(name, &spa, config); + + /* + * The pool is in a state where it needs the full, heavyweight, + * spa_open_common(). + */ + if (error == EAGAIN) { + rw_exit(&spa_namespace_lite_lock); + error = spa_open_common(name, &spa, FTAG, NULL, config); + is_lite = B_FALSE; + } if (spa != NULL) { /* @@ -6179,14 +6236,21 @@ spa_get_stats(const char *name, nvlist_t **config, */ if (altroot) { if (spa == NULL) { - mutex_enter(&spa_namespace_lock); - spa = spa_lookup(name); + if (!is_lite) { + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(name); + } else { + spa = spa_lookup_lite(name); + } + if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; - mutex_exit(&spa_namespace_lock); + + if (!is_lite) + mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } @@ -6194,9 +6258,12 @@ spa_get_stats(const char *name, nvlist_t **config, if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); - spa_close(spa, FTAG); + if (!is_lite) + spa_close(spa, FTAG); } + if (is_lite) + rw_exit(&spa_namespace_lite_lock); return (error); } @@ -6406,7 +6473,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, /* * If this pool already exists, return failure. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); @@ -6710,7 +6777,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) /* * If a pool with this name exists, return failure. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); @@ -6898,7 +6965,7 @@ spa_tryimport(nvlist_t *tryconfig) (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa = spa_add(name, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); kmem_free(name, MAXPATHLEN); @@ -7024,7 +7091,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (!(spa_mode_global & SPA_MODE_WRITE)) return (SET_ERROR(EROFS)); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); @@ -7048,7 +7115,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); } - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa->spa_export_thread = curthread; spa_close(spa, FTAG); @@ -7096,7 +7163,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); goto fail; } @@ -7167,7 +7234,7 @@ export_spa: /* * Take the namespace lock for the actual spa_t removal */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); @@ -7202,6 +7269,7 @@ fail: return (error); } + /* * Destroy a storage pool. */ @@ -7408,7 +7476,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) */ (void) spa_vdev_exit(spa, vd, txg, 0); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); @@ -8034,7 +8102,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) if (unspare) { spa_t *altspa = NULL; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) @@ -8043,7 +8111,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); @@ -8053,7 +8121,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) } /* all done with the spa; OK to release */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -8148,7 +8216,7 @@ spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, * we can properly assess the vdev state before we commit to * the initializing operation. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { @@ -8269,7 +8337,7 @@ spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, * we can properly assess the vdev state before we commit to * the TRIM operation. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { @@ -8974,7 +9042,7 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); @@ -9049,7 +9117,7 @@ spa_async_thread(void *arg) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); @@ -9057,7 +9125,7 @@ spa_async_thread(void *arg) } if (tasks & SPA_ASYNC_TRIM_RESTART) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); @@ -9065,7 +9133,7 @@ spa_async_thread(void *arg) } if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); @@ -9076,7 +9144,7 @@ spa_async_thread(void *arg) * Kick off L2 cache whole device TRIM. */ if (tasks & SPA_ASYNC_L2CACHE_TRIM) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_l2arc(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); @@ -9087,7 +9155,7 @@ spa_async_thread(void *arg) * Kick off L2 cache rebuilding. */ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); l2arc_spa_rebuild_start(spa); spa_config_exit(spa, SCL_L2ARC, FTAG); @@ -10292,7 +10360,7 @@ void spa_sync_allpools(void) { spa_t *spa = NULL; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) @@ -10300,7 +10368,7 @@ spa_sync_allpools(void) spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); @@ -10450,7 +10518,7 @@ spa_evict_all(void) * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach @@ -10460,7 +10528,7 @@ spa_evict_all(void) spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index a77874ea0d..4f35f225fd 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -134,7 +134,7 @@ spa_config_load(void) * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) @@ -375,12 +375,9 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools) if (*generation == spa_config_generation) return (SET_ERROR(EEXIST)); - int error = mutex_enter_interruptible(&spa_namespace_lock); - if (error) - return (SET_ERROR(EINTR)); - + rw_enter(&spa_namespace_lite_lock, RW_READER); *pools = fnvlist_alloc(); - while ((spa = spa_next(spa)) != NULL) { + while ((spa = spa_next_lite(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); @@ -390,7 +387,7 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools) } } *generation = spa_config_generation; - mutex_exit(&spa_namespace_lock); + rw_exit(&spa_namespace_lite_lock); return (0); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 97191e7685..a346362a63 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -238,6 +238,30 @@ avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; + +/* + * The spa_namespace_lock was originally meant to protect the spa_namespace_avl + * AVL tree. The spa_namespace_avl tree held the mappings from pool names to + * spa_t's. So if you wanted to lookup the spa_t for the "tank" pool, you would + * do an AVL search for "tank" while holding spa_namespace_lock. + * + * Over time though the spa_namespace_lock was re-purposed to protect other + * critical codepaths in the spa subsystem as well. In many cases we don't know + * what the original authors meant to protect with it, or if they needed it for + * read-only or read-write protection. It is simply "too big and risky to fix + * properly". + * + * The workaround is to add a new lightweight version of the spa_namespace_lock + * called spa_namespace_lite_lock. spa_namespace_lite_lock only protects the + * AVL tree, and nothing else. It can be used for read-only access to the AVL + * tree without requiring the spa_namespace_lock. Calls to spa_lookup_lite() + * and spa_next_lite() only need to acquire a reader lock on + * spa_namespace_lite_lock; they do not need to also acquire the old + * spa_namespace_lock. This allows us to still run zpool status even if the zfs + * module has spa_namespace_lock held. Note that these AVL tree locks only + * protect the tree, not the actual spa_t contents. + */ +krwlock_t spa_namespace_lite_lock; kcondvar_t spa_namespace_cv; static const int spa_max_replication_override = SPA_DVAS_PER_BP; @@ -402,6 +426,12 @@ static int spa_cpus_per_allocator = 4; */ const char *zfs_active_allocator = "dynamic"; +/* + * Add a delay of 'spa_namespace_delay_ms' milliseconds after taking the + * spa_namespace lock. This is only used for ZTS testing. + */ +uint_t spa_namespace_delay_ms = 0; + void spa_load_failed(spa_t *spa, const char *fmt, ...) { @@ -607,6 +637,30 @@ spa_config_held(spa_t *spa, int locks, krw_t rw) * ========================================================================== */ +spa_t * +spa_lookup_lite(const char *name) +{ + static spa_t search; /* spa_t is large; don't allocate on stack */ + spa_t *spa; + avl_index_t where; + char *cp; + + ASSERT(RW_LOCK_HELD(&spa_namespace_lite_lock)); + + (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); + + /* + * If it's a full dataset name, figure out the pool name and + * just use that. + */ + cp = strpbrk(search.spa_name, "/@#"); + if (cp != NULL) + *cp = '\0'; + + spa = avl_find(&spa_namespace_avl, &search, &where); + return (spa); +} + /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. @@ -632,7 +686,9 @@ retry: if (cp != NULL) *cp = '\0'; + rw_enter(&spa_namespace_lite_lock, RW_READER); spa = avl_find(&spa_namespace_avl, &search, &where); + rw_exit(&spa_namespace_lite_lock); if (spa == NULL) return (NULL); @@ -746,7 +802,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_stats_init(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); + rw_enter(&spa_namespace_lite_lock, RW_WRITER); avl_add(&spa_namespace_avl, spa); + rw_exit(&spa_namespace_lite_lock); /* * Set the alternate root, if there is one. @@ -850,7 +908,9 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_config_splitting); + rw_enter(&spa_namespace_lite_lock, RW_WRITER); avl_remove(&spa_namespace_avl, spa); + rw_exit(&spa_namespace_lite_lock); if (spa->spa_root) spa_strfree(spa->spa_root); @@ -935,6 +995,20 @@ spa_next(spa_t *prev) return (avl_first(&spa_namespace_avl)); } +/* + * Given a pool, return the next pool in the namespace, or NULL if there is + * none. If 'prev' is NULL, return the first pool. + */ +spa_t * +spa_next_lite(spa_t *prev) +{ + ASSERT(RW_LOCK_HELD(&spa_namespace_lite_lock)); + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + /* * ========================================================================== * SPA refcount functions @@ -1238,7 +1312,7 @@ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); @@ -1257,7 +1331,7 @@ uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid) { mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); @@ -1462,7 +1536,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) * If the config changed, update the config cache. */ if (config_changed) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); mutex_exit(&spa_namespace_lock); } @@ -2164,7 +2238,7 @@ spa_set_deadman_ziotime(hrtime_t ns) spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_ziotime = ns; mutex_exit(&spa_namespace_lock); @@ -2177,7 +2251,7 @@ spa_set_deadman_synctime(hrtime_t ns) spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_synctime = ns; mutex_exit(&spa_namespace_lock); @@ -2536,6 +2610,7 @@ spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&spa_namespace_lite_lock, NULL, RW_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); @@ -2999,7 +3074,7 @@ param_set_deadman_failmode_common(const char *val) return (SET_ERROR(EINVAL)); if (spa_mode_global != SPA_MODE_UNINIT) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa_set_deadman_failmode(spa, val); mutex_exit(&spa_namespace_lock); @@ -3011,9 +3086,11 @@ param_set_deadman_failmode_common(const char *val) /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); +EXPORT_SYMBOL(spa_lookup_lite); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); +EXPORT_SYMBOL(spa_next_lite); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); @@ -3120,6 +3197,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); +ZFS_MODULE_PARAM(zfs_spa, spa_, namespace_delay_ms, UINT, ZMOD_RW, + "millisecond sleep after taking spa_namespace_lock (for testing only)"); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, param_set_deadman_failmode, param_get_charp, ZMOD_RW, diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index f7cecc9af8..0a674b092b 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1562,7 +1562,7 @@ zfs_ereport_zvol_post(const char *subclass, const char *name, char *r; boolean_t locked = mutex_owned(&spa_namespace_lock); - if (!locked) mutex_enter(&spa_namespace_lock); + if (!locked) mutex_enter_ns(&spa_namespace_lock); spa_t *spa = spa_lookup(name); if (!locked) mutex_exit(&spa_namespace_lock); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 53366ad497..8dfe8850df 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3012,7 +3012,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); @@ -3060,27 +3060,14 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) props = NULL; } - if ((error = spa_open(pool, &spa, FTAG)) != 0) { - /* - * If the pool is faulted, there may be properties we can still - * get (such as altroot and cachefile), so attempt to get them - * anyway. - */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) != NULL) { - error = spa_prop_get(spa, outnvl); - if (error == 0 && props != NULL) - error = spa_prop_get_nvlist(spa, props, n_props, - outnvl); - } - mutex_exit(&spa_namespace_lock); - } else { + rw_enter(&spa_namespace_lite_lock, RW_READER); + if ((spa = spa_lookup_lite(pool)) != NULL) { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, outnvl); - spa_close(spa, FTAG); } + rw_exit(&spa_namespace_lite_lock); return (error); } @@ -5956,7 +5943,7 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * On zpool clear we also fix up missing slogs */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 012a0e3c6c..f2443afe53 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -907,7 +907,7 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) if (zio_pool_handler_exists(name, record->zi_cmd)) return (SET_ERROR(EEXIST)); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); boolean_t has_spa = spa_lookup(name) != NULL; mutex_exit(&spa_namespace_lock); @@ -994,7 +994,7 @@ zio_inject_list_next(int *id, char *name, size_t buflen, inject_handler_t *handler; int ret; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 088e46ce57..2bc2b38dd1 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -559,7 +559,7 @@ tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', 'zpool_status_007_pos', 'zpool_status_008_pos', - 'zpool_status_features_001_pos'] + 'zpool_status_features_001_pos', 'zpool_status_namespace_lock'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 19770138bf..ccfde4067a 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -90,6 +90,7 @@ export SYSTEM_FILES_COMMON='awk sync tail tar + time timeout touch tr @@ -230,6 +231,7 @@ export ZFSTEST_FILES='badsend edonr_test skein_test sha2_test + time ctime truncate_test ereports diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 96943421f8..77a0b3099b 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -98,6 +98,7 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq +SPA_NAMESPACE_DELAY_MS spa.namespace_delay_ms spa_namespace_delay_ms BCLONE_ENABLED bclone_enabled zfs_bclone_enabled BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty XATTR_COMPAT xattr_compat zfs_xattr_compat diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bbeabc6dfb..e1bbbda18f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1256,6 +1256,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh new file mode 100755 index 0000000000..fe94ece2be --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh @@ -0,0 +1,144 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Test out that zpool status|get|list are not affected by the +# 'spa_namespace_lock' being held. This was an issue in the past +# where a pool would hang while holding the lock, and 'zpool status' +# would hang waiting for the lock. +# +# STRATEGY: +# 1. Create a test pool. It should be created quickly. +# 2. Set SPA_NAMESPACE_DELAY_MS to 1 second. +# 3. Create another test pool. It should take over 1 second due to the +# delay. +# 4. Run zpool status|get|list. They should all run quickly. +# 5. In the background, destroy one of the pools and export the other one +# at the same time. This should cause a lot of lock contention. Note +# that the 1 sec delay is still in effect. +# 6. At the same time run some zpool commands and verify they still run in +# 200ms or less, and without error. + +verify_runnable "global" + +FILEDEV1="$TEST_BASE_DIR/filedev1.$$" +FILEDEV2="$TEST_BASE_DIR/filedev2.$$" + +log_must truncate -s 100M "$FILEDEV1" "$FILEDEV2" +TESTPOOL1=testpool1 +TESTPOOL2=testpool2 + +function cleanup +{ + restore_tunable SPA_NAMESPACE_DELAY_MS + datasetexists $TESTPOOL1 && log_must zpool destroy $TESTPOOL1 + datasetexists $TESTPOOL2 && log_must zpool destroy $TESTPOOL2 + rm -f $FILEDEV1 $FILEDEV2 + + log_note "debug: $(cat /tmp/out)" +} + +# Run a command and test how long it takes. Fail the test if the command +# fails or takes the wrong amount of time to run. +# +# The arguments consist of an array like this: +# +# zpool get all -le 200 +# +# The first part of the array is the command and its arguments. +# The 2nd to last argument is a ksh comparator operation like "-le" or "-ge" +# The last argument is the expected time in milliseconds for the comparator. +# +# So the example above reads as "run 'zpool get all' and check that it runs in +# 200 milliseconds or less". +# +function timetest_cmd +{ + # Run the command and record 'real' time. dec will be a factional value + # like '1.05' representing the number of seconds it took to run. + array=($@) + + # Save the last two arguments and remove them from the array + target_ms="${array[@]: -1}" + unset array[-1] + comp="${array[@]: -1}" + unset array[-1] + + out="$({ time -p "${array[@]}" 1> /dev/null; } 2>&1)" + rc=$? + dec=$(echo "$out" | awk '/^real /{print $2}') + + # Calculate milliseconds. The divide by one at the end removes the + # decimal places. + ms=$(bc <<< "scale=0; $dec * 1000 / 1") + + log_note "${array[@]}: $ms $comp $target_ms" + + # For debugging failures + echo "$@: dec=$dec; rc=$rc; ms=$ms; $out" >> /tmp/out2 + if [ -z "$ms" ] || [ $rc != 0 ] ; then + log_fail "Bad value: ms=$ms, rc=$rc: $out" + fi + + if eval ! [ $ms $comp $target_ms ] ; then + log_fail "Expected time wrong: $ms $comp $target_ms" + fi +} + +log_assert "Verify spa_namespace_lock isn't held by zpool status|get|list" + +log_onexit cleanup + +# A normal zpool create should take 200ms or less +timetest_cmd zpool create $TESTPOOL1 $FILEDEV1 -le 200 + +log_must save_tunable SPA_NAMESPACE_DELAY_MS +log_must set_tunable32 SPA_NAMESPACE_DELAY_MS 1000 + +# We added 1 sec hold time on spa_namespace lock. zpool create +# should now take at least 1000ms. +timetest_cmd zpool create $TESTPOOL2 $FILEDEV2 -ge 1000 + +# zpool status|get|list should not take spa_namespace_lock, so they should run +# quickly, even loaded down with options. +timetest_cmd zpool status -pPstvLD -le 200 +timetest_cmd zpool get all -le 200 +timetest_cmd zpool list -gLPv -le 200 + +# In the background, destroy one of the pools and export the other one at the +# same time. This should cause a lot of lock contention. At the same time +# run some zpool commands and verify they still run in 200ms or less, and +# without error. +zpool destroy $TESTPOOL1 & +zpool export $TESTPOOL2 & +for i in 1 2 3 4 ; do + timetest_cmd zpool status -pPstvLD -le 200 + timetest_cmd zpool get all -le 200 + timetest_cmd zpool list -gLPv -le 200 + sleep 0.1 +done +wait + +log_pass "zpool status|get|list was not delayed by the spa_namespace_lock"