diff --git a/TEST b/TEST index 376d6eb691..25d04dd01e 100644 --- a/TEST +++ b/TEST @@ -35,7 +35,7 @@ #TEST_ZFSTESTS_ITERS="1" #TEST_ZFSTESTS_OPTIONS="-vx" #TEST_ZFSTESTS_RUNFILE="linux.run" -#TEST_ZFSTESTS_TAGS="functional" +TEST_ZFSTESTS_TAGS="zpool_import,zpool_export,zpool_status,zpool_add" ### zfsstress #TEST_ZFSSTRESS_SKIP="yes" diff --git a/include/sys/spa.h b/include/sys/spa.h index aa66d489ef..71013ca6b8 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -842,6 +842,22 @@ extern kmutex_t spa_namespace_lock; extern avl_tree_t spa_namespace_avl; extern kcondvar_t spa_namespace_cv; +extern krwlock_t spa_namespace_lite_lock; + +extern uint_t spa_namespace_delay_ms; +/* + * Special version of mutex_enter() used for testing the spa_namespace_lock. + * It inserts an artificial delay after acquiring the lock to simulate a failure + * while holding the lock. The delay is controlled by the + * spa_namespace_lock_delay module param and is only to be used by ZTS. + */ +#define mutex_enter_ns(lock) do { \ + ASSERT(lock == &spa_namespace_lock); \ + mutex_enter(lock); \ + if (unlikely(spa_namespace_delay_ms != 0)) \ + zfs_msleep(spa_namespace_delay_ms); \ + } while (0); + /* * SPA configuration functions in spa_config.c */ @@ -866,9 +882,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_lookup_lite(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); +extern spa_t *spa_next_lite(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, const void *tag); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 7811abbb9c..91850b1b13 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -484,6 +484,7 @@ struct spa { extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; +extern uint_t spa_namespace_delay_ms; extern void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, zio_t *zio, boolean_t cutinline); extern void spa_load_spares(spa_t *spa); diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 998eaa5dd8..546589165f 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -769,6 +769,8 @@ void ksiddomain_rele(ksiddomain_t *); (void) nanosleep(&ts, NULL); \ } while (0) +#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms))) + typedef int fstrans_cookie_t; extern fstrans_cookie_t spl_fstrans_mark(void); diff --git a/include/sys/zfs_delay.h b/include/sys/zfs_delay.h index 56ac1f3c43..156d5043bc 100644 --- a/include/sys/zfs_delay.h +++ b/include/sys/zfs_delay.h @@ -37,5 +37,6 @@ usleep_range(delta_us, delta_us + 100); \ } \ } while (0) +#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms))) #endif /* _SYS_FS_ZFS_DELAY_H */ diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c index 1b9f1a4ec9..d2b8587d82 100644 --- a/module/os/freebsd/zfs/spa_os.c +++ b/module/os/freebsd/zfs/spa_os.c @@ -192,7 +192,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind) */ config = spa_generate_rootconf(name); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (config != NULL) { pname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); VERIFY0(strcmp(name, pname)); diff --git a/module/os/freebsd/zfs/zfs_ioctl_os.c b/module/os/freebsd/zfs/zfs_ioctl_os.c index 928cf25e94..61731665d9 100644 --- a/module/os/freebsd/zfs/zfs_ioctl_os.c +++ b/module/os/freebsd/zfs/zfs_ioctl_os.c @@ -107,7 +107,7 @@ zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) "command", &command) != 0) return (EINVAL); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa = spa_by_guid(pool_guid, vdev_guid); if (spa != NULL) strcpy(name, spa_name(spa)); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 714a30e863..e7f410c5fc 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8205,7 +8205,7 @@ l2arc_dev_get_next(void) * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 7112254275..7e8e9cdf62 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -728,7 +728,7 @@ mmp_signal_all_threads(void) { spa_t *spa = NULL; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa))) { if (spa->spa_state == POOL_STATE_ACTIVE) mmp_signal_thread(spa); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1a68a09535..e25bad9189 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1038,7 +1038,7 @@ spa_change_guid(spa_t *spa, const uint64_t *guidp) int error; mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (guidp != NULL) { guid = *guidp; @@ -5635,7 +5635,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_load_note(spa, "LOADED"); fail: - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa->spa_load_thread = NULL; cv_broadcast(&spa_namespace_cv); @@ -5800,7 +5800,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); locked = B_TRUE; } @@ -5898,6 +5898,46 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (0); } +/* + * This is a lightweight version of spa_open_common() used only for getting + * stats. It does not use the spa_namespace_lock. + * + * Returns: + * + * 0 Success + * ENOENT No pool with that name + * EAGAIN Try doing a spa_open_common() instead + * + * On failure, *spapp will be set to NULL; + */ +static int +spa_read_config_lite(const char *pool, spa_t **spapp, nvlist_t **config) +{ + spa_t *spa; + ASSERT(RW_LOCK_HELD(&spa_namespace_lite_lock)); + + if ((spa = spa_lookup_lite(pool)) == NULL) { + *spapp = NULL; + return (SET_ERROR(ENOENT)); + } + + /* + * Special case: If the pool isn't initialized yet, then tell the caller + * to call spa_open_common() instead. That will do the full, + * heavyweight load of spa_t. + */ + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + *spapp = NULL; + return (EAGAIN); + } + + if (config != NULL) + *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + *spapp = spa; + return (0); +} + int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) @@ -5920,7 +5960,7 @@ spa_inject_addref(char *name) { spa_t *spa; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); @@ -5934,7 +5974,7 @@ spa_inject_addref(char *name) void spa_inject_delref(spa_t *spa) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } @@ -6134,9 +6174,26 @@ spa_get_stats(const char *name, nvlist_t **config, { int error; spa_t *spa; + boolean_t is_lite = B_TRUE; *config = NULL; - error = spa_open_common(name, &spa, FTAG, NULL, config); + + rw_enter(&spa_namespace_lite_lock, RW_READER); + /* + * First, try a lightweight "read the config". This will avoid the + * spa_namespace_lock and will work if the pool is already imported. + */ + error = spa_read_config_lite(name, &spa, config); + + /* + * The pool is in a state where it needs the full, heavyweight, + * spa_open_common(). + */ + if (error == EAGAIN) { + rw_exit(&spa_namespace_lite_lock); + error = spa_open_common(name, &spa, FTAG, NULL, config); + is_lite = B_FALSE; + } if (spa != NULL) { /* @@ -6179,14 +6236,21 @@ spa_get_stats(const char *name, nvlist_t **config, */ if (altroot) { if (spa == NULL) { - mutex_enter(&spa_namespace_lock); - spa = spa_lookup(name); + if (!is_lite) { + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(name); + } else { + spa = spa_lookup_lite(name); + } + if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; - mutex_exit(&spa_namespace_lock); + + if (!is_lite) + mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } @@ -6194,9 +6258,12 @@ spa_get_stats(const char *name, nvlist_t **config, if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); - spa_close(spa, FTAG); + if (!is_lite) + spa_close(spa, FTAG); } + if (is_lite) + rw_exit(&spa_namespace_lite_lock); return (error); } @@ -6406,7 +6473,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, /* * If this pool already exists, return failure. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); @@ -6710,7 +6777,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) /* * If a pool with this name exists, return failure. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); @@ -6898,7 +6965,7 @@ spa_tryimport(nvlist_t *tryconfig) (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa = spa_add(name, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); kmem_free(name, MAXPATHLEN); @@ -7024,7 +7091,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (!(spa_mode_global & SPA_MODE_WRITE)) return (SET_ERROR(EROFS)); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); @@ -7048,7 +7115,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); } - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa->spa_export_thread = curthread; spa_close(spa, FTAG); @@ -7096,7 +7163,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); goto fail; } @@ -7167,7 +7234,7 @@ export_spa: /* * Take the namespace lock for the actual spa_t removal */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); @@ -7202,6 +7269,7 @@ fail: return (error); } + /* * Destroy a storage pool. */ @@ -7408,7 +7476,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) */ (void) spa_vdev_exit(spa, vd, txg, 0); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); @@ -8034,7 +8102,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) if (unspare) { spa_t *altspa = NULL; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) @@ -8043,7 +8111,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); @@ -8053,7 +8121,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) } /* all done with the spa; OK to release */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -8148,7 +8216,7 @@ spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, * we can properly assess the vdev state before we commit to * the initializing operation. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { @@ -8269,7 +8337,7 @@ spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, * we can properly assess the vdev state before we commit to * the TRIM operation. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { @@ -8974,7 +9042,7 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); @@ -9049,7 +9117,7 @@ spa_async_thread(void *arg) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); @@ -9057,7 +9125,7 @@ spa_async_thread(void *arg) } if (tasks & SPA_ASYNC_TRIM_RESTART) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); @@ -9065,7 +9133,7 @@ spa_async_thread(void *arg) } if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); @@ -9076,7 +9144,7 @@ spa_async_thread(void *arg) * Kick off L2 cache whole device TRIM. */ if (tasks & SPA_ASYNC_L2CACHE_TRIM) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_l2arc(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); @@ -9087,7 +9155,7 @@ spa_async_thread(void *arg) * Kick off L2 cache rebuilding. */ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); l2arc_spa_rebuild_start(spa); spa_config_exit(spa, SCL_L2ARC, FTAG); @@ -10292,7 +10360,7 @@ void spa_sync_allpools(void) { spa_t *spa = NULL; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) @@ -10300,7 +10368,7 @@ spa_sync_allpools(void) spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); @@ -10450,7 +10518,7 @@ spa_evict_all(void) * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach @@ -10460,7 +10528,7 @@ spa_evict_all(void) spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index a77874ea0d..4f35f225fd 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -134,7 +134,7 @@ spa_config_load(void) * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) @@ -375,12 +375,9 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools) if (*generation == spa_config_generation) return (SET_ERROR(EEXIST)); - int error = mutex_enter_interruptible(&spa_namespace_lock); - if (error) - return (SET_ERROR(EINTR)); - + rw_enter(&spa_namespace_lite_lock, RW_READER); *pools = fnvlist_alloc(); - while ((spa = spa_next(spa)) != NULL) { + while ((spa = spa_next_lite(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); @@ -390,7 +387,7 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools) } } *generation = spa_config_generation; - mutex_exit(&spa_namespace_lock); + rw_exit(&spa_namespace_lite_lock); return (0); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 97191e7685..a346362a63 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -238,6 +238,30 @@ avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; + +/* + * The spa_namespace_lock was originally meant to protect the spa_namespace_avl + * AVL tree. The spa_namespace_avl tree held the mappings from pool names to + * spa_t's. So if you wanted to lookup the spa_t for the "tank" pool, you would + * do an AVL search for "tank" while holding spa_namespace_lock. + * + * Over time though the spa_namespace_lock was re-purposed to protect other + * critical codepaths in the spa subsystem as well. In many cases we don't know + * what the original authors meant to protect with it, or if they needed it for + * read-only or read-write protection. It is simply "too big and risky to fix + * properly". + * + * The workaround is to add a new lightweight version of the spa_namespace_lock + * called spa_namespace_lite_lock. spa_namespace_lite_lock only protects the + * AVL tree, and nothing else. It can be used for read-only access to the AVL + * tree without requiring the spa_namespace_lock. Calls to spa_lookup_lite() + * and spa_next_lite() only need to acquire a reader lock on + * spa_namespace_lite_lock; they do not need to also acquire the old + * spa_namespace_lock. This allows us to still run zpool status even if the zfs + * module has spa_namespace_lock held. Note that these AVL tree locks only + * protect the tree, not the actual spa_t contents. + */ +krwlock_t spa_namespace_lite_lock; kcondvar_t spa_namespace_cv; static const int spa_max_replication_override = SPA_DVAS_PER_BP; @@ -402,6 +426,12 @@ static int spa_cpus_per_allocator = 4; */ const char *zfs_active_allocator = "dynamic"; +/* + * Add a delay of 'spa_namespace_delay_ms' milliseconds after taking the + * spa_namespace lock. This is only used for ZTS testing. + */ +uint_t spa_namespace_delay_ms = 0; + void spa_load_failed(spa_t *spa, const char *fmt, ...) { @@ -607,6 +637,30 @@ spa_config_held(spa_t *spa, int locks, krw_t rw) * ========================================================================== */ +spa_t * +spa_lookup_lite(const char *name) +{ + static spa_t search; /* spa_t is large; don't allocate on stack */ + spa_t *spa; + avl_index_t where; + char *cp; + + ASSERT(RW_LOCK_HELD(&spa_namespace_lite_lock)); + + (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); + + /* + * If it's a full dataset name, figure out the pool name and + * just use that. + */ + cp = strpbrk(search.spa_name, "/@#"); + if (cp != NULL) + *cp = '\0'; + + spa = avl_find(&spa_namespace_avl, &search, &where); + return (spa); +} + /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. @@ -632,7 +686,9 @@ retry: if (cp != NULL) *cp = '\0'; + rw_enter(&spa_namespace_lite_lock, RW_READER); spa = avl_find(&spa_namespace_avl, &search, &where); + rw_exit(&spa_namespace_lite_lock); if (spa == NULL) return (NULL); @@ -746,7 +802,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_stats_init(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); + rw_enter(&spa_namespace_lite_lock, RW_WRITER); avl_add(&spa_namespace_avl, spa); + rw_exit(&spa_namespace_lite_lock); /* * Set the alternate root, if there is one. @@ -850,7 +908,9 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_config_splitting); + rw_enter(&spa_namespace_lite_lock, RW_WRITER); avl_remove(&spa_namespace_avl, spa); + rw_exit(&spa_namespace_lite_lock); if (spa->spa_root) spa_strfree(spa->spa_root); @@ -935,6 +995,20 @@ spa_next(spa_t *prev) return (avl_first(&spa_namespace_avl)); } +/* + * Given a pool, return the next pool in the namespace, or NULL if there is + * none. If 'prev' is NULL, return the first pool. + */ +spa_t * +spa_next_lite(spa_t *prev) +{ + ASSERT(RW_LOCK_HELD(&spa_namespace_lite_lock)); + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + /* * ========================================================================== * SPA refcount functions @@ -1238,7 +1312,7 @@ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); @@ -1257,7 +1331,7 @@ uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid) { mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); @@ -1462,7 +1536,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) * If the config changed, update the config cache. */ if (config_changed) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); mutex_exit(&spa_namespace_lock); } @@ -2164,7 +2238,7 @@ spa_set_deadman_ziotime(hrtime_t ns) spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_ziotime = ns; mutex_exit(&spa_namespace_lock); @@ -2177,7 +2251,7 @@ spa_set_deadman_synctime(hrtime_t ns) spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_synctime = ns; mutex_exit(&spa_namespace_lock); @@ -2536,6 +2610,7 @@ spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&spa_namespace_lite_lock, NULL, RW_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); @@ -2999,7 +3074,7 @@ param_set_deadman_failmode_common(const char *val) return (SET_ERROR(EINVAL)); if (spa_mode_global != SPA_MODE_UNINIT) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa_set_deadman_failmode(spa, val); mutex_exit(&spa_namespace_lock); @@ -3011,9 +3086,11 @@ param_set_deadman_failmode_common(const char *val) /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); +EXPORT_SYMBOL(spa_lookup_lite); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); +EXPORT_SYMBOL(spa_next_lite); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); @@ -3120,6 +3197,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); +ZFS_MODULE_PARAM(zfs_spa, spa_, namespace_delay_ms, UINT, ZMOD_RW, + "millisecond sleep after taking spa_namespace_lock (for testing only)"); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, param_set_deadman_failmode, param_get_charp, ZMOD_RW, diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index f7cecc9af8..0a674b092b 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1562,7 +1562,7 @@ zfs_ereport_zvol_post(const char *subclass, const char *name, char *r; boolean_t locked = mutex_owned(&spa_namespace_lock); - if (!locked) mutex_enter(&spa_namespace_lock); + if (!locked) mutex_enter_ns(&spa_namespace_lock); spa_t *spa = spa_lookup(name); if (!locked) mutex_exit(&spa_namespace_lock); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 53366ad497..8dfe8850df 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3012,7 +3012,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); @@ -3060,27 +3060,14 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) props = NULL; } - if ((error = spa_open(pool, &spa, FTAG)) != 0) { - /* - * If the pool is faulted, there may be properties we can still - * get (such as altroot and cachefile), so attempt to get them - * anyway. - */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) != NULL) { - error = spa_prop_get(spa, outnvl); - if (error == 0 && props != NULL) - error = spa_prop_get_nvlist(spa, props, n_props, - outnvl); - } - mutex_exit(&spa_namespace_lock); - } else { + rw_enter(&spa_namespace_lite_lock, RW_READER); + if ((spa = spa_lookup_lite(pool)) != NULL) { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, outnvl); - spa_close(spa, FTAG); } + rw_exit(&spa_namespace_lite_lock); return (error); } @@ -5956,7 +5943,7 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * On zpool clear we also fix up missing slogs */ - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 012a0e3c6c..f2443afe53 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -907,7 +907,7 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) if (zio_pool_handler_exists(name, record->zi_cmd)) return (SET_ERROR(EEXIST)); - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); boolean_t has_spa = spa_lookup(name) != NULL; mutex_exit(&spa_namespace_lock); @@ -994,7 +994,7 @@ zio_inject_list_next(int *id, char *name, size_t buflen, inject_handler_t *handler; int ret; - mutex_enter(&spa_namespace_lock); + mutex_enter_ns(&spa_namespace_lock); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 088e46ce57..2bc2b38dd1 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -559,7 +559,7 @@ tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', 'zpool_status_007_pos', 'zpool_status_008_pos', - 'zpool_status_features_001_pos'] + 'zpool_status_features_001_pos', 'zpool_status_namespace_lock'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 19770138bf..ccfde4067a 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -90,6 +90,7 @@ export SYSTEM_FILES_COMMON='awk sync tail tar + time timeout touch tr @@ -230,6 +231,7 @@ export ZFSTEST_FILES='badsend edonr_test skein_test sha2_test + time ctime truncate_test ereports diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 96943421f8..77a0b3099b 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -98,6 +98,7 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq +SPA_NAMESPACE_DELAY_MS spa.namespace_delay_ms spa_namespace_delay_ms BCLONE_ENABLED bclone_enabled zfs_bclone_enabled BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty XATTR_COMPAT xattr_compat zfs_xattr_compat diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bbeabc6dfb..e1bbbda18f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1256,6 +1256,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh new file mode 100755 index 0000000000..fe94ece2be --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh @@ -0,0 +1,144 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Test out that zpool status|get|list are not affected by the +# 'spa_namespace_lock' being held. This was an issue in the past +# where a pool would hang while holding the lock, and 'zpool status' +# would hang waiting for the lock. +# +# STRATEGY: +# 1. Create a test pool. It should be created quickly. +# 2. Set SPA_NAMESPACE_DELAY_MS to 1 second. +# 3. Create another test pool. It should take over 1 second due to the +# delay. +# 4. Run zpool status|get|list. They should all run quickly. +# 5. In the background, destroy one of the pools and export the other one +# at the same time. This should cause a lot of lock contention. Note +# that the 1 sec delay is still in effect. +# 6. At the same time run some zpool commands and verify they still run in +# 200ms or less, and without error. + +verify_runnable "global" + +FILEDEV1="$TEST_BASE_DIR/filedev1.$$" +FILEDEV2="$TEST_BASE_DIR/filedev2.$$" + +log_must truncate -s 100M "$FILEDEV1" "$FILEDEV2" +TESTPOOL1=testpool1 +TESTPOOL2=testpool2 + +function cleanup +{ + restore_tunable SPA_NAMESPACE_DELAY_MS + datasetexists $TESTPOOL1 && log_must zpool destroy $TESTPOOL1 + datasetexists $TESTPOOL2 && log_must zpool destroy $TESTPOOL2 + rm -f $FILEDEV1 $FILEDEV2 + + log_note "debug: $(cat /tmp/out)" +} + +# Run a command and test how long it takes. Fail the test if the command +# fails or takes the wrong amount of time to run. +# +# The arguments consist of an array like this: +# +# zpool get all -le 200 +# +# The first part of the array is the command and its arguments. +# The 2nd to last argument is a ksh comparator operation like "-le" or "-ge" +# The last argument is the expected time in milliseconds for the comparator. +# +# So the example above reads as "run 'zpool get all' and check that it runs in +# 200 milliseconds or less". +# +function timetest_cmd +{ + # Run the command and record 'real' time. dec will be a factional value + # like '1.05' representing the number of seconds it took to run. + array=($@) + + # Save the last two arguments and remove them from the array + target_ms="${array[@]: -1}" + unset array[-1] + comp="${array[@]: -1}" + unset array[-1] + + out="$({ time -p "${array[@]}" 1> /dev/null; } 2>&1)" + rc=$? + dec=$(echo "$out" | awk '/^real /{print $2}') + + # Calculate milliseconds. The divide by one at the end removes the + # decimal places. + ms=$(bc <<< "scale=0; $dec * 1000 / 1") + + log_note "${array[@]}: $ms $comp $target_ms" + + # For debugging failures + echo "$@: dec=$dec; rc=$rc; ms=$ms; $out" >> /tmp/out2 + if [ -z "$ms" ] || [ $rc != 0 ] ; then + log_fail "Bad value: ms=$ms, rc=$rc: $out" + fi + + if eval ! [ $ms $comp $target_ms ] ; then + log_fail "Expected time wrong: $ms $comp $target_ms" + fi +} + +log_assert "Verify spa_namespace_lock isn't held by zpool status|get|list" + +log_onexit cleanup + +# A normal zpool create should take 200ms or less +timetest_cmd zpool create $TESTPOOL1 $FILEDEV1 -le 200 + +log_must save_tunable SPA_NAMESPACE_DELAY_MS +log_must set_tunable32 SPA_NAMESPACE_DELAY_MS 1000 + +# We added 1 sec hold time on spa_namespace lock. zpool create +# should now take at least 1000ms. +timetest_cmd zpool create $TESTPOOL2 $FILEDEV2 -ge 1000 + +# zpool status|get|list should not take spa_namespace_lock, so they should run +# quickly, even loaded down with options. +timetest_cmd zpool status -pPstvLD -le 200 +timetest_cmd zpool get all -le 200 +timetest_cmd zpool list -gLPv -le 200 + +# In the background, destroy one of the pools and export the other one at the +# same time. This should cause a lot of lock contention. At the same time +# run some zpool commands and verify they still run in 200ms or less, and +# without error. +zpool destroy $TESTPOOL1 & +zpool export $TESTPOOL2 & +for i in 1 2 3 4 ; do + timetest_cmd zpool status -pPstvLD -le 200 + timetest_cmd zpool get all -le 200 + timetest_cmd zpool list -gLPv -le 200 + sleep 0.1 +done +wait + +log_pass "zpool status|get|list was not delayed by the spa_namespace_lock"