Remove spa_namespace_lock from zpool status

This commit removes spa_namespace_lock from the zpool status codepath.
This means that zpool status will not hang if a pool fails while holding
the spa_namespace_lock.

Background:

The spa_namespace_lock was originally meant to protect the
spa_namespace_avl AVL tree.  The spa_namespace_avl tree held the
mappings from pool names to spa_t's.  So if you wanted to lookup the
spa_t for the "tank" pool, you would do an AVL search for "tank" while
holding spa_namespace_lock.

Over time though the spa_namespace_lock was re-purposed to protect other
critical codepaths in the spa subsystem as well.  In many cases we don't
know what the original authors meant to protect with it, or if they
needed it for read-only or read-write protection.  It is simply "too big
and risky to fix properly".

The workaround is to add a new lightweight version of the
spa_namespace_lock called spa_namespace_lite_lock.
spa_namespace_lite_lock only protects the AVL tree, and nothing else.
It can be used for read-only access to the AVL tree without requiring
the spa_namespace_lock.  Calls to spa_lookup_lite() and spa_next_lite()
only need to acquire a reader lock on spa_namespace_lite_lock; they do
not need to also acquire the old spa_namespace_lock.  This allows us to
still run zpool status even if the zfs module has spa_namespace_lock
held.  Note that these AVL tree locks only protect the tree, not the
actual spa_t contents.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
This commit is contained in:
Tony Hutter 2024-09-03 17:41:16 -07:00
parent 5c67820265
commit 9f4d64e711
20 changed files with 375 additions and 73 deletions

2
TEST
View File

@ -35,7 +35,7 @@
#TEST_ZFSTESTS_ITERS="1"
#TEST_ZFSTESTS_OPTIONS="-vx"
#TEST_ZFSTESTS_RUNFILE="linux.run"
#TEST_ZFSTESTS_TAGS="functional"
TEST_ZFSTESTS_TAGS="zpool_import,zpool_export,zpool_status,zpool_add"
### zfsstress
#TEST_ZFSSTRESS_SKIP="yes"

View File

@ -842,6 +842,22 @@ extern kmutex_t spa_namespace_lock;
extern avl_tree_t spa_namespace_avl;
extern kcondvar_t spa_namespace_cv;
extern krwlock_t spa_namespace_lite_lock;
extern uint_t spa_namespace_delay_ms;
/*
* Special version of mutex_enter() used for testing the spa_namespace_lock.
* It inserts an artificial delay after acquiring the lock to simulate a failure
* while holding the lock. The delay is controlled by the
* spa_namespace_lock_delay module param and is only to be used by ZTS.
*/
#define mutex_enter_ns(lock) do { \
ASSERT(lock == &spa_namespace_lock); \
mutex_enter(lock); \
if (unlikely(spa_namespace_delay_ms != 0)) \
zfs_msleep(spa_namespace_delay_ms); \
} while (0);
/*
* SPA configuration functions in spa_config.c
*/
@ -866,9 +882,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv,
/* Namespace manipulation */
extern spa_t *spa_lookup(const char *name);
extern spa_t *spa_lookup_lite(const char *name);
extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
extern void spa_remove(spa_t *spa);
extern spa_t *spa_next(spa_t *prev);
extern spa_t *spa_next_lite(spa_t *prev);
/* Refcount functions */
extern void spa_open_ref(spa_t *spa, const void *tag);

View File

@ -484,6 +484,7 @@ struct spa {
extern char *spa_config_path;
extern const char *zfs_deadman_failmode;
extern uint_t spa_slop_shift;
extern uint_t spa_namespace_delay_ms;
extern void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, zio_t *zio, boolean_t cutinline);
extern void spa_load_spares(spa_t *spa);

View File

@ -769,6 +769,8 @@ void ksiddomain_rele(ksiddomain_t *);
(void) nanosleep(&ts, NULL); \
} while (0)
#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms)))
typedef int fstrans_cookie_t;
extern fstrans_cookie_t spl_fstrans_mark(void);

View File

@ -37,5 +37,6 @@
usleep_range(delta_us, delta_us + 100); \
} \
} while (0)
#define zfs_msleep(ms) zfs_sleep_until(gethrtime() + (MSEC2NSEC(ms)))
#endif /* _SYS_FS_ZFS_DELAY_H */

View File

@ -192,7 +192,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind)
*/
config = spa_generate_rootconf(name);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if (config != NULL) {
pname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME);
VERIFY0(strcmp(name, pname));

View File

@ -107,7 +107,7 @@ zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
"command", &command) != 0)
return (EINVAL);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa = spa_by_guid(pool_guid, vdev_guid);
if (spa != NULL)
strcpy(name, spa_name(spa));

View File

@ -8205,7 +8205,7 @@ l2arc_dev_get_next(void)
* of cache devices (l2arc_dev_mtx). Once a device has been selected,
* both locks will be dropped and a spa config lock held instead.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
mutex_enter(&l2arc_dev_mtx);
/* if there are no vdevs, there is nothing to do */

View File

@ -728,7 +728,7 @@ mmp_signal_all_threads(void)
{
spa_t *spa = NULL;
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
while ((spa = spa_next(spa))) {
if (spa->spa_state == POOL_STATE_ACTIVE)
mmp_signal_thread(spa);

View File

@ -1038,7 +1038,7 @@ spa_change_guid(spa_t *spa, const uint64_t *guidp)
int error;
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if (guidp != NULL) {
guid = *guidp;
@ -5635,7 +5635,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
spa_load_note(spa, "LOADED");
fail:
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa->spa_load_thread = NULL;
cv_broadcast(&spa_namespace_cv);
@ -5800,7 +5800,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag,
* avoid dsl_dir_open() calling this in the first place.
*/
if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
locked = B_TRUE;
}
@ -5898,6 +5898,46 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag,
return (0);
}
/*
* This is a lightweight version of spa_open_common() used only for getting
* stats. It does not use the spa_namespace_lock.
*
* Returns:
*
* 0 Success
* ENOENT No pool with that name
* EAGAIN Try doing a spa_open_common() instead
*
* On failure, *spapp will be set to NULL;
*/
static int
spa_read_config_lite(const char *pool, spa_t **spapp, nvlist_t **config)
{
spa_t *spa;
ASSERT(RW_LOCK_HELD(&spa_namespace_lite_lock));
if ((spa = spa_lookup_lite(pool)) == NULL) {
*spapp = NULL;
return (SET_ERROR(ENOENT));
}
/*
* Special case: If the pool isn't initialized yet, then tell the caller
* to call spa_open_common() instead. That will do the full,
* heavyweight load of spa_t.
*/
if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
*spapp = NULL;
return (EAGAIN);
}
if (config != NULL)
*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
*spapp = spa;
return (0);
}
int
spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
nvlist_t *policy, nvlist_t **config)
@ -5920,7 +5960,7 @@ spa_inject_addref(char *name)
{
spa_t *spa;
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if ((spa = spa_lookup(name)) == NULL) {
mutex_exit(&spa_namespace_lock);
return (NULL);
@ -5934,7 +5974,7 @@ spa_inject_addref(char *name)
void
spa_inject_delref(spa_t *spa)
{
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa->spa_inject_ref--;
mutex_exit(&spa_namespace_lock);
}
@ -6134,9 +6174,26 @@ spa_get_stats(const char *name, nvlist_t **config,
{
int error;
spa_t *spa;
boolean_t is_lite = B_TRUE;
*config = NULL;
error = spa_open_common(name, &spa, FTAG, NULL, config);
rw_enter(&spa_namespace_lite_lock, RW_READER);
/*
* First, try a lightweight "read the config". This will avoid the
* spa_namespace_lock and will work if the pool is already imported.
*/
error = spa_read_config_lite(name, &spa, config);
/*
* The pool is in a state where it needs the full, heavyweight,
* spa_open_common().
*/
if (error == EAGAIN) {
rw_exit(&spa_namespace_lite_lock);
error = spa_open_common(name, &spa, FTAG, NULL, config);
is_lite = B_FALSE;
}
if (spa != NULL) {
/*
@ -6179,14 +6236,21 @@ spa_get_stats(const char *name, nvlist_t **config,
*/
if (altroot) {
if (spa == NULL) {
mutex_enter(&spa_namespace_lock);
spa = spa_lookup(name);
if (!is_lite) {
mutex_enter(&spa_namespace_lock);
spa = spa_lookup(name);
} else {
spa = spa_lookup_lite(name);
}
if (spa)
spa_altroot(spa, altroot, buflen);
else
altroot[0] = '\0';
spa = NULL;
mutex_exit(&spa_namespace_lock);
if (!is_lite)
mutex_exit(&spa_namespace_lock);
} else {
spa_altroot(spa, altroot, buflen);
}
@ -6194,9 +6258,12 @@ spa_get_stats(const char *name, nvlist_t **config,
if (spa != NULL) {
spa_config_exit(spa, SCL_CONFIG, FTAG);
spa_close(spa, FTAG);
if (!is_lite)
spa_close(spa, FTAG);
}
if (is_lite)
rw_exit(&spa_namespace_lite_lock);
return (error);
}
@ -6406,7 +6473,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
/*
* If this pool already exists, return failure.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if (spa_lookup(poolname) != NULL) {
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EEXIST));
@ -6710,7 +6777,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
/*
* If a pool with this name exists, return failure.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if (spa_lookup(pool) != NULL) {
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EEXIST));
@ -6898,7 +6965,7 @@ spa_tryimport(nvlist_t *tryconfig)
(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa = spa_add(name, tryconfig, NULL);
spa_activate(spa, SPA_MODE_READ);
kmem_free(name, MAXPATHLEN);
@ -7024,7 +7091,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
if (!(spa_mode_global & SPA_MODE_WRITE))
return (SET_ERROR(EROFS));
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if ((spa = spa_lookup(pool)) == NULL) {
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(ENOENT));
@ -7048,7 +7115,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
zvol_remove_minors(spa, spa_name(spa), B_TRUE);
taskq_wait(spa->spa_zvol_taskq);
}
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa->spa_export_thread = curthread;
spa_close(spa, FTAG);
@ -7096,7 +7163,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
if (!force && new_state == POOL_STATE_EXPORTED &&
spa_has_active_shared_spare(spa)) {
error = SET_ERROR(EXDEV);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
goto fail;
}
@ -7167,7 +7234,7 @@ export_spa:
/*
* Take the namespace lock for the actual spa_t removal
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if (new_state != POOL_STATE_UNINITIALIZED) {
if (!hardforce)
spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
@ -7202,6 +7269,7 @@ fail:
return (error);
}
/*
* Destroy a storage pool.
*/
@ -7408,7 +7476,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
*/
(void) spa_vdev_exit(spa, vd, txg, 0);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
mutex_exit(&spa_namespace_lock);
@ -8034,7 +8102,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
if (unspare) {
spa_t *altspa = NULL;
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
while ((altspa = spa_next(altspa)) != NULL) {
if (altspa->spa_state != POOL_STATE_ACTIVE ||
altspa == spa)
@ -8043,7 +8111,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
spa_open_ref(altspa, FTAG);
mutex_exit(&spa_namespace_lock);
(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_close(altspa, FTAG);
}
mutex_exit(&spa_namespace_lock);
@ -8053,7 +8121,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
}
/* all done with the spa; OK to release */
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_close(spa, FTAG);
mutex_exit(&spa_namespace_lock);
@ -8148,7 +8216,7 @@ spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
* we can properly assess the vdev state before we commit to
* the initializing operation.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
@ -8269,7 +8337,7 @@ spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
* we can properly assess the vdev state before we commit to
* the TRIM operation.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
@ -8974,7 +9042,7 @@ spa_async_thread(void *arg)
if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
uint64_t old_space, new_space;
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
old_space = metaslab_class_get_space(spa_normal_class(spa));
old_space += metaslab_class_get_space(spa_special_class(spa));
old_space += metaslab_class_get_space(spa_dedup_class(spa));
@ -9049,7 +9117,7 @@ spa_async_thread(void *arg)
dsl_scan_restart_resilver(dp, 0);
if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_initialize_restart(spa->spa_root_vdev);
spa_config_exit(spa, SCL_CONFIG, FTAG);
@ -9057,7 +9125,7 @@ spa_async_thread(void *arg)
}
if (tasks & SPA_ASYNC_TRIM_RESTART) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_trim_restart(spa->spa_root_vdev);
spa_config_exit(spa, SCL_CONFIG, FTAG);
@ -9065,7 +9133,7 @@ spa_async_thread(void *arg)
}
if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_autotrim_restart(spa);
spa_config_exit(spa, SCL_CONFIG, FTAG);
@ -9076,7 +9144,7 @@ spa_async_thread(void *arg)
* Kick off L2 cache whole device TRIM.
*/
if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_trim_l2arc(spa);
spa_config_exit(spa, SCL_CONFIG, FTAG);
@ -9087,7 +9155,7 @@ spa_async_thread(void *arg)
* Kick off L2 cache rebuilding.
*/
if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
l2arc_spa_rebuild_start(spa);
spa_config_exit(spa, SCL_L2ARC, FTAG);
@ -10292,7 +10360,7 @@ void
spa_sync_allpools(void)
{
spa_t *spa = NULL;
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) {
if (spa_state(spa) != POOL_STATE_ACTIVE ||
!spa_writeable(spa) || spa_suspended(spa))
@ -10300,7 +10368,7 @@ spa_sync_allpools(void)
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
txg_wait_synced(spa_get_dsl(spa), 0);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_close(spa, FTAG);
}
mutex_exit(&spa_namespace_lock);
@ -10450,7 +10518,7 @@ spa_evict_all(void)
* Remove all cached state. All pools should be closed now,
* so every spa in the AVL tree should be unreferenced.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
while ((spa = spa_next(NULL)) != NULL) {
/*
* Stop async tasks. The async thread may need to detach
@ -10460,7 +10528,7 @@ spa_evict_all(void)
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
spa_async_suspend(spa);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_close(spa, FTAG);
if (spa->spa_state != POOL_STATE_UNINITIALIZED) {

View File

@ -134,7 +134,7 @@ spa_config_load(void)
* Iterate over all elements in the nvlist, creating a new spa_t for
* each one with the specified configuration.
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
nvpair = NULL;
while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
@ -375,12 +375,9 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools)
if (*generation == spa_config_generation)
return (SET_ERROR(EEXIST));
int error = mutex_enter_interruptible(&spa_namespace_lock);
if (error)
return (SET_ERROR(EINTR));
rw_enter(&spa_namespace_lite_lock, RW_READER);
*pools = fnvlist_alloc();
while ((spa = spa_next(spa)) != NULL) {
while ((spa = spa_next_lite(spa)) != NULL) {
if (INGLOBALZONE(curproc) ||
zone_dataset_visible(spa_name(spa), NULL)) {
mutex_enter(&spa->spa_props_lock);
@ -390,7 +387,7 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools)
}
}
*generation = spa_config_generation;
mutex_exit(&spa_namespace_lock);
rw_exit(&spa_namespace_lite_lock);
return (0);
}

View File

@ -238,6 +238,30 @@
avl_tree_t spa_namespace_avl;
kmutex_t spa_namespace_lock;
/*
* The spa_namespace_lock was originally meant to protect the spa_namespace_avl
* AVL tree. The spa_namespace_avl tree held the mappings from pool names to
* spa_t's. So if you wanted to lookup the spa_t for the "tank" pool, you would
* do an AVL search for "tank" while holding spa_namespace_lock.
*
* Over time though the spa_namespace_lock was re-purposed to protect other
* critical codepaths in the spa subsystem as well. In many cases we don't know
* what the original authors meant to protect with it, or if they needed it for
* read-only or read-write protection. It is simply "too big and risky to fix
* properly".
*
* The workaround is to add a new lightweight version of the spa_namespace_lock
* called spa_namespace_lite_lock. spa_namespace_lite_lock only protects the
* AVL tree, and nothing else. It can be used for read-only access to the AVL
* tree without requiring the spa_namespace_lock. Calls to spa_lookup_lite()
* and spa_next_lite() only need to acquire a reader lock on
* spa_namespace_lite_lock; they do not need to also acquire the old
* spa_namespace_lock. This allows us to still run zpool status even if the zfs
* module has spa_namespace_lock held. Note that these AVL tree locks only
* protect the tree, not the actual spa_t contents.
*/
krwlock_t spa_namespace_lite_lock;
kcondvar_t spa_namespace_cv;
static const int spa_max_replication_override = SPA_DVAS_PER_BP;
@ -402,6 +426,12 @@ static int spa_cpus_per_allocator = 4;
*/
const char *zfs_active_allocator = "dynamic";
/*
* Add a delay of 'spa_namespace_delay_ms' milliseconds after taking the
* spa_namespace lock. This is only used for ZTS testing.
*/
uint_t spa_namespace_delay_ms = 0;
void
spa_load_failed(spa_t *spa, const char *fmt, ...)
{
@ -607,6 +637,30 @@ spa_config_held(spa_t *spa, int locks, krw_t rw)
* ==========================================================================
*/
spa_t *
spa_lookup_lite(const char *name)
{
static spa_t search; /* spa_t is large; don't allocate on stack */
spa_t *spa;
avl_index_t where;
char *cp;
ASSERT(RW_LOCK_HELD(&spa_namespace_lite_lock));
(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
/*
* If it's a full dataset name, figure out the pool name and
* just use that.
*/
cp = strpbrk(search.spa_name, "/@#");
if (cp != NULL)
*cp = '\0';
spa = avl_find(&spa_namespace_avl, &search, &where);
return (spa);
}
/*
* Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
* Returns NULL if no matching spa_t is found.
@ -632,7 +686,9 @@ retry:
if (cp != NULL)
*cp = '\0';
rw_enter(&spa_namespace_lite_lock, RW_READER);
spa = avl_find(&spa_namespace_avl, &search, &where);
rw_exit(&spa_namespace_lite_lock);
if (spa == NULL)
return (NULL);
@ -746,7 +802,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa_stats_init(spa);
ASSERT(MUTEX_HELD(&spa_namespace_lock));
rw_enter(&spa_namespace_lite_lock, RW_WRITER);
avl_add(&spa_namespace_avl, spa);
rw_exit(&spa_namespace_lite_lock);
/*
* Set the alternate root, if there is one.
@ -850,7 +908,9 @@ spa_remove(spa_t *spa)
nvlist_free(spa->spa_config_splitting);
rw_enter(&spa_namespace_lite_lock, RW_WRITER);
avl_remove(&spa_namespace_avl, spa);
rw_exit(&spa_namespace_lite_lock);
if (spa->spa_root)
spa_strfree(spa->spa_root);
@ -935,6 +995,20 @@ spa_next(spa_t *prev)
return (avl_first(&spa_namespace_avl));
}
/*
* Given a pool, return the next pool in the namespace, or NULL if there is
* none. If 'prev' is NULL, return the first pool.
*/
spa_t *
spa_next_lite(spa_t *prev)
{
ASSERT(RW_LOCK_HELD(&spa_namespace_lite_lock));
if (prev)
return (AVL_NEXT(&spa_namespace_avl, prev));
else
return (avl_first(&spa_namespace_avl));
}
/*
* ==========================================================================
* SPA refcount functions
@ -1238,7 +1312,7 @@ uint64_t
spa_vdev_enter(spa_t *spa)
{
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
ASSERT0(spa->spa_export_thread);
@ -1257,7 +1331,7 @@ uint64_t
spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
{
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
ASSERT0(spa->spa_export_thread);
@ -1462,7 +1536,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
* If the config changed, update the config cache.
*/
if (config_changed) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
mutex_exit(&spa_namespace_lock);
}
@ -2164,7 +2238,7 @@ spa_set_deadman_ziotime(hrtime_t ns)
spa_t *spa = NULL;
if (spa_mode_global != SPA_MODE_UNINIT) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL)
spa->spa_deadman_ziotime = ns;
mutex_exit(&spa_namespace_lock);
@ -2177,7 +2251,7 @@ spa_set_deadman_synctime(hrtime_t ns)
spa_t *spa = NULL;
if (spa_mode_global != SPA_MODE_UNINIT) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL)
spa->spa_deadman_synctime = ns;
mutex_exit(&spa_namespace_lock);
@ -2536,6 +2610,7 @@ spa_init(spa_mode_t mode)
{
mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&spa_namespace_lite_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
@ -2999,7 +3074,7 @@ param_set_deadman_failmode_common(const char *val)
return (SET_ERROR(EINVAL));
if (spa_mode_global != SPA_MODE_UNINIT) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL)
spa_set_deadman_failmode(spa, val);
mutex_exit(&spa_namespace_lock);
@ -3011,9 +3086,11 @@ param_set_deadman_failmode_common(const char *val)
/* Namespace manipulation */
EXPORT_SYMBOL(spa_lookup);
EXPORT_SYMBOL(spa_lookup_lite);
EXPORT_SYMBOL(spa_add);
EXPORT_SYMBOL(spa_remove);
EXPORT_SYMBOL(spa_next);
EXPORT_SYMBOL(spa_next_lite);
/* Refcount functions */
EXPORT_SYMBOL(spa_open_ref);
@ -3120,6 +3197,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW,
"Place user data indirect blocks into the special class");
ZFS_MODULE_PARAM(zfs_spa, spa_, namespace_delay_ms, UINT, ZMOD_RW,
"millisecond sleep after taking spa_namespace_lock (for testing only)");
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode,
param_set_deadman_failmode, param_get_charp, ZMOD_RW,

View File

@ -1562,7 +1562,7 @@ zfs_ereport_zvol_post(const char *subclass, const char *name,
char *r;
boolean_t locked = mutex_owned(&spa_namespace_lock);
if (!locked) mutex_enter(&spa_namespace_lock);
if (!locked) mutex_enter_ns(&spa_namespace_lock);
spa_t *spa = spa_lookup(name);
if (!locked) mutex_exit(&spa_namespace_lock);

View File

@ -3012,7 +3012,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
if (pair != NULL && strcmp(nvpair_name(pair),
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
nvlist_next_nvpair(props, pair) == NULL) {
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
@ -3060,27 +3060,14 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
props = NULL;
}
if ((error = spa_open(pool, &spa, FTAG)) != 0) {
/*
* If the pool is faulted, there may be properties we can still
* get (such as altroot and cachefile), so attempt to get them
* anyway.
*/
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(pool)) != NULL) {
error = spa_prop_get(spa, outnvl);
if (error == 0 && props != NULL)
error = spa_prop_get_nvlist(spa, props, n_props,
outnvl);
}
mutex_exit(&spa_namespace_lock);
} else {
rw_enter(&spa_namespace_lite_lock, RW_READER);
if ((spa = spa_lookup_lite(pool)) != NULL) {
error = spa_prop_get(spa, outnvl);
if (error == 0 && props != NULL)
error = spa_prop_get_nvlist(spa, props, n_props,
outnvl);
spa_close(spa, FTAG);
}
rw_exit(&spa_namespace_lite_lock);
return (error);
}
@ -5956,7 +5943,7 @@ zfs_ioc_clear(zfs_cmd_t *zc)
/*
* On zpool clear we also fix up missing slogs
*/
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
spa = spa_lookup(zc->zc_name);
if (spa == NULL) {
mutex_exit(&spa_namespace_lock);

View File

@ -907,7 +907,7 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
if (zio_pool_handler_exists(name, record->zi_cmd))
return (SET_ERROR(EEXIST));
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
boolean_t has_spa = spa_lookup(name) != NULL;
mutex_exit(&spa_namespace_lock);
@ -994,7 +994,7 @@ zio_inject_list_next(int *id, char *name, size_t buflen,
inject_handler_t *handler;
int ret;
mutex_enter(&spa_namespace_lock);
mutex_enter_ns(&spa_namespace_lock);
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers); handler != NULL;

View File

@ -559,7 +559,7 @@ tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
'zpool_status_003_pos', 'zpool_status_004_pos',
'zpool_status_005_pos', 'zpool_status_006_pos',
'zpool_status_007_pos', 'zpool_status_008_pos',
'zpool_status_features_001_pos']
'zpool_status_features_001_pos', 'zpool_status_namespace_lock']
tags = ['functional', 'cli_root', 'zpool_status']
[tests/functional/cli_root/zpool_sync]

View File

@ -90,6 +90,7 @@ export SYSTEM_FILES_COMMON='awk
sync
tail
tar
time
timeout
touch
tr
@ -230,6 +231,7 @@ export ZFSTEST_FILES='badsend
edonr_test
skein_test
sha2_test
time
ctime
truncate_test
ereports

View File

@ -98,6 +98,7 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
VOL_MODE vol.mode zvol_volmode
VOL_RECURSIVE vol.recursive UNSUPPORTED
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
SPA_NAMESPACE_DELAY_MS spa.namespace_delay_ms spa_namespace_delay_ms
BCLONE_ENABLED bclone_enabled zfs_bclone_enabled
BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty
XATTR_COMPAT xattr_compat zfs_xattr_compat

View File

@ -1256,6 +1256,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
functional/cli_root/zpool_status/zpool_status_namespace_lock.ksh \
functional/cli_root/zpool_sync/cleanup.ksh \
functional/cli_root/zpool_sync/setup.ksh \
functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \

View File

@ -0,0 +1,144 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Test out that zpool status|get|list are not affected by the
# 'spa_namespace_lock' being held. This was an issue in the past
# where a pool would hang while holding the lock, and 'zpool status'
# would hang waiting for the lock.
#
# STRATEGY:
# 1. Create a test pool. It should be created quickly.
# 2. Set SPA_NAMESPACE_DELAY_MS to 1 second.
# 3. Create another test pool. It should take over 1 second due to the
# delay.
# 4. Run zpool status|get|list. They should all run quickly.
# 5. In the background, destroy one of the pools and export the other one
# at the same time. This should cause a lot of lock contention. Note
# that the 1 sec delay is still in effect.
# 6. At the same time run some zpool commands and verify they still run in
# 200ms or less, and without error.
verify_runnable "global"
FILEDEV1="$TEST_BASE_DIR/filedev1.$$"
FILEDEV2="$TEST_BASE_DIR/filedev2.$$"
log_must truncate -s 100M "$FILEDEV1" "$FILEDEV2"
TESTPOOL1=testpool1
TESTPOOL2=testpool2
function cleanup
{
restore_tunable SPA_NAMESPACE_DELAY_MS
datasetexists $TESTPOOL1 && log_must zpool destroy $TESTPOOL1
datasetexists $TESTPOOL2 && log_must zpool destroy $TESTPOOL2
rm -f $FILEDEV1 $FILEDEV2
log_note "debug: $(cat /tmp/out)"
}
# Run a command and test how long it takes. Fail the test if the command
# fails or takes the wrong amount of time to run.
#
# The arguments consist of an array like this:
#
# zpool get all -le 200
#
# The first part of the array is the command and its arguments.
# The 2nd to last argument is a ksh comparator operation like "-le" or "-ge"
# The last argument is the expected time in milliseconds for the comparator.
#
# So the example above reads as "run 'zpool get all' and check that it runs in
# 200 milliseconds or less".
#
function timetest_cmd
{
# Run the command and record 'real' time. dec will be a factional value
# like '1.05' representing the number of seconds it took to run.
array=($@)
# Save the last two arguments and remove them from the array
target_ms="${array[@]: -1}"
unset array[-1]
comp="${array[@]: -1}"
unset array[-1]
out="$({ time -p "${array[@]}" 1> /dev/null; } 2>&1)"
rc=$?
dec=$(echo "$out" | awk '/^real /{print $2}')
# Calculate milliseconds. The divide by one at the end removes the
# decimal places.
ms=$(bc <<< "scale=0; $dec * 1000 / 1")
log_note "${array[@]}: $ms $comp $target_ms"
# For debugging failures
echo "$@: dec=$dec; rc=$rc; ms=$ms; $out" >> /tmp/out2
if [ -z "$ms" ] || [ $rc != 0 ] ; then
log_fail "Bad value: ms=$ms, rc=$rc: $out"
fi
if eval ! [ $ms $comp $target_ms ] ; then
log_fail "Expected time wrong: $ms $comp $target_ms"
fi
}
log_assert "Verify spa_namespace_lock isn't held by zpool status|get|list"
log_onexit cleanup
# A normal zpool create should take 200ms or less
timetest_cmd zpool create $TESTPOOL1 $FILEDEV1 -le 200
log_must save_tunable SPA_NAMESPACE_DELAY_MS
log_must set_tunable32 SPA_NAMESPACE_DELAY_MS 1000
# We added 1 sec hold time on spa_namespace lock. zpool create
# should now take at least 1000ms.
timetest_cmd zpool create $TESTPOOL2 $FILEDEV2 -ge 1000
# zpool status|get|list should not take spa_namespace_lock, so they should run
# quickly, even loaded down with options.
timetest_cmd zpool status -pPstvLD -le 200
timetest_cmd zpool get all -le 200
timetest_cmd zpool list -gLPv -le 200
# In the background, destroy one of the pools and export the other one at the
# same time. This should cause a lot of lock contention. At the same time
# run some zpool commands and verify they still run in 200ms or less, and
# without error.
zpool destroy $TESTPOOL1 &
zpool export $TESTPOOL2 &
for i in 1 2 3 4 ; do
timetest_cmd zpool status -pPstvLD -le 200
timetest_cmd zpool get all -le 200
timetest_cmd zpool list -gLPv -le 200
sleep 0.1
done
wait
log_pass "zpool status|get|list was not delayed by the spa_namespace_lock"