From 65d10bd87c408bfa13fa27bb6ad3ecc0e2e3775b Mon Sep 17 00:00:00 2001 From: Kevin Jin <33590050+jxdking@users.noreply.github.com> Date: Tue, 28 Mar 2023 11:43:41 -0400 Subject: [PATCH] Fix short-lived txg caused by autotrim Current autotrim causes short-lived txg through: 1. calling txg_wait_synced() in metaslab_enable() 2. calling txg_wait_open() with should_quiesce = true This patch addresses all the issues mentioned above. A new cv, vdev_autotrim_kick_cv is added to kick autotrim activity. It will be signaled once a txg is synced so that it does not change the original autotrim pace. Also because it is a cv, the wait is interruptible which speeds up the vdev_autotrim_stop_wait() call. Finally, combining big zfs_txg_timeout, txg_wait_open() also causes delay when exporting a pool. Reviewed-by: Brian Behlendorf Signed-off-by: jxdking Issue #8993 Closes #12194 --- include/sys/vdev_impl.h | 1 + include/sys/vdev_trim.h | 1 + module/zfs/spa.c | 3 ++ module/zfs/vdev.c | 2 ++ module/zfs/vdev_trim.c | 72 +++++++++++++++++++++++++++++++++-------- 5 files changed, 65 insertions(+), 14 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 73c0206efa..7cfffe3b4e 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -329,6 +329,7 @@ struct vdev { list_node_t vdev_trim_node; kmutex_t vdev_autotrim_lock; kcondvar_t vdev_autotrim_cv; + kcondvar_t vdev_autotrim_kick_cv; kthread_t *vdev_autotrim_thread; /* Protects vdev_trim_thread and vdev_trim_state. */ kmutex_t vdev_trim_lock; diff --git a/include/sys/vdev_trim.h b/include/sys/vdev_trim.h index 2a217f0d43..7a94d4af09 100644 --- a/include/sys/vdev_trim.h +++ b/include/sys/vdev_trim.h @@ -41,6 +41,7 @@ extern void vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state); extern void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list); extern void vdev_trim_restart(vdev_t *vd); extern void vdev_autotrim(spa_t *spa); +extern void vdev_autotrim_kick(spa_t *spa); extern void vdev_autotrim_stop_all(spa_t *spa); extern void vdev_autotrim_stop_wait(vdev_t *vd); extern void vdev_autotrim_restart(spa_t *spa); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 0573daa92f..dc202978c0 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -9449,6 +9449,9 @@ spa_sync(spa_t *spa, uint64_t txg) spa_update_dspace(spa); + if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) + vdev_autotrim_kick(spa); + /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 1af5baeecd..ad932a7ba7 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -696,6 +696,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1148,6 +1149,7 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); + cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 5b5076c872..0d71b94343 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -182,6 +182,25 @@ vdev_autotrim_should_stop(vdev_t *tvd) spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } +/* + * Wait for given number of kicks, return true if the wait is aborted due to + * vdev_autotrim_exit_wanted. + */ +static boolean_t +vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick) +{ + mutex_enter(&vd->vdev_autotrim_lock); + for (int i = 0; i < num_of_kick; i++) { + if (vd->vdev_autotrim_exit_wanted) + break; + cv_wait(&vd->vdev_autotrim_kick_cv, &vd->vdev_autotrim_lock); + } + boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted; + mutex_exit(&vd->vdev_autotrim_lock); + + return (exit_wanted); +} + /* * The sync task for updating the on-disk state of a manual TRIM. This * is scheduled by vdev_trim_change_state(). @@ -1190,7 +1209,6 @@ vdev_autotrim_thread(void *arg) while (!vdev_autotrim_should_stop(vd)) { int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); - boolean_t issued_trim = B_FALSE; uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; @@ -1224,6 +1242,8 @@ vdev_autotrim_thread(void *arg) i += txgs_per_trim) { metaslab_t *msp = vd->vdev_ms[i]; range_tree_t *trim_tree; + boolean_t issued_trim = B_FALSE; + boolean_t wait_aborted = B_FALSE; spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); @@ -1374,7 +1394,18 @@ vdev_autotrim_thread(void *arg) range_tree_vacate(trim_tree, NULL, NULL); range_tree_destroy(trim_tree); - metaslab_enable(msp, issued_trim, B_FALSE); + /* + * Wait for couples of kicks, to ensure the trim io is + * synced. If the wait is aborted due to + * vdev_autotrim_exit_wanted, we need to signal + * metaslab_enable() to wait for sync. + */ + if (issued_trim) { + wait_aborted = vdev_autotrim_wait_kick(vd, + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE); + } + + metaslab_enable(msp, wait_aborted, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t c = 0; c < children; c++) { @@ -1388,17 +1419,14 @@ vdev_autotrim_thread(void *arg) } kmem_free(tap, sizeof (trim_args_t) * children); + + if (vdev_autotrim_should_stop(vd)) + break; } spa_config_exit(spa, SCL_CONFIG, FTAG); - /* - * After completing the group of metaslabs wait for the next - * open txg. This is done to make sure that a minimum of - * zfs_trim_txg_batch txgs will occur before these metaslabs - * are trimmed again. - */ - txg_wait_open(spa_get_dsl(spa), 0, issued_trim); + vdev_autotrim_wait_kick(vd, 1); shift++; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1476,11 +1504,9 @@ vdev_autotrim_stop_wait(vdev_t *tvd) mutex_enter(&tvd->vdev_autotrim_lock); if (tvd->vdev_autotrim_thread != NULL) { tvd->vdev_autotrim_exit_wanted = B_TRUE; - - while (tvd->vdev_autotrim_thread != NULL) { - cv_wait(&tvd->vdev_autotrim_cv, - &tvd->vdev_autotrim_lock); - } + cv_broadcast(&tvd->vdev_autotrim_kick_cv); + cv_wait(&tvd->vdev_autotrim_cv, + &tvd->vdev_autotrim_lock); ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); tvd->vdev_autotrim_exit_wanted = B_FALSE; @@ -1488,6 +1514,24 @@ vdev_autotrim_stop_wait(vdev_t *tvd) mutex_exit(&tvd->vdev_autotrim_lock); } +void +vdev_autotrim_kick(spa_t *spa) +{ + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + vdev_t *root_vd = spa->spa_root_vdev; + vdev_t *tvd; + + for (uint64_t i = 0; i < root_vd->vdev_children; i++) { + tvd = root_vd->vdev_child[i]; + + mutex_enter(&tvd->vdev_autotrim_lock); + if (tvd->vdev_autotrim_thread != NULL) + cv_broadcast(&tvd->vdev_autotrim_kick_cv); + mutex_exit(&tvd->vdev_autotrim_lock); + } +} + /* * Wait for all of the vdev_autotrim_thread associated with the pool to * be terminated (canceled or stopped).