From fd0893cf1ff3c66b4793100412f4822591acf940 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 24 Jan 2023 12:20:32 -0500 Subject: [PATCH] Introduce minimal ZIL block commit delay Despite all optimizations, tests on actual hardware show that FreeBSD kernel can't sleep for less then ~2us. Similar tests on Linux show ~50us delay at least from nanosleep() (haven't tested inside kernel). It means that on very fast log device ZIL may not be able to satisfy zfs_commit_timeout_pct block commit timeout, increasing log latency more than desired. Handle that by introduction of zil_min_commit_timeout parameter, specifying minimal timeout value where additional delays to aggregate writes may be skipped. Also skip delays if the LWB is more than 7/8 full, that often happens if I/O sizes are constant and match one of LWB sizes. Both things are applied only if there were no already outstanding log blocks, that may indicate single-threaded workload, that by definition can not benefit from the commit delays. While there, add short time moving average to zl_last_lwb_latency to make it more stable. Tests of single-threaded 4KB writes to NVDIMM SLOG on FreeBSD show IOPS increase by 9% instead of expected 5%. For zfs_commit_timeout_pct of 1 there IOPS increase by 5.5% instead of expected 1%. Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Reviewed-by: Prakash Surya Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14418 --- man/man4/zfs.4 | 7 +++++++ module/zfs/zil.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index ed89142763..dbddf1bf59 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2126,6 +2126,13 @@ On very fragmented pools, lowering this .Pq typically to Sy 36kB can improve performance. . +.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64 +This sets the minimum delay in nanoseconds ZIL care to delay block commit, +waiting for more records. +If ZIL writes are too fast, kernel may not be able sleep for so short interval, +increasing log latency above allowed by +.Sy zfs_commit_timeout_pct . +. .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable the cache flush commands that are normally sent to disk by the ZIL after an LWB write has completed. diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 11e05e4778..aaf509a2fc 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -91,6 +91,14 @@ */ int zfs_commit_timeout_pct = 5; +/* + * Minimal time we care to delay commit waiting for more ZIL records. + * At least FreeBSD kernel can't sleep for less than 2us at its best. + * So requests to sleep for less then 5us is a waste of CPU time with + * a risk of significant log latency increase due to oversleep. + */ +static unsigned long zil_min_commit_timeout = 5000; + /* * See zil.h for more information about these fields. */ @@ -1155,7 +1163,8 @@ zil_lwb_flush_vdevs_done(zio_t *zio) lwb->lwb_tx = NULL; ASSERT3U(lwb->lwb_issued_timestamp, >, 0); - zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; + zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 3 + + gethrtime() - lwb->lwb_issued_timestamp) / 4; lwb->lwb_root_zio = NULL; @@ -2283,8 +2292,9 @@ zil_process_commit_list(zilog_t *zilog) spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; list_t nolwb_waiters; - lwb_t *lwb; + lwb_t *lwb, *plwb; itx_t *itx; + boolean_t first = B_TRUE; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); @@ -2306,6 +2316,9 @@ zil_process_commit_list(zilog_t *zilog) ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + first = (lwb->lwb_state != LWB_STATE_OPENED) && + ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || + plwb->lwb_state == LWB_STATE_FLUSH_DONE); } while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) { @@ -2476,7 +2489,23 @@ zil_process_commit_list(zilog_t *zilog) * try and pack as many itxs into as few lwbs as * possible, without significantly impacting the latency * of each individual itx. + * + * If we had no already running or open LWBs, it can be + * the workload is single-threaded. And if the ZIL write + * latency is very small or if the LWB is almost full, it + * may be cheaper to bypass the delay. */ + if (lwb->lwb_state == LWB_STATE_OPENED && first) { + hrtime_t sleep = zilog->zl_last_lwb_latency * + zfs_commit_timeout_pct / 100; + if (sleep < zil_min_commit_timeout || + lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { + lwb = zil_lwb_write_issue(zilog, lwb); + zilog->zl_cur_used = 0; + if (lwb == NULL) + zil_commit_writer_stall(zilog); + } + } } } @@ -3726,6 +3755,9 @@ EXPORT_SYMBOL(zil_set_logbias); ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, INT, ZMOD_RW, "ZIL block open timeout percentage"); +ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, ULONG, ZMOD_RW, + "Minimum delay we care for ZIL block commit"); + ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, "Disable intent logging replay");