From 3afdc97d91c24192db51b67762126a8d99d433db Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 25 Oct 2023 18:22:25 -0400 Subject: [PATCH] ZIO: Remove READY pipeline stage from root ZIOs zio_root() has no arguments for ready callback or parent ZIO. Except one recent case in ZIL code if root ZIOs ever have a parent it is also a root ZIO. It means we do not need READY pipeline stage for them, which takes some time to process, but even more time to wait for the children and be woken by them, and both for no good reason. The most visible effect of this change is that it avoids one taskq wakeup per ZIL block written, previously used to run zio_ready() for lwb_root_zio and skipped now. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15398 --- cmd/zdb/zdb.c | 5 ++--- include/sys/zio_impl.h | 3 +++ module/zfs/zil.c | 4 ++-- module/zfs/zio.c | 51 ++++++++++++++++++++++++++++++++++-------- 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index b39a0e8825..3c282f3fc9 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8716,8 +8716,6 @@ zdb_read_block(char *thing, spa_t *spa) BP_SET_CHECKSUM(bp, ck); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - czio->io_bp = bp; - if (vd == vd->vdev_top) { zio_nowait(zio_read(czio, spa, bp, pabd, psize, NULL, NULL, @@ -8736,7 +8734,8 @@ zdb_read_block(char *thing, spa_t *spa) } error = zio_wait(czio); if (error == 0 || error == ECKSUM) { - zio_t *ck_zio = zio_root(spa, NULL, NULL, 0); + zio_t *ck_zio = zio_null(NULL, spa, NULL, + NULL, NULL, 0); ck_zio->io_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); ck_zio->io_bp = bp; diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 29a05986cd..febe0a87b4 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -159,6 +159,9 @@ enum zio_stage { ZIO_STAGE_DONE = 1 << 25 /* RWFCI */ }; +#define ZIO_ROOT_PIPELINE \ + ZIO_STAGE_DONE + #define ZIO_INTERLOCK_STAGES \ (ZIO_STAGE_READY | \ ZIO_STAGE_DONE) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 218031a8af..ce2cb8b144 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -2155,8 +2155,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); if (lwb->lwb_child_zio == NULL) { - lwb->lwb_child_zio = zio_root( - zilog->zl_spa, NULL, NULL, + lwb->lwb_child_zio = zio_null(NULL, + zilog->zl_spa, NULL, NULL, NULL, ZIO_FLAG_CANFAIL); } } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3b3b40fa73..3eb472a9fd 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -634,6 +634,11 @@ zio_add_child(zio_t *pio, zio_t *cio) */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + /* Parent should not have READY stage if child doesn't have it. */ + IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && + (cio->io_child_type != ZIO_CHILD_VDEV), + (pio->io_pipeline & ZIO_STAGE_READY) == 0); + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; @@ -665,6 +670,11 @@ zio_add_child_first(zio_t *pio, zio_t *cio) */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + /* Parent should not have READY stage if child doesn't have it. */ + IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && + (cio->io_child_type != ZIO_CHILD_VDEV), + (pio->io_pipeline & ZIO_STAGE_READY) == 0); + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; @@ -901,7 +911,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; - zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); + zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) || + (pipeline & ZIO_STAGE_READY) == 0; zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) @@ -932,6 +943,10 @@ zio_destroy(zio_t *zio) kmem_cache_free(zio_cache, zio); } +/* + * ZIO intended to be between others. Provides synchronization at READY + * and DONE pipeline stages and calls the respective callbacks. + */ zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, zio_flag_t flags) @@ -945,10 +960,22 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, return (zio); } +/* + * ZIO intended to be a root of a tree. Unlike null ZIO does not have a + * READY pipeline stage (is ready on creation), so it should not be used + * as child of any ZIO that may need waiting for grandchildren READY stage + * (any other ZIO type). + */ zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { - return (zio_null(NULL, spa, NULL, done, private, flags)); + zio_t *zio; + + zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, + ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE); + + return (zio); } static int @@ -2396,13 +2423,14 @@ static void zio_reexecute(void *arg) { zio_t *pio = arg; - zio_t *cio, *cio_next; + zio_t *cio, *cio_next, *gio; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); + mutex_enter(&pio->io_lock); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; @@ -2410,8 +2438,16 @@ zio_reexecute(void *arg) pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_state[w] = 0; + pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) || + (pio->io_pipeline & ZIO_STAGE_READY) == 0; + pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE); + zio_link_t *zl = NULL; + while ((gio = zio_walk_parents(pio, &zl)) != NULL) { + for (int w = 0; w < ZIO_WAIT_TYPES; w++) { + gio->io_children[pio->io_child_type][w] += + !pio->io_state[w]; + } + } for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; @@ -2425,12 +2461,9 @@ zio_reexecute(void *arg) * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ - zio_link_t *zl = NULL; - mutex_enter(&pio->io_lock); + zl = NULL; for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock);