From 12f2b1f65e91428465319dd3ebaa8c4466476f8e Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 2 Aug 2023 01:56:30 +1000 Subject: [PATCH 01/11] zdb: include cloned blocks in block statistics This gives `zdb -b` support for clone blocks. Previously, it didn't know what clones were, so would count their space allocation multiple times and then report leaked space (or, in debug, would assert trying to claim blocks a second time). This commit fixes those bugs, and reports the number of clones and the space "used" (saved) by them. Reviewed-by: Brian Behlendorf Reviewed-by: Kay Pedersen Signed-off-by: Rob Norris Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Closes #15123 --- cmd/zdb/zdb.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++- include/sys/brt.h | 1 + module/zfs/brt.c | 31 +++++++++++++++++++ 3 files changed, 108 insertions(+), 1 deletion(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 9568d2bbfe..4b9921d47b 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = { #define ZB_TOTAL DN_MAX_LEVELS #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) +typedef struct zdb_brt_entry { + dva_t zbre_dva; + uint64_t zbre_refcount; + avl_node_t zbre_node; +} zdb_brt_entry_t; + typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; + uint64_t zcb_clone_asize; + uint64_t zcb_clone_blocks; uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; @@ -5368,6 +5377,8 @@ typedef struct zdb_cb { int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; + avl_tree_t zcb_brt; + boolean_t zcb_brt_is_active; } zdb_cb_t; /* test if two DVA offsets from same vdev are within the same metaslab */ @@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); + if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { + /* + * Cloned blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them them once. + * + * To do this, we keep our own in-memory BRT. For each block + * we haven't seen before, we look it up in the real BRT and + * if its there, we note it and its refcount then proceed as + * normal. If we see the block again, we count it as a clone + * and then give it no further consideration. + */ + zdb_brt_entry_t zbre_search, *zbre; + avl_index_t where; + + zbre_search.zbre_dva = bp->blk_dva[0]; + zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); + if (zbre != NULL) { + zcb->zcb_clone_asize += BP_GET_ASIZE(bp); + zcb->zcb_clone_blocks++; + + zbre->zbre_refcount--; + if (zbre->zbre_refcount == 0) { + avl_remove(&zcb->zcb_brt, zbre); + umem_free(zbre, sizeof (zdb_brt_entry_t)); + } + return; + } + + uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); + if (crefcnt > 0) { + zbre = umem_zalloc(sizeof (zdb_brt_entry_t), + UMEM_NOFAIL); + zbre->zbre_dva = bp->blk_dva[0]; + zbre->zbre_refcount = crefcnt; + avl_insert(&zcb->zcb_brt, zbre, where); + } + } + if (dump_opt['L']) return; @@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa) iterate_deleted_livelists(spa, dump_livelist_cb, NULL); } +static int +zdb_brt_entry_compare(const void *zcn1, const void *zcn2) +{ + const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva; + const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva; + int cmp; + + cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + if (cmp == 0) + cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)); + + return (cmp); +} + static int dump_block_stats(spa_t *spa) { @@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa) zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); + if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { + avl_create(&zcb->zcb_brt, zdb_brt_entry_compare, + sizeof (zdb_brt_entry_t), + offsetof(zdb_brt_entry_t, zbre_node)); + zcb->zcb_brt_is_active = B_TRUE; + } + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", @@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa) metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); - total_found = tzb->zb_asize - zcb->zcb_dedup_asize + + total_found = + tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize + zcb->zcb_removing_size + zcb->zcb_checkpoint_size; if (total_found == total_alloc && !dump_opt['L']) { @@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa) "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, (u_longlong_t)zcb->zcb_dedup_blocks, (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); + (void) printf("\t%-16s %14llu count: %6llu\n", + "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize, + (u_longlong_t)zcb->zcb_clone_blocks); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); diff --git a/include/sys/brt.h b/include/sys/brt.h index 0761159e3f..f73df95058 100644 --- a/include/sys/brt.h +++ b/include/sys/brt.h @@ -36,6 +36,7 @@ extern "C" { #endif extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp); +extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp); extern uint64_t brt_get_dspace(spa_t *spa); extern uint64_t brt_get_used(spa_t *spa); diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 877b503a1b..e8218fb268 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -1544,6 +1544,37 @@ out: return (B_FALSE); } +uint64_t +brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) +{ + brt_t *brt = spa->spa_brt; + brt_vdev_t *brtvd; + brt_entry_t bre_search, *bre; + uint64_t vdevid, refcnt; + int error; + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_rlock(brt); + + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); + if (bre == NULL) { + error = brt_entry_lookup(brt, brtvd, &bre_search); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) + refcnt = 0; + else + refcnt = bre_search.bre_refcount; + } else + refcnt = bre->bre_refcount; + + brt_unlock(brt); + return (refcnt); +} + static void brt_prefetch(brt_t *brt, const blkptr_t *bp) { From c47f0f441759483bec05545572a8e6841393f879 Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 2 Aug 2023 04:31:11 +1000 Subject: [PATCH 02/11] linux/copy_file_range: properly request a fallback copy on Linux <5.3 Before Linux 5.3, the filesystem's copy_file_range handler had to signal back to the kernel that we can't fulfill the request and it should fallback to a content copy. This is done by returning -EOPNOTSUPP. This commit converts the EXDEV return from zfs_clone_range to EOPNOTSUPP, to force the kernel to fallback for all the valid reasons it might be unable to clone. Without it the copy_file_range() syscall will return EXDEV to userspace, breaking its semantics. Add test for copy_file_range fallbacks. copy_file_range should always fallback to a content copy whenever ZFS can't service the request with cloning. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Reviewed-by: Kay Pedersen Signed-off-by: Rob Norris Closes #15131 --- module/os/linux/zfs/zpl_file_range.c | 7 ++ tests/runfiles/linux.run | 1 + tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/tests/Makefile.am | 1 + .../block_cloning_copyfilerange_fallback.ksh | 86 +++++++++++++++++++ 5 files changed, 97 insertions(+) create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 18efebfc1d..72384b638b 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -106,6 +106,13 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, if (ret == -EOPNOTSUPP || ret == -EXDEV) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); +#else + /* + * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal + * to the kernel that it should fallback to a content copy. + */ + if (ret == -EXDEV) + ret = -EOPNOTSUPP; #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ return (ret); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index b68202d849..4747b98373 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -36,6 +36,7 @@ tags = ['functional', 'atime'] [tests/functional/block_cloning:Linux] tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', + 'block_cloning_copyfilerange_fallback', 'block_cloning_ficlone', 'block_cloning_ficlonerange', 'block_cloning_ficlonerange_partial', 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index c9a2b4179a..5c4b3a7bcd 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -300,6 +300,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_partial': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange_fallback': + ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': ['SKIP', cfr_cross_reason], }) diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 0819cb6b57..3b6b2ef734 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -443,6 +443,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange.ksh \ functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ functional/block_cloning/block_cloning_disabled_ficlone.ksh \ functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh new file mode 100755 index 0000000000..87f99eb5c0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# Copyright (c) 2023, Rob Norris +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="copy_file_range will fall back to copy when cloning not possible." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must sync_pool $TESTPOOL + + +log_note "Copying entire file with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "1 2 3 4" ] + + +log_note "Copying within a block with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 32768 32768 65536 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "2 3 4" ] + + +log_note "Copying across a block with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 327680 327680 131072 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "2" ] + +log_pass $claim From b5e2456333694f06be26cef2ebb5e7e3412e6c57 Mon Sep 17 00:00:00 2001 From: oromenahar Date: Tue, 1 Aug 2023 17:26:12 +0200 Subject: [PATCH 03/11] Check the return value in clonefile test Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Kay Pedersen Closes #15128 --- tests/zfs-tests/cmd/clonefile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c index a7e7277ae4..696dc471d8 100644 --- a/tests/zfs-tests/cmd/clonefile.c +++ b/tests/zfs-tests/cmd/clonefile.c @@ -212,7 +212,7 @@ main(int argc, char **argv) int dfd = open(argv[optind+1], O_WRONLY|O_CREAT, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); - if (sfd < 0) { + if (dfd < 0) { fprintf(stderr, "open: %s: %s\n", argv[optind+1], strerror(errno)); close(sfd); From b3c1807d776cb8ea3a911b464bb53f7c52444247 Mon Sep 17 00:00:00 2001 From: Zach Dykstra Date: Tue, 1 Aug 2023 11:01:32 -0500 Subject: [PATCH 04/11] readmmap.c: fix building with MUSL libc glibc includes sys/types.h from stdlib.h. This is not the case for MUSL, so explicitly include it. Fixes usage of uint_t. Reviewed-by: Brian Behlendorf Signed-off-by: Zach Dykstra Closes #15130 --- tests/zfs-tests/cmd/readmmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/zfs-tests/cmd/readmmap.c b/tests/zfs-tests/cmd/readmmap.c index 704ffd55c8..a5c8079d0e 100644 --- a/tests/zfs-tests/cmd/readmmap.c +++ b/tests/zfs-tests/cmd/readmmap.c @@ -44,6 +44,7 @@ #include #include #include +#include #include int From bd1eab16ebd0196ece3d0e5f50c8d0eb40b0bba8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Tue, 1 Aug 2023 17:50:17 +0200 Subject: [PATCH 05/11] linux: zfs: ctldir: set [amc]time to snapshot's creation property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If looking up a snapdir inode failed, hold pool config – hold the snapshot – get its creation property – release it – release it, then use that as the [amc]time in the allocated inode. If that fails then fall back to current time. No performance impact since this is only done when allocating a new snapdir inode. Reviewed-by: Brian Behlendorf Signed-off-by: Ahelenia Ziemiańska Closes #15110 Closes #15117 --- module/os/linux/zfs/zfs_ctldir.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index dca48e1e40..c45a3eb5a4 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -478,17 +478,19 @@ zfsctl_is_snapdir(struct inode *ip) */ static struct inode * zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, - const struct file_operations *fops, const struct inode_operations *ops) + const struct file_operations *fops, const struct inode_operations *ops, + uint64_t creation) { - inode_timespec_t now; struct inode *ip; znode_t *zp; + inode_timespec_t now = {.tv_sec = creation}; ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); - now = current_time(ip); + if (!creation) + now = current_time(ip); zp = ITOZ(ip); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); @@ -552,14 +554,28 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops) { struct inode *ip = NULL; + uint64_t creation = 0; + dsl_dataset_t *snap_ds; + dsl_pool_t *pool; while (ip == NULL) { ip = ilookup(zfsvfs->z_sb, (unsigned long)id); if (ip) break; + if (id <= ZFSCTL_INO_SNAPDIRS && !creation) { + pool = dmu_objset_pool(zfsvfs->z_os); + dsl_pool_config_enter(pool, FTAG); + if (!dsl_dataset_hold_obj(pool, + ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) { + creation = dsl_get_creation(snap_ds); + dsl_dataset_rele(snap_ds, FTAG); + } + dsl_pool_config_exit(pool, FTAG); + } + /* May fail due to concurrent zfsctl_inode_alloc() */ - ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops); + ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation); } return (ip); @@ -581,7 +597,7 @@ zfsctl_create(zfsvfs_t *zfsvfs) ASSERT(zfsvfs->z_ctldir == NULL); zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, - &zpl_fops_root, &zpl_ops_root); + &zpl_fops_root, &zpl_ops_root, 0); if (zfsvfs->z_ctldir == NULL) return (SET_ERROR(ENOENT)); From 0ae7bfc0a42455cc8a8c7e6a95b2c3cfe1fd795d Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Tue, 1 Aug 2023 14:47:00 -0700 Subject: [PATCH 06/11] zpool_vdev_remove() should handle EALREADY error return When the vdev properties features was merged an extra check was added in `spa_vdev_remove_top_check()` which checked whether the vdev that we want to remove is already being removed and if so return an EALREADY error. ``` static int spa_vdev_remove_top_check(vdev_t *vd) { ... ... /* * This device is already being removed */ if (vd->vdev_removing) return (SET_ERROR(EALREADY)); ``` Before that change we'd still fail with an error but it was a more generic one - here is the check that failed later in the same function: ``` /* * There can not be a removal in progress. */ if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(EBUSY)); ``` Changing the error code returned from that function changed the behavior of the removal's library interface exposed to the userland - `spa_vdev_remove()` now returns `EZFS_UNKNOWN` instead of `EZFS_EBUSY` that was returning before. This patch adds logic to make `spa_vdev_remove()` mindful of the new EALREADY code and propagating `EZFS_EBUSY` reverting to the previously established semantics of that function. Reviewed-by: Mark Maybee Reviewed-by: Matthew Ahrens Signed-off-by: Serapheim Dimitropoulos Closes #15013 Closes #15129 --- lib/libzfs/libzfs_pool.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index d4af31c50c..85564edfd8 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3926,6 +3926,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) switch (errno) { + case EALREADY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "removal for this vdev is already in progress.")); + (void) zfs_error(hdl, EZFS_BUSY, errbuf); + break; + case EINVAL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; all top-level vdevs must " From 02ce9030e6ecdb6491096e1d35902da8bcc80fce Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 27 Jul 2023 12:07:09 -0400 Subject: [PATCH 07/11] Avoid waiting in dmu_sync_late_arrival(). The transaction there does not produce any dirty data or log blocks, so it should not be throttled. All other cases wait for TXG sync, by which time the log block we are writing will be obsolete, so we can skip waiting and just return error here instead. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15096 --- module/zfs/dmu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3a4560cec2..078811dbf4 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1656,7 +1656,13 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); - if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + /* + * This transaction does not produce any dirty data or log blocks, so + * it should not be throttled. All other cases wait for TXG sync, by + * which time the log block we are writing will be obsolete, so we can + * skip waiting and just return error here instead. + */ + if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); From ffaedf0a44bd71931d054217a31284d04fd751a0 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 28 Jul 2023 16:30:33 -0400 Subject: [PATCH 08/11] Remove fastwrite mechanism. Fastwrite was introduced many years ago to improve ZIL writes spread between multiple top-level vdevs by tracking number of allocated but not written blocks and choosing vdev with smaller count. It suposed to reduce ZIL knowledge about allocation, but actually made ZIL to even more actively report allocation code about the allocations, complicating both ZIL and metaslabs code. On top of that, it seems ZIO_FLAG_FASTWRITE setting in dmu_sync() was lost many years ago, that was one of the declared benefits. Plus introduction of embedded log metaslab class solved another problem with allocation rotor accounting both normal and log allocations, since in most cases those are now in different metaslab classes. After all that, I'd prefer to simplify already too complicated ZIL, ZIO and metaslab code if the benefit of complexity is not obvious. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15107 --- include/sys/metaslab.h | 3 -- include/sys/vdev_impl.h | 1 - include/sys/zil_impl.h | 1 - include/sys/zio.h | 1 - module/zfs/metaslab.c | 67 ++--------------------------------------- module/zfs/vdev.c | 2 -- module/zfs/zil.c | 42 +++----------------------- module/zfs/zio.c | 14 +-------- 8 files changed, 8 insertions(+), 123 deletions(-) diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index fec080139a..0df6e5f81f 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -80,7 +80,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); #define METASLAB_ASYNC_ALLOC 0x8 #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 -#define METASLAB_FASTWRITE 0x40 #define METASLAB_ZIL 0x80 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, @@ -96,8 +95,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); -void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); -void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); void metaslab_stat_init(void); void metaslab_stat_fini(void); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 5f4e82ad86..ad9dc3aefd 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -266,7 +266,6 @@ struct vdev { metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ - uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 03a409c525..b58dad9695 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -91,7 +91,6 @@ typedef enum { typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ - boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */ boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ int lwb_nused; /* # used bytes in buffer */ diff --git a/include/sys/zio.h b/include/sys/zio.h index f4da80783e..e1f4d5c044 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -222,7 +222,6 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NOPWRITE (1ULL << 28) #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) -#define ZIO_FLAG_FASTWRITE (1ULL << 31) #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 9991e1a22c..8393e8dd91 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5101,7 +5101,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, zio_alloc_list_t *zal, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; - metaslab_group_t *mg, *fast_mg, *rotor; + metaslab_group_t *mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; @@ -5164,15 +5164,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; - } else if (flags & METASLAB_FASTWRITE) { - mg = fast_mg = mca->mca_rotor; - - do { - if (fast_mg->mg_vd->vdev_pending_fastwrite < - mg->mg_vd->vdev_pending_fastwrite) - mg = fast_mg; - } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor); - } else { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; @@ -5297,7 +5288,7 @@ top: mg->mg_bias = 0; } - if ((flags & METASLAB_FASTWRITE) || + if ((flags & METASLAB_ZIL) || atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mca->mca_rotor = mg->mg_next; @@ -5310,11 +5301,6 @@ top: ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); - if (flags & METASLAB_FASTWRITE) { - atomic_add_64(&vd->vdev_pending_fastwrite, - psize); - } - return (0); } next: @@ -5950,55 +5936,6 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } -void -metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - uint64_t psize = BP_GET_PSIZE(bp); - int d; - vdev_t *vd; - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(psize > 0); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - for (d = 0; d < ndvas; d++) { - if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) - continue; - atomic_add_64(&vd->vdev_pending_fastwrite, psize); - } - - spa_config_exit(spa, SCL_VDEV, FTAG); -} - -void -metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - uint64_t psize = BP_GET_PSIZE(bp); - int d; - vdev_t *vd; - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(psize > 0); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - for (d = 0; d < ndvas; d++) { - if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) - continue; - ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); - atomic_sub_64(&vd->vdev_pending_fastwrite, psize); - } - - spa_config_exit(spa, SCL_VDEV, FTAG); -} - static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index f3812b843e..87c1455932 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1192,7 +1192,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT(tvd == tvd->vdev_top); - tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; @@ -1655,7 +1654,6 @@ vdev_metaslab_fini(vdev_t *vd) } } ASSERT0(vd->vdev_ms_count); - ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { diff --git a/module/zfs/zil.c b/module/zfs/zil.c index be5b9edf6e..6f04a7d4a7 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -761,15 +761,13 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) } static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, - boolean_t fastwrite) +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; - lwb->lwb_fastwrite = fastwrite; lwb->lwb_slog = slog; lwb->lwb_indirect = B_FALSE; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { @@ -916,7 +914,6 @@ zil_create(zilog_t *zilog) dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; - boolean_t fastwrite = FALSE; boolean_t slog = FALSE; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); @@ -949,8 +946,6 @@ zil_create(zilog_t *zilog) error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, ZIL_MIN_BLKSZ, &slog); - fastwrite = TRUE; - if (error == 0) zil_init_log_chain(zilog, &blk); } @@ -959,7 +954,7 @@ zil_create(zilog_t *zilog) * Allocate a log write block (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite); + lwb = zil_alloc_lwb(zilog, &blk, slog, txg); /* * If we just allocated the first log block, commit our transaction @@ -1044,9 +1039,6 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { - if (lwb->lwb_fastwrite) - metaslab_fastwrite_unmark(zilog->zl_spa, - &lwb->lwb_blk); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); @@ -1551,7 +1543,6 @@ zil_lwb_write_done(zio_t *zio) ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; lwb->lwb_write_zio = NULL; - lwb->lwb_fastwrite = FALSE; nlwb = list_next(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); @@ -1718,20 +1709,12 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); - /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ - mutex_enter(&zilog->zl_lock); - if (!lwb->lwb_fastwrite) { - metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 1; - } - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), - zil_lwb_write_done, lwb, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); + zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb); + mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; - zil_lwb_set_zio_dependency(zilog, lwb); zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); @@ -1864,7 +1847,7 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) /* * Allocate a new log write block (lwb). */ - nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); + nlwb = zil_alloc_lwb(zilog, bp, slog, txg); } lwb->lwb_state = LWB_STATE_ISSUED; @@ -3651,18 +3634,6 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) BP_ZERO(&zh->zh_log); } - /* - * Remove fastwrite on any blocks that have been pre-allocated for - * the next commit. This prevents fastwrite counter pollution by - * unused, long-lived LWBs. - */ - for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) { - if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) { - metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 0; - } - } - mutex_exit(&zilog->zl_lock); } @@ -3895,9 +3866,6 @@ zil_close(zilog_t *zilog) ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - if (lwb->lwb_fastwrite) - metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3f5e6a08d8..b562710990 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3024,11 +3024,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - /* - * We didn't allocate this bp, so make sure it doesn't get unmarked. - */ - pio->io_flags &= ~ZIO_FLAG_FASTWRITE; - zio_nowait(zio); return (pio); @@ -3616,7 +3611,6 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) @@ -3776,7 +3770,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ - int flags = METASLAB_FASTWRITE | METASLAB_ZIL; + int flags = METASLAB_ZIL; int allocator = (uint_t)cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, @@ -4931,12 +4925,6 @@ zio_done(zio_t *zio) zfs_ereport_free_checksum(zcr); } - if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && - !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && - !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { - metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); - } - /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as From c1801cbe59dfc99ba7aa34f09c0b6b8f35bb2d8f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 11 Aug 2023 12:04:08 -0400 Subject: [PATCH 09/11] ZIL: Avoid dbuf_read() before dmu_sync(). In most cases dmu_sync() works with dirty records directly and does not need actual data. The only exception is dmu_sync_late_arrival(). To save some CPU time use dmu_buf_hold_noread*() in z*_get_data() and explicitly call dbuf_read() in dmu_sync_late_arrival(). There is also a chance that by that time TXG will already be synced and we won't have to do it at all. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15153 --- include/sys/dmu.h | 4 ++++ include/sys/dmu_impl.h | 2 -- module/zfs/dmu.c | 9 ++++++++- module/zfs/zfs_vnops.c | 4 ++-- module/zfs/zvol.c | 4 ++-- 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 7e57d133c2..615ba8fe74 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -572,11 +572,15 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp); +int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + const void *tag, dmu_buf_t **dbp); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags); +int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, + dmu_buf_t **dbp); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index ce6ae3c665..83ae2b76ba 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -247,8 +247,6 @@ typedef struct dmu_sendstatus { void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); -int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t, - const void *, dmu_buf_t **); #ifdef __cplusplus } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 078811dbf4..ddb29020b0 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -165,7 +165,7 @@ dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { zfs_acl_byteswap, "acl" } }; -static int +int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp) { @@ -185,6 +185,7 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, *dbp = &db->db; return (0); } + int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp) @@ -1653,6 +1654,12 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, { dmu_sync_arg_t *dsa; dmu_tx_t *tx; + int error; + + error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL, + DB_RF_CANFAIL | DB_RF_NOPREFETCH); + if (error != 0) + return (error); tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 54ea43363b..07c177f3bc 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -917,8 +917,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, } #endif if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); + error = dmu_buf_hold_noread(os, object, offset, zgd, + &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index cd4e6f0c75..f44d1b7b55 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -727,8 +727,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); - error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, - DMU_READ_NO_PREFETCH); + error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd, + &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; From bb31ded68b269fb222bfbc9e9bf56d236ca908f4 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 11 Aug 2023 12:04:44 -0400 Subject: [PATCH 10/11] ZIL: Replay blocks without next block pointer. If we get next block allocation error during log write, we trigger transaction commit. But the block we have just completed is still written and transactions it covers will be acknowledged normally. If after that we ignore the block during replay just because it is the last in the chain, we may not replay some transactions that we have acknowledged as synced, that is not right. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15132 --- module/zfs/zil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 6f04a7d4a7..567787a19b 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -290,7 +290,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, char *lr = (char *)(zilc + 1); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + sizeof (cksum)) || zilc->zc_nused < sizeof (*zilc) || zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); @@ -304,7 +304,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + sizeof (cksum)) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { From df8c9f351dab56b056c4ac4d4c763c11fb30d35f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 24 Aug 2023 20:08:49 -0400 Subject: [PATCH 11/11] ZIL: Second attempt to reduce scope of zl_issuer_lock. The previous patch #14841 appeared to have significant flaw, causing deadlocks if zl_get_data callback got blocked waiting for TXG sync. I already handled some of such cases in the original patch, but issue #14982 shown cases that were impossible to solve in that design. This patch fixes the problem by postponing log blocks allocation till the very end, just before the zios issue, leaving nothing blocking after that point to cause deadlocks. Before that point though any sleeps are now allowed, not causing sync thread blockage. This require slightly more complicated lwb state machine to allocate blocks and issue zios in proper order. But with removal of special early issue workarounds the new code is much cleaner now, and should even be more efficient. Since this patch uses null zios between write, I've found that null zios do not wait for logical children ready status in zio_ready(), that makes parent write to proceed prematurely, producing incorrect log blocks. Added ZIO_CHILD_LOGICAL_BIT to zio_wait_for_children() fixes it. Reviewed-by: Rob Norris Reviewed-by: Mark Maybee Reviewed-by: George Wilson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15122 --- cmd/ztest.c | 2 +- include/sys/zil_impl.h | 43 ++- module/zfs/zfs_vnops.c | 2 +- module/zfs/zil.c | 697 +++++++++++++++++++---------------------- module/zfs/zio.c | 4 +- module/zfs/zvol.c | 2 +- 6 files changed, 362 insertions(+), 388 deletions(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index b6b99bfff6..398c519cfc 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, int error; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); ztest_object_lock(zd, object, RL_READER); @@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, DMU_READ_NO_PREFETCH); ASSERT0(error); } else { + ASSERT3P(zio, !=, NULL); size = doi.doi_data_block_size; if (ISP2(size)) { offset = P2ALIGN(offset, size); diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index b58dad9695..f780ad3d61 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -38,14 +38,22 @@ extern "C" { /* * Possible states for a given lwb structure. * - * An lwb will start out in the "closed" state, and then transition to - * the "opened" state via a call to zil_lwb_write_open(). When - * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock" - * must be held. + * An lwb will start out in the "new" state, and transition to the "opened" + * state via a call to zil_lwb_write_open() on first itx assignment. When + * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be + * held. * - * After the lwb is "opened", it can transition into the "issued" state - * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must - * be held when making this transition. + * After the lwb is "opened", it can be assigned number of itxs and transition + * into the "closed" state via zil_lwb_write_close() when full or on timeout. + * When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock" + * must be held. New lwb allocation also takes "zl_lock" to protect the list. + * + * After the lwb is "closed", it can transition into the "ready" state via + * zil_lwb_write_issue(). "zl_lock" must be held when making this transition. + * Since it is done by the same thread, "zl_issuer_lock" is not needed. + * + * When lwb in "ready" state receives its block pointer, it can transition to + * "issued". "zl_lock" must be held when making this transition. * * After the lwb's write zio completes, it transitions into the "write * done" state via zil_lwb_write_done(); and then into the "flush done" @@ -62,17 +70,20 @@ extern "C" { * * Additionally, correctness when reading an lwb's state is often * achieved by exploiting the fact that these state transitions occur in - * this specific order; i.e. "closed" to "opened" to "issued" to "done". + * this specific order; i.e. "new" to "opened" to "closed" to "ready" to + * "issued" to "write_done" and finally "flush_done". * - * Thus, if an lwb is in the "closed" or "opened" state, holding the + * Thus, if an lwb is in the "new" or "opened" state, holding the * "zl_issuer_lock" will prevent a concurrent thread from transitioning - * that lwb to the "issued" state. Likewise, if an lwb is already in the - * "issued" state, holding the "zl_lock" will prevent a concurrent - * thread from transitioning that lwb to the "write done" state. + * that lwb to the "closed" state. Likewise, if an lwb is already in the + * "ready" state, holding the "zl_lock" will prevent a concurrent thread + * from transitioning that lwb to the "issued" state. */ typedef enum { - LWB_STATE_CLOSED, + LWB_STATE_NEW, LWB_STATE_OPENED, + LWB_STATE_CLOSED, + LWB_STATE_READY, LWB_STATE_ISSUED, LWB_STATE_WRITE_DONE, LWB_STATE_FLUSH_DONE, @@ -91,17 +102,21 @@ typedef enum { typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ + boolean_t lwb_slim; /* log block has slim format */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */ - boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ + int lwb_error; /* log block allocation error */ + int lwb_nmax; /* max bytes in the buffer */ int lwb_nused; /* # used bytes in buffer */ int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ + zio_t *lwb_child_zio; /* parent zio for children */ zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ uint64_t lwb_issued_txg; /* the txg when the write is issued */ + uint64_t lwb_alloc_txg; /* the txg when lwb_blk is allocated */ uint64_t lwb_max_txg; /* highest txg in this lwb */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 07c177f3bc..8ffcf37134 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -839,7 +839,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, uint64_t zp_gen; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* @@ -889,6 +888,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ + ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 567787a19b..f2d279e36a 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -151,7 +151,6 @@ static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); -static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb); static itx_t *zil_itx_clone(itx_t *oitx); static int @@ -760,33 +759,52 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) return (TREE_CMP(v1, v2)); } +/* + * Allocate a new lwb. We may already have a block pointer for it, in which + * case we get size and version from there. Or we may not yet, in which case + * we choose them here and later make the block allocation match. + */ static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) +zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, + uint64_t txg, lwb_state_t state) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; - lwb->lwb_blk = *bp; - lwb->lwb_slog = slog; - lwb->lwb_indirect = B_FALSE; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); + if (bp) { + lwb->lwb_blk = *bp; + lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); + sz = BP_GET_LSIZE(bp); } else { - lwb->lwb_nused = lwb->lwb_nfilled = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + BP_ZERO(&lwb->lwb_blk); + lwb->lwb_slim = (spa_version(zilog->zl_spa) >= + SPA_VERSION_SLIM_ZIL); } - lwb->lwb_state = LWB_STATE_CLOSED; - lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); + lwb->lwb_slog = slog; + lwb->lwb_error = 0; + if (lwb->lwb_slim) { + lwb->lwb_nmax = sz; + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + } else { + lwb->lwb_nmax = sz - sizeof (zil_chain_t); + lwb->lwb_nused = lwb->lwb_nfilled = 0; + } + lwb->lwb_sz = sz; + lwb->lwb_state = state; + lwb->lwb_buf = zio_buf_alloc(sz); + lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; - lwb->lwb_max_txg = txg; + lwb->lwb_alloc_txg = txg; + lwb->lwb_max_txg = 0; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); + if (state != LWB_STATE_NEW) + zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); @@ -800,10 +818,12 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) VERIFY(list_is_empty(&lwb->lwb_waiters)); VERIFY(list_is_empty(&lwb->lwb_itxs)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT3P(lwb->lwb_child_zio, ==, NULL); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); - ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || + ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_FLUSH_DONE); /* @@ -954,7 +974,7 @@ zil_create(zilog_t *zilog) * Allocate a log write block (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, slog, txg); + lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); /* * If we just allocated the first log block, commit our transaction @@ -1041,7 +1061,8 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); } } else if (!keep_first) { @@ -1269,21 +1290,21 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { /* * The lwb_waiters field of the lwb is protected by the zilog's - * zl_lock, thus it must be held when calling this function. + * zl_issuer_lock while the lwb is open and zl_lock otherwise. + * zl_issuer_lock also protects leaving the open state. + * zcw_lwb setting is protected by zl_issuer_lock and state != + * flush_done, which transition is protected by zl_lock. */ - ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); + IMPLY(lwb->lwb_state != LWB_STATE_OPENED, + MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); - ASSERT3P(lwb, !=, NULL); - ASSERT(lwb->lwb_state == LWB_STATE_OPENED || - lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE); - list_insert_tail(&lwb->lwb_waiters, zcw); + ASSERT3P(zcw->zcw_lwb, ==, NULL); zcw->zcw_lwb = lwb; - mutex_exit(&zcw->zcw_lock); } /* @@ -1294,11 +1315,9 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) static void zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { - mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); list_insert_tail(nolwb, zcw); - mutex_exit(&zcw->zcw_lock); + ASSERT3P(zcw->zcw_lwb, ==, NULL); } void @@ -1484,7 +1503,7 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); lwb_t *lwb = list_head(&zilog->zl_lwb_list); - while (lwb != NULL && lwb->lwb_max_txg <= txg) { + while (lwb != NULL) { if (lwb->lwb_issued_txg <= txg) { ASSERT(lwb->lwb_state != LWB_STATE_ISSUED); ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE); @@ -1527,14 +1546,6 @@ zil_lwb_write_done(zio_t *zio) ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); - ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); - ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); - ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); - ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); - ASSERT(!BP_IS_GANG(zio->io_bp)); - ASSERT(!BP_IS_HOLE(zio->io_bp)); - ASSERT(BP_GET_FILL(zio->io_bp) == 0); - abd_free(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); lwb->lwb_buf = NULL; @@ -1542,6 +1553,7 @@ zil_lwb_write_done(zio_t *zio) mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; + lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; nlwb = list_next(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); @@ -1606,116 +1618,75 @@ zil_lwb_write_done(zio_t *zio) } } +/* + * Build the zio dependency chain, which is used to preserve the ordering of + * lwb completions that is required by the semantics of the ZIL. Each new lwb + * zio becomes a parent of the previous lwb zio, such that the new lwb's zio + * cannot complete until the previous lwb's zio completes. + * + * This is required by the semantics of zil_commit(): the commit waiters + * attached to the lwbs will be woken in the lwb zio's completion callback, + * so this zio dependency graph ensures the waiters are woken in the correct + * order (the same order the lwbs were created). + */ static void zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) { - lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zilog->zl_lock)); + lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb); + if (prev_lwb == NULL || + prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE) + return; + /* - * The zilog's "zl_last_lwb_opened" field is used to build the - * lwb/zio dependency chain, which is used to preserve the - * ordering of lwb completions that is required by the semantics - * of the ZIL. Each new lwb zio becomes a parent of the - * "previous" lwb zio, such that the new lwb's zio cannot - * complete until the "previous" lwb's zio completes. + * If the previous lwb's write hasn't already completed, we also want + * to order the completion of the lwb write zios (above, we only order + * the completion of the lwb root zios). This is required because of + * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. * - * This is required by the semantics of zil_commit(); the commit - * waiters attached to the lwbs will be woken in the lwb zio's - * completion callback, so this zio dependency graph ensures the - * waiters are woken in the correct order (the same order the - * lwbs were created). + * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous + * lwb will rely on this lwb to flush the vdevs written to by that + * previous lwb. Thus, we need to ensure this lwb doesn't issue the + * flush until after the previous lwb's write completes. We ensure + * this ordering by setting the zio parent/child relationship here. + * + * Without this relationship on the lwb's write zio, it's possible + * for this lwb's write to complete prior to the previous lwb's write + * completing; and thus, the vdevs for the previous lwb would be + * flushed prior to that lwb's data being written to those vdevs (the + * vdevs are flushed in the lwb write zio's completion handler, + * zil_lwb_write_done()). */ - if (last_lwb_opened != NULL && - last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED || - last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE); - - ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); - zio_add_child(lwb->lwb_root_zio, - last_lwb_opened->lwb_root_zio); - - /* - * If the previous lwb's write hasn't already completed, - * we also want to order the completion of the lwb write - * zios (above, we only order the completion of the lwb - * root zios). This is required because of how we can - * defer the DKIOCFLUSHWRITECACHE commands for each lwb. - * - * When the DKIOCFLUSHWRITECACHE commands are deferred, - * the previous lwb will rely on this lwb to flush the - * vdevs written to by that previous lwb. Thus, we need - * to ensure this lwb doesn't issue the flush until - * after the previous lwb's write completes. We ensure - * this ordering by setting the zio parent/child - * relationship here. - * - * Without this relationship on the lwb's write zio, - * it's possible for this lwb's write to complete prior - * to the previous lwb's write completing; and thus, the - * vdevs for the previous lwb would be flushed prior to - * that lwb's data being written to those vdevs (the - * vdevs are flushed in the lwb write zio's completion - * handler, zil_lwb_write_done()). - */ - if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED); - - ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL); - zio_add_child(lwb->lwb_write_zio, - last_lwb_opened->lwb_write_zio); - } + if (prev_lwb->lwb_state == LWB_STATE_ISSUED) { + ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL); + zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio); + } else { + ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); } + + ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL); + zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio); } /* * This function's purpose is to "open" an lwb such that it is ready to - * accept new itxs being committed to it. To do this, the lwb's zio - * structures are created, and linked to the lwb. This function is - * idempotent; if the passed in lwb has already been opened, this - * function is essentially a no-op. + * accept new itxs being committed to it. This function is idempotent; if + * the passed in lwb has already been opened, it is essentially a no-op. */ static void zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) { - zbookmark_phys_t zb; - zio_priority_t prio; - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb, !=, NULL); - EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); - EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); - if (lwb->lwb_root_zio != NULL) + if (lwb->lwb_state != LWB_STATE_NEW) { + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); return; - - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); - - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; - - SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, - lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); - - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, - &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), - zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb); + } mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; - zil_lwb_set_zio_dependency(zilog, lwb); zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); } @@ -1752,57 +1723,21 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { - lwb_t *nlwb = NULL; - zil_chain_t *zilc; - spa_t *spa = zilog->zl_spa; - blkptr_t *bp; - dmu_tx_t *tx; - uint64_t txg; - uint64_t zil_blksz; - int i, error; - boolean_t slog; + int i; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + lwb->lwb_state = LWB_STATE_CLOSED; /* - * If this lwb includes indirect writes, we have to commit before - * creating the transaction, otherwise we may end up in dead lock. + * If there was an allocation failure then returned NULL will trigger + * zil_commit_writer_stall() at the caller. This is inherently racy, + * since allocation may not have happened yet. */ - if (lwb->lwb_indirect) { - for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); - lwb->lwb_nused = lwb->lwb_nfilled; - } - - /* - * Allocate the next block and save its address in this block - * before writing it in order to establish the log chain. - */ - - tx = dmu_tx_create(zilog->zl_os); - - /* - * Since we are not going to create any new dirty data, and we - * can even help with clearing the existing dirty data, we - * should not be subject to the dirty data based delays. We - * use TXG_NOTHROTTLE to bypass the delay mechanism. - */ - VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); - - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - - mutex_enter(&zilog->zl_lwb_io_lock); - lwb->lwb_issued_txg = txg; - zilog->zl_lwb_inflight[txg & TXG_MASK]++; - zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); - mutex_exit(&zilog->zl_lwb_io_lock); + if (lwb->lwb_error != 0) + return (NULL); /* * Log blocks are pre-allocated. Here we select the size of the next @@ -1820,7 +1755,7 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) * the maximum block size because we can exhaust the available * pool log space. */ - zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); + uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) continue; zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); @@ -1832,94 +1767,149 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) - zilc = (zil_chain_t *)lwb->lwb_buf; - else - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; - BP_ZERO(bp); - error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); - if (error == 0) { - ASSERT3U(bp->blk_birth, ==, txg); - bp->blk_cksum = lwb->lwb_blk.blk_cksum; - bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; - - /* - * Allocate a new log write block (lwb). - */ - nlwb = zil_alloc_lwb(zilog, bp, slog, txg); - } - - lwb->lwb_state = LWB_STATE_ISSUED; - - dmu_tx_commit(tx); - - /* - * We need to acquire the config lock for the lwb to issue it later. - * However, if we already have a queue of closed parent lwbs already - * holding the config lock (but not yet issued), we can't block here - * waiting on the lock or we will deadlock. In that case we must - * first issue to parent IOs before waiting on the lock. - */ - if (ilwbs && !list_is_empty(ilwbs)) { - if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) { - lwb_t *tlwb; - while ((tlwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, tlwb); - spa_config_enter(spa, SCL_STATE, lwb, RW_READER); - } - } else { - spa_config_enter(spa, SCL_STATE, lwb, RW_READER); - } - - if (ilwbs) - list_insert_tail(ilwbs, lwb); - - /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). - */ - return (nlwb); + return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); } /* * Finalize previously closed block and issue the write zio. - * Does not require locking. */ static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { + spa_t *spa = zilog->zl_spa; zil_chain_t *zilc; - int wsz; + boolean_t slog; + zbookmark_phys_t zb; + zio_priority_t prio; + int error; - /* Actually fill the lwb with the data if not yet. */ - if (!lwb->lwb_indirect) { - for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); - lwb->lwb_nused = lwb->lwb_nfilled; - } + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); - ASSERT3S(wsz, <=, lwb->lwb_sz); - zio_shrink(lwb->lwb_write_zio, wsz); - wsz = lwb->lwb_write_zio->io_size; + /* Actually fill the lwb with the data. */ + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; - zilc = (zil_chain_t *)lwb->lwb_buf; - } else { - wsz = lwb->lwb_sz; - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - } - zilc->zc_pad = 0; - zilc->zc_nused = lwb->lwb_nused; - zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, + ZIO_FLAG_CANFAIL); /* - * clear unused data for security + * The lwb is now ready to be issued, but it can be only if it already + * got its block pointer allocated or the allocation has failed. + * Otherwise leave it as-is, relying on some other thread to issue it + * after allocating its block pointer via calling zil_lwb_write_issue() + * for the previous lwb(s) in the chain. */ - memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + mutex_enter(&zilog->zl_lock); + lwb->lwb_state = LWB_STATE_READY; + if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { + mutex_exit(&zilog->zl_lock); + return; + } + mutex_exit(&zilog->zl_lock); + +next_lwb: + if (lwb->lwb_slim) + zilc = (zil_chain_t *)lwb->lwb_buf; + else + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); + int wsz = lwb->lwb_sz; + if (lwb->lwb_error == 0) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, + &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, + lwb, prio, ZIO_FLAG_CANFAIL, &zb); + zil_lwb_add_block(lwb, &lwb->lwb_blk); + + if (lwb->lwb_slim) { + /* For Slim ZIL only write what is used. */ + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, + int); + ASSERT3S(wsz, <=, lwb->lwb_sz); + zio_shrink(lwb->lwb_write_zio, wsz); + wsz = lwb->lwb_write_zio->io_size; + } + memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + zilc->zc_pad = 0; + zilc->zc_nused = lwb->lwb_nused; + zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + } else { + /* + * We can't write the lwb if there was an allocation failure, + * so create a null zio instead just to maintain dependencies. + */ + lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL, + zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL); + lwb->lwb_write_zio->io_error = lwb->lwb_error; + } + if (lwb->lwb_child_zio) + zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio); + + /* + * Open transaction to allocate the next block pointer. + */ + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Allocate next the block pointer unless we are already in error. + */ + lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb); + blkptr_t *bp = &zilc->zc_next_blk; + BP_ZERO(bp); + error = lwb->lwb_error; + if (error == 0) { + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, + &slog); + } + if (error == 0) { + ASSERT3U(bp->blk_birth, ==, txg); + BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : + ZIO_CHECKSUM_ZILOG); + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; + } + + /* + * Reduce TXG open time by incrementing inflight counter and committing + * the transaciton. zil_sync() will wait for it to return to zero. + */ + mutex_enter(&zilog->zl_lwb_io_lock); + lwb->lwb_issued_txg = txg; + zilog->zl_lwb_inflight[txg & TXG_MASK]++; + zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); + mutex_exit(&zilog->zl_lwb_io_lock); + dmu_tx_commit(tx); + + spa_config_enter(spa, SCL_STATE, lwb, RW_READER); + + /* + * We've completed all potentially blocking operations. Update the + * nlwb and allow it proceed without possible lock order reversals. + */ + mutex_enter(&zilog->zl_lock); + zil_lwb_set_zio_dependency(zilog, lwb); + lwb->lwb_state = LWB_STATE_ISSUED; + + if (nlwb) { + nlwb->lwb_blk = *bp; + nlwb->lwb_error = error; + nlwb->lwb_slog = slog; + nlwb->lwb_alloc_txg = txg; + if (nlwb->lwb_state != LWB_STATE_READY) + nlwb = NULL; + } + mutex_exit(&zilog->zl_lock); if (lwb->lwb_slog) { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); @@ -1938,11 +1928,19 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } - ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER)); - zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_write_zio); + if (lwb->lwb_child_zio) + zio_nowait(lwb->lwb_child_zio); + + /* + * If nlwb was ready when we gave it the block pointer, + * it is on us to issue it and possibly following ones. + */ + lwb = nlwb; + if (lwb) + goto next_lwb; } /* @@ -2014,10 +2012,7 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) * For more details, see the comment above zil_commit(). */ if (lr->lrc_txtype == TX_COMMIT) { - mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); - itx->itx_private = NULL; - mutex_exit(&zilog->zl_lock); list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } @@ -2036,17 +2031,17 @@ cont: * If this record won't fit in the current log block, start a new one. * For WR_NEED_COPY optimize layout for minimal number of chunks. */ - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t)); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_close(zilog, lwb, ilwbs); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); if (lwb == NULL) return (NULL); - zil_lwb_write_open(zilog, lwb); - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; /* * There must be enough space in the new, empty log block to @@ -2084,7 +2079,7 @@ cont: clr->lrc_seq = ++zilog->zl_lr_seq; lwb->lwb_nused += reclen + dnow; - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); zil_lwb_add_txg(lwb, lr->lrc_txg); @@ -2096,22 +2091,9 @@ cont: goto cont; } - /* - * We have to really issue all queued LWBs before we may have to - * wait for a txg sync. Otherwise we may end up in a dead lock. - */ - if (lr->lrc_txtype == TX_WRITE) { - boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa); - if (frozen || itx->itx_wr_state == WR_INDIRECT) { - lwb_t *tlwb; - while ((tlwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, tlwb); - } - if (itx->itx_wr_state == WR_INDIRECT) - lwb->lwb_indirect = B_TRUE; - if (frozen) - txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); - } + if (lr->lrc_txtype == TX_WRITE && + lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); return (lwb); } @@ -2174,26 +2156,24 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) ZIL_STAT_BUMP(zilog, zil_itx_indirect_count); ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); + if (lwb->lwb_child_zio == NULL) { + lwb->lwb_child_zio = zio_root( + zilog->zl_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } } /* - * We pass in the "lwb_write_zio" rather than - * "lwb_root_zio" so that the "lwb_write_zio" - * becomes the parent of any zio's created by - * the "zl_get_data" callback. The vdevs are - * flushed after the "lwb_write_zio" completes, - * so we want to make sure that completion - * callback waits for these additional zio's, - * such that the vdevs used by those zio's will - * be included in the lwb's vdev tree, and those - * vdevs will be properly flushed. If we passed - * in "lwb_root_zio" here, then these additional - * vdevs may not be flushed; e.g. if these zio's - * completed after "lwb_write_zio" completed. + * The "lwb_child_zio" we pass in will become a child of + * "lwb_write_zio", when one is created, so one will be + * a parent of any zio's created by the "zl_get_data". + * This way "lwb_write_zio" will first wait for children + * block pointers before own writing, and then for their + * writing completion before the vdev cache flushing. */ error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, - lwb->lwb_write_zio); + lwb->lwb_child_zio); if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ memset((char *)dbuf + lrwb->lr_length, 0, @@ -2226,12 +2206,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) error); zfs_fallthrough; case EIO: - if (lwb->lwb_indirect) { - txg_wait_synced(zilog->zl_dmu_pool, - lr->lrc_txg); - } else { - lwb->lwb_write_zio->io_error = error; - } + txg_wait_synced(zilog->zl_dmu_pool, + lr->lrc_txg); zfs_fallthrough; case ENOENT: zfs_fallthrough; @@ -2675,7 +2651,6 @@ zil_prune_commit_list(zilog_t *zilog) zil_commit_waiter_skip(itx->itx_private); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); - itx->itx_private = NULL; } mutex_exit(&zilog->zl_lock); @@ -2753,10 +2728,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * have already been created (zl_lwb_list not empty). */ zil_commit_activate_saxattr_feature(zilog); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - first = (lwb->lwb_state != LWB_STATE_OPENED) && + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_OPENED); + first = (lwb->lwb_state == LWB_STATE_NEW) && ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || plwb->lwb_state == LWB_STATE_FLUSH_DONE); } @@ -2880,37 +2854,32 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_OPENED); /* * At this point, the ZIL block pointed at by the "lwb" - * variable is in one of the following states: "closed" - * or "open". + * variable is in "new" or "opened" state. * - * If it's "closed", then no itxs have been committed to - * it, so there's no point in issuing its zio (i.e. it's - * "empty"). + * If it's "new", then no itxs have been committed to it, so + * there's no point in issuing its zio (i.e. it's "empty"). * - * If it's "open", then it contains one or more itxs that + * If it's "opened", then it contains one or more itxs that * eventually need to be committed to stable storage. In * this case we intentionally do not issue the lwb's zio * to disk yet, and instead rely on one of the following * two mechanisms for issuing the zio: * - * 1. Ideally, there will be more ZIL activity occurring - * on the system, such that this function will be - * immediately called again (not necessarily by the same - * thread) and this lwb's zio will be issued via - * zil_lwb_assign(). This way, the lwb is guaranteed to - * be "full" when it is issued to disk, and we'll make - * use of the lwb's size the best we can. + * 1. Ideally, there will be more ZIL activity occurring on + * the system, such that this function will be immediately + * called again by different thread and this lwb will be + * closed by zil_lwb_assign(). This way, the lwb will be + * "full" when it is issued to disk, and we'll make use of + * the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on - * the system, such that this lwb's zio isn't issued via - * zil_lwb_assign(), zil_commit_waiter() will issue the - * lwb's zio. If this occurs, the lwb is not guaranteed + * the system, zil_commit_waiter() will close it and issue + * the zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount * of ZIL activity occurring on the system at that time. @@ -2940,8 +2909,11 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) hrtime_t sleep = zilog->zl_last_lwb_latency * zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || - lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { - lwb = zil_lwb_write_close(zilog, lwb, ilwbs); + lwb->lwb_nmax - lwb->lwb_nused < + lwb->lwb_nmax / 8) { + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, + LWB_STATE_NEW); zilog->zl_cur_used = 0; if (lwb == NULL) { while ((lwb = list_remove_head(ilwbs)) @@ -3024,7 +2996,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) lwb_t *lwb = zcw->zcw_lwb; ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); /* * If the lwb has already been issued by another thread, we can @@ -3033,9 +3005,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * do this prior to acquiring the zl_issuer_lock, to avoid * acquiring it when it's not necessary to do so. */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) + if (lwb->lwb_state != LWB_STATE_OPENED) return; /* @@ -3058,8 +3028,8 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * wind up with a use-after-free error below. */ if (zcw->zcw_done) { - lwb = NULL; - goto out; + mutex_exit(&zilog->zl_issuer_lock); + return; } ASSERT3P(lwb, ==, zcw->zcw_lwb); @@ -3070,28 +3040,33 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * second time while holding the lock. * * We don't need to hold the zl_lock since the lwb cannot transition - * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb - * _can_ transition from ISSUED to DONE, but it's OK to race with + * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb + * _can_ transition from CLOSED to DONE, but it's OK to race with * that transition since we treat the lwb the same, whether it's in - * the ISSUED or DONE states. + * the CLOSED, ISSUED or DONE states. * * The important thing, is we treat the lwb differently depending on - * if it's ISSUED or OPENED, and block any other threads that might - * attempt to issue this lwb. For that reason we hold the + * if it's OPENED or CLOSED, and block any other threads that might + * attempt to close/issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_close() if the lwb had already been issued. + * zil_lwb_write_close() if the lwb had already been closed/issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) { - lwb = NULL; - goto out; + if (lwb->lwb_state != LWB_STATE_OPENED) { + mutex_exit(&zilog->zl_issuer_lock); + return; } - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + /* + * We do not need zcw_lock once we hold zl_issuer_lock and know lwb + * is still open. But we have to drop it to avoid a deadlock in case + * callback of zio issued by zil_lwb_write_issue() try to get it, + * while zil_lwb_write_issue() is blocked on attempt to issue next + * lwb it found in LWB_STATE_READY state. + */ + mutex_exit(&zcw->zcw_lock); /* * As described in the comments above zil_commit_waiter() and @@ -3099,9 +3074,9 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ - lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* * Since the lwb's zio hadn't been issued by the time this thread @@ -3124,34 +3099,15 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the * zil_commit_writer_stall() function for more details. - * - * We must drop the commit waiter's lock prior to - * calling zil_commit_writer_stall() or else we can wind - * up with the following deadlock: - * - * - This thread is waiting for the txg to sync while - * holding the waiter's lock; txg_wait_synced() is - * used within txg_commit_writer_stall(). - * - * - The txg can't sync because it is waiting for this - * lwb's zio callback to call dmu_tx_commit(). - * - * - The lwb's zio callback can't call dmu_tx_commit() - * because it's blocked trying to acquire the waiter's - * lock, which occurs prior to calling dmu_tx_commit() */ - mutex_exit(&zcw->zcw_lock); zil_lwb_write_issue(zilog, lwb); - lwb = NULL; zil_commit_writer_stall(zilog); - mutex_enter(&zcw->zcw_lock); - } - -out: - mutex_exit(&zilog->zl_issuer_lock); - if (lwb) + mutex_exit(&zilog->zl_issuer_lock); + } else { + mutex_exit(&zilog->zl_issuer_lock); zil_lwb_write_issue(zilog, lwb); - ASSERT(MUTEX_HELD(&zcw->zcw_lock)); + } + mutex_enter(&zcw->zcw_lock); } /* @@ -3216,7 +3172,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) * where it's "zcw_lwb" field is NULL, and it hasn't yet * been skipped, so it's "zcw_done" field is still B_FALSE. */ - IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); + IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW); if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { ASSERT3B(timedout, ==, B_FALSE); @@ -3264,6 +3220,8 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) */ IMPLY(lwb != NULL, + lwb->lwb_state == LWB_STATE_CLOSED || + lwb->lwb_state == LWB_STATE_READY || lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE); @@ -3618,10 +3576,11 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_state != LWB_STATE_FLUSH_DONE || - lwb->lwb_max_txg > txg) + lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); - zio_free(spa, txg, &lwb->lwb_blk); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + zio_free(spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); /* @@ -3825,17 +3784,18 @@ zil_close(zilog_t *zilog) } mutex_enter(&zilog->zl_lock); + txg = zilog->zl_dirty_max_txg; lwb = list_tail(&zilog->zl_lwb_list); - if (lwb == NULL) - txg = zilog->zl_dirty_max_txg; - else - txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); + if (lwb != NULL) { + txg = MAX(txg, lwb->lwb_alloc_txg); + txg = MAX(txg, lwb->lwb_max_txg); + } mutex_exit(&zilog->zl_lock); /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in - * zil_lwb_write_close(). + * zil_lwb_write_issue(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); @@ -3864,8 +3824,7 @@ zil_close(zilog_t *zilog) lwb = list_remove_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } @@ -3986,7 +3945,7 @@ zil_suspend(const char *osname, void **cookiep) /* * We need to use zil_commit_impl to ensure we wait for all - * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed + * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed * to disk before proceeding. If we used zil_commit instead, it * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be diff --git a/module/zfs/zio.c b/module/zfs/zio.c index b562710990..7458b416c2 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4466,8 +4466,8 @@ zio_ready(zio_t *zio) zio_t *pio, *pio_next; zio_link_t *zl = NULL; - if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, - ZIO_WAIT_READY)) { + if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | + ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index f44d1b7b55..53dcb4dee4 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -698,7 +698,6 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, int error; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); @@ -717,6 +716,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ + ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change