From 00d85a98ea10340cb017a4afc3c1c2ef1cf1914d Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Tue, 6 Feb 2024 09:55:43 -0800
Subject: [PATCH 01/31] BRT: Fix FICLONE/FICLONERANGE shortened copy

On Linux the ioctl_ficlonerange() and ioctl_ficlone() system calls
are expected to either fully clone the specified range or return an
error.  The range may be for an entire file.  While internally ZFS
supports cloning partial ranges there's no way to return the length
cloned to the caller so we need to make this all or nothing.

As part of this change support for the REMAP_FILE_CAN_SHORTEN flag
has been added.  When REMAP_FILE_CAN_SHORTEN is set zfs_clone_range()
will return a shortened range when encountering pending dirty records.
When it's clear zfs_clone_range() will block and wait for the records
to be written out allowing the blocks to be cloned.

Furthermore, the file range lock is held over the region being cloned
to prevent it from being modified while cloning.  This doesn't quite
provide an atomic semantics since if an error is encountered only a
portion of the range may be cloned.  This will be converted to an
error if REMAP_FILE_CAN_SHORTEN was not provided and returned to the
caller.  However, the destination file range is left in an undefined
state.

A test case has been added which exercises this functionality by
verifying that `cp --reflink=never|auto|always` works correctly.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #15728
Closes #15842
---
 include/os/freebsd/zfs/sys/zfs_vfsops_os.h    |   1 -
 include/os/linux/zfs/sys/zfs_vfsops_os.h      |   2 -
 include/sys/zfs_vnops.h                       |   3 +
 man/man4/zfs.4                                |   9 +
 module/os/freebsd/zfs/zfs_vfsops.c            |   4 -
 module/os/linux/zfs/zfs_vnops_os.c            |   5 -
 module/os/linux/zfs/zpl_file_range.c          |  48 +++---
 module/zfs/zfs_vnops.c                        |  43 ++++-
 tests/runfiles/common.run                     |   2 +-
 tests/test-runner/bin/zts-report.py.in        |   2 +
 tests/zfs-tests/include/tunables.cfg          |   1 +
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../functional/cp_files/cp_files_002_pos.ksh  | 161 ++++++++++++++++++
 13 files changed, 243 insertions(+), 39 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh

diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h
index 56a0ac96ac..24bb03575f 100644
--- a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h
+++ b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h
@@ -286,7 +286,6 @@ typedef struct zfid_long {
 
 extern uint_t zfs_fsyncer_key;
 extern int zfs_super_owner;
-extern int zfs_bclone_enabled;
 
 extern void zfs_init(void);
 extern void zfs_fini(void);
diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h
index 2204665502..b4d5db21f5 100644
--- a/include/os/linux/zfs/sys/zfs_vfsops_os.h
+++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h
@@ -45,8 +45,6 @@ extern "C" {
 typedef struct zfsvfs zfsvfs_t;
 struct znode;
 
-extern int zfs_bclone_enabled;
-
 /*
  * This structure emulates the vfs_t from other platforms.  It's purpose
  * is to facilitate the handling of mount options and minimize structural
diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h
index 5da103f177..e60b99bed1 100644
--- a/include/sys/zfs_vnops.h
+++ b/include/sys/zfs_vnops.h
@@ -24,8 +24,11 @@
 
 #ifndef	_SYS_FS_ZFS_VNOPS_H
 #define	_SYS_FS_ZFS_VNOPS_H
+
 #include <sys/zfs_vnops_os.h>
 
+extern int zfs_bclone_enabled;
+
 extern int zfs_fsync(znode_t *, int, cred_t *);
 extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
 extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index c12ef1387c..352990e02d 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1142,6 +1142,15 @@ Enable the experimental block cloning feature.
 If this setting is 0, then even if feature@block_cloning is enabled,
 attempts to clone blocks will act as though the feature is disabled.
 .
+.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
+When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
+written to disk.
+This allows the clone operation to reliably succeed when a file is
+modified and then immediately cloned.
+For small files this may be slower than making a copy of the file.
+Therefore, this setting defaults to 0 which causes a clone operation to
+immediately fail when encountering a dirty block.
+.
 .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
 Select a BLAKE3 implementation.
 .Pp
diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c
index 23b8da1845..a972c720df 100644
--- a/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/module/os/freebsd/zfs/zfs_vfsops.c
@@ -89,10 +89,6 @@ int zfs_debug_level;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
 	"Debug level");
 
-int zfs_bclone_enabled = 0;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
-	&zfs_bclone_enabled, 0, "Enable block cloning");
-
 struct zfs_jailparam {
 	int mount_snapshot;
 };
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index ecfa4b54e2..c06a75662b 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -4248,9 +4248,4 @@ EXPORT_SYMBOL(zfs_map);
 /* CSTYLED */
 module_param(zfs_delete_blocks, ulong, 0644);
 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
-
-/* CSTYLED */
-module_param(zfs_bclone_enabled, uint, 0644);
-MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
-
 #endif
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index 139c51cf46..3065d54fa9 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -31,8 +31,6 @@
 #include <sys/zfs_vnops.h>
 #include <sys/zfeature.h>
 
-int zfs_bclone_enabled = 0;
-
 /*
  * Clone part of a file via block cloning.
  *
@@ -40,7 +38,7 @@ int zfs_bclone_enabled = 0;
  * care of that depending on how it was called.
  */
 static ssize_t
-__zpl_clone_file_range(struct file *src_file, loff_t src_off,
+zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
     struct file *dst_file, loff_t dst_off, size_t len)
 {
 	struct inode *src_i = file_inode(src_file);
@@ -96,11 +94,12 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 {
 	ssize_t ret;
 
+	/* Flags is reserved for future extensions and must be zero. */
 	if (flags != 0)
 		return (-EINVAL);
 
-	/* Try to do it via zfs_clone_range() */
-	ret = __zpl_clone_file_range(src_file, src_off,
+	/* Try to do it via zfs_clone_range() and allow shortening. */
+	ret = zpl_clone_file_range_impl(src_file, src_off,
 	    dst_file, dst_off, len);
 
 #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
@@ -137,6 +136,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
  * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
  * range in both files and if they're the same, arrange for them to be backed
  * by the same storage.
+ *
+ * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
+ * if we want. It's designed for filesystems that may need to shorten the
+ * length for alignment, EOF, or any other requirement. ZFS may shorten the
+ * request when there is outstanding dirty data which hasn't been written.
  */
 loff_t
 zpl_remap_file_range(struct file *src_file, loff_t src_off,
@@ -145,24 +149,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
 	if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
 		return (-EINVAL);
 
-	/*
-	 * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
-	 * range if we want. Its designed for filesystems that make data past
-	 * EOF available, and don't want it to be visible in both files. ZFS
-	 * doesn't do that, so we just turn the flag off.
-	 */
-	flags &= ~REMAP_FILE_CAN_SHORTEN;
-
+	/* No support for dedup yet */
 	if (flags & REMAP_FILE_DEDUP)
-		/* No support for dedup yet */
 		return (-EOPNOTSUPP);
 
 	/* Zero length means to clone everything to the end of the file */
 	if (len == 0)
 		len = i_size_read(file_inode(src_file)) - src_off;
 
-	return (__zpl_clone_file_range(src_file, src_off,
-	    dst_file, dst_off, len));
+	ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
+	    dst_file, dst_off, len);
+
+	if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
+		ret = -EINVAL;
+
+	return (ret);
 }
 #endif /* HAVE_VFS_REMAP_FILE_RANGE */
 
@@ -179,8 +180,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
 	if (len == 0)
 		len = i_size_read(file_inode(src_file)) - src_off;
 
-	return (__zpl_clone_file_range(src_file, src_off,
-	    dst_file, dst_off, len));
+	/* The entire length must be cloned or this is an error. */
+	ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
+	    dst_file, dst_off, len);
+
+	if (ret >= 0 && ret != len)
+		ret = -EINVAL;
+
+	return (ret);
 }
 #endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
 
@@ -214,8 +221,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)
 
 	size_t len = i_size_read(file_inode(src_file));
 
-	ssize_t ret =
-	    __zpl_clone_file_range(src_file, 0, dst_file, 0, len);
+	ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
 
 	fput(src_file);
 
@@ -253,7 +259,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
 	if (len == 0)
 		len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
 
-	ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
+	ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
 	    dst_file, fcr.fcr_dest_offset, len);
 
 	fput(src_file);
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index e6ae574ad0..2b37834d5c 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -58,6 +58,26 @@
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 
+/*
+ * Enable the experimental block cloning feature.  If this setting is 0, then
+ * even if feature@block_cloning is enabled, attempts to clone blocks will act
+ * as though the feature is disabled.
+ */
+int zfs_bclone_enabled = 0;
+
+/*
+ * When set zfs_clone_range() waits for dirty data to be written to disk.
+ * This allows the clone operation to reliably succeed when a file is modified
+ * and then immediately cloned. For small files this may be slower than making
+ * a copy of the file and is therefore not the default.  However, in certain
+ * scenarios this behavior may be desirable so a tunable is provided.
+ */
+static int zfs_bclone_wait_dirty = 0;
+
+/*
+ * Maximum bytes to read per chunk in zfs_read().
+ */
+static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
 
 static ulong_t zfs_fsync_sync_cnt = 4;
 
@@ -189,8 +209,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
 	return (error);
 }
 
-static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
-
 /*
  * Read bytes from specified file into supplied buffer.
  *
@@ -1055,6 +1073,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	size_t		maxblocks, nbps;
 	uint_t		inblksz;
 	uint64_t	clear_setid_bits_txg = 0;
+	uint64_t	last_synced_txg = 0;
 
 	inoff = *inoffp;
 	outoff = *outoffp;
@@ -1293,15 +1312,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}
 
 		nbps = maxblocks;
+		last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
 		error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
 		    &nbps);
 		if (error != 0) {
 			/*
 			 * If we are trying to clone a block that was created
-			 * in the current transaction group, error will be
-			 * EAGAIN here, which we can just return to the caller
-			 * so it can fallback if it likes.
+			 * in the current transaction group, the error will be
+			 * EAGAIN here.  Based on zfs_bclone_wait_dirty either
+			 * return a shortened range to the caller so it can
+			 * fallback, or wait for the next TXG and check again.
 			 */
+			if (error == EAGAIN && zfs_bclone_wait_dirty) {
+				txg_wait_synced(dmu_objset_pool(inos),
+				    last_synced_txg + 1);
+				continue;
+			}
+
 			break;
 		}
 
@@ -1523,3 +1550,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay);
 
 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
 	"Bytes to read per chunk");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
+	"Enable block cloning");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
+	"Wait for dirty blocks when cloning");
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index a3550d26ab..dd936ce598 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -630,7 +630,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
 tags = ['functional', 'compression']
 
 [tests/functional/cp_files]
-tests = ['cp_files_001_pos', 'cp_stress']
+tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
 tags = ['functional', 'cp_files']
 
 [tests/functional/crtime]
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index ae4aa62754..edfdd47ee6 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'):
         'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
         'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
         'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
+        'cp_files/cp_files_002_pos': ['SKIP', na_reason],
         'link_count/link_count_001': ['SKIP', na_reason],
         'casenorm/mixed_create_failure': ['FAIL', 13215],
         'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
@@ -312,6 +313,7 @@ elif sys.platform.startswith('linux'):
             ['SKIP', cfr_reason],
         'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
         'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
+        'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
         'fault/auto_online_002_pos': ['FAIL', 11889],
         'fault/auto_replace_001_pos': ['FAIL', 14851],
         'fault/auto_spare_002_pos': ['FAIL', 11889],
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index a0edad14d0..46cd42c4b8 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -91,6 +91,7 @@ VOL_MODE			vol.mode			zvol_volmode
 VOL_RECURSIVE			vol.recursive			UNSUPPORTED
 VOL_USE_BLK_MQ			UNSUPPORTED			zvol_use_blk_mq
 BCLONE_ENABLED			zfs_bclone_enabled		zfs_bclone_enabled
+BCLONE_WAIT_DIRTY		zfs_bclone_wait_dirty		zfs_bclone_wait_dirty
 XATTR_COMPAT			xattr_compat			zfs_xattr_compat
 ZEVENT_LEN_MAX			zevent.len_max			zfs_zevent_len_max
 ZEVENT_RETAIN_MAX		zevent.retain_max		zfs_zevent_retain_max
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 8bee07f480..7442c79857 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1393,6 +1393,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/compression/setup.ksh \
 	functional/cp_files/cleanup.ksh \
 	functional/cp_files/cp_files_001_pos.ksh \
+	functional/cp_files/cp_files_002_pos.ksh \
 	functional/cp_files/cp_stress.ksh \
 	functional/cp_files/setup.ksh \
 	functional/crtime/cleanup.ksh \
diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
new file mode 100755
index 0000000000..60817449ab
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
@@ -0,0 +1,161 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
+
+#
+# DESCRIPTION:
+# Verify all cp --reflink modes work with modified file.
+#
+# STRATEGY:
+# 1. Verify "cp --reflink=never|auto|always" behaves as expected.
+#    Two different modes of operation are tested.
+#
+#    a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL
+#       when there are dirty blocks which cannot be immediately cloned.
+#       This is the default behavior.
+#
+#    b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for
+#       dirty blocks to be written to disk allowing the clone to succeed.
+#       The downside to this is it may be slow which depending on the
+#       situtation may defeat the point of making a clone.
+#
+
+verify_runnable "global"
+verify_block_cloning
+
+if ! is_linux; then
+	log_unsupported "cp --reflink is a GNU coreutils option"
+fi
+
+function cleanup
+{
+	datasetexists $TESTPOOL/cp-reflink && \
+	    destroy_dataset $$TESTPOOL/cp-reflink -f
+	log_must set_tunable32 BCLONE_WAIT_DIRTY 0
+}
+
+function verify_copy
+{
+	src_cksum=$(sha256digest $1)
+	dst_cksum=$(sha256digest $2)
+
+	if [[ "$src_cksum" != "$dst_cksum" ]]; then
+		log_must ls -l $CP_TESTDIR
+		log_fail "checksum mismatch ($src_cksum != $dst_cksum)"
+	fi
+}
+
+log_assert "Verify all cp --reflink modes work with modified file"
+
+log_onexit cleanup
+
+SRC_FILE=src.data
+DST_FILE=dst.data
+SRC_SIZE=$(($RANDOM % 2048))
+
+# A smaller recordsize is used merely to speed up the test.
+RECORDSIZE=4096
+
+log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink
+CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink)
+
+log_must cd $CP_TESTDIR
+
+# Never wait on dirty blocks (zfs_bclone_wait_dirty=0)
+log_must set_tunable32 BCLONE_WAIT_DIRTY 0
+
+for mode in "never" "auto" "always"; do
+	log_note "Checking 'cp --reflink=$mode'"
+
+	# Create a new file and immediately copy it.
+	log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
+
+	if [[ "$mode" == "always" ]]; then
+		log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
+		log_must ls -l $CP_TESTDIR
+	else
+		log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+		verify_copy $SRC_FILE $DST_FILE
+	fi
+	log_must rm -f $DST_FILE
+
+	# Append to an existing file and immediately copy it.
+	sync_pool $TESTPOOL
+	log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
+	    count=1 conv=notrunc
+	if [[ "$mode" == "always" ]]; then
+		log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
+		log_must ls -l $CP_TESTDIR
+	else
+		log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+		verify_copy $SRC_FILE $DST_FILE
+	fi
+	log_must rm -f $DST_FILE
+
+	# Overwrite a random range of an existing file and immediately copy it.
+	sync_pool $TESTPOOL
+	log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
+            seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+	if [[ "$mode" == "always" ]]; then
+		log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
+		log_must ls -l $CP_TESTDIR
+	else
+		log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+		verify_copy $SRC_FILE $DST_FILE
+	fi
+	log_must rm -f $SRC_FILE $DST_FILE
+done
+
+# Wait on dirty blocks (zfs_bclone_wait_dirty=1)
+log_must set_tunable32 BCLONE_WAIT_DIRTY 1
+
+for mode in "never" "auto" "always"; do
+	log_note "Checking 'cp --reflink=$mode'"
+
+	# Create a new file and immediately copy it.
+	log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
+	log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+	verify_copy $SRC_FILE $DST_FILE
+	log_must rm -f $DST_FILE
+
+	# Append to an existing file and immediately copy it.
+	log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
+	    count=1 conv=notrunc
+	log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+	verify_copy $SRC_FILE $DST_FILE
+	log_must rm -f $DST_FILE
+
+	# Overwrite a random range of an existing file and immediately copy it.
+	log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
+            seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+	log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+	verify_copy $SRC_FILE $DST_FILE
+	log_must rm -f $SRC_FILE $DST_FILE
+done
+
+log_pass

From 08fd5ccc38c3b4575da91fc8b6ac350f444b5735 Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Sat, 3 Feb 2024 00:51:51 +0500
Subject: [PATCH 02/31] Improve performance for zpool trim on linux

On Linux, ZFS uses blkdev_issue_discard in vdev_disk_io_trim to issue
trim command which is synchronous.

This commit updates vdev_disk_io_trim to use __blkdev_issue_discard,
which is asynchronous. Unfortunately there isn't any asynchronous
version for blkdev_issue_secure_erase, so performance of secure trim
will still suffer.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #15843
---
 config/kernel-blkdev.m4         | 34 ++++++++++++---
 module/os/linux/zfs/vdev_disk.c | 74 ++++++++++++++++++++++++++-------
 2 files changed, 88 insertions(+), 20 deletions(-)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 8e9e638b12..c5a353ca92 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -524,6 +524,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [
 
 dnl #
 dnl # 5.19 API: blkdev_issue_secure_erase()
+dnl # 4.7  API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
 dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
@@ -539,6 +540,20 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
 		    sector, nr_sects, GFP_KERNEL);
 	])
 
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [
+		#include <linux/blkdev.h>
+	],[
+		struct block_device *bdev = NULL;
+		sector_t sector = 0;
+		sector_t nr_sects = 0;
+		unsigned long flags = 0;
+		struct bio *biop = NULL;
+		int error __attribute__ ((unused));
+
+		error = __blkdev_issue_discard(bdev,
+		    sector, nr_sects, GFP_KERNEL, flags, &biop);
+	])
+
 	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [
 		#include <linux/blkdev.h>
 	],[
@@ -562,13 +577,22 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [
 	],[
 		AC_MSG_RESULT(no)
 
-		AC_MSG_CHECKING([whether blkdev_issue_discard() is available])
-		ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [
+		AC_MSG_CHECKING([whether __blkdev_issue_discard() is available])
+		ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [
 			AC_MSG_RESULT(yes)
-			AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1,
-			    [blkdev_issue_discard() is available])
+			AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1,
+			    [__blkdev_issue_discard() is available])
 		],[
-			ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()])
+			AC_MSG_RESULT(no)
+
+			AC_MSG_CHECKING([whether blkdev_issue_discard() is available])
+			ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [
+				AC_MSG_RESULT(yes)
+				AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1,
+					[blkdev_issue_discard() is available])
+			],[
+				ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()])
+			])
 		])
 	])
 ])
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index e7f0aa5738..b0bda5fa20 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -862,27 +862,66 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 	return (0);
 }
 
+#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
+	defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
+BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
+{
+	zio_t *zio = bio->bi_private;
+#ifdef HAVE_1ARG_BIO_END_IO_T
+	zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+	zio->io_error = -error;
+#endif
+	bio_put(bio);
+	if (zio->io_error)
+		vdev_disk_error(zio);
+	zio_interrupt(zio);
+}
+
+static int
+vdev_issue_discard_trim(zio_t *zio, unsigned long flags)
+{
+	int ret;
+	struct bio *bio = NULL;
+
+#if defined(BLKDEV_DISCARD_SECURE)
+	ret = - __blkdev_issue_discard(
+	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
+	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio);
+#else
+	(void) flags;
+	ret = - __blkdev_issue_discard(
+	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
+	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio);
+#endif
+	if (!ret && bio) {
+		bio->bi_private = zio;
+		bio->bi_end_io = vdev_disk_discard_end_io;
+		vdev_submit_bio(bio);
+	}
+	return (ret);
+}
+#endif
+
 static int
 vdev_disk_io_trim(zio_t *zio)
 {
-	vdev_t *v = zio->io_vd;
-	vdev_disk_t *vd = v->vdev_tsd;
-
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
-	if (zio->io_trim_flags & ZIO_TRIM_SECURE) {
-		return (-blkdev_issue_secure_erase(BDH_BDEV(vd->vd_bdh),
-		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS));
-	} else {
-		return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh),
-		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS));
-	}
-#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
 	unsigned long trim_flags = 0;
-#if defined(BLKDEV_DISCARD_SECURE)
-	if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+	if (zio->io_trim_flags & ZIO_TRIM_SECURE) {
+#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
+		return (-blkdev_issue_secure_erase(
+		    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
+		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS));
+#elif defined(BLKDEV_DISCARD_SECURE)
 		trim_flags |= BLKDEV_DISCARD_SECURE;
 #endif
-	return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh),
+	}
+#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
+	defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
+	return (vdev_issue_discard_trim(zio, trim_flags));
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
+	return (-blkdev_issue_discard(
+	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
 	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags));
 #else
 #error "Unsupported kernel"
@@ -968,7 +1007,12 @@ vdev_disk_io_start(zio_t *zio)
 	case ZIO_TYPE_TRIM:
 		zio->io_error = vdev_disk_io_trim(zio);
 		rw_exit(&vd->vd_lock);
+#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
+		if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+			zio_interrupt(zio);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
 		zio_interrupt(zio);
+#endif
 		return;
 
 	default:

From 9bb8d26bd5485b579dff60166cf7a51f6e57820a Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo@canonical.com>
Date: Fri, 8 Dec 2023 21:32:35 -0300
Subject: [PATCH 03/31] zed: fix typo in variable
 ZED_POWER_OFF_ENCLO*US*RE_SLOT_ON_FAULT

Replace ENCLO_US_RE with ENCLO_SU_RE in the name of the variable.

Note this changes the user-visible string in zed.rc, thus might
break current users with the wrong string, but it's ~2 months
since zfs-2.2.0 tag is out, thus should not be widespread yet.

Mechanical change:

    $ grep -rl ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT
    cmd/zed/zed.d/zed.rc
    cmd/zed/zed.d/statechange-slot_off.sh

    $ sed -i 's/ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT/<linebreak>
                ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT/g' \
      cmd/zed/zed.d/zed.rc \
      cmd/zed/zed.d/statechange-slot_off.sh

    $ grep -rl ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT
    $

Fixes 11fbcacf37d1a66c7a40bb8920c70ce9a87270ea
("zed: Add zedlet to power off slot when drive is faulted")

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Mauricio Faria de Oliveira <mfo@canonical.com>
Closes #15651
---
 cmd/zed/zed.d/statechange-slot_off.sh | 6 +++---
 cmd/zed/zed.d/zed.rc                  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh
index 150012abe7..06acce93b8 100755
--- a/cmd/zed/zed.d/statechange-slot_off.sh
+++ b/cmd/zed/zed.d/statechange-slot_off.sh
@@ -5,7 +5,7 @@
 #
 # Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
 # as they flip between FAULTED and ONLINE.  If
-# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
+# ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
 # FAULTED, then power down the slot via sysfs:
 #
 # /sys/class/enclosure/<enclosure>/<slot>/power_status
@@ -19,7 +19,7 @@
 # Exit codes:
 #   0: slot successfully powered off
 #   1: enclosure not available
-#   2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled
+#   2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT disabled
 #   3: vdev was not FAULTED
 #   4: The enclosure sysfs path passed from ZFS does not exist
 #   5: Enclosure slot didn't actually turn off after we told it to
@@ -32,7 +32,7 @@ if [ ! -d /sys/class/enclosure ] ; then
 	exit 1
 fi
 
-if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then
+if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT}" != "1" ] ; then
 	exit 2
 fi
 
diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc
index 78dc1afc7b..48051544d1 100644
--- a/cmd/zed/zed.d/zed.rc
+++ b/cmd/zed/zed.d/zed.rc
@@ -146,4 +146,4 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
 # Power off the drive's slot in the enclosure if it becomes FAULTED.  This can
 # help silence misbehaving drives.  This assumes your drive enclosure fully
 # supports slot power control via sysfs.
-#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1
+#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1

From 40e20d808ce263ac6f62c96a5c9cb10dc4add151 Mon Sep 17 00:00:00 2001
From: Cameron Harr <charr@llnl.gov>
Date: Wed, 7 Feb 2024 09:12:12 -0800
Subject: [PATCH 04/31] Add 'zpool status -e' flag to see unhealthy vdevs

When very large pools are present, it can be laborious to find
reasons for why a pool is degraded and/or where an unhealthy vdev
is. This option filters out vdevs that are ONLINE and with no errors
to make it easier to see where the issues are. Root and parents of
unhealthy vdevs will always be printed.

Testing:
ZFS errors and drive failures for multiple vdevs were simulated with
zinject.

Sample vdev listings with '-e' option
- All vdevs healthy
    NAME        STATE     READ WRITE CKSUM
    iron5       ONLINE       0     0     0

- ZFS errors
    NAME        STATE     READ WRITE CKSUM
    iron5       ONLINE       0     0     0
      raidz2-5  ONLINE       1     0     0
        L23     ONLINE       1     0     0
        L24     ONLINE       1     0     0
        L37     ONLINE       1     0     0

- Vdev faulted
    NAME        STATE     READ WRITE CKSUM
    iron5       DEGRADED     0     0     0
      raidz2-6  DEGRADED     0     0     0
        L67     FAULTED      0     0     0  too many errors

- Vdev faults and data errors
    NAME        STATE     READ WRITE CKSUM
    iron5       DEGRADED     0     0     0
      raidz2-1  DEGRADED     0     0     0
        L2      FAULTED      0     0     0  too many errors
      raidz2-5  ONLINE       1     0     0
        L23     ONLINE       1     0     0
        L24     ONLINE       1     0     0
        L37     ONLINE       1     0     0
      raidz2-6  DEGRADED     0     0     0
        L67     FAULTED      0     0     0  too many errors

- Vdev missing
    NAME        STATE     READ WRITE CKSUM
    iron5       DEGRADED     0     0     0
      raidz2-6  DEGRADED     0     0     0
        L67     UNAVAIL      3     1     0

- Slow devices when -s provided with -e
    NAME        STATE     READ WRITE CKSUM  SLOW
    iron5       DEGRADED     0     0     0     -
      raidz2-5  DEGRADED     0     0     0     -
        L10     FAULTED      0     0     0     0  external device fault
        L51     ONLINE       0     0     0    14

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Cameron Harr <harr1@llnl.gov>
Closes #15769
---
 cmd/zpool/zpool_main.c                        |  58 +++++++++-
 man/man8/zpool-status.8                       |   4 +-
 tests/runfiles/common.run                     |   3 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../zpool_status/zpool_status_002_pos.ksh     |   4 +-
 .../zpool_status/zpool_status_003_pos.ksh     |   2 +
 .../zpool_status/zpool_status_008_pos.ksh     | 104 ++++++++++++++++++
 7 files changed, 169 insertions(+), 7 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 6687a44644..69bf9649ac 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -2161,6 +2161,7 @@ typedef struct status_cbdata {
 	boolean_t	cb_explain;
 	boolean_t	cb_first;
 	boolean_t	cb_dedup_stats;
+	boolean_t	cb_print_unhealthy;
 	boolean_t	cb_print_status;
 	boolean_t	cb_print_slow_ios;
 	boolean_t	cb_print_vdev_init;
@@ -2357,6 +2358,35 @@ health_str_to_color(const char *health)
 	return (NULL);
 }
 
+/*
+ * Called for each leaf vdev.  Returns 0 if the vdev is healthy.
+ * A vdev is unhealthy if any of the following are true:
+ * 1) there are read, write, or checksum errors,
+ * 2) its state is not ONLINE, or
+ * 3) slow IO reporting was requested (-s) and there are slow IOs.
+ */
+static int
+vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data)
+{
+	status_cbdata_t *cb = data;
+	vdev_stat_t *vs;
+	uint_t vsc;
+	(void) hdl_data;
+
+	if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &vsc) != 0)
+		return (1);
+
+	if (vs->vs_checksum_errors || vs->vs_read_errors ||
+	    vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY)
+		return (1);
+
+	if (cb->cb_print_slow_ios && vs->vs_slow_ios)
+		return (1);
+
+	return (0);
+}
+
 /*
  * Print out configuration state as requested by status_callback.
  */
@@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 	const char *state;
 	const char *type;
 	const char *path = NULL;
-	const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL;
+	const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL,
+	    *scolor = NULL;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
@@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 			state = gettext("AVAIL");
 	}
 
+	/*
+	 * If '-e' is specified then top-level vdevs and their children
+	 * can be pruned if all of their leaves are healthy.
+	 */
+	if (cb->cb_print_unhealthy && depth > 0 &&
+	    for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) {
+		return;
+	}
+
 	printf_color(health_str_to_color(state),
 	    "\t%*s%-*s  %-8s", depth, "", cb->cb_namewidth - depth,
 	    name, state);
@@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 		if (vs->vs_checksum_errors)
 			ccolor = ANSI_RED;
 
+		if (vs->vs_slow_ios)
+			scolor = ANSI_BLUE;
+
 		if (cb->cb_literal) {
 			fputc(' ', stdout);
 			printf_color(rcolor, "%5llu",
@@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 			}
 
 			if (cb->cb_literal)
-				printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
+				printf_color(scolor, " %5llu",
+				    (u_longlong_t)vs->vs_slow_ios);
 			else
-				printf(" %5s", rbuf);
+				printf_color(scolor, " %5s", rbuf);
 		}
 		if (cb->cb_print_power) {
 			if (children == 0)  {
@@ -8999,9 +9043,11 @@ status_callback(zpool_handle_t *zhp, void *data)
 				(void) printf(gettext(
 				    "errors: No known data errors\n"));
 			} else if (!cbp->cb_verbose) {
+				color_start(ANSI_RED);
 				(void) printf(gettext("errors: %llu data "
 				    "errors, use '-v' for a list\n"),
 				    (u_longlong_t)nerr);
+				color_end();
 			} else {
 				print_error_log(zhp);
 			}
@@ -9022,6 +9068,7 @@ status_callback(zpool_handle_t *zhp, void *data)
  *              [pool] [interval [count]]
  *
  *	-c CMD	For each vdev, run command CMD
+ *	-e	Display only unhealthy vdevs
  *	-i	Display vdev initialization status.
  *	-g	Display guid for individual vdev name.
  *	-L	Follow links when resolving vdev path name.
@@ -9053,7 +9100,7 @@ zpool_do_status(int argc, char **argv)
 	};
 
 	/* check options */
-	while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options,
+	while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
 	    NULL)) != -1) {
 		switch (c) {
 		case 'c':
@@ -9080,6 +9127,9 @@ zpool_do_status(int argc, char **argv)
 			}
 			cmd = optarg;
 			break;
+		case 'e':
+			cb.cb_print_unhealthy = B_TRUE;
+			break;
 		case 'i':
 			cb.cb_print_vdev_init = B_TRUE;
 			break;
diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8
index 56fa4aed05..24ad6e643c 100644
--- a/man/man8/zpool-status.8
+++ b/man/man8/zpool-status.8
@@ -36,7 +36,7 @@
 .Sh SYNOPSIS
 .Nm zpool
 .Cm status
-.Op Fl DigLpPstvx
+.Op Fl DeigLpPstvx
 .Op Fl T Sy u Ns | Ns Sy d
 .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
 .Oo Ar pool Oc Ns …
@@ -69,6 +69,8 @@ See the
 option of
 .Nm zpool Cm iostat
 for complete details.
+.It Fl e
+Only show unhealthy vdevs (not-ONLINE or with errors).
 .It Fl i
 Display vdev initialization status.
 .It Fl g
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index dd936ce598..7331244515 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -535,7 +535,8 @@ tags = ['functional', 'cli_root', 'zpool_split']
 tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
     'zpool_status_003_pos', 'zpool_status_004_pos',
     'zpool_status_005_pos', 'zpool_status_006_pos',
-    'zpool_status_007_pos', 'zpool_status_features_001_pos']
+    'zpool_status_007_pos', 'zpool_status_008_pos',
+    'zpool_status_features_001_pos']
 tags = ['functional', 'cli_root', 'zpool_status']
 
 [tests/functional/cli_root/zpool_sync]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 7442c79857..e2824ee065 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1238,6 +1238,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
+	functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
 	functional/cli_root/zpool_sync/cleanup.ksh \
 	functional/cli_root/zpool_sync/setup.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh
index 3bdd7db649..d6f32cdc7a 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh
@@ -51,7 +51,7 @@ else
 fi
 
 set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \
-	"-vx $testpool"
+	"-vx $testpool" "-e $testpool" "-es $testpool"
 
 log_assert "Executing 'zpool status' with correct options succeeds"
 
@@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do
 	(( i = i + 1 ))
 done
 
+cleanup 
+
 log_pass "'zpool status' with correct options succeeded"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh
index b501aac5ad..52b22dd833 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh
@@ -37,6 +37,7 @@
 # 3. Read the file
 # 4. Take a snapshot and make a clone
 # 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v'
+#      and 'zpool status -ev'
 
 function cleanup
 {
@@ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2
 log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'"
 log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'"
 log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'"
+log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'"
 log_mustnot eval "zpool status -v | grep '$TESTFS1'"
 
 log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh
new file mode 100755
index 0000000000..6be2ad5a74
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh
@@ -0,0 +1,104 @@
+#!/bin/ksh -p
+
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify 'zpool status -e' only shows unhealthy devices.
+#
+# STRATEGY:
+# 1. Create zpool
+# 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs
+# 3. Verify vdevs are reported correctly with -e and -s
+# 4. Verify parents are reported as DEGRADED
+# 5. Verify healthy children are not reported
+#
+
+function cleanup
+{
+	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+	zinject -c all
+	poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
+	log_must rm -f $all_vdevs
+}
+
+log_assert "Verify 'zpool status -e'"
+
+log_onexit cleanup
+
+all_vdevs=$(echo $TESTDIR/vdev{1..6})
+log_must mkdir -p $TESTDIR
+log_must truncate -s $MINVDEVSIZE $all_vdevs
+
+OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
+
+for raid_type in "draid2:3d:6c:1s" "raidz2"; do
+
+	log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs
+
+	# Check DEGRADED vdevs are shown.
+	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE"
+	log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2
+	log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED"
+
+	# Check FAULTED vdevs are shown.
+	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE"
+	log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2
+	log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED"
+
+	# Check no ONLINE vdevs are shown
+	log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE"
+
+	# Check no ONLINE slow vdevs are show.  Then mark IOs greater than
+	# 10ms slow, delay IOs 20ms to vdev6, check slow IOs.
+	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE"
+	log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE"
+
+	log_must set_tunable64 ZIO_SLOW_IO_MS 10
+	log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2
+	log_must mkfile 1048576 /$TESTPOOL2/testfile
+	sync_pool $TESTPOOL2
+	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+
+	# Check vdev6 slow IOs are only shown when requested with -s.
+	log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
+	log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
+
+	# Pool level and top-vdev level status must be DEGRADED.
+	log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED"
+	log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED"
+
+	# Check that healthy vdevs[1-3] aren't shown with -e.
+	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE"
+	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE"
+	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE"
+	log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE"
+	log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE"
+	log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE"
+
+	log_must zinject -c all
+	log_must zpool status -es $TESTPOOL2
+
+	zpool destroy $TESTPOOL2
+done
+
+log_pass "Verify zpool status -e shows only unhealthy vdevs"

From d22bf6a9bd216523e3f58195282be12d9da7fd33 Mon Sep 17 00:00:00 2001
From: the-Chain-Warden-thresh <18302010006@fudan.edu.cn>
Date: Thu, 8 Feb 2024 03:53:05 +0800
Subject: [PATCH 05/31] LUA: Backport CVE-2020-24370's patch

CVE-2020-24370 is a security vulnerability in lua. Although the CVE
description in CVE-2020-24370 said that this CVE only affected lua
5.4.0, according to lua this CVE actually existed since lua 5.2. The
root cause of this CVE is the negation overflow that occurs when you
try to take the negative of 0x80000000. Thus, this CVE also exists in
openzfs. Try to backport the fix to the lua in openzfs since the
original fix is for 5.4 and several functions have been changed.

https://github.com/advisories/GHSA-gfr4-c37g-mm3v
https://nvd.nist.gov/vuln/detail/CVE-2020-24370
https://www.lua.org/bugs.html#5.4.0-11
https://github.com/lua/lua/commit/a585eae6e7ada1ca9271607a4f48dfb1786

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: ChenHao Lu <18302010006@fudan.edu.cn>
Closes #15847
---
 module/lua/ldebug.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/module/lua/ldebug.c b/module/lua/ldebug.c
index 0092474c76..23e321bb12 100644
--- a/module/lua/ldebug.c
+++ b/module/lua/ldebug.c
@@ -111,10 +111,11 @@ static const char *upvalname (Proto *p, int uv) {
 
 static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
   int nparams = clLvalue(ci->func)->p->numparams;
-  if (n >= ci->u.l.base - ci->func - nparams)
+  int nvararg = cast_int(ci->u.l.base - ci->func) - nparams;
+  if (n <= -nvararg)
     return NULL;  /* no such vararg */
   else {
-    *pos = ci->func + nparams + n;
+    *pos = ci->func + nparams - n;
     return "(*vararg)";  /* generic name for any vararg */
   }
 }
@@ -126,7 +127,7 @@ static const char *findlocal (lua_State *L, CallInfo *ci, int n,
   StkId base;
   if (isLua(ci)) {
     if (n < 0)  /* access to vararg values? */
-      return findvararg(ci, -n, pos);
+      return findvararg(ci, n, pos);
     else {
       base = ci->u.l.base;
       name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));

From b699dacb4ac8bb7622943ae8587474dbe1fc81b1 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 12 Feb 2024 13:06:09 -0800
Subject: [PATCH 06/31] [zfs-2.2.3] Enable zfs_bclone_enabled on cp_files tests

cp_files_002_pos uses BRT, so enable block cloning in setup/cleanup.
This is only something we need to do in zfs-2.2.3, since 2.2.x ships
with block cloning disabled by default.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
---
 tests/zfs-tests/tests/functional/cp_files/cleanup.ksh | 4 ++++
 tests/zfs-tests/tests/functional/cp_files/setup.ksh   | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh b/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh
index 42fe70042d..c0bccab122 100755
--- a/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh
+++ b/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh
@@ -32,3 +32,7 @@
 . $STF_SUITE/include/libtest.shlib
 
 default_cleanup
+
+if tunable_exists BCLONE_ENABLED ; then
+	log_must restore_tunable BCLONE_ENABLED
+fi
diff --git a/tests/zfs-tests/tests/functional/cp_files/setup.ksh b/tests/zfs-tests/tests/functional/cp_files/setup.ksh
index b756d4e76c..4223386b36 100755
--- a/tests/zfs-tests/tests/functional/cp_files/setup.ksh
+++ b/tests/zfs-tests/tests/functional/cp_files/setup.ksh
@@ -32,4 +32,10 @@
 . $STF_SUITE/include/libtest.shlib
 
 DISK=${DISKS%% *}
+
+if tunable_exists BCLONE_ENABLED ; then
+	log_must save_tunable BCLONE_ENABLED
+	log_must set_tunable32 BCLONE_ENABLED 1
+fi
+
 default_setup $DISK

From 36116b4612f85b1373fbcefad3fd34cd7efd65d4 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Tue, 13 Feb 2024 08:58:04 +1100
Subject: [PATCH 07/31] zfs list: add '-t fs' and '-t vol' options (#15883)

Because "filesystem" and "volume" are just too long!

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15864
(cherry picked from commit a5a725440bcb2f4c4554be3e489f911e3dd60412)
---
 cmd/zfs/zfs_main.c  | 22 ++++++++++++++++------
 man/man8/zfs-list.8 | 11 ++++++++++-
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 67b191d72e..3017de9ee7 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -3672,15 +3672,25 @@ zfs_do_list(int argc, char **argv)
 
 			for (char *tok; (tok = strsep(&optarg, ",")); ) {
 				static const char *const type_subopts[] = {
-					"filesystem", "volume",
-					"snapshot", "snap",
+					"filesystem",
+					"fs",
+					"volume",
+					"vol",
+					"snapshot",
+					"snap",
 					"bookmark",
-					"all" };
+					"all"
+				};
 				static const int type_types[] = {
-					ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME,
-					ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT,
+					ZFS_TYPE_FILESYSTEM,
+					ZFS_TYPE_FILESYSTEM,
+					ZFS_TYPE_VOLUME,
+					ZFS_TYPE_VOLUME,
+					ZFS_TYPE_SNAPSHOT,
+					ZFS_TYPE_SNAPSHOT,
 					ZFS_TYPE_BOOKMARK,
-					ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK };
+					ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK
+				};
 
 				for (c = 0; c < ARRAY_SIZE(type_subopts); ++c)
 					if (strcmp(tok, type_subopts[c]) == 0) {
diff --git a/man/man8/zfs-list.8 b/man/man8/zfs-list.8
index 9f6a73ab95..85bd3fbafc 100644
--- a/man/man8/zfs-list.8
+++ b/man/man8/zfs-list.8
@@ -29,7 +29,7 @@
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd February 8, 2024
 .Dt ZFS-LIST 8
 .Os
 .
@@ -155,6 +155,15 @@ or
 For example, specifying
 .Fl t Sy snapshot
 displays only snapshots.
+.Sy fs ,
+.Sy snap ,
+or
+.Sy vol
+can be used as aliases for
+.Sy filesystem ,
+.Sy snapshot ,
+or
+.Sy volume .
 .El
 .
 .Sh EXAMPLES

From fc3d34bd08d81f9189fd06ac641da4e2d82a56c7 Mon Sep 17 00:00:00 2001
From: Bi11 <Bi11gates9999@gmail.com>
Date: Tue, 13 Feb 2024 05:53:33 +0800
Subject: [PATCH 08/31] BRT: Fix slop space calculation with block cloning

Similar to deduplication, the size of data duplicated by block cloning
should not be included in the slop space calculation.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Yuxin Wang <yuxinwang9999@gmail.com>
Closes #15874
---
 module/zfs/spa_misc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 72b690162d..24f038ad7f 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1822,7 +1822,8 @@ spa_get_slop_space(spa_t *spa)
 	 * deduplicated data, so since it's not useful to reserve more
 	 * space with more deduplicated data, we subtract that out here.
 	 */
-	space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
+	space =
+	    spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
 	slop = MIN(space >> spa_slop_shift, spa_max_slop);
 
 	/*

From a6f6c881ffde598898165f36e4e0a2ff15836cf6 Mon Sep 17 00:00:00 2001
From: Dex Wood <slash2314@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:25:17 -0600
Subject: [PATCH 09/31] Add Ntfy notification support to ZED

This commit adds the zed_notify_ntfy() function and hooks it
into zed_notify(). This will allow ZED to send notifications
to ntfy.sh or a self-hosted Ntfy service, which can be received
on a desktop or mobile device. It is configured with ZED_NTFY_TOPIC,
ZED_NTFY_URL, and ZED_NTFY_ACCESS_TOKEN variables in zed.rc.

Reviewed-by: @classabbyamp
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Dex Wood <slash2314@gmail.com>
Closes #15584
---
 cmd/zed/zed.d/zed-functions.sh | 98 ++++++++++++++++++++++++++++++++++
 cmd/zed/zed.d/zed.rc           | 22 ++++++++
 2 files changed, 120 insertions(+)

diff --git a/cmd/zed/zed.d/zed-functions.sh b/cmd/zed/zed.d/zed-functions.sh
index 49b6b54029..3a2519633d 100644
--- a/cmd/zed/zed.d/zed-functions.sh
+++ b/cmd/zed/zed.d/zed-functions.sh
@@ -205,6 +205,10 @@ zed_notify()
     [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
     [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
 
+    zed_notify_ntfy "${subject}" "${pathname}"; rv=$?
+    [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+    [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
     [ "${num_success}" -gt 0 ] && return 0
     [ "${num_failure}" -gt 0 ] && return 1
     return 2
@@ -527,6 +531,100 @@ zed_notify_pushover()
 }
 
 
+# zed_notify_ntfy (subject, pathname)
+#
+# Send a notification via Ntfy.sh <https://ntfy.sh/>.
+# The ntfy topic (ZED_NTFY_TOPIC) identifies the topic that the notification
+# will be sent to Ntfy.sh server. The ntfy url (ZED_NTFY_URL) defines the
+# self-hosted or provided hosted ntfy service location. The ntfy access token
+# <https://docs.ntfy.sh/publish/#access-tokens> (ZED_NTFY_ACCESS_TOKEN) reprsents an
+# access token that could be used if a topic is read/write protected. If a
+# topic can be written to publicaly, a ZED_NTFY_ACCESS_TOKEN is not required.
+#
+# Requires curl and sed executables to be installed in the standard PATH.
+#
+# References
+#   https://docs.ntfy.sh
+#
+# Arguments
+#   subject: notification subject
+#   pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+#   ZED_NTFY_TOPIC
+#   ZED_NTFY_ACCESS_TOKEN (OPTIONAL)
+#   ZED_NTFY_URL
+#
+# Return
+#   0: notification sent
+#   1: notification failed
+#   2: not configured
+#
+zed_notify_ntfy()
+{
+    local subject="$1"
+    local pathname="${2:-"/dev/null"}"
+    local msg_body
+    local msg_out
+    local msg_err
+
+    [ -n "${ZED_NTFY_TOPIC}" ] || return 2
+    local url="${ZED_NTFY_URL:-"https://ntfy.sh"}/${ZED_NTFY_TOPIC}"
+
+    if [ ! -r "${pathname}" ]; then
+        zed_log_err "ntfy cannot read \"${pathname}\""
+        return 1
+    fi
+
+    zed_check_cmd "curl" "sed" || return 1
+
+    # Read the message body in.
+    #
+    msg_body="$(cat "${pathname}")"
+
+    if [ -z "${msg_body}" ]
+    then
+        msg_body=$subject
+        subject=""
+    fi
+
+    # Send the POST request and check for errors.
+    #
+    if [ -n "${ZED_NTFY_ACCESS_TOKEN}" ]; then
+        msg_out="$( \
+        curl \
+        -u ":${ZED_NTFY_ACCESS_TOKEN}" \
+        -H "Title: ${subject}" \
+        -d "${msg_body}" \
+        -H "Priority: high" \
+        "${url}" \
+        2>/dev/null \
+        )"; rv=$?
+    else
+        msg_out="$( \
+        curl \
+        -H "Title: ${subject}" \
+        -d "${msg_body}" \
+        -H "Priority: high" \
+        "${url}" \
+        2>/dev/null \
+        )"; rv=$?
+    fi
+    if [ "${rv}" -ne 0 ]; then
+        zed_log_err "curl exit=${rv}"
+        return 1
+    fi
+    msg_err="$(echo "${msg_out}" \
+        | sed -n -e 's/.*"errors" *:.*\[\(.*\)\].*/\1/p')"
+    if [ -n "${msg_err}" ]; then
+        zed_log_err "ntfy \"${msg_err}"\"
+        return 1
+    fi
+    return 0
+}
+
+
+
 # zed_rate_limit (tag, [interval])
 #
 # Check whether an event of a given type [tag] has already occurred within the
diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc
index 48051544d1..bc269b155d 100644
--- a/cmd/zed/zed.d/zed.rc
+++ b/cmd/zed/zed.d/zed.rc
@@ -147,3 +147,25 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
 # help silence misbehaving drives.  This assumes your drive enclosure fully
 # supports slot power control via sysfs.
 #ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1
+
+##
+# Ntfy topic
+# This defines which topic will receive the ntfy notification.
+#  <https://docs.ntfy.sh/publish/>
+# Disabled by default; uncomment to enable.
+#ZED_NTFY_TOPIC=""
+
+##
+# Ntfy access token (optional for public topics)
+# This defines an access token which can be used
+# to allow you to authenticate when sending to topics
+# <https://docs.ntfy.sh/publish/#access-tokens>
+# Disabled by default; uncomment to enable.
+#ZED_NTFY_ACCESS_TOKEN=""
+
+##
+# Ntfy Service URL
+# This defines which service the ntfy call will be directed toward
+#  <https://docs.ntfy.sh/install/>
+# https://ntfy.sh by default; uncomment to enable an alternative service url.
+#ZED_NTFY_URL="https://ntfy.sh"

From a4978d260580d85491eee2ecb2435f56a7d4e50c Mon Sep 17 00:00:00 2001
From: Bi11 <Bi11gates9999@gmail.com>
Date: Tue, 13 Feb 2024 08:58:47 +0800
Subject: [PATCH 10/31] zdb: Fix false leak report for BRT objects

Fix a misreport in 'zdb -d' where it falsely marked
BRT objects as leaked.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Yuxin Wang <yuxinwang9999@gmail.com>
Closes #15882
---
 cmd/zdb/zdb.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 19b0d61f09..d81199765c 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -8041,6 +8041,17 @@ dump_mos_leaks(spa_t *spa)
 		}
 	}
 
+	if (spa->spa_brt != NULL) {
+		brt_t *brt = spa->spa_brt;
+		for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+			brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
+			if (brtvd != NULL && brtvd->bv_initiated) {
+				mos_obj_refd(brtvd->bv_mos_brtvdev);
+				mos_obj_refd(brtvd->bv_mos_entries);
+			}
+		}
+	}
+
 	/*
 	 * Visit all allocated objects and make sure they are referenced.
 	 */

From d92fbe2150d72e70ff1fdf1ada89ed3245a47cd4 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 14 Feb 2024 14:29:19 -0800
Subject: [PATCH 11/31] [zfs-2.2.3] ZTS: Use correct bclone module param name
 on FreeBSD

The bclone module names are not prefixed with 'zfs' on FreeBSD.
This was causing test failues.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
---
 tests/zfs-tests/include/tunables.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 46cd42c4b8..718c4cf2d8 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -90,8 +90,8 @@ VOL_INHIBIT_DEV			UNSUPPORTED			zvol_inhibit_dev
 VOL_MODE			vol.mode			zvol_volmode
 VOL_RECURSIVE			vol.recursive			UNSUPPORTED
 VOL_USE_BLK_MQ			UNSUPPORTED			zvol_use_blk_mq
-BCLONE_ENABLED			zfs_bclone_enabled		zfs_bclone_enabled
-BCLONE_WAIT_DIRTY		zfs_bclone_wait_dirty		zfs_bclone_wait_dirty
+BCLONE_ENABLED			bclone_enabled			zfs_bclone_enabled
+BCLONE_WAIT_DIRTY		bclone_wait_dirty		zfs_bclone_wait_dirty
 XATTR_COMPAT			xattr_compat			zfs_xattr_compat
 ZEVENT_LEN_MAX			zevent.len_max			zfs_zevent_len_max
 ZEVENT_RETAIN_MAX		zevent.retain_max		zfs_zevent_retain_max

From b62fd2cef9baede3fb9ee7dca980a0eb10d694f8 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 16 Feb 2024 08:59:56 -0800
Subject: [PATCH 12/31] ZTS: Skip cross-fs bclone tests if FreeBSD < 14.0

Skip cross filesystem block cloning tests on FreeBSD if running
less than version 14.0.  Cross filesystem copy_file_range() was
added in FreeBSD 14.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #15901
---
 tests/test-runner/bin/zts-report.py.in        | 22 ++++++++++++++-
 tests/zfs-tests/include/libtest.shlib         | 27 ++++++++++++++-----
 .../functional/bclone/bclone_common.kshlib    |  6 +++++
 ...ck_cloning_copyfilerange_cross_dataset.ksh |  5 ++--
 .../block_cloning_cross_enc_dataset.ksh       |  5 ++--
 5 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index edfdd47ee6..ecc50f4871 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -138,7 +138,11 @@ idmap_reason = 'Idmapped mount needs kernel 5.12+'
 # copy_file_range() is not supported by all kernels
 #
 cfr_reason = 'Kernel copy_file_range support required'
-cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
+
+if sys.platform.startswith('freebsd'):
+    cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs FreeBSD 14+'
+else:
+    cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
 
 #
 # These tests are known to fail, thus we use this list to prevent these
@@ -268,6 +272,22 @@ if sys.platform.startswith('freebsd'):
         'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623],
         'resilver/resilver_restart_001': ['FAIL', known_reason],
         'snapshot/snapshot_002_pos': ['FAIL', '14831'],
+        'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason],
+        'bclone/bclone_crossfs_corner_cases_limited':
+            ['SKIP', cfr_cross_reason],
+        'bclone/bclone_crossfs_data': ['SKIP', cfr_cross_reason],
+        'bclone/bclone_crossfs_embedded': ['SKIP', cfr_cross_reason],
+        'bclone/bclone_crossfs_hole': ['SKIP', cfr_cross_reason],
+        'bclone/bclone_diffprops_all': ['SKIP', cfr_cross_reason],
+        'bclone/bclone_diffprops_checksum': ['SKIP', cfr_cross_reason],
+        'bclone/bclone_diffprops_compress': ['SKIP', cfr_cross_reason],
+        'bclone/bclone_diffprops_copies': ['SKIP', cfr_cross_reason],
+        'bclone/bclone_diffprops_recordsize': ['SKIP', cfr_cross_reason],
+        'bclone/bclone_prop_sync': ['SKIP', cfr_cross_reason],
+        'block_cloning/block_cloning_cross_enc_dataset':
+            ['SKIP', cfr_cross_reason],
+        'block_cloning/block_cloning_copyfilerange_cross_dataset':
+            ['SKIP', cfr_cross_reason]
     })
 elif sys.platform.startswith('linux'):
     maybe.update({
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index b4d2b91dd4..dfab48d2cd 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -61,13 +61,8 @@ function compare_version_gte
 	[ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ]
 }
 
-# Linux kernel version comparison function
-#
-# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version
-#
-# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ]
-#
-function linux_version
+# Helper function used by linux_version() and freebsd_version()
+function kernel_version
 {
 	typeset ver="$1"
 
@@ -83,6 +78,24 @@ function linux_version
 	echo $((version * 100000 + major * 1000 + minor))
 }
 
+# Linux kernel version comparison function
+#
+# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version
+#
+# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ]
+function linux_version {
+	kernel_version "$1"
+}
+
+# FreeBSD version comparison function
+#
+# $1 FreeBSD version ("13.2", "14.0") or blank for installed FreeBSD version
+#
+# Used for comparison: if [ $(freebsd_version) -ge $(freebsd_version "13.2") ]
+function freebsd_version {
+	kernel_version "$1"
+}
+
 # Determine if this is a Linux test system
 #
 # Return 0 if platform Linux, 1 if otherwise
diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
index beba01c0ed..3b8eaea5bb 100644
--- a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
@@ -42,6 +42,12 @@ function verify_crossfs_block_cloning
 	if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
 		log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
 	fi
+
+	# Cross dataset block cloning only supported on FreeBSD 14+
+	# https://github.com/freebsd/freebsd-src/commit/969071be938c
+        if is_freebsd && [ $(freebsd_version) -lt $(freebsd_version 14.0) ] ; then
+               log_unsupported "Cloning across datasets not supported in $(uname -r)"
+        fi
 }
 
 # Unused.
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh
index 43323c207a..ad83d30291 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh
@@ -26,12 +26,11 @@
 
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
 
 verify_runnable "global"
 
-if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
-  log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
-fi
+verify_crossfs_block_cloning
 
 claim="The copy_file_range syscall can clone across datasets."
 
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh
index 34d3d26925..702e23267f 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh
@@ -26,12 +26,11 @@
 
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
 
 verify_runnable "global"
 
-if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
-  log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
-fi
+verify_crossfs_block_cloning
 
 claim="Block cloning across encrypted datasets."
 

From c0c4866f8a29a38b2bb683c267d7278e0020d90c Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 15 Dec 2023 12:51:41 -0500
Subject: [PATCH 13/31] dmu: Allow buffer fills to fail

When ZFS overwrites a whole block, it does not bother to read the
old content from disk. It is a good optimization, but if the buffer
fill fails due to page fault or something else, the buffer ends up
corrupted, neither keeping old content, nor getting the new one.

On FreeBSD this is additionally complicated by page faults being
blocked by VFS layer, always returning EFAULT on attempt to write
from mmap()'ed but not yet cached address range.  Normally it is
not a big problem, since after original failure VFS will retry the
write after reading the required data.  The problem becomes worse
in specific case when somebody tries to write into a file its own
mmap()'ed content from the same location.  In that situation the
only copy of the data is getting corrupted on the page fault and
the following retries only fixate the status quo.  Block cloning
makes this issue easier to reproduce, since it does not read the
old data, unlike traditional file copy, that may work by chance.

This patch provides the fill status to dmu_buf_fill_done(), that
in case of error can destroy the corrupted buffer as if no write
happened.  One more complication in case of block cloning is that
if error is possible during fill, dmu_buf_will_fill() must read
the data via fall-back to dmu_buf_will_dirty().  It is required
to allow in case of error restoring the buffer to a state after
the cloning, not not before it, that would happen if we just call
dbuf_undirty().

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15665
---
 include/os/freebsd/spl/sys/uio.h |  2 +-
 include/os/linux/spl/sys/uio.h   |  2 +-
 include/sys/dbuf.h               |  4 ++--
 lib/libspl/include/sys/uio.h     |  2 +-
 module/os/freebsd/zfs/dmu_os.c   |  4 ++--
 module/zfs/dbuf.c                | 33 +++++++++++++++++++++++---------
 module/zfs/dmu.c                 | 21 +++++++++-----------
 module/zfs/dmu_recv.c            |  2 +-
 8 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h
index b71f2f2e56..b9d41903ea 100644
--- a/include/os/freebsd/spl/sys/uio.h
+++ b/include/os/freebsd/spl/sys/uio.h
@@ -62,7 +62,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
 }
 
 static inline void
-zfs_uio_advance(zfs_uio_t *uio, size_t size)
+zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
 {
 	zfs_uio_resid(uio) -= size;
 	zfs_uio_offset(uio) += size;
diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h
index a4b600004c..5e6ea8d3c2 100644
--- a/include/os/linux/spl/sys/uio.h
+++ b/include/os/linux/spl/sys/uio.h
@@ -95,7 +95,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
 }
 
 static inline void
-zfs_uio_advance(zfs_uio_t *uio, size_t size)
+zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
 {
 	uio->uio_resid -= size;
 	uio->uio_loffset += size;
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 1800a7e31d..f2a1535c91 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -379,8 +379,8 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
+boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h
index e9e21819d4..665bfc4230 100644
--- a/lib/libspl/include/sys/uio.h
+++ b/lib/libspl/include/sys/uio.h
@@ -90,7 +90,7 @@ zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len)
 }
 
 static inline void
-zfs_uio_advance(zfs_uio_t *uio, size_t size)
+zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
 {
 	uio->uio_resid -= size;
 	uio->uio_loffset += size;
diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c
index a5f486b95d..c33ce01ab3 100644
--- a/module/os/freebsd/zfs/dmu_os.c
+++ b/module/os/freebsd/zfs/dmu_os.c
@@ -110,7 +110,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
+			dmu_buf_will_fill(db, tx, B_FALSE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
@@ -126,7 +126,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		}
 
 		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
+			dmu_buf_fill_done(db, tx, B_FALSE);
 
 		offset += tocpy;
 		size -= tocpy;
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 255add6cd2..280001bc34 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2734,7 +2734,7 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 }
 
 void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
@@ -2752,8 +2752,14 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 		 * Block cloning: We will be completely overwriting a block
 		 * cloned in this transaction group, so let's undirty the
 		 * pending clone and mark the block as uncached. This will be
-		 * as if the clone was never done.
+		 * as if the clone was never done.  But if the fill can fail
+		 * we should have a way to return back to the cloned data.
 		 */
+		if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+			mutex_exit(&db->db_mtx);
+			dmu_buf_will_dirty(db_fake, tx);
+			return;
+		}
 		VERIFY(!dbuf_undirty(db, tx));
 		db->db_state = DB_UNCACHED;
 	}
@@ -2814,32 +2820,41 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
 	dl->dr_overridden_by.blk_birth = dr->dr_txg;
 }
 
-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
 {
 	(void) tx;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-	dbuf_states_t old_state;
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
-	old_state = db->db_state;
-	db->db_state = DB_CACHED;
-	if (old_state == DB_FILL) {
+	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			memset(db->db.db_data, 0, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
+			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db,
 			    "fill done handling freed in flight");
+			failed = B_FALSE;
+		} else if (failed) {
+			VERIFY(!dbuf_undirty(db, tx));
+			db->db_buf = NULL;
+			dbuf_clear_data(db);
+			DTRACE_SET_STATE(db, "fill failed");
 		} else {
+			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db, "fill done");
 		}
 		cv_broadcast(&db->db_changed);
+	} else {
+		db->db_state = DB_CACHED;
+		failed = B_FALSE;
 	}
 	mutex_exit(&db->db_mtx);
+	return (failed);
 }
 
 void
@@ -2984,7 +2999,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 	DTRACE_SET_STATE(db, "filling assigned arcbuf");
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
-	dmu_buf_fill_done(&db->db, tx);
+	dmu_buf_fill_done(&db->db, tx, B_FALSE);
 }
 
 void
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 909605aa26..3215ab1c2a 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1115,14 +1115,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
+			dmu_buf_will_fill(db, tx, B_FALSE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
 		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
+			dmu_buf_fill_done(db, tx, B_FALSE);
 
 		offset += tocpy;
 		size -= tocpy;
@@ -1330,27 +1330,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 
 		ASSERT(size > 0);
 
-		bufoff = zfs_uio_offset(uio) - db->db_offset;
+		offset_t off = zfs_uio_offset(uio);
+		bufoff = off - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
+			dmu_buf_will_fill(db, tx, B_TRUE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
-		/*
-		 * XXX zfs_uiomove could block forever (eg.nfs-backed
-		 * pages).  There needs to be a uiolockdown() function
-		 * to lock the pages in memory, so that zfs_uiomove won't
-		 * block.
-		 */
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff,
 		    tocpy, UIO_WRITE, uio);
 
-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
+		if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
+			/* The fill was reverted.  Undo any uio progress. */
+			zfs_uio_advance(uio, off - zfs_uio_offset(uio));
+		}
 
 		if (err)
 			break;
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 05ca91717c..54aa60259e 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2532,7 +2532,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
 	 * size of the provided arc_buf_t.
 	 */
 	if (db_spill->db_size != drrs->drr_length) {
-		dmu_buf_will_fill(db_spill, tx);
+		dmu_buf_will_fill(db_spill, tx, B_FALSE);
 		VERIFY0(dbuf_spill_set_blksz(db_spill,
 		    drrs->drr_length, tx));
 	}

From c883088df83ced3a2b8b38e6d89a5e63fb153ee4 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Tue, 30 Jan 2024 13:34:05 -0800
Subject: [PATCH 14/31] Tag zfs-2.2.3

META file and changelog updated.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index 05337a9c50..d64414e322 100644
--- a/META
+++ b/META
@@ -1,7 +1,7 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.2.2
+Version:       2.2.3
 Release:       1
 Release-Tags:  relext
 License:       CDDL

From 58211157bf866bbcdd8720e92c27297db3ba75d6 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 21 Mar 2024 10:46:15 +1100
Subject: [PATCH 15/31] Linux 6.8 compat: use splice_copy_file_range() for
 fallback

Linux 6.8 removes generic_copy_file_range(), which had been reduced to a
simple wrapper around splice_copy_file_range(). Detect that function
directly and use it if generic_ is not available.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15930
Closes #15931
(cherry picked from commit ef08a4d4065d21414d7fedccac20da6bfda4dfd0)
---
 config/kernel-vfs-file_range.m4      | 27 +++++++++++++++++++++++++++
 config/kernel.m4                     |  2 ++
 module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++--
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
index cc96404d8b..8a5cbe2eee 100644
--- a/config/kernel-vfs-file_range.m4
+++ b/config/kernel-vfs-file_range.m4
@@ -16,6 +16,9 @@ dnl #
 dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
 dnl #      generic_copy_file_range() added to support it
 dnl #
+dnl # 6.8: generic_copy_file_range() removed, replaced by
+dnl #      splice_copy_file_range()
+dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
 		#include <linux/fs.h>
@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
 	])
 ])
 
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
+		#include <linux/splice.h>
+	], [
+		struct file *src_file __attribute__ ((unused)) = NULL;
+		loff_t src_off __attribute__ ((unused)) = 0;
+		struct file *dst_file __attribute__ ((unused)) = NULL;
+		loff_t dst_off __attribute__ ((unused)) = 0;
+		size_t len __attribute__ ((unused)) = 0;
+		splice_copy_file_range(src_file, src_off, dst_file, dst_off,
+		    len);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
+	AC_MSG_CHECKING([whether splice_copy_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
+		    [splice_copy_file_range() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
 		#include <linux/fs.h>
diff --git a/config/kernel.m4 b/config/kernel.m4
index e3f8645774..1d0c5a27fc 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
 	ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_IOV_ITER
 	ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index 3065d54fa9..64728fdb11 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -26,6 +26,9 @@
 #include <linux/compat.h>
 #endif
 #include <linux/fs.h>
+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
+#include <linux/splice.h>
+#endif
 #include <sys/file.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_vnops.h>
@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	ret = zpl_clone_file_range_impl(src_file, src_off,
 	    dst_file, dst_off, len);
 
-#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
 	/*
 	 * Since Linux 5.3 the filesystem driver is responsible for executing
 	 * an appropriate fallback, and a generic fallback function is provided.
@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	    ret == -EAGAIN)
 		ret = generic_copy_file_range(src_file, src_off, dst_file,
 		    dst_off, len, flags);
+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
+	/*
+	 * Since 6.8 the fallback function is called splice_copy_file_range
+	 * and has a slightly different signature.
+	 */
+	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
+	    ret == -EAGAIN)
+		ret = splice_copy_file_range(src_file, src_off, dst_file,
+		    dst_off, len);
 #else
 	/*
 	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 */
 	if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
 		ret = -EOPNOTSUPP;
-#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
 
 	return (ret);
 }

From 220bb7341eb4b41f017cec8f492e0daf18660d4e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 13 Nov 2023 17:55:29 +1100
Subject: [PATCH 16/31] linux 5.4 compat: page_size()

Before 5.4 we have to do a little math.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit df04efe321a49c650f1fbaa6fd701fa2928cbe21)
---
 config/kernel-mm-page-size.m4             | 17 +++++++++++
 config/kernel.m4                          |  2 ++
 include/os/linux/Makefile.am              |  1 +
 include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++
 4 files changed, 56 insertions(+)
 create mode 100644 config/kernel-mm-page-size.m4
 create mode 100644 include/os/linux/kernel/linux/mm_compat.h

diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
new file mode 100644
index 0000000000..d5ebd92698
--- /dev/null
+++ b/config/kernel-mm-page-size.m4
@@ -0,0 +1,17 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
+	ZFS_LINUX_TEST_SRC([page_size], [
+		#include <linux/mm.h>
+	],[
+		unsigned long s;
+		s = page_size(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
+	AC_MSG_CHECKING([whether page_size() is available])
+	ZFS_LINUX_TEST_RESULT([page_size], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 1d0c5a27fc..548905ccd0 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
+	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
+	ZFS_AC_KERNEL_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index 3830d198df..51c27132b4 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
 	%D%/kernel/linux/compiler_compat.h \
 	%D%/kernel/linux/dcache_compat.h \
 	%D%/kernel/linux/kmap_compat.h \
+	%D%/kernel/linux/mm_compat.h \
 	%D%/kernel/linux/mod_compat.h \
 	%D%/kernel/linux/page_compat.h \
 	%D%/kernel/linux/percpu_compat.h \
diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
new file mode 100644
index 0000000000..40056c68d6
--- /dev/null
+++ b/include/os/linux/kernel/linux/mm_compat.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ZFS_MM_COMPAT_H
+#define	_ZFS_MM_COMPAT_H
+
+#include <linux/mm.h>
+
+/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
+#ifndef HAVE_MM_PAGE_SIZE
+#define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
+#endif
+
+#endif /* _ZFS_MM_COMPAT_H */

From 52a2af6fd16470dd4ba8f7a1def459b43081ae5b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 11 Dec 2023 16:05:54 +1100
Subject: [PATCH 17/31] abd: add page iterator

The regular ABD iterators yield data buffers, so they have to map and
unmap pages into kernel memory. If the caller only wants to count
chunks, or can use page pointers directly, then the map/unmap is just
unnecessary overhead.

This adds adb_iterate_page_func, which yields unmapped struct page
instead.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit 390b448726c580999dd337be7a40b0e95cf1d50b)
---
 include/sys/abd.h              |   7 +++
 include/sys/abd_impl.h         |  26 ++++++++-
 module/os/freebsd/zfs/abd_os.c |   4 +-
 module/os/linux/zfs/abd_os.c   | 104 ++++++++++++++++++++++++++++++---
 module/zfs/abd.c               |  42 +++++++++++++
 5 files changed, 169 insertions(+), 14 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 750f9986c1..8a2df0bca9 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -79,6 +79,9 @@ typedef struct abd {
 
 typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
 typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
+#if defined(__linux__) && defined(_KERNEL)
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
+#endif
 
 extern int zfs_abd_scatter_enabled;
 
@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
 int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
 int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
     abd_iter_func2_t *, void *);
+#if defined(__linux__) && defined(_KERNEL)
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+    void *);
+#endif
 void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
 void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
 void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
index 40546d4af1..f88ea25e24 100644
--- a/include/sys/abd_impl.h
+++ b/include/sys/abd_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #ifndef _ABD_IMPL_H
@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
 	ABDSTAT_DECR  /* Decrease abdstat values */
 } abd_stats_op_t;
 
-struct scatterlist; /* forward declaration */
+/* forward declarations */
+struct scatterlist;
+struct page;
 
 struct abd_iter {
 	/* public interface */
-	void		*iter_mapaddr;	/* addr corresponding to iter_pos */
-	size_t		iter_mapsize;	/* length of data valid at mapaddr */
+	union {
+		/* for abd_iter_map()/abd_iter_unmap() */
+		struct {
+			/* addr corresponding to iter_pos */
+			void		*iter_mapaddr;
+			/* length of data valid at mapaddr */
+			size_t		iter_mapsize;
+		};
+		/* for abd_iter_page() */
+		struct {
+			/* current page */
+			struct page	*iter_page;
+			/* offset of data in page */
+			size_t		iter_page_doff;
+			/* size of data in page */
+			size_t		iter_page_dsize;
+		};
+	};
 
 	/* private */
 	abd_t		*iter_abd;	/* ABD being iterated through */
@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
 void abd_iter_advance(struct abd_iter *, size_t);
 void abd_iter_map(struct abd_iter *);
 void abd_iter_unmap(struct abd_iter *);
+void abd_iter_page(struct abd_iter *);
 
 /*
  * Helper macros
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index 58a37df62b..3b812271f9 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	aiter->iter_pos = 0;
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
 }
 
 /*
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 24390fbbf1..dae1280121 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 /*
@@ -59,6 +60,7 @@
 #include <sys/zfs_znode.h>
 #ifdef _KERNEL
 #include <linux/kmap_compat.h>
+#include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #endif
 
@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
-	aiter->iter_pos = 0;
-	if (abd_is_linear(abd)) {
-		aiter->iter_offset = 0;
-		aiter->iter_sg = NULL;
-	} else {
+	if (!abd_is_linear(abd)) {
 		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
 		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
 	}
@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 boolean_t
 abd_iter_at_end(struct abd_iter *aiter)
 {
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
 }
 
@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
 void
 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 {
+	/*
+	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
+	 * this state (directly or abd_iter_unmap()) before advancing.
+	 */
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
+	ASSERT3P(aiter->iter_page, ==, NULL);
+	ASSERT0(aiter->iter_page_doff);
+	ASSERT0(aiter->iter_page_dsize);
 
 	/* There's nothing left to advance to, so do nothing */
 	if (abd_iter_at_end(aiter))
@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
 }
 
 #if defined(_KERNEL)
+/*
+ * Yield the next page struct and data offset and size within it, without
+ * mapping it into the address space.
+ */
+void
+abd_iter_page(struct abd_iter *aiter)
+{
+	if (abd_iter_at_end(aiter)) {
+		aiter->iter_page = NULL;
+		aiter->iter_page_doff = 0;
+		aiter->iter_page_dsize = 0;
+		return;
+	}
+
+	struct page *page;
+	size_t doff, dsize;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+
+		/* memory address at iter_pos */
+		void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+
+		/* struct page for address */
+		page = is_vmalloc_addr(paddr) ?
+		    vmalloc_to_page(paddr) : virt_to_page(paddr);
+
+		/* offset of address within the page */
+		doff = offset_in_page(paddr);
+
+		/* total data remaining in abd from this position */
+		dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
+	} else {
+		ASSERT(!abd_is_gang(aiter->iter_abd));
+
+		/* current scatter page */
+		page = sg_page(aiter->iter_sg);
+
+		/* position within page */
+		doff = aiter->iter_offset;
+
+		/* remaining data in scatterlist */
+		dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+	}
+	ASSERT(page);
+
+	if (PageTail(page)) {
+		/*
+		 * This page is part of a "compound page", which is a group of
+		 * pages that can be referenced from a single struct page *.
+		 * Its organised as a "head" page, followed by a series of
+		 * "tail" pages.
+		 *
+		 * In OpenZFS, compound pages are allocated using the
+		 * __GFP_COMP flag, which we get from scatter ABDs and SPL
+		 * vmalloc slabs (ie >16K allocations). So a great many of the
+		 * IO buffers we get are going to be of this type.
+		 *
+		 * The tail pages are just regular PAGE_SIZE pages, and can be
+		 * safely used as-is. However, the head page has length
+		 * covering itself and all the tail pages. If this ABD chunk
+		 * spans multiple pages, then we can use the head page and a
+		 * >PAGE_SIZE length, which is far more efficient.
+		 *
+		 * To do this, we need to adjust the offset to be counted from
+		 * the head page. struct page for compound pages are stored
+		 * contiguously, so we can just adjust by a simple offset.
+		 */
+		struct page *head = compound_head(page);
+		doff += ((page - head) * PAGESIZE);
+		page = head;
+	}
+
+	/* final page and position within it */
+	aiter->iter_page = page;
+	aiter->iter_page_doff = doff;
+
+	/* amount of data in the chunk, up to the end of the page */
+	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
+}
+
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
-#endif
+
+#endif /* _KERNEL */
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index d982f201c9..3388e23573 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
 	return (ret);
 }
 
+#if defined(__linux__) && defined(_KERNEL)
+int
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
+    abd_iter_page_func_t *func, void *private)
+{
+	struct abd_iter aiter;
+	int ret = 0;
+
+	if (size == 0)
+		return (0);
+
+	abd_verify(abd);
+	ASSERT3U(off + size, <=, abd->abd_size);
+
+	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
+
+	while (size > 0) {
+		IMPLY(abd_is_gang(abd), c_abd != NULL);
+
+		abd_iter_page(&aiter);
+
+		size_t len = MIN(aiter.iter_page_dsize, size);
+		ASSERT3U(len, >, 0);
+
+		ret = func(aiter.iter_page, aiter.iter_page_doff,
+		    len, private);
+
+		aiter.iter_page = NULL;
+		aiter.iter_page_doff = 0;
+		aiter.iter_page_dsize = 0;
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
+	}
+
+	return (ret);
+}
+#endif
+
 struct buf_arg {
 	void *arg_buf;
 };

From 4820185031d674cba29e95792f4b0f2c4fa576ff Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:12:56 +1100
Subject: [PATCH 18/31] vdev_disk: rename existing functions to vdev_classic_*

This is just renaming the existing functions we're about to replace and
grouping them together to make the next commits easier to follow.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4)
---
 include/sys/abd.h               |   2 +
 module/os/linux/zfs/abd_os.c    |   5 +
 module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++---------------
 3 files changed, 120 insertions(+), 102 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 8a2df0bca9..bee38b831b 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -220,6 +220,8 @@ void abd_fini(void);
 
 /*
  * Linux ABD bio functions
+ * Note: these are only needed to support vdev_classic. See comment in
+ * vdev_disk.c.
  */
 #if defined(__linux__) && defined(_KERNEL)
 unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index dae1280121..3fe01c0b7d 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter)
 	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
 }
 
+/*
+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in
+ * vdev_disk.c.
+ */
+
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index b0bda5fa20..957619b87a 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
  */
 #define	EFI_MIN_RESV_SIZE	(16 * 1024)
 
-/*
- * Virtual device vector for disks.
- */
-typedef struct dio_request {
-	zio_t			*dr_zio;	/* Parent ZIO */
-	atomic_t		dr_ref;		/* References */
-	int			dr_error;	/* Bio error */
-	int			dr_bio_count;	/* Count of bio's */
-	struct bio		*dr_bio[];	/* Attached bio's */
-} dio_request_t;
-
 /*
  * BIO request failfast mask.
  */
@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v)
 	v->vdev_tsd = NULL;
 }
 
-static dio_request_t *
-vdev_disk_dio_alloc(int bio_count)
-{
-	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
-	    sizeof (struct bio *) * bio_count, KM_SLEEP);
-	atomic_set(&dr->dr_ref, 0);
-	dr->dr_bio_count = bio_count;
-	dr->dr_error = 0;
-
-	for (int i = 0; i < dr->dr_bio_count; i++)
-		dr->dr_bio[i] = NULL;
-
-	return (dr);
-}
-
-static void
-vdev_disk_dio_free(dio_request_t *dr)
-{
-	int i;
-
-	for (i = 0; i < dr->dr_bio_count; i++)
-		if (dr->dr_bio[i])
-			bio_put(dr->dr_bio[i]);
-
-	kmem_free(dr, sizeof (dio_request_t) +
-	    sizeof (struct bio *) * dr->dr_bio_count);
-}
-
-static void
-vdev_disk_dio_get(dio_request_t *dr)
-{
-	atomic_inc(&dr->dr_ref);
-}
-
-static void
-vdev_disk_dio_put(dio_request_t *dr)
-{
-	int rc = atomic_dec_return(&dr->dr_ref);
-
-	/*
-	 * Free the dio_request when the last reference is dropped and
-	 * ensure zio_interpret is called only once with the correct zio
-	 */
-	if (rc == 0) {
-		zio_t *zio = dr->dr_zio;
-		int error = dr->dr_error;
-
-		vdev_disk_dio_free(dr);
-
-		if (zio) {
-			zio->io_error = error;
-			ASSERT3S(zio->io_error, >=, 0);
-			if (zio->io_error)
-				vdev_disk_error(zio);
-
-			zio_delay_interrupt(zio);
-		}
-	}
-}
-
-BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
-{
-	dio_request_t *dr = bio->bi_private;
-
-	if (dr->dr_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
-		dr->dr_error = BIO_END_IO_ERROR(bio);
-#else
-		if (error)
-			dr->dr_error = -(error);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			dr->dr_error = EIO;
-#endif
-	}
-
-	/* Drop reference acquired by __vdev_disk_physio */
-	vdev_disk_dio_put(dr);
-}
-
 static inline void
 vdev_submit_bio_impl(struct bio *bio)
 {
@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 
+/* ========== */
+
+/*
+ * This is the classic, battle-tested BIO submission code.
+ *
+ * These functions have been renamed to vdev_classic_* to make it clear what
+ * they belong to, but their implementations are unchanged.
+ */
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+	zio_t			*dr_zio;	/* Parent ZIO */
+	atomic_t		dr_ref;		/* References */
+	int			dr_error;	/* Bio error */
+	int			dr_bio_count;	/* Count of bio's */
+	struct bio		*dr_bio[];	/* Attached bio's */
+} dio_request_t;
+
+static dio_request_t *
+vdev_classic_dio_alloc(int bio_count)
+{
+	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
+	    sizeof (struct bio *) * bio_count, KM_SLEEP);
+	atomic_set(&dr->dr_ref, 0);
+	dr->dr_bio_count = bio_count;
+	dr->dr_error = 0;
+
+	for (int i = 0; i < dr->dr_bio_count; i++)
+		dr->dr_bio[i] = NULL;
+
+	return (dr);
+}
+
+static void
+vdev_classic_dio_free(dio_request_t *dr)
+{
+	int i;
+
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			bio_put(dr->dr_bio[i]);
+
+	kmem_free(dr, sizeof (dio_request_t) +
+	    sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_classic_dio_get(dio_request_t *dr)
+{
+	atomic_inc(&dr->dr_ref);
+}
+
+static void
+vdev_classic_dio_put(dio_request_t *dr)
+{
+	int rc = atomic_dec_return(&dr->dr_ref);
+
+	/*
+	 * Free the dio_request when the last reference is dropped and
+	 * ensure zio_interpret is called only once with the correct zio
+	 */
+	if (rc == 0) {
+		zio_t *zio = dr->dr_zio;
+		int error = dr->dr_error;
+
+		vdev_classic_dio_free(dr);
+
+		if (zio) {
+			zio->io_error = error;
+			ASSERT3S(zio->io_error, >=, 0);
+			if (zio->io_error)
+				vdev_disk_error(zio);
+
+			zio_delay_interrupt(zio);
+		}
+	}
+}
+
+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
+{
+	dio_request_t *dr = bio->bi_private;
+
+	if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			dr->dr_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			dr->dr_error = EIO;
+#endif
+	}
+
+	/* Drop reference acquired by vdev_classic_physio */
+	vdev_classic_dio_put(dr);
+}
+
 static inline unsigned int
-vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 {
 	unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
 	    bio_size, abd_offset);
@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 
 static int
-__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+vdev_classic_physio(struct block_device *bdev, zio_t *zio,
     size_t io_size, uint64_t io_offset, int rw, int flags)
 {
 	dio_request_t *dr;
@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	}
 
 retry:
-	dr = vdev_disk_dio_alloc(bio_count);
+	dr = vdev_classic_dio_alloc(bio_count);
 
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 	    zio->io_vd->vdev_failfast == B_TRUE) {
@@ -771,23 +780,23 @@ retry:
 		 * this should be rare - see the comment above.
 		 */
 		if (dr->dr_bio_count == i) {
-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
 			bio_count *= 2;
 			goto retry;
 		}
 
-		nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
+		nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
 		dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
 		if (unlikely(dr->dr_bio[i] == NULL)) {
-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
 			return (SET_ERROR(ENOMEM));
 		}
 
-		/* Matching put called by vdev_disk_physio_completion */
-		vdev_disk_dio_get(dr);
+		/* Matching put called by vdev_classic_physio_completion */
+		vdev_classic_dio_get(dr);
 
 		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
-		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+		dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
 
@@ -801,7 +810,7 @@ retry:
 	}
 
 	/* Extra reference to protect dio_request during vdev_submit_bio */
-	vdev_disk_dio_get(dr);
+	vdev_classic_dio_get(dr);
 
 	if (dr->dr_bio_count > 1)
 		blk_start_plug(&plug);
@@ -815,11 +824,13 @@ retry:
 	if (dr->dr_bio_count > 1)
 		blk_finish_plug(&plug);
 
-	vdev_disk_dio_put(dr);
+	vdev_classic_dio_put(dr);
 
 	return (error);
 }
 
+/* ========== */
+
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
 {
 	zio_t *zio = bio->bi_private;
@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio)
 	}
 
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
-	error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
+	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
 	    zio->io_size, zio->io_offset, rw, 0);
 	rw_exit(&vd->vd_lock);
 

From 13b5348848b17aedc3c47017394ce3d7148be4b7 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:23:30 +1100
Subject: [PATCH 19/31] vdev_disk: reorganise vdev_disk_io_start

Light reshuffle to make it a bit more linear to read and get rid of a
bunch of args that aren't needed in all cases.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit 867178ae1db28e73051c8a7ce662f2f2f81cd8e6)
---
 module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 957619b87a..51e7cef2fc 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 
 static int
-vdev_classic_physio(struct block_device *bdev, zio_t *zio,
-    size_t io_size, uint64_t io_offset, int rw, int flags)
+vdev_classic_physio(zio_t *zio)
 {
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	size_t io_size = zio->io_size;
+	uint64_t io_offset = zio->io_offset;
+	int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
+	int flags = 0;
+
 	dio_request_t *dr;
 	uint64_t abd_offset;
 	uint64_t bio_offset;
@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
-	int rw, error;
+	int error;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio)
 		rw_exit(&vd->vd_lock);
 		zio_execute(zio);
 		return;
-	case ZIO_TYPE_WRITE:
-		rw = WRITE;
-		break;
-
-	case ZIO_TYPE_READ:
-		rw = READ;
-		break;
 
 	case ZIO_TYPE_TRIM:
 		zio->io_error = vdev_disk_io_trim(zio);
@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio)
 #endif
 		return;
 
+	case ZIO_TYPE_READ:
+	case ZIO_TYPE_WRITE:
+		zio->io_target_timestamp = zio_handle_io_delay(zio);
+		error = vdev_classic_physio(zio);
+		rw_exit(&vd->vd_lock);
+		if (error) {
+			zio->io_error = error;
+			zio_interrupt(zio);
+		}
+		return;
+
 	default:
+		/*
+		 * Getting here means our parent vdev has made a very strange
+		 * request of us, and shouldn't happen. Assert here to force a
+		 * crash in dev builds, but in production return the IO
+		 * unhandled. The pool will likely suspend anyway but that's
+		 * nicer than crashing the kernel.
+		 */
+		ASSERT3S(zio->io_type, ==, -1);
+
 		rw_exit(&vd->vd_lock);
 		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_interrupt(zio);
 		return;
 	}
 
-	zio->io_target_timestamp = zio_handle_io_delay(zio);
-	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
-	    zio->io_size, zio->io_offset, rw, 0);
-	rw_exit(&vd->vd_lock);
-
-	if (error) {
-		zio->io_error = error;
-		zio_interrupt(zio);
-		return;
-	}
+	__builtin_unreachable();
 }
 
 static void

From 03ff875e09c6578941607581e9f08658e191bbd9 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:29:19 +1100
Subject: [PATCH 20/31] vdev_disk: make read/write IO function configurable

This is just setting up for the next couple of commits, which will add a
new IO function and a parameter to select it.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit c4a13ba483f08a81aa47479d2f763a470d95b2b0)
---
 module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 51e7cef2fc..de4dba72fa 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio)
 #endif
 }
 
+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
+
 static void
 vdev_disk_io_start(zio_t *zio)
 {
@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio)
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
 		zio->io_target_timestamp = zio_handle_io_delay(zio);
-		error = vdev_classic_physio(zio);
+		error = vdev_disk_io_rw_fn(zio);
 		rw_exit(&vd->vd_lock);
 		if (error) {
 			zio->io_error = error;
@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
+/*
+ * At first use vdev use, set the submission function from the default value if
+ * it hasn't been set already.
+ */
+static int
+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+	(void) spa;
+	(void) nv;
+	(void) tsd;
+
+	if (vdev_disk_io_rw_fn == NULL)
+		vdev_disk_io_rw_fn = vdev_classic_physio;
+
+	return (0);
+}
+
 vdev_ops_t vdev_disk_ops = {
-	.vdev_op_init = NULL,
+	.vdev_op_init = vdev_disk_init,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_disk_open,
 	.vdev_op_close = vdev_disk_close,

From 51c2bd0def6462b489f5d4e8ac1b9d6ab854f47b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 18 Jul 2023 11:11:29 +1000
Subject: [PATCH 21/31] vdev_disk: rewrite BIO filling machinery to avoid split
 pages

This commit tackles a number of issues in the way BIOs (`struct bio`)
are constructed for submission to the Linux block layer.

The kernel has a hard upper limit on the number of pages/segments that
can be added to a BIO, as well as a separate limit for each device
(related to its queue depth and other scheduling characteristics).

ZFS counts the number of memory pages in the request ABD
(`abd_nr_pages_off()`, and then uses that as the number of segments to
put into the BIO, up to the hard upper limit. If it requires more than
the limit, it will create multiple BIOs.

Leaving aside the fact that page count method is wrong (see below), not
limiting to the device segment max means that the device driver will
need to split the BIO in half. This is alone is not necessarily a
problem, but it interacts with another issue to cause a much larger
problem.

The kernel function to add a segment to a BIO (`bio_add_page()`) takes a
`struct page` pointer, and offset+len within it. `struct page` can
represent a run of contiguous memory pages (known as a "compound page").
In can be of arbitrary length.

The ZFS functions that count ABD pages and load them into the BIO
(`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never
consider a page to be more than `PAGE_SIZE` (4K), even if the `struct
page` is for multiple pages. In this case, it will load the same `struct
page` into the BIO multiple times, with the offset adjusted each time.

With a sufficiently large ABD, this can easily lead to the BIO being
entirely filled much earlier than it could have been. This is also
further contributes to the problem caused by the incorrect segment limit
calculation, as its much easier to go past the device limit, and so
require a split.

Again, this is not a problem on its own.

The logic for "never submit more than `PAGE_SIZE`" is actually a little
more subtle. It will actually never submit a buffer that crosses a 4K
page boundary.

In practice, this is fine, as most ABDs are scattered, that is a list of
complete 4K pages, and so are loaded in as such.

Linear ABDs are typically allocated from slabs, and for small sizes they
are frequently not aligned to page boundaries. For example, a 12K
allocation can span four pages, eg:

     -- 4K -- -- 4K -- -- 4K -- -- 4K --
    |        |        |        |        |
          :## ######## ######## ######:    [1K, 4K, 4K, 3K]

Such an allocation would be loaded into a BIO as you see:

    [1K, 4K, 4K, 3K]

This tends not to be a problem in practice, because even if the BIO were
filled and needed to be split, each half would still have either a start
or end aligned to the logical block size of the device (assuming 4K at
least).

---

In ideal circumstances, these shortcomings don't cause any particular
problems. Its when they start to interact with other ZFS features that
things get interesting.

Aggregation will create a "gang" ABD, which is simply a list of other
ABDs. Iterating over a gang ABD is just iterating over each ABD within
it in turn.

Because the segments are simply loaded in order, we can end up with
uneven segments either side of the "gap" between the two ABDs. For
example, two 12K ABDs might be aggregated and then loaded as:

    [1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K]

Should a split occur, each individual BIO can end up either having an
start or end offset that is not aligned to the logical block size, which
some drivers (eg SCSI) will reject. However, this tends not to happen
because the default aggregation limit usually keeps the BIO small enough
to not require more than one split, and most pages are actually full 4K
pages, so hitting an uneven gap is very rare anyway.

If the pool is under particular memory pressure, then an IO can be
broken down into a "gang block", a 512-byte block composed of a header
and up to three block pointers. Each points to a fragment of the
original write, or in turn, another gang block, breaking the original
data up over and over until space can be found in the pool for each of
them.

Each gang header is a separate 512-byte memory allocation from a slab,
that needs to be written down to disk. When the gang header is added to
the BIO, its a single 512-byte segment.

Pulling all this together, consider a large aggregated write of gang
blocks. This results a BIO containing lots of 512-byte segments. Given
our tendency to overfill the BIO, a split is likely, and most possible
split points will yield a pair of BIOs that are misaligned. Drivers that
care, like the SCSI driver, will reject them.

---

This commit is a substantial refactor and rewrite of much of `vdev_disk`
to sort all this out.

`vdev_bio_max_segs()` now returns the ideal maximum size for the device,
if available. There's also a tuneable `zfs_vdev_disk_max_segs` to
override this, to assist with testing.

We scan the ABD up front to count the number of pages within it, and to
confirm that if we submitted all those pages to one or more BIOs, it
could be split at any point with creating a misaligned BIO.  If the
pages in the BIO are not usable (as in any of the above situations), the
ABD is linearised, and then checked again. This is the same technique
used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size
and allocator quirks.

`vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The
idea is simply that it can hold all the state needed to create, submit
and return multiple BIOs, including all the refcounts, the ABD copy if
it was needed, and so on. Apart from what I hope is a clearer interface,
the major difference is that because we know how many BIOs we'll need up
front, we don't need the old overflow logic that would grow the BIO
array, throw away all the old work and restart. We can get it right from
the start.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8)
---
 include/os/linux/kernel/linux/mod_compat.h |   1 +
 man/man4/zfs.4                             |  10 +-
 module/os/linux/zfs/vdev_disk.c            | 439 ++++++++++++++++++++-
 3 files changed, 447 insertions(+), 3 deletions(-)

diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h
index 8e20a96135..039865b703 100644
--- a/include/os/linux/kernel/linux/mod_compat.h
+++ b/include/os/linux/kernel/linux/mod_compat.h
@@ -68,6 +68,7 @@ enum scope_prefix_types {
 	zfs_trim,
 	zfs_txg,
 	zfs_vdev,
+	zfs_vdev_disk,
 	zfs_vdev_file,
 	zfs_vdev_mirror,
 	zfs_vnops,
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 352990e02d..b5679f2f07 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2,6 +2,7 @@
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
+.\" Copyright (c) 2023, 2024 Klara, Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
@@ -15,7 +16,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd July 21, 2023
+.Dd January 9, 2024
 .Dt ZFS 4
 .Os
 .
@@ -1345,6 +1346,13 @@ _
 	4	Driver	No driver retries on driver errors.
 .TE
 .
+.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
+Maximum number of segments to add to a BIO (min 4).
+If this is higher than the maximum allowed by the device queue or the kernel
+itself, it will be clamped.
+Setting it to zero will cause the kernel's ideal size to be used.
+This parameter only applies on Linux.
+.
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
 .Pa .zfs/snapshot .
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index de4dba72fa..0ccb9ad96f 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -24,6 +24,7 @@
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -66,6 +67,13 @@ typedef struct vdev_disk {
 	krwlock_t			vd_lock;
 } vdev_disk_t;
 
+/*
+ * Maximum number of segments to add to a bio (min 4). If this is higher than
+ * the maximum allowed by the device queue or the kernel itself, it will be
+ * clamped. Setting it to zero will cause the kernel's ideal size to be used.
+ */
+uint_t zfs_vdev_disk_max_segs = 0;
+
 /*
  * Unique identifier for the exclusive vdev holder.
  */
@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 
+static inline uint_t
+vdev_bio_max_segs(struct block_device *bdev)
+{
+	/*
+	 * Smallest of the device max segs and the tuneable max segs. Minimum
+	 * 4, so there's room to finish split pages if they come up.
+	 */
+	const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
+	const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
+	    MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
+	const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
+
+#ifdef HAVE_BIO_MAX_SEGS
+	return (bio_max_segs(max_segs));
+#else
+	return (MIN(max_segs, BIO_MAX_PAGES));
+#endif
+}
+
+static inline uint_t
+vdev_bio_max_bytes(struct block_device *bdev)
+{
+	return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
+}
+
+
+/*
+ * Virtual block IO object (VBIO)
+ *
+ * Linux block IO (BIO) objects have a limit on how many data segments (pages)
+ * they can hold. Depending on how they're allocated and structured, a large
+ * ZIO can require more than one BIO to be submitted to the kernel, which then
+ * all have to complete before we can return the completed ZIO back to ZFS.
+ *
+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
+ * translate a ZIO down into the kernel block layer and back again.
+ *
+ * Note that these are only used for data ZIOs (read/write). Meta-operations
+ * (flush/trim) don't need multiple BIOs and so can just make the call
+ * directly.
+ */
+typedef struct {
+	zio_t		*vbio_zio;	/* parent zio */
+
+	struct block_device *vbio_bdev;	/* blockdev to submit bios to */
+
+	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
+
+	atomic_t	vbio_ref;	/* bio refcount */
+	int		vbio_error;	/* error from failed bio */
+
+	uint_t		vbio_max_segs;	/* max segs per bio */
+
+	uint_t		vbio_max_bytes;	/* max bytes per bio */
+	uint_t		vbio_lbs_mask;	/* logical block size mask */
+
+	uint64_t	vbio_offset;	/* start offset of next bio */
+
+	struct bio	*vbio_bio;	/* pointer to the current bio */
+	struct bio	*vbio_bios;	/* list of all bios */
+} vbio_t;
+
+static vbio_t *
+vbio_alloc(zio_t *zio, struct block_device *bdev)
+{
+	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
+
+	vbio->vbio_zio = zio;
+	vbio->vbio_bdev = bdev;
+	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
+	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
+	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
+	vbio->vbio_offset = zio->io_offset;
+
+	return (vbio);
+}
+
+static int
+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
+{
+	struct bio *bio;
+	uint_t ssize;
+
+	while (size > 0) {
+		bio = vbio->vbio_bio;
+		if (bio == NULL) {
+			/* New BIO, allocate and set up */
+			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
+			    vbio->vbio_max_segs);
+			if (unlikely(bio == NULL))
+				return (SET_ERROR(ENOMEM));
+			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+
+			bio->bi_next = vbio->vbio_bios;
+			vbio->vbio_bios = vbio->vbio_bio = bio;
+		}
+
+		/*
+		 * Only load as much of the current page data as will fit in
+		 * the space left in the BIO, respecting lbs alignment. Older
+		 * kernels will error if we try to overfill the BIO, while
+		 * newer ones will accept it and split the BIO. This ensures
+		 * everything works on older kernels, and avoids an additional
+		 * overhead on the new.
+		 */
+		ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
+		    vbio->vbio_lbs_mask);
+		if (ssize > 0 &&
+		    bio_add_page(bio, page, ssize, offset) == ssize) {
+			/* Accepted, adjust and load any remaining. */
+			size -= ssize;
+			offset += ssize;
+			continue;
+		}
+
+		/* No room, set up for a new BIO and loop */
+		vbio->vbio_offset += BIO_BI_SIZE(bio);
+
+		/* Signal new BIO allocation wanted */
+		vbio->vbio_bio = NULL;
+	}
+
+	return (0);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
+static void vbio_put(vbio_t *vbio);
+
+static void
+vbio_submit(vbio_t *vbio, int flags)
+{
+	ASSERT(vbio->vbio_bios);
+	struct bio *bio = vbio->vbio_bios;
+	vbio->vbio_bio = vbio->vbio_bios = NULL;
+
+	/*
+	 * We take a reference for each BIO as we submit it, plus one to
+	 * protect us from BIOs completing before we're done submitting them
+	 * all, causing vbio_put() to free vbio out from under us and/or the
+	 * zio to be returned before all its IO has completed.
+	 */
+	atomic_set(&vbio->vbio_ref, 1);
+
+	/*
+	 * If we're submitting more than one BIO, inform the block layer so
+	 * it can batch them if it wants.
+	 */
+	struct blk_plug plug;
+	boolean_t do_plug = (bio->bi_next != NULL);
+	if (do_plug)
+		blk_start_plug(&plug);
+
+	/* Submit all the BIOs */
+	while (bio != NULL) {
+		atomic_inc(&vbio->vbio_ref);
+
+		struct bio *next = bio->bi_next;
+		bio->bi_next = NULL;
+
+		bio->bi_end_io = vdev_disk_io_rw_completion;
+		bio->bi_private = vbio;
+		bio_set_op_attrs(bio,
+		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+		    WRITE : READ, flags);
+
+		vdev_submit_bio(bio);
+
+		bio = next;
+	}
+
+	/* Finish the batch */
+	if (do_plug)
+		blk_finish_plug(&plug);
+
+	/* Release the extra reference */
+	vbio_put(vbio);
+}
+
+static void
+vbio_return_abd(vbio_t *vbio)
+{
+	zio_t *zio = vbio->vbio_zio;
+	if (vbio->vbio_abd == NULL)
+		return;
+
+	/*
+	 * If we copied the ABD before issuing it, clean up and return the copy
+	 * to the ADB, with changes if appropriate.
+	 */
+	void *buf = abd_to_buf(vbio->vbio_abd);
+	abd_free(vbio->vbio_abd);
+	vbio->vbio_abd = NULL;
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+	else
+		abd_return_buf(zio->io_abd, buf, zio->io_size);
+}
+
+static void
+vbio_free(vbio_t *vbio)
+{
+	VERIFY0(atomic_read(&vbio->vbio_ref));
+
+	vbio_return_abd(vbio);
+
+	kmem_free(vbio, sizeof (vbio_t));
+}
+
+static void
+vbio_put(vbio_t *vbio)
+{
+	if (atomic_dec_return(&vbio->vbio_ref) > 0)
+		return;
+
+	/*
+	 * This was the last reference, so the entire IO is completed. Clean
+	 * up and submit it for processing.
+	 */
+
+	/*
+	 * Get any data buf back to the original ABD, if necessary. We do this
+	 * now so we can get the ZIO into the pipeline as quickly as possible,
+	 * and then do the remaining cleanup after.
+	 */
+	vbio_return_abd(vbio);
+
+	zio_t *zio = vbio->vbio_zio;
+
+	/*
+	 * Set the overall error. If multiple BIOs returned an error, only the
+	 * first will be taken; the others are dropped (see
+	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
+	 * multiple IOs to the same device to fail with different errors, so
+	 * there's no real risk.
+	 */
+	zio->io_error = vbio->vbio_error;
+	if (zio->io_error)
+		vdev_disk_error(zio);
+
+	/* All done, submit for processing */
+	zio_delay_interrupt(zio);
+
+	/* Finish cleanup */
+	vbio_free(vbio);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
+{
+	vbio_t *vbio = bio->bi_private;
+
+	if (vbio->vbio_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		vbio->vbio_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			vbio->vbio_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			vbio->vbio_error = EIO;
+#endif
+	}
+
+	/*
+	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
+	 * kernel won't touch it again after the completion function runs.
+	 */
+	bio_put(bio);
+
+	/* Drop this BIOs reference acquired by vbio_submit() */
+	vbio_put(vbio);
+}
+
+/*
+ * Iterator callback to count ABD pages and check their size & alignment.
+ *
+ * On Linux, each BIO segment can take a page pointer, and an offset+length of
+ * the data within that page. A page can be arbitrarily large ("compound"
+ * pages) but we still have to ensure the data portion is correctly sized and
+ * aligned to the logical block size, to ensure that if the kernel wants to
+ * split the BIO, the two halves will still be properly aligned.
+ */
+typedef struct {
+	uint_t  bmask;
+	uint_t  npages;
+	uint_t  end;
+} vdev_disk_check_pages_t;
+
+static int
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vdev_disk_check_pages_t *s = priv;
+
+	/*
+	 * If we didn't finish on a block size boundary last time, then there
+	 * would be a gap if we tried to use this ABD as-is, so abort.
+	 */
+	if (s->end != 0)
+		return (1);
+
+	/*
+	 * Note if we're taking less than a full block, so we can check it
+	 * above on the next call.
+	 */
+	s->end = len & s->bmask;
+
+	/* All blocks after the first must start on a block size boundary. */
+	if (s->npages != 0 && (off & s->bmask) != 0)
+		return (1);
+
+	s->npages++;
+	return (0);
+}
+
+/*
+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns
+ * the number of pages, or 0 if it can't be submitted like this.
+ */
+static boolean_t
+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
+{
+	vdev_disk_check_pages_t s = {
+	    .bmask = bdev_logical_block_size(bdev)-1,
+	    .npages = 0,
+	    .end = 0,
+	};
+
+	if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
+}
+
+static int
+vdev_disk_io_rw(zio_t *zio)
+{
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	int flags = 0;
+
+	/*
+	 * Accessing outside the block device is never allowed.
+	 */
+	if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
+		vdev_dbgmsg(zio->io_vd,
+		    "Illegal access %llu size %llu, device size %llu",
+		    (u_longlong_t)zio->io_offset,
+		    (u_longlong_t)zio->io_size,
+		    (u_longlong_t)i_size_read(bdev->bd_inode));
+		return (SET_ERROR(EIO));
+	}
+
+	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
+	    v->vdev_failfast == B_TRUE) {
+		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
+		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
+	}
+
+	/*
+	 * Check alignment of the incoming ABD. If any part of it would require
+	 * submitting a page that is not aligned to the logical block size,
+	 * then we take a copy into a linear buffer and submit that instead.
+	 * This should be impossible on a 512b LBS, and fairly rare on 4K,
+	 * usually requiring abnormally-small data blocks (eg gang blocks)
+	 * mixed into the same ABD as larger ones (eg aggregated).
+	 */
+	abd_t *abd = zio->io_abd;
+	if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
+		void *buf;
+		if (zio->io_type == ZIO_TYPE_READ)
+			buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+		else
+			buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+
+		/*
+		 * Wrap the copy in an abd_t, so we can use the same iterators
+		 * to count and fill the vbio later.
+		 */
+		abd = abd_get_from_buf(buf, zio->io_size);
+
+		/*
+		 * False here would mean the borrowed copy has an invalid
+		 * alignment too, which would mean we've somehow been passed a
+		 * linear ABD with an interior page that has a non-zero offset
+		 * or a size not a multiple of PAGE_SIZE. This is not possible.
+		 * It would mean either zio_buf_alloc() or its underlying
+		 * allocators have done something extremely strange, or our
+		 * math in vdev_disk_check_pages() is wrong. In either case,
+		 * something in seriously wrong and its not safe to continue.
+		 */
+		VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
+	}
+
+	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
+	int error = 0;
+	vbio_t *vbio = vbio_alloc(zio, bdev);
+	if (abd != zio->io_abd)
+		vbio->vbio_abd = abd;
+
+	/* Fill it with pages */
+	error = abd_iterate_page_func(abd, 0, zio->io_size,
+	    vdev_disk_fill_vbio_cb, vbio);
+	if (error != 0) {
+		vbio_free(vbio);
+		return (error);
+	}
+
+	vbio_submit(vbio, flags);
+	return (0);
+}
+
 /* ========== */
 
 /*
- * This is the classic, battle-tested BIO submission code.
+ * This is the classic, battle-tested BIO submission code. Until we're totally
+ * sure that the new code is safe and correct in all cases, this will remain
+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
+ * load time.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
-		vdev_disk_io_rw_fn = vdev_classic_physio;
+		/* XXX make configurable */
+		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 	"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
+
+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
+	"Maximum number of data segments to add to an IO request (min 4)");

From af3a5bb40d89d6339ead886fbef96b24171962bd Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 13:28:57 +1100
Subject: [PATCH 22/31] vdev_disk: add module parameter to select BIO
 submission method

This makes the submission method selectable at module load time via the
`zfs_vdev_disk_classic` parameter, allowing this change to be backported
to 2.2 safely, and disabled in favour of the "classic" submission method
if new problems come up.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit df2169d141aadc0c2cc728c5c5261d6f5c2a27f7)
---
 man/man4/zfs.4                  | 16 ++++++++++++++++
 module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index b5679f2f07..6a628e7f3e 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1352,6 +1352,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
 itself, it will be clamped.
 Setting it to zero will cause the kernel's ideal size to be used.
 This parameter only applies on Linux.
+This parameter is ignored if
+.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
+.
+.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
+and earlier.
+This "classic" method has known issues with highly fragmented IO requests and
+is slower on many workloads, but it has been in use for many years and is known
+to be very stable.
+If you set this parameter, please also open a bug report why you did so,
+including the workload involved and any error messages.
+.Pp
+This parameter and the classic submission method will be removed once we have
+total confidence in the new method.
+.Pp
+This parameter only applies on Linux, and can only be set at module load time.
 .
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 0ccb9ad96f..a9110623ac 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
+/*
+ * BIO submission method. See comment above about vdev_classic.
+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic
+ */
+static uint_t zfs_vdev_disk_classic = 0;	/* default new */
+
+/* Set submission function from module parameter */
+static int
+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
+{
+	int err = param_set_uint(buf, kp);
+	if (err < 0)
+		return (SET_ERROR(err));
+
+	vdev_disk_io_rw_fn =
+	    zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
+
+	printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
+	    zfs_vdev_disk_classic ? "classic" : "new");
+
+	return (0);
+}
+
 /*
  * At first use vdev use, set the submission function from the default value if
  * it hasn't been set already.
@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
-		/* XXX make configurable */
-		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
+		vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
+		    vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
 	"Maximum number of data segments to add to an IO request (min 4)");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
+    vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
+	"Use classic BIO submission method");

From cb599d27edf8788c53507c0a6588438fa0f120b8 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 21 Feb 2024 11:07:21 +1100
Subject: [PATCH 23/31] vdev_disk: use bio_chain() to submit multiple BIOs

Simplifies our code a lot, so we don't have to wait for each and
reassemble them.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit 72fd834c47558cb10d847948d1a4615e894c77c3)
---
 module/os/linux/zfs/vdev_disk.c | 255 ++++++++++++--------------------
 1 file changed, 92 insertions(+), 163 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index a9110623ac..36468fc211 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
-	if (vd->vd_bdh != NULL) {
+	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
-	}
 
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
@@ -663,9 +662,6 @@ typedef struct {
 
 	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
 
-	atomic_t	vbio_ref;	/* bio refcount */
-	int		vbio_error;	/* error from failed bio */
-
 	uint_t		vbio_max_segs;	/* max segs per bio */
 
 	uint_t		vbio_max_bytes;	/* max bytes per bio */
@@ -674,43 +670,52 @@ typedef struct {
 	uint64_t	vbio_offset;	/* start offset of next bio */
 
 	struct bio	*vbio_bio;	/* pointer to the current bio */
-	struct bio	*vbio_bios;	/* list of all bios */
+	int		vbio_flags;	/* bio flags */
 } vbio_t;
 
 static vbio_t *
-vbio_alloc(zio_t *zio, struct block_device *bdev)
+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
 {
 	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
 
 	vbio->vbio_zio = zio;
 	vbio->vbio_bdev = bdev;
-	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_abd = NULL;
 	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
 	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
 	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
 	vbio->vbio_offset = zio->io_offset;
+	vbio->vbio_bio = NULL;
+	vbio->vbio_flags = flags;
 
 	return (vbio);
 }
 
+BIO_END_IO_PROTO(vbio_completion, bio, error);
+
 static int
 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 {
-	struct bio *bio;
+	struct bio *bio = vbio->vbio_bio;
 	uint_t ssize;
 
 	while (size > 0) {
-		bio = vbio->vbio_bio;
 		if (bio == NULL) {
 			/* New BIO, allocate and set up */
 			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
 			    vbio->vbio_max_segs);
-			if (unlikely(bio == NULL))
-				return (SET_ERROR(ENOMEM));
-			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+			VERIFY(bio);
 
-			bio->bi_next = vbio->vbio_bios;
-			vbio->vbio_bios = vbio->vbio_bio = bio;
+			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+			bio_set_op_attrs(bio,
+			    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+			    WRITE : READ, vbio->vbio_flags);
+
+			if (vbio->vbio_bio) {
+				bio_chain(vbio->vbio_bio, bio);
+				vdev_submit_bio(vbio->vbio_bio);
+			}
+			vbio->vbio_bio = bio;
 		}
 
 		/*
@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 		vbio->vbio_offset += BIO_BI_SIZE(bio);
 
 		/* Signal new BIO allocation wanted */
-		vbio->vbio_bio = NULL;
+		bio = NULL;
 	}
 
 	return (0);
 }
 
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
-static void vbio_put(vbio_t *vbio);
-
-static void
-vbio_submit(vbio_t *vbio, int flags)
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 {
-	ASSERT(vbio->vbio_bios);
-	struct bio *bio = vbio->vbio_bios;
-	vbio->vbio_bio = vbio->vbio_bios = NULL;
-
-	/*
-	 * We take a reference for each BIO as we submit it, plus one to
-	 * protect us from BIOs completing before we're done submitting them
-	 * all, causing vbio_put() to free vbio out from under us and/or the
-	 * zio to be returned before all its IO has completed.
-	 */
-	atomic_set(&vbio->vbio_ref, 1);
-
-	/*
-	 * If we're submitting more than one BIO, inform the block layer so
-	 * it can batch them if it wants.
-	 */
-	struct blk_plug plug;
-	boolean_t do_plug = (bio->bi_next != NULL);
-	if (do_plug)
-		blk_start_plug(&plug);
-
-	/* Submit all the BIOs */
-	while (bio != NULL) {
-		atomic_inc(&vbio->vbio_ref);
-
-		struct bio *next = bio->bi_next;
-		bio->bi_next = NULL;
-
-		bio->bi_end_io = vdev_disk_io_rw_completion;
-		bio->bi_private = vbio;
-		bio_set_op_attrs(bio,
-		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
-		    WRITE : READ, flags);
-
-		vdev_submit_bio(bio);
-
-		bio = next;
-	}
-
-	/* Finish the batch */
-	if (do_plug)
-		blk_finish_plug(&plug);
-
-	/* Release the extra reference */
-	vbio_put(vbio);
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
 }
 
+/* Create some BIOs, fill them with data and submit them */
 static void
-vbio_return_abd(vbio_t *vbio)
+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
+	ASSERT(vbio->vbio_bdev);
+
+	/*
+	 * We plug so we can submit the BIOs as we go and only unplug them when
+	 * they are fully created and submitted. This is important; if we don't
+	 * plug, then the kernel may start executing earlier BIOs while we're
+	 * still creating and executing later ones, and if the device goes
+	 * away while that's happening, older kernels can get confused and
+	 * trample memory.
+	 */
+	struct blk_plug plug;
+	blk_start_plug(&plug);
+
+	(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
+	ASSERT(vbio->vbio_bio);
+
+	vbio->vbio_bio->bi_end_io = vbio_completion;
+	vbio->vbio_bio->bi_private = vbio;
+
+	vdev_submit_bio(vbio->vbio_bio);
+
+	blk_finish_plug(&plug);
+
+	vbio->vbio_bio = NULL;
+	vbio->vbio_bdev = NULL;
+}
+
+/* IO completion callback */
+BIO_END_IO_PROTO(vbio_completion, bio, error)
+{
+	vbio_t *vbio = bio->bi_private;
 	zio_t *zio = vbio->vbio_zio;
-	if (vbio->vbio_abd == NULL)
-		return;
+
+	ASSERT(zio);
+
+	/* Capture and log any errors */
+#ifdef HAVE_1ARG_BIO_END_IO_T
+	zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+	zio->io_error = 0;
+	if (error)
+		zio->io_error = -(error);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		zio->io_error = EIO;
+#endif
+	ASSERT3U(zio->io_error, >=, 0);
+
+	if (zio->io_error)
+		vdev_disk_error(zio);
+
+	/* Return the BIO to the kernel */
+	bio_put(bio);
 
 	/*
 	 * If we copied the ABD before issuing it, clean up and return the copy
 	 * to the ADB, with changes if appropriate.
 	 */
-	void *buf = abd_to_buf(vbio->vbio_abd);
-	abd_free(vbio->vbio_abd);
-	vbio->vbio_abd = NULL;
+	if (vbio->vbio_abd != NULL) {
+		void *buf = abd_to_buf(vbio->vbio_abd);
+		abd_free(vbio->vbio_abd);
+		vbio->vbio_abd = NULL;
 
-	if (zio->io_type == ZIO_TYPE_READ)
-		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
-	else
-		abd_return_buf(zio->io_abd, buf, zio->io_size);
-}
-
-static void
-vbio_free(vbio_t *vbio)
-{
-	VERIFY0(atomic_read(&vbio->vbio_ref));
-
-	vbio_return_abd(vbio);
+		if (zio->io_type == ZIO_TYPE_READ)
+			abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+		else
+			abd_return_buf(zio->io_abd, buf, zio->io_size);
+	}
 
+	/* Final cleanup */
 	kmem_free(vbio, sizeof (vbio_t));
-}
-
-static void
-vbio_put(vbio_t *vbio)
-{
-	if (atomic_dec_return(&vbio->vbio_ref) > 0)
-		return;
-
-	/*
-	 * This was the last reference, so the entire IO is completed. Clean
-	 * up and submit it for processing.
-	 */
-
-	/*
-	 * Get any data buf back to the original ABD, if necessary. We do this
-	 * now so we can get the ZIO into the pipeline as quickly as possible,
-	 * and then do the remaining cleanup after.
-	 */
-	vbio_return_abd(vbio);
-
-	zio_t *zio = vbio->vbio_zio;
-
-	/*
-	 * Set the overall error. If multiple BIOs returned an error, only the
-	 * first will be taken; the others are dropped (see
-	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
-	 * multiple IOs to the same device to fail with different errors, so
-	 * there's no real risk.
-	 */
-	zio->io_error = vbio->vbio_error;
-	if (zio->io_error)
-		vdev_disk_error(zio);
 
 	/* All done, submit for processing */
 	zio_delay_interrupt(zio);
-
-	/* Finish cleanup */
-	vbio_free(vbio);
-}
-
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
-{
-	vbio_t *vbio = bio->bi_private;
-
-	if (vbio->vbio_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
-		vbio->vbio_error = BIO_END_IO_ERROR(bio);
-#else
-		if (error)
-			vbio->vbio_error = -(error);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			vbio->vbio_error = EIO;
-#endif
-	}
-
-	/*
-	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
-	 * kernel won't touch it again after the completion function runs.
-	 */
-	bio_put(bio);
-
-	/* Drop this BIOs reference acquired by vbio_submit() */
-	vbio_put(vbio);
 }
 
 /*
@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
 	return (B_TRUE);
 }
 
-/* Iterator callback to submit ABD pages to the vbio. */
-static int
-vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
-{
-	vbio_t *vbio = priv;
-	return (vbio_add_page(vbio, page, len, off));
-}
-
 static int
 vdev_disk_io_rw(zio_t *zio)
 {
@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio)
 	}
 
 	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
-	int error = 0;
-	vbio_t *vbio = vbio_alloc(zio, bdev);
+	vbio_t *vbio = vbio_alloc(zio, bdev, flags);
 	if (abd != zio->io_abd)
 		vbio->vbio_abd = abd;
 
-	/* Fill it with pages */
-	error = abd_iterate_page_func(abd, 0, zio->io_size,
-	    vdev_disk_fill_vbio_cb, vbio);
-	if (error != 0) {
-		vbio_free(vbio);
-		return (error);
-	}
-
-	vbio_submit(vbio, flags);
+	/* Fill it with data pages and submit it to the kernel */
+	vbio_submit(vbio, abd, zio->io_size);
 	return (0);
 }
 

From d0b3be763f63bcc4ca05542eb9c2c635d93af03c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 14 Mar 2024 10:57:30 +1100
Subject: [PATCH 24/31] abd_iter_page: don't use compound heads on Linux <4.5

Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages
in a compound page were refcounted separately. This means that using the
head page without taking a reference to it could see it cleaned up later
before we're finished with it. Specifically, bio_add_page() would take a
reference, and drop its reference after the bio completion callback
returns.

If the zio is executed immediately from the completion callback, this is
usually ok, as any data is referenced through the tail page referenced
by the ABD, and so becomes "live" that way. If there's a delay in zio
execution (high load, error injection), then the head page can be freed,
along with any dirty flags or other indicators that the underlying
memory is used. Later, when the zio completes and that memory is
accessed, its either unmapped and an unhandled fault takes down the
entire system, or it is mapped and we end up messing around in someone
else's memory. Both of these are very bad.

The solution on these older kernels is to take a reference to the head
page when we use it, and release it when we're done. There's not really
a sensible way under our current structure to do this; the "best" would
be to keep a list of head page references in the ABD, and release them
when the ABD is freed.

Since this additional overhead is totally unnecessary on 4.5+, where
head and tail pages share refcounts, I've opted to simply not use the
compound head in ABD page iteration there. This is theoretically less
efficient (though cleaning up head page references would add overhead),
but its safe, and we still get the other benefits of not mapping pages
before adding them to a bio and not mis-splitting pages.

There doesn't appear to be an obvious symbol name or config option we
can match on to discover this behaviour in configure (and the mm/page
APIs have changed a lot since then anyway), so I've gone with a simple
version check.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
(cherry picked from commit c6be6ce1755a3d9a3cbe70256cd8958ef83d8542)
---
 module/os/linux/zfs/abd_os.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 3fe01c0b7d..d3255dcbc0 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -62,6 +62,7 @@
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
+#include <linux/version.h>
 #endif
 
 #ifdef _KERNEL
@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter)
 	}
 	ASSERT(page);
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
 	if (PageTail(page)) {
 		/*
 		 * This page is part of a "compound page", which is a group of
@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter)
 		 * To do this, we need to adjust the offset to be counted from
 		 * the head page. struct page for compound pages are stored
 		 * contiguously, so we can just adjust by a simple offset.
+		 *
+		 * Before kernel 4.5, compound page heads were refcounted
+		 * separately, such that moving back to the head page would
+		 * require us to take a reference to it and releasing it once
+		 * we're completely finished with it. In practice, that means
+		 * when our caller is done with the ABD, which we have no
+		 * insight into from here. Rather than contort this API to
+		 * track head page references on such ancient kernels, we just
+		 * compile this block out and use the tail pages directly. This
+		 * is slightly less efficient, but makes everything far
+		 * simpler.
 		 */
 		struct page *head = compound_head(page);
 		doff += ((page - head) * PAGESIZE);
 		page = head;
 	}
+#endif
 
 	/* final page and position within it */
 	aiter->iter_page = page;

From eebf00bee91de2c7b7d03a9c1e5e2f3e5fd66c9e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 27 Mar 2024 13:11:12 +1100
Subject: [PATCH 25/31] vdev_disk: default to classic submission for 2.2.x

We don't want to change to brand-new code in the middle of a stable
series, but we want it available to test for people running into page
splitting issues.

This commits make zfs_vdev_disk_classic=1 the default, and updates the
documentation to better explain what's going on.

Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
---
 man/man4/zfs.4                  | 31 ++++++++++++++++++++++---------
 module/os/linux/zfs/vdev_disk.c |  8 +++++---
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 6a628e7f3e..a98ec519aa 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1355,17 +1355,30 @@ This parameter only applies on Linux.
 This parameter is ignored if
 .Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
 .
-.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
-If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
-and earlier.
-This "classic" method has known issues with highly fragmented IO requests and
-is slower on many workloads, but it has been in use for many years and is known
-to be very stable.
-If you set this parameter, please also open a bug report why you did so,
+.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
+Controls the method used to submit IO to the Linux block layer
+(default
+.Sy 1 "classic" Ns
+)
+.Pp
+If set to 1, the "classic" method is used.
+This is the method that has been in use since the earliest versions of
+ZFS-on-Linux.
+It has known issues with highly fragmented IO requests and is less efficient on
+many workloads, but it well known and well understood.
+.Pp
+If set to 0, the "new" method is used.
+This method is available since 2.2.4 and should resolve all known issues and be
+far more efficient, but has not had as much testing.
+In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
+.Pp
+It is not recommended that you change it except on advice from the OpenZFS
+developers.
+If you do change it, please also open a bug report describing why you did so,
 including the workload involved and any error messages.
 .Pp
-This parameter and the classic submission method will be removed once we have
-total confidence in the new method.
+This parameter and the "classic" submission method will be removed in a future
+release of OpenZFS once we have total confidence in the new method.
 .Pp
 This parameter only applies on Linux, and can only be set at module load time.
 .
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 36468fc211..e1c19a085b 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -969,8 +969,10 @@ vdev_disk_io_rw(zio_t *zio)
 /*
  * This is the classic, battle-tested BIO submission code. Until we're totally
  * sure that the new code is safe and correct in all cases, this will remain
- * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
- * load time.
+ * available.
+ *
+ * It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
+ * enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
@@ -1468,7 +1470,7 @@ vdev_disk_rele(vdev_t *vd)
  * BIO submission method. See comment above about vdev_classic.
  * Set zfs_vdev_disk_classic=0 for new, =1 for classic
  */
-static uint_t zfs_vdev_disk_classic = 0;	/* default new */
+static uint_t zfs_vdev_disk_classic = 1;	/* default classic */
 
 /* Set submission function from module parameter */
 static int

From deb7a84231aff8d772bb4ce9fa486d1886f1a2b6 Mon Sep 17 00:00:00 2001
From: Robert Evans <rrevans@gmail.com>
Date: Mon, 25 Mar 2024 17:56:49 -0400
Subject: [PATCH 26/31] Fix corruption caused by mmap flushing problems

1) Make mmap flushes synchronous. Linux may skip flushing dirty pages
   already in writeback unless data-integrity sync is requested.

2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be
   skipped due to DMU pushing back on TX assign.

3) Add missing mmap flush when doing block cloning.

4) While here, pass errors from putpage to writepage/writepages.

This change fixes corruption edge cases, but unfortunately adds
synchronous ZIL flushes for dirty mmap pages to llseek and bclone
operations. It may be possible to avoid these sync writes later
but would need more tricky refactoring of the writeback code.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #15933
Closes #16019
---
 module/os/linux/zfs/zfs_vnops_os.c | 5 +----
 module/os/linux/zfs/zpl_file.c     | 8 ++++----
 module/zfs/zfs_vnops.c             | 6 +++++-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index c06a75662b..7c473bc7e9 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -3792,11 +3792,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
-	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
-		if (err == ERESTART)
-			dmu_tx_wait(tx);
-
 		dmu_tx_abort(tx);
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 3caa0fc6c2..9dec52215c 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 {
 	boolean_t *for_sync = data;
 	fstrans_cookie_t cookie;
+	int ret;
 
 	ASSERT(PageLocked(pp));
 	ASSERT(!PageWriteback(pp));
 
 	cookie = spl_fstrans_mark();
-	(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
+	ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
 	spl_fstrans_unmark(cookie);
 
-	return (0);
+	return (ret);
 }
 
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
 {
-	(void) zpl_putpage(&pp->page, wbc, data);
-	return (0);
+	return (zpl_putpage(&pp->page, wbc, data));
 }
 #endif
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 2b37834d5c..7020f88ecf 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -130,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
 
 	/* Flush any mmap()'d data to disk */
 	if (zn_has_cached_data(zp, 0, file_sz - 1))
-		zn_flush_cached_data(zp, B_FALSE);
+		zn_flush_cached_data(zp, B_TRUE);
 
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
@@ -1193,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}
 	}
 
+	/* Flush any mmap()'d data to disk */
+	if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
+		zn_flush_cached_data(inzp, B_TRUE);
+
 	/*
 	 * Maintain predictable lock order.
 	 */

From 99ef4190f22d2ba75e7546a1e87e515480c3d8f5 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 20 Mar 2024 20:22:36 -0400
Subject: [PATCH 27/31] Update resume token at object receive.

Before this change resume token was updated only on data receive.
Usually it is enough to resume replication without much overlap.
But we've got a report of a curios case, where replication source
was traversed with recursive grep, which through enabled atime
modified every object without modifying any data.  It produced
several gigabytes of replication traffic without a single data
write and so without a single resume point.

While the resume token was not designed to resume from an object,
I've found that the send implementation always sends object before
any data. So by requesting resume from offset 0 we are effectively
resuming from the object, followed (or not) by the data at offset
0, just as we need it.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15927
---
 module/zfs/dmu_recv.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 54aa60259e..2cf1090973 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2110,6 +2110,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		dmu_buf_rele(db, FTAG);
 		dnode_rele(dn, FTAG);
 	}
+
+	/*
+	 * If the receive fails, we want the resume stream to start with the
+	 * same record that we last successfully received. There is no way to
+	 * request resume from the object record, but we can benefit from the
+	 * fact that sender always sends object record before anything else,
+	 * after which it will "resend" data at offset 0 and resume normally.
+	 */
+	save_resume_state(rwa, drro->drr_object, 0, tx);
+
 	dmu_tx_commit(tx);
 
 	return (0);

From 921491220801457c3bf5c04bf7df4bf657aa9092 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 29 Mar 2024 12:11:09 -0400
Subject: [PATCH 28/31] Improve dbuf_read() error reporting

Previous code reported non-ZIO errors only via return value, but
not via parent ZIO.  It could cause NULL-dereference panics due
to dmu_buf_hold_array_by_dnode() ignoring the return value,
relying solely on parent ZIO status.

Reported by:	Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
---
 module/zfs/dbuf.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 280001bc34..5f8cc215a9 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1542,17 +1542,14 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
  * returning.
  */
 static int
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
+dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
     db_lock_type_t dblt, const void *tag)
 {
-	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
 	blkptr_t bp, *bpp;
 
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@@ -1627,8 +1624,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	if (err != 0)
 		goto early_unlock;
 
-	DB_DNODE_EXIT(db);
-
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
@@ -1653,12 +1648,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
 	dmu_buf_unlock_parent(db, dblt, tag);
-	(void) arc_read(zio, db->db_objset->os_spa, bpp,
+	return (arc_read(zio, db->db_objset->os_spa, bpp,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
-	    &aflags, &zb);
-	return (err);
+	    &aflags, &zb));
+
 early_unlock:
-	DB_DNODE_EXIT(db);
 	mutex_exit(&db->db_mtx);
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (err);
@@ -1743,7 +1737,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 }
 
 int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t prefetch;
@@ -1759,7 +1753,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 	dn = DB_DNODE(db);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
+	    (flags & DB_RF_NOPREFETCH) == 0;
 
 	mutex_enter(&db->db_mtx);
 	if (flags & DB_RF_PARTIAL_FIRST)
@@ -1806,13 +1800,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 
-		if (zio == NULL && (db->db_state == DB_NOFILL ||
+		if (pio == NULL && (db->db_state == DB_NOFILL ||
 		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
-			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+			pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
 		}
-		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
+		err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
 		/*
 		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
 		 * for us
@@ -1833,9 +1827,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		 */
 		if (need_wait) {
 			if (err == 0)
-				err = zio_wait(zio);
+				err = zio_wait(pio);
 			else
-				VERIFY0(zio_wait(zio));
+				(void) zio_wait(pio);
+			pio = NULL;
 		}
 	} else {
 		/*
@@ -1862,7 +1857,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-				    db, zio_t *, zio);
+				    db, zio_t *, pio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
@@ -1871,6 +1866,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		}
 	}
 
+	if (pio && err != 0) {
+		zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		zio->io_error = err;
+		zio_nowait(zio);
+	}
+
 	return (err);
 }
 

From 4b89c91c650095b99cf4e1beeaec8bef564e06c2 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 28 Mar 2024 17:02:03 -0400
Subject: [PATCH 29/31] L2ARC: Relax locking during write

Previous code held ARC state sublist lock throughout all L2ARC
write process, which included number of allocations and even ZIO
issues.  Being blocked in any of those places the code could also
block ARC eviction, that could cause OOM activation or even dead-
lock if system is low on memory or one is too fragmented.

Fix it by dropping the lock as soon as we see a block eligible
for L2ARC writing and pick it up later using earlier inserted
marker.  While there, also reduce scope of hash lock, moving
ZIO allocation and other operations not requiring header access
out of it.  All operations requiring header access move under
hash lock, since L2_WRITING flag does not prevent header eviction
only transition to arc_l2c_only state with L1 header.

To be able to manipulate sublist lock and marker as needed add few
more multilist functions and modify one.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
---
 include/sys/multilist.h |   5 +-
 module/zfs/arc.c        | 189 +++++++++++++++++++++-------------------
 module/zfs/dbuf.c       |   2 +-
 module/zfs/dmu_objset.c |  10 +--
 module/zfs/metaslab.c   |   8 +-
 module/zfs/multilist.c  |  26 +++++-
 6 files changed, 136 insertions(+), 104 deletions(-)

diff --git a/include/sys/multilist.h b/include/sys/multilist.h
index 26f37c37ab..e7de86f237 100644
--- a/include/sys/multilist.h
+++ b/include/sys/multilist.h
@@ -82,12 +82,15 @@ int  multilist_is_empty(multilist_t *);
 unsigned int multilist_get_num_sublists(multilist_t *);
 unsigned int multilist_get_random_index(multilist_t *);
 
-multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_lock(multilist_sublist_t *);
+multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
 multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
 void multilist_sublist_unlock(multilist_sublist_t *);
 
 void multilist_sublist_insert_head(multilist_sublist_t *, void *);
 void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
+void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
 void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
 void multilist_sublist_remove(multilist_sublist_t *, void *);
 int  multilist_sublist_is_empty(multilist_sublist_t *);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index f5fd0aa551..266fb187ed 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -3883,7 +3883,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 
 	ASSERT3P(marker, !=, NULL);
 
-	mls = multilist_sublist_lock(ml, idx);
+	mls = multilist_sublist_lock_idx(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
@@ -3995,6 +3995,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	return (bytes_evicted);
 }
 
+static arc_buf_hdr_t *
+arc_state_alloc_marker(void)
+{
+	arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+	/*
+	 * A b_spa of 0 is used to indicate that this header is
+	 * a marker. This fact is used in arc_evict_state_impl().
+	 */
+	marker->b_spa = 0;
+
+	return (marker);
+}
+
+static void
+arc_state_free_marker(arc_buf_hdr_t *marker)
+{
+	kmem_cache_free(hdr_full_cache, marker);
+}
+
 /*
  * Allocate an array of buffer headers used as placeholders during arc state
  * eviction.
@@ -4005,16 +4025,8 @@ arc_state_alloc_markers(int count)
 	arc_buf_hdr_t **markers;
 
 	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
-	for (int i = 0; i < count; i++) {
-		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
-		/*
-		 * A b_spa of 0 is used to indicate that this header is
-		 * a marker. This fact is used in arc_evict_state_impl().
-		 */
-		markers[i]->b_spa = 0;
-
-	}
+	for (int i = 0; i < count; i++)
+		markers[i] = arc_state_alloc_marker();
 	return (markers);
 }
 
@@ -4022,7 +4034,7 @@ static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
 	for (int i = 0; i < count; i++)
-		kmem_cache_free(hdr_full_cache, markers[i]);
+		arc_state_free_marker(markers[i]);
 	kmem_free(markers, sizeof (*markers) * count);
 }
 
@@ -4066,7 +4078,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
-		mls = multilist_sublist_lock(ml, i);
+		mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@@ -4131,7 +4143,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@@ -8639,7 +8651,7 @@ l2arc_sublist_lock(int list_num)
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
-	return (multilist_sublist_lock(ml, idx));
+	return (multilist_sublist_lock_idx(ml, idx));
 }
 
 /*
@@ -9051,9 +9063,9 @@ l2arc_blk_fetch_done(zio_t *zio)
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
-	uint64_t 		write_asize, write_psize, write_lsize, headroom;
-	boolean_t		full;
+	arc_buf_hdr_t 		*hdr, *head, *marker;
+	uint64_t 		write_asize, write_psize, headroom;
+	boolean_t		full, from_head = !arc_warm;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
@@ -9062,10 +9074,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
-	write_lsize = write_asize = write_psize = 0;
+	write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+	marker = arc_state_alloc_marker();
 
 	/*
 	 * Copy buffers for L2ARC writing.
@@ -9080,40 +9093,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				continue;
 		}
 
-		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		uint64_t passed_sz = 0;
-
-		VERIFY3P(mls, !=, NULL);
-
-		/*
-		 * L2ARC fast warmup.
-		 *
-		 * Until the ARC is warm and starts to evict, read from the
-		 * head of the ARC lists rather than the tail.
-		 */
-		if (arc_warm == B_FALSE)
-			hdr = multilist_sublist_head(mls);
-		else
-			hdr = multilist_sublist_tail(mls);
-
 		headroom = target_sz * l2arc_headroom;
 		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
-		for (; hdr; hdr = hdr_prev) {
+		/*
+		 * Until the ARC is warm and starts to evict, read from the
+		 * head of the ARC lists rather than the tail.
+		 */
+		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+		ASSERT3P(mls, !=, NULL);
+		if (from_head)
+			hdr = multilist_sublist_head(mls);
+		else
+			hdr = multilist_sublist_tail(mls);
+
+		while (hdr != NULL) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
-			if (arc_warm == B_FALSE)
-				hdr_prev = multilist_sublist_next(mls, hdr);
-			else
-				hdr_prev = multilist_sublist_prev(mls, hdr);
-
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
-				/*
-				 * Skip this buffer rather than waiting.
-				 */
+skip:
+				/* Skip this buffer rather than waiting. */
+				if (from_head)
+					hdr = multilist_sublist_next(mls, hdr);
+				else
+					hdr = multilist_sublist_prev(mls, hdr);
 				continue;
 			}
 
@@ -9128,11 +9135,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
-				continue;
+				goto skip;
 			}
 
 			ASSERT(HDR_HAS_L1HDR(hdr));
-
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@@ -9154,12 +9160,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			}
 
 			/*
-			 * We rely on the L1 portion of the header below, so
-			 * it's invalid for this header to have been evicted out
-			 * of the ghost cache, prior to being written out. The
-			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 * We should not sleep with sublist lock held or it
+			 * may block ARC eviction.  Insert a marker to save
+			 * the position and drop the lock.
 			 */
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+			if (from_head) {
+				multilist_sublist_insert_after(mls, hdr,
+				    marker);
+			} else {
+				multilist_sublist_insert_before(mls, hdr,
+				    marker);
+			}
+			multilist_sublist_unlock(mls);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
@@ -9190,32 +9202,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
-					    ARC_FLAG_L2_WRITING);
+					    ARC_FLAG_L2CACHE);
 					mutex_exit(hash_lock);
-					continue;
+					goto next;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
+			hdr->b_l2hdr.b_dev = dev;
+			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+			hdr->b_l2hdr.b_hits = 0;
+			hdr->b_l2hdr.b_arcs_state =
+			    hdr->b_l1hdr.b_state->arcs_state;
+			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
-				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
-				mutex_exit(&dev->l2ad_mtx);
+			}
+			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
+			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+			    ARC_FLAG_L2_WRITING);
 
+			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
+			l2arc_hdr_arcstats_increment(hdr);
+
+			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+			mutex_exit(hash_lock);
+
+			if (pio == NULL) {
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
-				/*
-				 * Create a list to save allocated abd buffers
-				 * for l2arc_log_blk_commit().
-				 */
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
@@ -9223,54 +9248,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    ZIO_FLAG_CANFAIL);
 			}
 
-			hdr->b_l2hdr.b_dev = dev;
-			hdr->b_l2hdr.b_hits = 0;
-
-			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
-			hdr->b_l2hdr.b_arcs_state =
-			    hdr->b_l1hdr.b_state->arcs_state;
-			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
-			mutex_enter(&dev->l2ad_mtx);
-			list_insert_head(&dev->l2ad_buflist, hdr);
-			mutex_exit(&dev->l2ad_mtx);
-
-			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
-			    arc_hdr_size(hdr), hdr);
-
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    hdr->b_l2hdr.b_daddr, asize, to_write,
+			    dev->l2ad_hand, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
-			write_lsize += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
+			zio_nowait(wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
-			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
-			mutex_exit(hash_lock);
-
-			/*
-			 * Append buf info to current log and commit if full.
-			 * arcstat_l2_{size,asize} kstats are updated
-			 * internally.
-			 */
-			if (l2arc_log_blk_insert(dev, hdr)) {
-				/*
-				 * l2ad_hand will be adjusted in
-				 * l2arc_log_blk_commit().
-				 */
+			if (commit) {
+				/* l2ad_hand will be adjusted inside. */
 				write_asize +=
 				    l2arc_log_blk_commit(dev, pio, cb);
 			}
 
-			zio_nowait(wzio);
+next:
+			multilist_sublist_lock(mls);
+			if (from_head)
+				hdr = multilist_sublist_next(mls, marker);
+			else
+				hdr = multilist_sublist_prev(mls, marker);
+			multilist_sublist_remove(mls, marker);
 		}
 
 		multilist_sublist_unlock(mls);
@@ -9279,9 +9284,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			break;
 	}
 
+	arc_state_free_marker(marker);
+
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
-		ASSERT0(write_lsize);
+		ASSERT0(write_psize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
@@ -10609,7 +10616,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
-	L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+	L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 5f8cc215a9..ef48cbf984 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -754,7 +754,7 @@ static void
 dbuf_evict_one(void)
 {
 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(
+	multilist_sublist_t *mls = multilist_sublist_lock_idx(
 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index d134d4958f..f8bd7422a5 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1633,7 +1633,7 @@ sync_dnodes_task(void *arg)
 	sync_dnodes_arg_t *sda = arg;
 
 	multilist_sublist_t *ms =
-	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+	    multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
 
 	dmu_objset_sync_dnodes(ms, sda->sda_tx);
 
@@ -1987,8 +1987,8 @@ userquota_updates_task(void *arg)
 	dnode_t *dn;
 	userquota_cache_t cache = { { 0 } };
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	ASSERT(multilist_sublist_head(list) == NULL ||
 	    dmu_objset_userused_enabled(os));
@@ -2070,8 +2070,8 @@ dnode_rele_task(void *arg)
 	userquota_updates_arg_t *uua = arg;
 	objset_t *os = uua->uua_os;
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	dnode_t *dn;
 	while ((dn = multilist_sublist_head(list)) != NULL) {
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 5809a832bc..dbfc00362f 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -641,7 +641,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
@@ -658,7 +658,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 				i--;
 				break;
 			}
-			mls = multilist_sublist_lock(ml, i);
+			mls = multilist_sublist_lock_idx(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
@@ -2190,12 +2190,12 @@ metaslab_potentially_evict(metaslab_class_t *mc)
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
-		    multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
+		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
-			VERIFY3P(mls, ==, multilist_sublist_lock(
+			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c
index b1cdf1c5c5..3d3ef86e68 100644
--- a/module/zfs/multilist.c
+++ b/module/zfs/multilist.c
@@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
 	return (random_in_range(ml->ml_num_sublists));
 }
 
+void
+multilist_sublist_lock(multilist_sublist_t *mls)
+{
+	mutex_enter(&mls->mls_lock);
+}
+
 /* Lock and return the sublist specified at the given index */
 multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
 {
 	multilist_sublist_t *mls;
 
@@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
 multilist_sublist_t *
 multilist_sublist_lock_obj(multilist_t *ml, void *obj)
 {
-	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+	return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
 }
 
 void
@@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
 	list_insert_tail(&mls->mls_list, obj);
 }
 
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_after(&mls->mls_list, prev, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_before(&mls->mls_list, next, obj);
+}
+
 /*
  * Move the object one element forward in the list.
  *

From d1487bcc88aa1bb884e7ea9161e5038aa83f5316 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Sun, 24 Mar 2024 20:31:29 -0400
Subject: [PATCH 30/31] Speculative prefetch for reordered requests

Before this change speculative prefetcher was able to detect a stream
only if all of its accesses are perfectly sequential.  It was easy to
implement and is perfectly fine for single-threaded applications.
Unfortunately multi-threaded network servers, such as iSCSI, SMB or
NFS usually have plenty of threads and may often reorder requests,
preventing successful speculation and prefetch.

This change allows speculative prefetcher to detect streams even if
requests are reordered by introducing a list of 9 non-contiguous
ranges up to 16MB ahead of current stream position and filling the
gaps as more requests arrive.  It also allows stream to proceed
even with holes up to a certain configurable threshold (25%).

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
---
 cmd/arc_summary          |  11 +-
 include/sys/dmu_zfetch.h |  16 ++-
 man/man4/zfs.4           |  11 ++
 module/zfs/dmu.c         |   8 +-
 module/zfs/dmu_zfetch.c  | 289 +++++++++++++++++++++++++++++++--------
 5 files changed, 272 insertions(+), 63 deletions(-)

diff --git a/cmd/arc_summary b/cmd/arc_summary
index 9c69ec4f8c..100fb1987a 100755
--- a/cmd/arc_summary
+++ b/cmd/arc_summary
@@ -793,18 +793,27 @@ def section_dmu(kstats_dict):
 
     zfetch_stats = isolate_section('zfetchstats', kstats_dict)
 
-    zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
+    zfetch_access_total = int(zfetch_stats['hits']) +\
+        int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\
+        int(zfetch_stats['past']) + int(zfetch_stats['misses'])
 
     prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
     prt_i2('Stream hits:',
            f_perc(zfetch_stats['hits'], zfetch_access_total),
            f_hits(zfetch_stats['hits']))
+    future = int(zfetch_stats['future']) + int(zfetch_stats['stride'])
+    prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total),
+           f_hits(future))
+    prt_i2('Hits behind stream:',
+           f_perc(zfetch_stats['past'], zfetch_access_total),
+           f_hits(zfetch_stats['past']))
     prt_i2('Stream misses:',
            f_perc(zfetch_stats['misses'], zfetch_access_total),
            f_hits(zfetch_stats['misses']))
     prt_i2('Streams limit reached:',
            f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
            f_hits(zfetch_stats['max_streams']))
+    prt_i1('Stream strides:', f_hits(zfetch_stats['stride']))
     prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
     print()
 
diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h
index f00e13cf03..322472fb1a 100644
--- a/include/sys/dmu_zfetch.h
+++ b/include/sys/dmu_zfetch.h
@@ -45,18 +45,24 @@ typedef struct zfetch {
 	int		zf_numstreams;	/* number of zstream_t's */
 } zfetch_t;
 
+typedef struct zsrange {
+	uint16_t	start;
+	uint16_t	end;
+} zsrange_t;
+
+#define	ZFETCH_RANGES	9		/* Fits zstream_t into 128 bytes */
+
 typedef struct zstream {
+	list_node_t	zs_node;	/* link for zf_stream */
 	uint64_t	zs_blkid;	/* expect next access at this blkid */
+	uint_t		zs_atime;	/* time last prefetch issued */
+	zsrange_t	zs_ranges[ZFETCH_RANGES]; /* ranges from future */
 	unsigned int	zs_pf_dist;	/* data prefetch distance in bytes */
 	unsigned int	zs_ipf_dist;	/* L1 prefetch distance in bytes */
 	uint64_t	zs_pf_start;	/* first data block to prefetch */
 	uint64_t	zs_pf_end;	/* data block to prefetch up to */
 	uint64_t	zs_ipf_start;	/* first data block to prefetch L1 */
 	uint64_t	zs_ipf_end;	/* data block to prefetch L1 up to */
-
-	list_node_t	zs_node;	/* link for zf_stream */
-	hrtime_t	zs_atime;	/* time last prefetch issued */
-	zfetch_t	*zs_fetch;	/* parent fetch */
 	boolean_t	zs_missed;	/* stream saw cache misses */
 	boolean_t	zs_more;	/* need more distant prefetch */
 	zfs_refcount_t	zs_callers;	/* number of pending callers */
@@ -74,7 +80,7 @@ void		dmu_zfetch_init(zfetch_t *, struct dnode *);
 void		dmu_zfetch_fini(zfetch_t *);
 zstream_t	*dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
     boolean_t);
-void		dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
+void		dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
 void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
     boolean_t);
 
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 242f62e5e2..e00eb354e3 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -531,6 +531,10 @@ However, this is limited by
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
+.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
+Log2 fraction of holes in speculative prefetch stream allowed for it to
+proceed.
+.
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
@@ -545,6 +549,13 @@ Max bytes to prefetch per stream.
 .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
 Max bytes to prefetch indirects for per stream.
 .
+.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
+Requests within this byte distance from the current prefetch stream position
+are considered parts of the stream, reordered due to parallel processing.
+Such requests do not advance the stream position immediately unless
+.Sy zfetch_hole_shift
+fill threshold is reached, but saved to fill holes in the stream later.
+.
 .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
 Max number of streams per zfetch (prefetch streams per file).
 .
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 3215ab1c2a..08bb185a3e 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -569,8 +569,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
-			if (zs)
-				dmu_zfetch_run(zs, missed, B_TRUE);
+			if (zs) {
+				dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
+				    B_TRUE);
+			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
@@ -606,7 +608,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zfs_racct_write(length, nblks);
 
 	if (zs)
-		dmu_zfetch_run(zs, missed, B_TRUE);
+		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index d0acaf5020..e261951760 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -65,9 +65,16 @@ unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
 #endif
 /* max bytes to prefetch indirects for per stream (default 64MB) */
 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
+/* max request reorder distance within a stream (default 16MB) */
+unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
+/* Max log2 fraction of holes in a stream */
+unsigned int	zfetch_hole_shift = 2;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_future;
+	kstat_named_t zfetchstat_stride;
+	kstat_named_t zfetchstat_past;
 	kstat_named_t zfetchstat_misses;
 	kstat_named_t zfetchstat_max_streams;
 	kstat_named_t zfetchstat_io_issued;
@@ -76,6 +83,9 @@ typedef struct zfetch_stats {
 
 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "future",			KSTAT_DATA_UINT64 },
+	{ "stride",			KSTAT_DATA_UINT64 },
+	{ "past",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "max_streams",		KSTAT_DATA_UINT64 },
 	{ "io_issued",			KSTAT_DATA_UINT64 },
@@ -84,6 +94,9 @@ static zfetch_stats_t zfetch_stats = {
 
 struct {
 	wmsum_t zfetchstat_hits;
+	wmsum_t zfetchstat_future;
+	wmsum_t zfetchstat_stride;
+	wmsum_t zfetchstat_past;
 	wmsum_t zfetchstat_misses;
 	wmsum_t zfetchstat_max_streams;
 	wmsum_t zfetchstat_io_issued;
@@ -107,6 +120,12 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
 		return (EACCES);
 	zs->zfetchstat_hits.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_hits);
+	zs->zfetchstat_future.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_future);
+	zs->zfetchstat_stride.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_stride);
+	zs->zfetchstat_past.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_past);
 	zs->zfetchstat_misses.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_misses);
 	zs->zfetchstat_max_streams.value.ui64 =
@@ -122,6 +141,9 @@ void
 zfetch_init(void)
 {
 	wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_future, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_past, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
@@ -147,6 +169,9 @@ zfetch_fini(void)
 	}
 
 	wmsum_fini(&zfetch_sums.zfetchstat_hits);
+	wmsum_fini(&zfetch_sums.zfetchstat_future);
+	wmsum_fini(&zfetch_sums.zfetchstat_stride);
+	wmsum_fini(&zfetch_sums.zfetchstat_past);
 	wmsum_fini(&zfetch_sums.zfetchstat_misses);
 	wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
 	wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
@@ -222,22 +247,22 @@ static void
 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
 	zstream_t *zs, *zs_next, *zs_old = NULL;
-	hrtime_t now = gethrtime(), t;
+	uint_t now = gethrestime_sec(), t;
 
 	ASSERT(MUTEX_HELD(&zf->zf_lock));
 
 	/*
 	 * Delete too old streams, reusing the first found one.
 	 */
-	t = now - SEC2NSEC(zfetch_max_sec_reap);
+	t = now - zfetch_max_sec_reap;
 	for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
 		zs_next = list_next(&zf->zf_stream, zs);
 		/*
 		 * Skip if still active.  1 -- zf_stream reference.
 		 */
-		if (zfs_refcount_count(&zs->zs_refs) != 1)
+		if ((int)(zs->zs_atime - t) >= 0)
 			continue;
-		if (zs->zs_atime > t)
+		if (zfs_refcount_count(&zs->zs_refs) != 1)
 			continue;
 		if (zs_old)
 			dmu_zfetch_stream_remove(zf, zs);
@@ -246,6 +271,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}
 	if (zs_old) {
 		zs = zs_old;
+		list_remove(&zf->zf_stream, zs);
 		goto reuse;
 	}
 
@@ -255,21 +281,23 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	 * for all the streams to be non-overlapping.
 	 */
 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
-	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+	    (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
 	    zfetch_max_distance));
 	if (zf->zf_numstreams >= max_streams) {
-		t = now - SEC2NSEC(zfetch_min_sec_reap);
+		t = now - zfetch_min_sec_reap;
 		for (zs = list_head(&zf->zf_stream); zs != NULL;
 		    zs = list_next(&zf->zf_stream, zs)) {
+			if ((int)(zs->zs_atime - t) >= 0)
+				continue;
 			if (zfs_refcount_count(&zs->zs_refs) != 1)
 				continue;
-			if (zs->zs_atime > t)
-				continue;
-			if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
+			if (zs_old == NULL ||
+			    (int)(zs_old->zs_atime - zs->zs_atime) >= 0)
 				zs_old = zs;
 		}
 		if (zs_old) {
 			zs = zs_old;
+			list_remove(&zf->zf_stream, zs);
 			goto reuse;
 		}
 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
@@ -277,24 +305,24 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}
 
 	zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
-	zs->zs_fetch = zf;
 	zfs_refcount_create(&zs->zs_callers);
 	zfs_refcount_create(&zs->zs_refs);
 	/* One reference for zf_stream. */
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	zf->zf_numstreams++;
-	list_insert_head(&zf->zf_stream, zs);
 
 reuse:
+	list_insert_head(&zf->zf_stream, zs);
 	zs->zs_blkid = blkid;
+	/* Allow immediate stream reuse until first hit. */
+	zs->zs_atime = now - zfetch_min_sec_reap;
+	memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
 	zs->zs_pf_dist = 0;
+	zs->zs_ipf_dist = 0;
 	zs->zs_pf_start = blkid;
 	zs->zs_pf_end = blkid;
-	zs->zs_ipf_dist = 0;
 	zs->zs_ipf_start = blkid;
 	zs->zs_ipf_end = blkid;
-	/* Allow immediate stream reuse until first hit. */
-	zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
 	zs->zs_missed = B_FALSE;
 	zs->zs_more = B_FALSE;
 }
@@ -311,6 +339,120 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
 	aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
 }
 
+/*
+ * Process stream hit access for nblks blocks starting at zs_blkid.  Return
+ * number of blocks to proceed for after aggregation with future ranges.
+ */
+static uint64_t
+dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
+{
+	uint_t i, j;
+
+	/* Optimize sequential accesses (no future ranges). */
+	if (zs->zs_ranges[0].start == 0)
+		goto done;
+
+	/* Look for intersections with further ranges. */
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > nblks)
+			break;
+		if (r->end >= nblks) {
+			nblks = r->end;
+			i++;
+			break;
+		}
+	}
+
+	/* Delete all found intersecting ranges, updates remaining. */
+	for (j = 0; i < ZFETCH_RANGES; i++, j++) {
+		if (zs->zs_ranges[i].start == 0)
+			break;
+		ASSERT3U(zs->zs_ranges[i].start, >, nblks);
+		ASSERT3U(zs->zs_ranges[i].end, >, nblks);
+		zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
+		zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
+	}
+	if (j < ZFETCH_RANGES) {
+		zs->zs_ranges[j].start = 0;
+		zs->zs_ranges[j].end = 0;
+	}
+
+done:
+	zs->zs_blkid += nblks;
+	return (nblks);
+}
+
+/*
+ * Process future stream access for nblks blocks starting at blkid.  Return
+ * number of blocks to proceed for if future ranges reach fill threshold.
+ */
+static uint64_t
+dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
+{
+	ASSERT3U(blkid, >, zs->zs_blkid);
+	blkid -= zs->zs_blkid;
+	ASSERT3U(blkid + nblks, <=, UINT16_MAX);
+
+	/* Search for first and last intersection or insert point. */
+	uint_t f = ZFETCH_RANGES, l = 0, i;
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > blkid + nblks)
+			break;
+		if (r->end < blkid)
+			continue;
+		if (f > i)
+			f = i;
+		if (l < i)
+			l = i;
+	}
+	if (f <= l) {
+		/* Got some intersecting range, expand it if needed. */
+		if (zs->zs_ranges[f].start > blkid)
+			zs->zs_ranges[f].start = blkid;
+		zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
+		if (f < l) {
+			/* Got more than one intersection, remove others. */
+			for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
+				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
+				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
+			}
+			zs->zs_ranges[ZFETCH_RANGES - 1].start = 0;
+			zs->zs_ranges[ZFETCH_RANGES - 1].end = 0;
+		}
+	} else if (i < ZFETCH_RANGES) {
+		/* Got no intersecting ranges, insert new one. */
+		for (l = ZFETCH_RANGES - 1; l > i; l--) {
+			zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
+			zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
+		}
+		zs->zs_ranges[i].start = blkid;
+		zs->zs_ranges[i].end = blkid + nblks;
+	} else {
+		/* No space left to insert.  Drop the range. */
+		return (0);
+	}
+
+	/* Check if with the new access addition we reached fill threshold. */
+	if (zfetch_hole_shift >= 16)
+		return (0);
+	uint_t hole = 0;
+	for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0)
+			break;
+		hole += r->start - f;
+		f = r->end;
+		if (hole <= r->end >> zfetch_hole_shift)
+			l = r->end;
+	}
+	if (l > 0)
+		return (dmu_zfetch_hit(zs, l));
+
+	return (0);
+}
+
 /*
  * This is the predictive prefetch entry point.  dmu_zfetch_prepare()
  * associates dnode access specified with blkid and nblks arguments with
@@ -365,53 +507,92 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	mutex_enter(&zf->zf_lock);
 
 	/*
-	 * Find matching prefetch stream.  Depending on whether the accesses
+	 * Find perfect prefetch stream.  Depending on whether the accesses
 	 * are block-aligned, first block of the new access may either follow
 	 * the last block of the previous access, or be equal to it.
 	 */
+	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
+	uint64_t end_blkid = blkid + nblks;
 	for (zs = list_head(&zf->zf_stream); zs != NULL;
 	    zs = list_next(&zf->zf_stream, zs)) {
 		if (blkid == zs->zs_blkid) {
-			break;
+			goto hit;
 		} else if (blkid + 1 == zs->zs_blkid) {
 			blkid++;
 			nblks--;
-			break;
+			goto hit;
 		}
 	}
 
 	/*
-	 * If the file is ending, remove the matching stream if found.
-	 * If not found then it is too late to create a new one now.
+	 * Find close enough prefetch stream.  Access crossing stream position
+	 * is a hit in its new part.  Access ahead of stream position considered
+	 * a hit for metadata prefetch, since we do not care about fill percent,
+	 * or stored for future otherwise.  Access behind stream position is
+	 * silently ignored, since we already skipped it reaching fill percent.
 	 */
-	uint64_t end_of_access_blkid = blkid + nblks;
-	if (end_of_access_blkid >= maxblkid) {
-		if (zs != NULL)
-			dmu_zfetch_stream_remove(zf, zs);
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
+	uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
+	uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
+	for (zs = list_head(&zf->zf_stream); zs != NULL;
+	    zs = list_next(&zf->zf_stream, zs)) {
+		if (blkid > zs->zs_blkid) {
+			if (end_blkid <= zs->zs_blkid + max_reorder) {
+				if (!fetch_data) {
+					nblks = dmu_zfetch_hit(zs,
+					    end_blkid - zs->zs_blkid);
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+					goto future;
+				}
+				nblks = dmu_zfetch_future(zs, blkid, nblks);
+				if (nblks > 0)
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+				else
+					ZFETCHSTAT_BUMP(zfetchstat_future);
+				goto future;
+			}
+		} else if (end_blkid >= zs->zs_blkid) {
+			nblks -= zs->zs_blkid - blkid;
+			blkid += zs->zs_blkid - blkid;
+			goto hit;
+		} else if (end_blkid + max_reorder > zs->zs_blkid &&
+		    (int)(zs->zs_atime - t) >= 0) {
+			ZFETCHSTAT_BUMP(zfetchstat_past);
+			zs->zs_atime = gethrestime_sec();
+			goto out;
+		}
 	}
 
-	/* Exit if we already prefetched this block before. */
-	if (nblks == 0) {
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
-	}
+	/*
+	 * This access is not part of any existing stream.  Create a new
+	 * stream for it unless we are at the end of file.
+	 */
+	if (end_blkid < maxblkid)
+		dmu_zfetch_stream_create(zf, end_blkid);
+	mutex_exit(&zf->zf_lock);
+	if (!have_lock)
+		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+	ZFETCHSTAT_BUMP(zfetchstat_misses);
+	return (NULL);
 
-	if (zs == NULL) {
-		/*
-		 * This access is not part of any existing stream.  Create
-		 * a new stream for it.
-		 */
-		dmu_zfetch_stream_create(zf, end_of_access_blkid);
+hit:
+	nblks = dmu_zfetch_hit(zs, nblks);
+	ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+future:
+	zs->zs_atime = gethrestime_sec();
+
+	/* Exit if we already prefetched for this position before. */
+	if (nblks == 0)
+		goto out;
+
+	/* If the file is ending, remove the stream. */
+	end_blkid = zs->zs_blkid;
+	if (end_blkid >= maxblkid) {
+		dmu_zfetch_stream_remove(zf, zs);
+out:
 		mutex_exit(&zf->zf_lock);
 		if (!have_lock)
 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		ZFETCHSTAT_BUMP(zfetchstat_misses);
 		return (NULL);
 	}
 
@@ -427,7 +608,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	 * than ~6% of ARC held by active prefetches.  It should help with
 	 * getting out of RAM on some badly mispredicted read patterns.
 	 */
-	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
 	unsigned int nbytes = nblks << dbs;
 	unsigned int pf_nblks;
 	if (fetch_data) {
@@ -447,10 +627,10 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	} else {
 		pf_nblks = 0;
 	}
-	if (zs->zs_pf_start < end_of_access_blkid)
-		zs->zs_pf_start = end_of_access_blkid;
-	if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
-		zs->zs_pf_end = end_of_access_blkid + pf_nblks;
+	if (zs->zs_pf_start < end_blkid)
+		zs->zs_pf_start = end_blkid;
+	if (zs->zs_pf_end < end_blkid + pf_nblks)
+		zs->zs_pf_end = end_blkid + pf_nblks;
 
 	/*
 	 * Do the same for indirects, starting where we will stop reading
@@ -468,9 +648,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
 		zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
 
-	zs->zs_blkid = end_of_access_blkid;
-	/* Protect the stream from reclamation. */
-	zs->zs_atime = gethrtime();
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	/* Count concurrent callers. */
 	zfs_refcount_add(&zs->zs_callers, NULL);
@@ -478,15 +655,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 
 	if (!have_lock)
 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-
-	ZFETCHSTAT_BUMP(zfetchstat_hits);
 	return (zs);
 }
 
 void
-dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
+dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
+    boolean_t have_lock)
 {
-	zfetch_t *zf = zs->zs_fetch;
 	int64_t pf_start, pf_end, ipf_start, ipf_end;
 	int epbs, issued;
 
@@ -562,7 +737,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
 
 	zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
 	if (zs)
-		dmu_zfetch_run(zs, missed, have_lock);
+		dmu_zfetch_run(zf, zs, missed, have_lock);
 }
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
@@ -585,3 +760,9 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
+	"Max request reorder distance within a stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
+	"Max log2 fraction of holes in a stream");

From 3cd9b03030e82ba4ce68d4345f5a4cdb34e622d6 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 2 Apr 2024 15:14:54 +1100
Subject: [PATCH 31/31] vdev_disk: don't touch vbio after its handed off to the
 kernel

After IO is unplugged, it may complete immediately and vbio_completion
be called on interrupt context. That may interrupt or deschedule our
task. If its the last bio, the vbio will be freed. Then, we get
rescheduled, and try to write to freed memory through vbio->.

This patch just removes the the cleanup, and the corresponding assert.
These were leftovers from a previous iteration of vbio_submit() and were
always "belt and suspenders" ops anyway, never strictly required.

Reported-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
---
 module/os/linux/zfs/vdev_disk.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index e1c19a085b..6366d97b6e 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -758,8 +758,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 static void
 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
-	ASSERT(vbio->vbio_bdev);
-
 	/*
 	 * We plug so we can submit the BIOs as we go and only unplug them when
 	 * they are fully created and submitted. This is important; if we don't
@@ -780,9 +778,6 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 	vdev_submit_bio(vbio->vbio_bio);
 
 	blk_finish_plug(&plug);
-
-	vbio->vbio_bio = NULL;
-	vbio->vbio_bdev = NULL;
 }
 
 /* IO completion callback */