From 1266cebf87a1f613b02cfb998a4c7153a6560cfe Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 13 Jul 2023 11:50:34 -0400
Subject: [PATCH 01/24] FreeBSD: Fix build on stable/13 after 1302506.

Starting approximately from version 1302506 vn_lock_pair() grown two
additional arguments following head.  There is a one week hole, but
that is closet reference point we have.

Reviewed-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Sponsored by:   iXsystems, Inc.
Closes #15047
---
 module/os/freebsd/zfs/zfs_vnops_os.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index d29f00a0cb..7692200ab2 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -6263,7 +6263,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 			goto bad_write_fallback;
 		}
 	} else {
-#if __FreeBSD_version >= 1400086
+#if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
+	__FreeBSD_version >= 1400086
 		vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false,
 		    LK_EXCLUSIVE);
 #else

From bf6cd307967b9fea5173d122c57db728431993a4 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 13 Jul 2023 18:06:57 +0200
Subject: [PATCH 02/24] FreeBSD: catch up to __FreeBSD_version 1400093

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Closes #15036
---
 include/os/freebsd/spl/sys/vnode.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h
index ab1727dca0..0779e58e49 100644
--- a/include/os/freebsd/spl/sys/vnode.h
+++ b/include/os/freebsd/spl/sys/vnode.h
@@ -36,7 +36,11 @@ struct xucred;
 typedef struct flock	flock64_t;
 typedef	struct vnode	vnode_t;
 typedef	struct vattr	vattr_t;
+#if __FreeBSD_version < 1400093
 typedef enum vtype vtype_t;
+#else
+#define	vtype_t __enum_uint8(vtype)
+#endif
 
 #include <sys/types.h>
 #include <sys/queue.h>

From b4e630b00c996a0b11f34e51488869cf3ab15d5e Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 13 Jul 2023 12:12:55 -0400
Subject: [PATCH 03/24] Add missed DMU_PROJECTUSED_OBJECT prefetch.

It seems 9c5167d19f "Project Quota on ZFS" missed to add prefetch
for DMU_PROJECTUSED_OBJECT during scan (scrub/resilver).  It should
not cause visible problems, but may affect scub/resilver performance.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15024
---
 module/zfs/dsl_scan.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 50428bff3e..ecdeba80b7 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2015,6 +2015,11 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 		    zb->zb_objset, DMU_META_DNODE_OBJECT);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
+				dsl_scan_prefetch_dnode(scn,
+				    &osp->os_projectused_dnode, zb->zb_objset,
+				    DMU_PROJECTUSED_OBJECT);
+			}
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_groupused_dnode, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);

From e613e4bbe397ed4eea08502cc8694bc990ab5ebe Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 14 Jul 2023 19:11:46 -0400
Subject: [PATCH 04/24] Avoid extra snprintf() in dsl_deadlist_merge().

Since we are already iterating the ZAP, we have exact string key to
remove, we do not need to call zap_remove_int() with the int key we
just converted, we can call zap_remove() for the original string.

This should make no functional change, only a micro-optimization.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15056

(cherry picked from commit fdba8cbb796cb089c3d6eefa833f5176b0474c29)
---
 module/zfs/dsl_deadlist.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index 181efd8ed6..47c234f76c 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -892,9 +892,9 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 	for (zap_cursor_init(&zc, dl->dl_os, obj);
 	    (error = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
-		uint64_t mintxg = zfs_strtonum(za->za_name, NULL);
-		dsl_deadlist_insert_bpobj(dl, za->za_first_integer, mintxg, tx);
-		VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
+		dsl_deadlist_insert_bpobj(dl, za->za_first_integer,
+		    zfs_strtonum(za->za_name, NULL), tx);
+		VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));
 		if (perror == 0) {
 			dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
 			    zfs_strtonum(pza->za_name, NULL));

From 56ed389a575a63bf21fa293b21a2d27dd67b7bad Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 14 Jul 2023 19:16:40 -0400
Subject: [PATCH 05/24] Fix raw receive with different indirect block size.

Unlike regular receive, raw receive require destination to have the
same block structure as the source.  In case of dnode reclaim this
triggers two special cases, requiring special handling:
 - If dn_nlevels == 1, we can change the ibs, but dnode_set_blksz()
should not dirty the data buffer if block size does not change, or
durign receive dbuf_dirty_lightweight() will trigger assertion.
 - If dn_nlevels > 1, we just can't change the ibs, dnode_set_blksz()
would fail and receive_object would trigger assertion, so we should
destroy and recreate the dnode from scratch.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15039

(cherry picked from commit c4e8742149e31a77dc074f3baacfefed3ccb800e)
---
 module/zfs/dmu_recv.c | 22 ++++++++++++----------
 module/zfs/dnode.c    | 31 ++++++++++++++++---------------
 2 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 2fdd7c1ece..05ca91717c 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1795,17 +1795,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa,
 	}
 
 	/*
-	 * The dmu does not currently support decreasing nlevels
-	 * or changing the number of dnode slots on an object. For
-	 * non-raw sends, this does not matter and the new object
-	 * can just use the previous one's nlevels. For raw sends,
-	 * however, the structure of the received dnode (including
-	 * nlevels and dnode slots) must match that of the send
-	 * side. Therefore, instead of using dmu_object_reclaim(),
-	 * we must free the object completely and call
-	 * dmu_object_claim_dnsize() instead.
+	 * The dmu does not currently support decreasing nlevels or changing
+	 * indirect block size if there is already one, same as changing the
+	 * number of of dnode slots on an object.  For non-raw sends this
+	 * does not matter and the new object can just use the previous one's
+	 * parameters.  For raw sends, however, the structure of the received
+	 * dnode (including indirects and dnode slots) must match that of the
+	 * send side.  Therefore, instead of using dmu_object_reclaim(), we
+	 * must free the object completely and call dmu_object_claim_dnsize()
+	 * instead.
 	 */
-	if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) ||
+	if ((rwa->raw && ((doi->doi_indirection > 1 &&
+	    indblksz != doi->doi_metadata_block_size) ||
+	    drro->drr_nlevels < doi->doi_indirection)) ||
 	    dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
 		err = dmu_free_long_object(rwa->os, drro->drr_object);
 		if (err != 0)
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index d15268cd7b..7cf03264dc 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1882,7 +1882,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	if (ibs == dn->dn_indblkshift)
 		ibs = 0;
 
-	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+	if (size == dn->dn_datablksz && ibs == 0)
 		return (0);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
@@ -1905,24 +1905,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;
 
-	/* resize the old block */
-	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
-	if (err == 0) {
-		dbuf_new_size(db, size, tx);
-	} else if (err != ENOENT) {
-		goto fail;
-	}
-
-	dnode_setdblksz(dn, size);
 	dnode_setdirty(dn, tx);
-	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+	if (size != dn->dn_datablksz) {
+		/* resize the old block */
+		err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
+		if (err == 0) {
+			dbuf_new_size(db, size, tx);
+		} else if (err != ENOENT) {
+			goto fail;
+		}
+
+		dnode_setdblksz(dn, size);
+		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
+		if (db)
+			dbuf_rele(db, FTAG);
+	}
 	if (ibs) {
 		dn->dn_indblkshift = ibs;
-		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+		dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	}
-	/* release after we have fixed the blocksize in the dnode */
-	if (db)
-		dbuf_rele(db, FTAG);
 
 	rw_exit(&dn->dn_struct_rwlock);
 	return (0);

From f917cf1c0324de2816e73149a81c4c6dd43ce542 Mon Sep 17 00:00:00 2001
From: Alan Somers <asomers@gmail.com>
Date: Fri, 14 Jul 2023 17:13:15 -0600
Subject: [PATCH 06/24] Fix the ZFS checksum error histograms with larger
 record sizes

My analysis in PR #14716 was incorrect.  Each histogram bucket contains
the number of incorrect bits, by position in a 64-bit word, over the
entire record.  8-bit buckets can overflow for record sizes above 2k.
To forestall that, saturate each bucket at 255.  That should still get
the point across: either all bits are equally wrong, or just a couple
are.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alan Somers <asomers@gmail.com>
Sponsored-by: Axcient
Closes #15049
---
 module/zfs/zfs_fm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index c42ef048dd..2754ceec83 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -790,7 +790,7 @@ update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)
 	/* We store the bits in big-endian (largest-first) order */
 	for (i = 0; i < 64; i++) {
 		if (value & (1ull << i)) {
-			hist[63 - i]++;
+			hist[63 - i] = MAX(hist[63 - i], hist[63 - i] + 1);
 			++bits;
 		}
 	}

From 5299f4f289165421cceac5c2d8d6703781ad0ae9 Mon Sep 17 00:00:00 2001
From: Yuri Pankov <113725409+yuripv@users.noreply.github.com>
Date: Thu, 20 Jul 2023 18:06:55 +0200
Subject: [PATCH 07/24] set autotrim default to 'off' everywhere

As it turns out having autotrim default to 'on' on FreeBSD never really
worked due to mess with defines where userland and kernel module were
getting different default values (userland was defaulting to 'off',
module was thinking it's 'on').

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Yuri Pankov <yuripv@FreeBSD.org>
Closes #15079
---
 include/sys/spa.h           | 6 ------
 module/zcommon/zpool_prop.c | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/sys/spa.h b/include/sys/spa.h
index 1fa2044008..b908556874 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -723,16 +723,10 @@ typedef enum spa_mode {
  * Send TRIM commands in-line during normal pool operation while deleting.
  *	OFF: no
  *	ON: yes
- * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources.
  */
 typedef enum {
 	SPA_AUTOTRIM_OFF = 0,	/* default */
 	SPA_AUTOTRIM_ON,
-#ifdef IN_FREEBSD_BASE
-	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON,
-#else
-	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF,
-#endif
 } spa_autotrim_t;
 
 /*
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index 459ff62fc9..c4aca04a96 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -160,7 +160,7 @@ zpool_prop_init(void)
 	    "wait | continue | panic", "FAILMODE", failuremode_table,
 	    sfeatures);
 	zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim",
-	    SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL,
+	    SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL,
 	    "on | off", "AUTOTRIM", boolean_table, sfeatures);
 
 	/* hidden properties */

From 931dc705505a83bf8d73adfca6e2cd4108b18a0a Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Fri, 14 Jul 2023 19:32:49 -0400
Subject: [PATCH 08/24] Linux 6.5 compat: intptr_t definition is canonically
 signed

Make the version here match that elsewhere in the kernel and system
headers.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15058
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 include/os/linux/spl/sys/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/os/linux/spl/sys/types.h b/include/os/linux/spl/sys/types.h
index a7666187ec..d89a91c36f 100644
--- a/include/os/linux/spl/sys/types.h
+++ b/include/os/linux/spl/sys/types.h
@@ -38,7 +38,7 @@ typedef unsigned long		ulong_t;
 typedef unsigned long long	u_longlong_t;
 typedef long long		longlong_t;
 
-typedef unsigned long		intptr_t;
+typedef long			intptr_t;
 typedef unsigned long long	rlim64_t;
 
 typedef struct task_struct	kthread_t;

From 1bc244ae93bec6cc6877dad0c1f9fa2aa95acb36 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Fri, 14 Jul 2023 19:33:51 -0400
Subject: [PATCH 09/24] Linux 6.5 compat: BLK_STS_NEXUS renamed to
 BLK_STS_RESV_CONFLICT

This change was introduced in Linux commit
7ba150834b840f6f5cdd07ca69a4ccf39df59a66

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15059
---
 config/kernel-blkdev.m4                       | 25 +++++++++++++++++++
 include/os/linux/kernel/linux/blkdev_compat.h |  8 ++++++
 2 files changed, 33 insertions(+)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 28e5364581..63d719f9c2 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -443,6 +443,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [
 	])
 ])
 
+dnl #
+dnl # 6.5.x API change
+dnl # BLK_STS_NEXUS replaced with BLK_STS_RESV_CONFLICT
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT], [
+	ZFS_LINUX_TEST_SRC([blk_sts_resv_conflict], [
+		#include <linux/blkdev.h>
+	],[
+		blk_status_t s __attribute__ ((unused)) = BLK_STS_RESV_CONFLICT;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [
+	AC_MSG_CHECKING([whether BLK_STS_RESV_CONFLICT is defined])
+		ZFS_LINUX_TEST_RESULT([blk_sts_resv_conflict], [
+			AC_DEFINE(HAVE_BLK_STS_RESV_CONFLICT, 1, [BLK_STS_RESV_CONFLICT is defined])
+			AC_MSG_RESULT(yes)
+		], [
+			AC_MSG_RESULT(no)
+		])
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
@@ -458,6 +481,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
@@ -476,4 +500,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT
 ])
diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index c5c6385be6..f1448587b9 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -181,7 +181,11 @@ bi_status_to_errno(blk_status_t status)
 		return (ENOLINK);
 	case BLK_STS_TARGET:
 		return (EREMOTEIO);
+#ifdef HAVE_BLK_STS_RESV_CONFLICT
+	case BLK_STS_RESV_CONFLICT:
+#else
 	case BLK_STS_NEXUS:
+#endif
 		return (EBADE);
 	case BLK_STS_MEDIUM:
 		return (ENODATA);
@@ -215,7 +219,11 @@ errno_to_bi_status(int error)
 	case EREMOTEIO:
 		return (BLK_STS_TARGET);
 	case EBADE:
+#ifdef HAVE_BLK_STS_RESV_CONFLICT
+		return (BLK_STS_RESV_CONFLICT);
+#else
 		return (BLK_STS_NEXUS);
+#endif
 	case ENODATA:
 		return (BLK_STS_MEDIUM);
 	case EILSEQ:

From 73ba5df31a07bd6ab96dc85cc723ff8721822ce8 Mon Sep 17 00:00:00 2001
From: Coleman Kane <ckane@colemankane.org>
Date: Thu, 20 Jul 2023 12:09:25 -0400
Subject: [PATCH 10/24] Linux 6.5 compat: disk_check_media_change() was added

The disk_check_media_change() function was added which replaces
bdev_check_media_change.  This change was introduced in 6.5rc1
444aa2c58cb3b6cfe3b7cc7db6c294d73393a894 and the new function takes a
gendisk* as its argument, no longer a block_device*. Thus, bdev->bd_disk
is now used to pass the expected data.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Coleman Kane <ckane@colemankane.org>
Closes #15060
---
 config/kernel-blkdev.m4                       | 29 +++++++++++++++++++
 include/os/linux/kernel/linux/blkdev_compat.h |  2 ++
 2 files changed, 31 insertions(+)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 63d719f9c2..887acee670 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -103,6 +103,33 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [
 	])
 ])
 
+dnl #
+dnl # 6.5.x API change
+dnl # disk_check_media_change() was added
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
+	ZFS_LINUX_TEST_SRC([disk_check_media_change], [
+		#include <linux/fs.h>
+		#include <linux/blkdev.h>
+	], [
+		struct block_device *bdev = NULL;
+		bool error;
+
+		error = disk_check_media_change(bdev->bd_disk);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
+	AC_MSG_CHECKING([whether disk_check_media_change() exists])
+	ZFS_LINUX_TEST_RESULT([disk_check_media_change], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_DISK_CHECK_MEDIA_CHANGE, 1,
+		    [disk_check_media_change() exists])
+	], [
+		AC_MSG_RESULT(no)
+	])
+])
+
 dnl #
 dnl # bdev_kobj() is introduced from 5.12
 dnl #
@@ -481,6 +508,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT
 ])
 
@@ -500,5 +528,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT
 ])
diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index f1448587b9..e0f20ba320 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -345,6 +345,8 @@ zfs_check_media_change(struct block_device *bdev)
 	return (0);
 }
 #define	vdev_bdev_reread_part(bdev)	zfs_check_media_change(bdev)
+#elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE)
+#define	vdev_bdev_reread_part(bdev)	disk_check_media_change(bdev->bd_disk)
 #else
 /*
  * This is encountered if check_disk_change() and bdev_check_media_change()

From 83b0967c1fe57ce9b59ab553d282090283ca09ed Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 20 Jul 2023 12:10:04 -0400
Subject: [PATCH 11/24] Do not request data L1 buffers on scan prefetch.

Set ARC_FLAG_NO_BUF when prefetching data L1 buffers for scan.  We
do not prefetch data L0 buffers, so we do not need the L1 buffers,
only want them to be ready in ARC. This saves some CPU time on the
buffers decompression.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15029
---
 module/zfs/dsl_scan.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index ecdeba80b7..34012db82d 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2080,10 +2080,16 @@ dsl_scan_prefetch_thread(void *arg)
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
+		/* We don't need data L1 buffer since we do not prefetch L0. */
+		blkptr_t *bp = &spic->spic_bp;
+		if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+			flags |= ARC_FLAG_NO_BUF;
+
 		/* issue the prefetch asynchronously */
-		(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
-		    &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
-		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
+		(void) arc_read(scn->scn_zio_root, spa, bp,
+		    dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
+		    zio_flags, &flags, &spic->spic_zb);
 
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}

From f5f5a2db9544bb09e14db47936b985264eb0ff87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wojciech=20Ma=C5=82ota-W=C3=B3jcik?=
 <59281144+outofforest@users.noreply.github.com>
Date: Thu, 20 Jul 2023 18:55:22 +0200
Subject: [PATCH 12/24] Rollback before zfs root is mounted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On my machines I observe random failures caused by rollback happening
after zfs root is mounted. I've observed two types of failures:

- zfs-rollback-bootfs.service fails saying that rollback must be
  done just before mounting the dataset
- boot process fails and rescue console is entered.

After making this modification and testing it for couple of days
none of those problems have been observed anymore.

I don't know if `dracut-mount.service` is still needed in the
`After` directive. Maybe someone else is able to address this?

Reviewed-by: Gregory Bartholomew <gregory.lee.bartholomew@gmail.com>
Signed-off-by: Wojciech Małota-Wójcik <59281144+outofforest@users.noreply.github.com>
Closes #15025
---
 contrib/dracut/90zfs/zfs-rollback-bootfs.service.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in
index 68fdcb1f32..12d8ac703e 100644
--- a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in
+++ b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in
@@ -2,7 +2,7 @@
 Description=Rollback bootfs just before it is mounted
 Requisite=zfs-import.target
 After=zfs-import.target dracut-pre-mount.service zfs-snapshot-bootfs.service
-Before=dracut-mount.service
+Before=dracut-mount.service sysroot.mount
 DefaultDependencies=no
 ConditionKernelCommandLine=bootfs.rollback
 ConditionEnvironment=BOOTFS

From d8011707ccb0fa1ba551777923893a137bd198d8 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Thu, 20 Jul 2023 21:57:16 +0500
Subject: [PATCH 13/24] Ignore pool ashift property during vdev attachment

Ashift can be set for a vdev only during its creation, and the
top-level vdev does not change when a vdev is attached or replaced.
The ashift property should not be used during attachment, as it
does not allow attaching/replacing a vdev if the pool's ashift
property is increased after the existing vdev was created. Instead,
we should be able to attach the vdev if the attached vdev can
satisfy the ashift requirement with its parent.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #15061
---
 include/sys/vdev_impl.h                       |  1 +
 module/zfs/vdev.c                             | 14 +++++---
 .../cli_root/zpool_attach/attach-o_ashift.ksh | 30 ++++++-----------
 .../zpool_replace/replace-o_ashift.ksh        | 32 +++++++------------
 .../zpool_replace/replace_prop_ashift.ksh     | 24 +++-----------
 5 files changed, 36 insertions(+), 65 deletions(-)

diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 2b22b973ba..5f4e82ad86 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -420,6 +420,7 @@ struct vdev {
 	boolean_t	vdev_copy_uberblocks;  /* post expand copy uberblocks */
 	boolean_t	vdev_resilver_deferred;  /* resilver deferred */
 	boolean_t	vdev_kobj_flag; /* kobj event record */
+	boolean_t	vdev_attaching; /* vdev attach ashift handling */
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	spa_aux_vdev_t	*vdev_aux;	/* for l2cache and spares vdevs	*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 30551feb63..1199bf5d32 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -889,9 +889,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	    &vd->vdev_not_present);
 
 	/*
-	 * Get the alignment requirement.
+	 * Get the alignment requirement. Ignore pool ashift for vdev
+	 * attach case.
 	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+	if (alloctype != VDEV_ALLOC_ATTACH) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+		    &vd->vdev_ashift);
+	} else {
+		vd->vdev_attaching = B_TRUE;
+	}
 
 	/*
 	 * Retrieve the vdev creation time.
@@ -2144,9 +2150,9 @@ vdev_open(vdev_t *vd)
 				return (SET_ERROR(EDOM));
 			}
 
-			if (vd->vdev_top == vd) {
+			if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
 				vdev_ashift_optimize(vd);
-			}
+			vd->vdev_attaching = B_FALSE;
 		}
 		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
 		    vd->vdev_ashift > ASHIFT_MAX)) {
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh
index 6ccec6abd6..574cb7654d 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh
@@ -35,7 +35,7 @@
 #
 # STRATEGY:
 #	1. Create various pools with different ashift values.
-#	2. Verify 'attach -o ashift=<n>' works only with allowed values.
+#	2. Verify 'attach' works.
 #
 
 verify_runnable "global"
@@ -66,26 +66,14 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
-	for cmdval in ${ashifts[@]}
-	do
-		log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
-		log_must verify_ashift $disk1 $ashift
-
-		# ashift_of(attached_disk) <= ashift_of(existing_vdev)
-		if [[ $cmdval -le $ashift ]]
-		then
-			log_must zpool attach -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-			log_must verify_ashift $disk2 $ashift
-		else
-			log_mustnot zpool attach -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-		fi
-		# clean things for the next run
-		log_must zpool destroy $TESTPOOL1
-		log_must zpool labelclear $disk1
-		log_must zpool labelclear $disk2
-	done
+	log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
+	log_must verify_ashift $disk1 $ashift
+	log_must zpool attach $TESTPOOL1 $disk1 $disk2
+	log_must verify_ashift $disk2 $ashift
+	# clean things for the next run
+	log_must zpool destroy $TESTPOOL1
+	log_must zpool labelclear $disk1
+	log_must zpool labelclear $disk2
 done
 
 typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-")
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh
index 37ed0062e6..9595e51241 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh
@@ -35,7 +35,7 @@
 #
 # STRATEGY:
 #	1. Create various pools with different ashift values.
-#	2. Verify 'replace -o ashift=<n>' works only with allowed values.
+#	2. Verify 'replace' works.
 #
 
 verify_runnable "global"
@@ -66,26 +66,16 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
-	for cmdval in ${ashifts[@]}
-	do
-		log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
-		log_must verify_ashift $disk1 $ashift
-		# ashift_of(replacing_disk) <= ashift_of(existing_vdev)
-		if [[ $cmdval -le $ashift ]]
-		then
-			log_must zpool replace -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-			log_must verify_ashift $disk2 $ashift
-			wait_replacing $TESTPOOL1
-		else
-			log_mustnot zpool replace -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-		fi
-		# clean things for the next run
-		log_must zpool destroy $TESTPOOL1
-		log_must zpool labelclear $disk1
-		log_must zpool labelclear $disk2
-	done
+	log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
+	log_must verify_ashift $disk1 $ashift
+	# ashift_of(replacing_disk) <= ashift_of(existing_vdev)
+	log_must zpool replace $TESTPOOL1 $disk1 $disk2
+	log_must verify_ashift $disk2 $ashift
+	wait_replacing $TESTPOOL1
+	# clean things for the next run
+	log_must zpool destroy $TESTPOOL1
+	log_must zpool labelclear $disk1
+	log_must zpool labelclear $disk2
 done
 
 typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-")
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh
index ffdaf91a28..b4ac18e5ea 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh
@@ -34,10 +34,8 @@
 #
 # STRATEGY:
 #	1. Create a pool with default values.
-#	2. Verify 'zpool replace' uses the ashift pool property value when
-#	   replacing an existing device.
-#	3. Verify the default ashift value can still be overridden by manually
-#	   specifying '-o ashift=<n>' from the command line.
+#	2. Override the pool ashift property.
+#	3. Verify 'zpool replace' works.
 #
 
 verify_runnable "global"
@@ -72,21 +70,9 @@ do
 	do
 		log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
 		log_must zpool set ashift=$pprop $TESTPOOL1
-		# ashift_of(replacing_disk) <= ashift_of(existing_vdev)
-		if [[ $pprop -le $ashift ]]
-		then
-			log_must zpool replace $TESTPOOL1 $disk1 $disk2
-			wait_replacing $TESTPOOL1
-			log_must verify_ashift $disk2 $ashift
-		else
-			# cannot replace if pool prop ashift > vdev ashift
-			log_mustnot zpool replace $TESTPOOL1 $disk1 $disk2
-			# verify we can override the pool prop value manually
-			log_must zpool replace -o ashift=$ashift $TESTPOOL1 \
-			    $disk1 $disk2
-			wait_replacing $TESTPOOL1
-			log_must verify_ashift $disk2 $ashift
-		fi
+		log_must zpool replace $TESTPOOL1 $disk1 $disk2
+		wait_replacing $TESTPOOL1
+		log_must verify_ashift $disk2 $ashift
 		# clean things for the next run
 		log_must zpool destroy $TESTPOOL1
 		log_must zpool labelclear $disk1

From 1a2e486d25e7aa8ad42917f2d32a6c2d25a23232 Mon Sep 17 00:00:00 2001
From: Yuri Pankov <113725409+yuripv@users.noreply.github.com>
Date: Thu, 20 Jul 2023 19:21:47 +0200
Subject: [PATCH 14/24] Don't panic if setting vdev properties is unsupported
 for this vdev type

Check that vdev has valid zap and bail out early.

While here, move objid selection out of the loop, it's not going to
change.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Yuri Pankov <yuripv@FreeBSD.org>
Closes #15063
---
 module/zfs/vdev.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 1199bf5d32..b6f8c0ab30 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -5694,6 +5694,7 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
+	uint64_t objid;
 	nvlist_t *nvprops;
 
 	vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
@@ -5704,31 +5705,28 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 	if (vd == NULL)
 		return;
 
+	/*
+	 * Set vdev property values in the vdev props mos object.
+	 */
+	if (vd->vdev_root_zap != 0) {
+		objid = vd->vdev_root_zap;
+	} else if (vd->vdev_top_zap != 0) {
+		objid = vd->vdev_top_zap;
+	} else if (vd->vdev_leaf_zap != 0) {
+		objid = vd->vdev_leaf_zap;
+	} else {
+		panic("unexpected vdev type");
+	}
+
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
-		uint64_t intval, objid = 0;
+		uint64_t intval;
 		const char *strval;
 		vdev_prop_t prop;
 		const char *propname = nvpair_name(elem);
 		zprop_type_t proptype;
 
-		/*
-		 * Set vdev property values in the vdev props mos object.
-		 */
-		if (vd->vdev_root_zap != 0) {
-			objid = vd->vdev_root_zap;
-		} else if (vd->vdev_top_zap != 0) {
-			objid = vd->vdev_top_zap;
-		} else if (vd->vdev_leaf_zap != 0) {
-			objid = vd->vdev_leaf_zap;
-		} else {
-			/*
-			 * XXX: implement vdev_props_set_check()
-			 */
-			panic("vdev not root/top/leaf");
-		}
-
 		switch (prop = vdev_name_to_prop(propname)) {
 		case VDEV_PROP_USERPROP:
 			if (vdev_prop_user(propname)) {
@@ -5797,6 +5795,12 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 
 	ASSERT(vd != NULL);
 
+	/* Check that vdev has a zap we can use */
+	if (vd->vdev_root_zap == 0 &&
+	    vd->vdev_top_zap == 0 &&
+	    vd->vdev_leaf_zap == 0)
+		return (SET_ERROR(EINVAL));
+
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));

From e037327bfee02717fc8f53fd848a09f370e14499 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Thu, 20 Jul 2023 22:23:52 +0500
Subject: [PATCH 15/24] spa_min_alloc should be GCD, not min

Since spa_min_alloc may not be a power of 2, unlike ashifts, in the
case of DRAID, we should not select the minimal value among several
vdevs. Rounding to a multiple of it is unlikely to work for other
vdevs. Instead, using the greatest common divisor produces smaller
yet more reasonable results.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #15067
---
 include/sys/spa_impl.h |  1 +
 module/zfs/spa_misc.c  |  1 +
 module/zfs/vdev.c      | 36 ++++++++++++++++++++++++++++++++----
 module/zfs/zio.c       | 22 +++++++++++++++++-----
 4 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 44afa76328..588c72f6e4 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -250,6 +250,7 @@ struct spa {
 	uint64_t	spa_min_ashift;		/* of vdevs in normal class */
 	uint64_t	spa_max_ashift;		/* of vdevs in normal class */
 	uint64_t	spa_min_alloc;		/* of vdevs in normal class */
+	uint64_t	spa_gcd_alloc;		/* of vdevs in normal class */
 	uint64_t	spa_config_guid;	/* config pool guid */
 	uint64_t	spa_load_guid;		/* spa_load initialized guid */
 	uint64_t	spa_last_synced_guid;	/* last synced guid */
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 06f6407690..3b355e0deb 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -772,6 +772,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 	spa->spa_min_alloc = INT_MAX;
+	spa->spa_gcd_alloc = INT_MAX;
 
 	/* Reset cached value */
 	spa->spa_dedup_dspace = ~0ULL;
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index b6f8c0ab30..f3812b843e 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1399,6 +1399,36 @@ vdev_remove_parent(vdev_t *cvd)
 	vdev_free(mvd);
 }
 
+/*
+ * Choose GCD for spa_gcd_alloc.
+ */
+static uint64_t
+vdev_gcd(uint64_t a, uint64_t b)
+{
+	while (b != 0) {
+		uint64_t t = b;
+		b = a % b;
+		a = t;
+	}
+	return (a);
+}
+
+/*
+ * Set spa_min_alloc and spa_gcd_alloc.
+ */
+static void
+vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
+{
+	if (min_alloc < spa->spa_min_alloc)
+		spa->spa_min_alloc = min_alloc;
+	if (spa->spa_gcd_alloc == INT_MAX) {
+		spa->spa_gcd_alloc = min_alloc;
+	} else {
+		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
+		    spa->spa_gcd_alloc);
+	}
+}
+
 void
 vdev_metaslab_group_create(vdev_t *vd)
 {
@@ -1451,8 +1481,7 @@ vdev_metaslab_group_create(vdev_t *vd)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
 			uint64_t min_alloc = vdev_get_min_alloc(vd);
-			if (min_alloc < spa->spa_min_alloc)
-				spa->spa_min_alloc = min_alloc;
+			vdev_spa_set_alloc(spa, min_alloc);
 		}
 	}
 }
@@ -2213,8 +2242,7 @@ vdev_open(vdev_t *vd)
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
 		uint64_t min_alloc = vdev_get_min_alloc(vd);
-		if (min_alloc < spa->spa_min_alloc)
-			spa->spa_min_alloc = min_alloc;
+		vdev_spa_set_alloc(spa, min_alloc);
 	}
 
 	/*
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 10279fde89..3f5e6a08d8 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1596,6 +1596,19 @@ zio_shrink(zio_t *zio, uint64_t size)
 	}
 }
 
+/*
+ * Round provided allocation size up to a value that can be allocated
+ * by at least some vdev(s) in the pool with minimum or no additional
+ * padding and without extra space usage on others
+ */
+static uint64_t
+zio_roundup_alloc_size(spa_t *spa, uint64_t size)
+{
+	if (size > spa->spa_min_alloc)
+		return (roundup(size, spa->spa_gcd_alloc));
+	return (spa->spa_min_alloc);
+}
+
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
@@ -1802,9 +1815,8 @@ zio_write_compress(zio_t *zio)
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
-			ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
-			size_t rounded = (size_t)roundup(psize,
-			    spa->spa_min_alloc);
+			size_t rounded = (size_t)zio_roundup_alloc_size(spa,
+			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
@@ -1847,8 +1859,8 @@ zio_write_compress(zio_t *zio)
 		 * take this codepath because it will change the on-disk block
 		 * and decryption will fail.
 		 */
-		size_t rounded = MIN((size_t)roundup(psize,
-		    spa->spa_min_alloc), lsize);
+		size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
+		    lsize);
 
 		if (rounded != psize) {
 			abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);

From b221f4394312ff23550fa60624e3f38bed2a1a1e Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Thu, 20 Jul 2023 10:30:21 -0700
Subject: [PATCH 16/24] Fix zpl_test_super race with zfs_umount

We cannot call zpl_enter in zpl_test_super, because zpl_test_super is
under spinlock so we can't sleep, and also because zpl_test_super is
called without sb->s_umount taken, so it's possible we would race with
zfs_umount and call zpl_enter on freed zfsvfs.

Here's an stack trace when this happens:
[ 2379.114837] VERIFY(cvp->cv_magic == CV_MAGIC) failed
[ 2379.114845] PANIC at spl-condvar.c:497:__cv_broadcast()
[ 2379.114854] Kernel panic - not syncing: VERIFY(cvp->cv_magic == CV_MAGIC) failed
[ 2379.115012] Call Trace:
[ 2379.115019]  dump_stack+0x74/0x96
[ 2379.115024]  panic+0x114/0x2f6
[ 2379.115035]  spl_panic+0xcf/0xfc [spl]
[ 2379.115477]  __cv_broadcast+0x68/0xa0 [spl]
[ 2379.115585]  rrw_exit+0xb8/0x310 [zfs]
[ 2379.115696]  rrm_exit+0x4a/0x80 [zfs]
[ 2379.115808]  zpl_test_super+0xa9/0xd0 [zfs]
[ 2379.115920]  sget+0xd1/0x230
[ 2379.116033]  zpl_mount+0xdc/0x230 [zfs]
[ 2379.116037]  legacy_get_tree+0x28/0x50
[ 2379.116039]  vfs_get_tree+0x27/0xc0
[ 2379.116045]  path_mount+0x2fe/0xa70
[ 2379.116048]  do_mount+0x80/0xa0
[ 2379.116050]  __x64_sys_mount+0x8b/0xe0
[ 2379.116052]  do_syscall_64+0x35/0x50
[ 2379.116054]  entry_SYSCALL_64_after_hwframe+0x61/0xc6
[ 2379.116057] RIP: 0033:0x7f9912e8b26a

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #15077
---
 module/os/linux/zfs/zfs_vfsops.c |  1 +
 module/os/linux/zfs/zpl_super.c  | 39 ++++++++++++++++++++------------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index 6b6293b9e4..87c4e6dcaf 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -1662,6 +1662,7 @@ zfs_umount(struct super_block *sb)
 	}
 
 	zfsvfs_free(zfsvfs);
+	sb->s_fs_info = NULL;
 	return (0);
 }
 
diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
index c5c230bee1..ad52a11aad 100644
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@@ -277,8 +277,6 @@ zpl_test_super(struct super_block *s, void *data)
 {
 	zfsvfs_t *zfsvfs = s->s_fs_info;
 	objset_t *os = data;
-	int match;
-
 	/*
 	 * If the os doesn't match the z_os in the super_block, assume it is
 	 * not a match. Matching would imply a multimount of a dataset. It is
@@ -286,19 +284,7 @@ zpl_test_super(struct super_block *s, void *data)
 	 * that changes the z_os, e.g., rollback, where the match will be
 	 * missed, but in that case the user will get an EBUSY.
 	 */
-	if (zfsvfs == NULL || os != zfsvfs->z_os)
-		return (0);
-
-	/*
-	 * If they do match, recheck with the lock held to prevent mounting the
-	 * wrong dataset since z_os can be stale when the teardown lock is held.
-	 */
-	if (zpl_enter(zfsvfs, FTAG) != 0)
-		return (0);
-	match = (os == zfsvfs->z_os);
-	zpl_exit(zfsvfs, FTAG);
-
-	return (match);
+	return (zfsvfs != NULL && os == zfsvfs->z_os);
 }
 
 static struct super_block *
@@ -324,12 +310,35 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 
 	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
 
+	/*
+	 * Recheck with the lock held to prevent mounting the wrong dataset
+	 * since z_os can be stale when the teardown lock is held.
+	 *
+	 * We can't do this in zpl_test_super in since it's under spinlock and
+	 * also s_umount lock is not held there so it would race with
+	 * zfs_umount and zfsvfs can be freed.
+	 */
+	if (!IS_ERR(s) && s->s_fs_info != NULL) {
+		zfsvfs_t *zfsvfs = s->s_fs_info;
+		if (zpl_enter(zfsvfs, FTAG) == 0) {
+			if (os != zfsvfs->z_os)
+				err = -SET_ERROR(EBUSY);
+			zpl_exit(zfsvfs, FTAG);
+		} else {
+			err = -SET_ERROR(EBUSY);
+		}
+	}
 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
 
 	if (IS_ERR(s))
 		return (ERR_CAST(s));
 
+	if (err) {
+		deactivate_locked_super(s);
+		return (ERR_PTR(err));
+	}
+
 	if (s->s_root == NULL) {
 		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
 		if (err) {

From 8c81c0b05d63e69473d859870f7a6e32e46b0410 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 21 Jul 2023 11:46:58 -0700
Subject: [PATCH 17/24] zed: Fix zed ASSERT on slot power cycle

We would see zed assert on one of our systems if we powered off a
slot.  Further examination showed zfs_retire_recv() was reporting
a GUID of 0, which in turn would return a NULL nvlist.  Add
in a check for a zero GUID.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #15084
---
 cmd/zed/agents/zfs_retire.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index f83ae09259..a0e377a4a0 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -416,6 +416,11 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
 			return;
 
+		if (vdev_guid == 0) {
+			fmd_hdl_debug(hdl, "Got a zero GUID");
+			return;
+		}
+
 		if (spare) {
 			int nspares = find_and_remove_spares(zhdl, vdev_guid);
 			fmd_hdl_debug(hdl, "%d spares removed", nspares);

From 51a2b597679f948d6102c0e0376ba2273d82818e Mon Sep 17 00:00:00 2001
From: Alan Somers <asomers@FreeBSD.org>
Date: Tue, 11 Jul 2023 14:45:06 -0600
Subject: [PATCH 18/24] Don't emit checksum histograms in
 ereport.fs.zfs.checksum events

The checksum histograms were intended to be used with ATA and parallel
SCSI, which are obsolete.  With modern storage hardware, they will
almost always look like white noise; all bits will be wrong.  They only
serve to bloat the event.  That's a particular problem on FreeBSD, where
events must fit into a 1016 byte buffer.

This fixes issue #14717 for RAIDZ pools, but not for mirror pools.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Alan Somers <asomers@gmail.com>
Sponsored-by: Axcient
Closes #15052
---
 include/sys/fm/fs/zfs.h |  2 --
 man/man8/zpool-events.8 | 19 +------------------
 module/zfs/zfs_fm.c     | 25 ++++---------------------
 3 files changed, 5 insertions(+), 41 deletions(-)

diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index b9bac7e252..3cf2b1274d 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -112,8 +112,6 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS	"bad_range_clears"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS	"bad_set_bits"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS	"bad_cleared_bits"
-#define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
-#define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
 #define	FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME	"snapshot_name"
 #define	FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME	"device_name"
 #define	FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME	"raw_name"
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index 341f902fe6..c3bd3208e6 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -26,7 +26,7 @@
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 27, 2021
+.Dd July 11, 2023
 .Dt ZPOOL-EVENTS 8
 .Os
 .
@@ -362,23 +362,6 @@ Like
 but contains
 .Pq Ar good data No & ~( Ns Ar bad data ) ;
 that is, the bits set in the good data which are cleared in the bad data.
-.It Sy bad_set_histogram
-If this field exists, it is an array of counters.
-Each entry counts bits set in a particular bit of a big-endian uint64 type.
-The first entry counts bits
-set in the high-order bit of the first byte, the 9th byte, etc, and the last
-entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc.
-This information is useful for observing a stuck bit in a parallel data path,
-such as IDE or parallel SCSI.
-.It Sy bad_cleared_histogram
-If this field exists, it is an array of counters.
-Each entry counts bit clears in a particular bit of a big-endian uint64 type.
-The first entry counts bits
-clears of the high-order bit of the first byte, the 9th byte, etc, and the
-last entry counts clears of the low-order bit of the 8th byte, the 16th byte,
-etc.
-This information is useful for observing a stuck bit in a parallel data
-path, such as IDE or parallel SCSI.
 .El
 .
 .Sh I/O STAGES
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 2754ceec83..9365ca500d 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -754,10 +754,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 #define	MAX_RANGES		16
 
 typedef struct zfs_ecksum_info {
-	/* histograms of set and cleared bits by bit number in a 64-bit word */
-	uint8_t zei_histogram_set[sizeof (uint64_t) * NBBY];
-	uint8_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
-
 	/* inline arrays of bits set and cleared. */
 	uint64_t zei_bits_set[ZFM_MAX_INLINE];
 	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
@@ -781,7 +777,7 @@ typedef struct zfs_ecksum_info {
 } zfs_ecksum_info_t;
 
 static void
-update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)
+update_bad_bits(uint64_t value_arg, uint32_t *count)
 {
 	size_t i;
 	size_t bits = 0;
@@ -789,10 +785,8 @@ update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)
 
 	/* We store the bits in big-endian (largest-first) order */
 	for (i = 0; i < 64; i++) {
-		if (value & (1ull << i)) {
-			hist[63 - i] = MAX(hist[63 - i], hist[63 - i] + 1);
+		if (value & (1ull << i))
 			++bits;
-		}
 	}
 	/* update the count of bits changed */
 	*count += bits;
@@ -1010,10 +1004,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 				offset++;
 			}
 
-			update_histogram(set, eip->zei_histogram_set,
-			    &eip->zei_range_sets[range]);
-			update_histogram(cleared, eip->zei_histogram_cleared,
-			    &eip->zei_range_clears[range]);
+			update_bad_bits(set, &eip->zei_range_sets[range]);
+			update_bad_bits(cleared, &eip->zei_range_clears[range]);
 		}
 
 		/* convert to byte offsets */
@@ -1049,15 +1041,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_cleared,
 		    NULL);
-	} else {
-		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
-		    DATA_TYPE_UINT8_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
-		    DATA_TYPE_UINT8_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
-		    NULL);
 	}
 	return (eip);
 }

From b6f618f8ffda435f5df9e185c999245842add93d Mon Sep 17 00:00:00 2001
From: Alan Somers <asomers@FreeBSD.org>
Date: Tue, 11 Jul 2023 15:13:57 -0600
Subject: [PATCH 19/24] Don't emit cksum_{actual_expected} in
 ereport.fs.zfs.checksum events

With anything but fletcher-4, even a tiny change in the input will cause
the checksum value to change completely.  So knowing the actual and
expected checksums doesn't provide much more information than "they
don't match".  The harm in sending them is simply that they bloat the
event.  In particular, on FreeBSD the event must fit into a 1016 byte
buffer.

Fixes #14717 for mirrored pools.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Alan Somers <asomers@gmail.com>
Sponsored-by: Axcient
Closes #14717
Closes #15052
---
 include/sys/fm/fs/zfs.h    | 2 --
 include/sys/zio_checksum.h | 2 --
 man/man8/zpool-events.8    | 4 ----
 module/zfs/vdev_indirect.c | 2 +-
 module/zfs/vdev_raidz.c    | 2 +-
 module/zfs/zfs_fm.c        | 8 --------
 module/zfs/zio_checksum.c  | 2 --
 7 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index 3cf2b1274d..fb9e864922 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -102,8 +102,6 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP	"zio_timestamp"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA	"zio_delta"
 #define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
-#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED	"cksum_expected"
-#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL	"cksum_actual"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO	"cksum_algorithm"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP	"cksum_byteswap"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h
index 9fb79ab4a5..37fd65b7cb 100644
--- a/include/sys/zio_checksum.h
+++ b/include/sys/zio_checksum.h
@@ -94,8 +94,6 @@ typedef const struct zio_checksum_info {
 } zio_checksum_info_t;
 
 typedef struct zio_bad_cksum {
-	zio_cksum_t		zbc_expected;
-	zio_cksum_t		zbc_actual;
 	const char		*zbc_checksum_name;
 	uint8_t			zbc_byteswapped;
 	uint8_t			zbc_injected;
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index c3bd3208e6..e1436f6ded 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -305,10 +305,6 @@ The time when a given I/O request was submitted.
 The time required to service a given I/O request.
 .It Sy prev_state
 The previous state of the vdev.
-.It Sy cksum_expected
-The expected checksum value for the block.
-.It Sy cksum_actual
-The actual checksum value for an errant block.
 .It Sy cksum_algorithm
 Checksum algorithm used.
 See
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 8966758534..acb7256966 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -1398,7 +1398,7 @@ vdev_indirect_checksum_error(zio_t *zio,
 	vd->vdev_stat.vs_checksum_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	zio_bad_cksum_t zbc = {{{ 0 }}};
+	zio_bad_cksum_t zbc = { 0 };
 	abd_t *bad_abd = ic->ic_data;
 	abd_t *good_abd = is->is_good_child->ic_data;
 	(void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 14b98a76b8..3445fa9d35 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1785,7 +1785,7 @@ vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 static int
 raidz_checksum_verify(zio_t *zio)
 {
-	zio_bad_cksum_t zbc = {{{0}}};
+	zio_bad_cksum_t zbc = {0};
 	raidz_map_t *rm = zio->io_vsd;
 
 	int ret = zio_checksum_error(zio, &zbc);
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 9365ca500d..c4eb74e873 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -914,14 +914,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 
 	if (info != NULL && info->zbc_has_cksum) {
 		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_expected) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_expected,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_actual) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_actual,
 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
 		    DATA_TYPE_STRING,
 		    info->zbc_checksum_name,
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index 6090959c5b..9de515e876 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -515,8 +515,6 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
 	}
 
 	if (info != NULL) {
-		info->zbc_expected = expected_cksum;
-		info->zbc_actual = actual_cksum;
 		info->zbc_checksum_name = ci->ci_name;
 		info->zbc_byteswapped = byteswap;
 		info->zbc_injected = 0;

From 8a6fde8213797df1acd24d4afb9ada0d005f6b1d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 21 Jul 2023 14:50:48 -0400
Subject: [PATCH 20/24] Add explicit prefetches to bpobj_iterate().

To simplify error handling bpobj_iterate_blkptrs() iterates through
the list of block pointers backwards.  Unfortunately speculative
prefetcher is currently unable to detect such patterns, that makes
each block read there synchronous and very slow on HDD pools.

According to my tests, added explicit prefetch reduces time needed
to asynchronously delete 8 snapshots of 4 million blocks each from
20 seconds to less than one, that should free sync thread for other
useful work, such as async writes, scrub, etc.

While there, plug one memory leak in case of bpobj_open() error and
harmonize some variable names.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15071
---
 include/sys/bpobj.h |  2 +-
 module/zfs/bpobj.c  | 49 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h
index f3384f5264..81bc0fe210 100644
--- a/include/sys/bpobj.h
+++ b/include/sys/bpobj.h
@@ -60,7 +60,7 @@ typedef struct bpobj {
 	kmutex_t	bpo_lock;
 	objset_t	*bpo_os;
 	uint64_t	bpo_object;
-	int		bpo_epb;
+	uint32_t	bpo_epb;
 	uint8_t		bpo_havecomp;
 	uint8_t		bpo_havesubobj;
 	uint8_t		bpo_havefreed;
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index 211bab5651..e772caead2 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
 	dmu_buf_t *dbuf = NULL;
 	bpobj_t *bpo = bpi->bpi_bpo;
 
-	for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
+	int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
+	uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
+	    sizeof (blkptr_t);
+	uint64_t ps = start * sizeof (blkptr_t);
+	uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
+	    ps);
+	if (pe > pb) {
+		dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+	for (; i >= start; i--) {
 		uint64_t offset = i * sizeof (blkptr_t);
 		uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
 
@@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
 			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
-			    offset, FTAG, &dbuf, 0);
+			    offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
+			pe = pb;
+			pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
+			    dbuf->db_offset - dmu_prefetch_max : 0, ps);
+			if (pe > pb) {
+				dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
+				    pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
+			}
 		}
 
 		ASSERT3U(offset, >=, dbuf->db_offset);
@@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
 			int64_t i = bpi->bpi_unprocessed_subobjs - 1;
 			uint64_t offset = i * sizeof (uint64_t);
 
-			uint64_t obj_from_sublist;
+			uint64_t subobj;
 			err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
-			    offset, sizeof (uint64_t), &obj_from_sublist,
-			    DMU_READ_PREFETCH);
+			    offset, sizeof (uint64_t), &subobj,
+			    DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
-			bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
+
+			bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
 			    KM_SLEEP);
-
-			err = bpobj_open(sublist, bpo->bpo_os,
-			    obj_from_sublist);
-			if (err)
+			err = bpobj_open(subbpo, bpo->bpo_os, subobj);
+			if (err) {
+				kmem_free(subbpo, sizeof (bpobj_t));
 				break;
+			}
 
-			list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
-			mutex_enter(&sublist->bpo_lock);
+			if (subbpo->bpo_havesubobj &&
+			    subbpo->bpo_phys->bpo_subobjs != 0) {
+				dmu_prefetch(subbpo->bpo_os,
+				    subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
+				    ZIO_PRIORITY_ASYNC_READ);
+			}
+
+			list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
+			mutex_enter(&subbpo->bpo_lock);
 			bpi->bpi_unprocessed_subobjs--;
 		}
 	}

From 81be809a25b8d3ef859f203664d1981163d87983 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 21 Jul 2023 14:51:47 -0400
Subject: [PATCH 21/24] Adjust prefetch parameters.

- Reduce maximum prefetch distance for 32bit platforms to 8MB as it
was previously.  Those systems didn't grow much probably, so better
stay conservative there.
 - Retire array_rd_sz tunable, blocking prefetch for large requests.
We should not penalize applications trying to be more efficient. The
speculative prefetcher by itself has reasonable distance limits, and
1MB is not much at all these days.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15072
---
 include/sys/dmu_zfetch.h |  2 --
 man/man4/zfs.4           |  3 ---
 module/zfs/dmu.c         |  7 +++++--
 module/zfs/dmu_zfetch.c  | 12 +++++++-----
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h
index 0fbc3bacff..f00e13cf03 100644
--- a/include/sys/dmu_zfetch.h
+++ b/include/sys/dmu_zfetch.h
@@ -36,8 +36,6 @@
 extern "C" {
 #endif
 
-extern uint64_t	zfetch_array_rd_sz;
-
 struct dnode;				/* so we can reference dnode */
 
 typedef struct zfetch {
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 271b02b6ee..7959bfe33b 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -519,9 +519,6 @@ However, this is limited by
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
-.It Sy zfetch_array_rd_sz Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
-If prefetching is enabled, disable prefetching for reads larger than this size.
-.
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index dda869287c..3a4560cec2 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -89,7 +89,11 @@ static int zfs_dmu_offset_next_sync = 1;
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
+#ifdef _ILP32
+uint_t dmu_prefetch_max = 8 * 1024 * 1024;
+#else
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+#endif
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
@@ -552,8 +556,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
-	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
-	    length <= zfetch_array_rd_sz) {
+	if ((flags & DMU_READ_NO_PREFETCH) == 0) {
 		/*
 		 * Prepare the zfetch before initiating the demand reads, so
 		 * that if multiple threads block on same indirect block, we
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index b70459380c..d0acaf5020 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -52,14 +52,19 @@ static unsigned int	zfetch_max_streams = 8;
 static unsigned int	zfetch_min_sec_reap = 1;
 /* max time before stream delete */
 static unsigned int	zfetch_max_sec_reap = 2;
+#ifdef _ILP32
+/* min bytes to prefetch per stream (default 2MB) */
+static unsigned int	zfetch_min_distance = 2 * 1024 * 1024;
+/* max bytes to prefetch per stream (default 8MB) */
+unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
+#else
 /* min bytes to prefetch per stream (default 4MB) */
 static unsigned int	zfetch_min_distance = 4 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 64MB) */
 unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
+#endif
 /* max bytes to prefetch indirects for per stream (default 64MB) */
 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
-/* max number of bytes in an array_read in which we allow prefetching (1MB) */
-uint64_t	zfetch_array_rd_sz = 1024 * 1024;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
@@ -580,6 +585,3 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
-
-ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW,
-	"Number of bytes in a array_read");

From 685ae4429ffe7753359faa6cbae34e061d1e670b Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 22 Jul 2023 04:52:32 +1000
Subject: [PATCH 22/24] metaslab: tuneable to better control force ganging

metaslab_force_ganging isn't enough to actually force ganging, because
it still only forces 3% of the time. This adds
metaslab_force_ganging_pct so we can configure how often to force
ganging.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15088
---
 man/man4/zfs.4        |  7 ++++++-
 module/zfs/metaslab.c | 14 ++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 7959bfe33b..3843419731 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -15,7 +15,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd January 10, 2023
+.Dd July 21, 2023
 .Dt ZFS 4
 .Os
 .
@@ -239,6 +239,11 @@ relative to the pool.
 Make some blocks above a certain size be gang blocks.
 This option is used by the test suite to facilitate testing.
 .
+.It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint
+For blocks that could be forced to be a gang block (due to
+.Sy metaslab_force_ganging ) ,
+force this many of them to be gang blocks.
+.
 .It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 176247d63b..9991e1a22c 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -58,6 +58,11 @@ static uint64_t metaslab_aliquot = 1024 * 1024;
  */
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
+/*
+ * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
+ */
+uint_t metaslab_force_ganging_pct = 3;
+
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
@@ -5109,7 +5114,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 * damage can result in extremely long reconstruction times.  This
 	 * will also test spilling from special to normal.
 	 */
-	if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) {
+	if (psize >= metaslab_force_ganging &&
+	    metaslab_force_ganging_pct > 0 &&
+	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
@@ -6266,7 +6273,10 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 	"Segment-based metaslab selection maximum buckets before switching");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
-	"Blocks larger than this size are forced to be gang blocks");
+	"Blocks larger than this size are sometimes forced to be gang blocks");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
+	"Percentage of large blocks that will be forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 	"Max distance (bytes) to search forward before using size tree");

From c5273e0c317a941af9c8a4090ad38b1c7b55d716 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 22 Jul 2023 04:53:06 +1000
Subject: [PATCH 23/24] shellcheck: disable "unreachable command" check
 [SC2317]

This new check in 0.9.0 appears to have some issues with various forms
of "early return", like trap, exit and return. This is tripping up (at
least):

  cmd/zed/zed.d/history_event-zfs-list-cacher.sh
  /etc/zfs/zfs-functions

Its not obvious what its complaining about or what the remedy is, so it
seems sensible to disable this check for now.

See also:

  https://www.shellcheck.net/wiki/SC2317
  https://github.com/koalaman/shellcheck/issues/2542
  https://github.com/koalaman/shellcheck/issues/2613

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15089
---
 config/Shellcheck.am | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/config/Shellcheck.am b/config/Shellcheck.am
index 1cff81e04b..1ab1351606 100644
--- a/config/Shellcheck.am
+++ b/config/Shellcheck.am
@@ -4,6 +4,7 @@
 # Not following: a was not specified as input (see shellcheck -x). [SC1091]
 # Prefer putting braces around variable references even when not strictly required. [SC2250]
 # Consider invoking this command separately to avoid masking its return value (or use '|| true' to ignore). [SC2312]
+# Command appears to be unreachable. Check usage (or ignore if invoked indirectly). [SC2317]
 # In POSIX sh, 'local' is undefined. [SC2039] # older ShellCheck versions
 # In POSIX sh, 'local' is undefined. [SC3043] # newer ShellCheck versions
 
@@ -18,7 +19,7 @@ PHONY += shellcheck
 _STGT = $(subst ^,/,$(subst shellcheck-here-,,$@))
 shellcheck-here-%:
 if HAVE_SHELLCHECK
-	shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)"
+	shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC2317,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)"
 else
 	@echo "skipping shellcheck of" $(_STGT) "because shellcheck is not installed"
 endif

From 70232483b4d3d2ede5b6064e178706dfad070f14 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 21 Jul 2023 15:50:43 -0700
Subject: [PATCH 24/24] Tag 2.2.0-rc2

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index 5f834d5cc7..f17be8f36e 100644
--- a/META
+++ b/META
@@ -2,7 +2,7 @@ Meta:          1
 Name:          zfs
 Branch:        1.0
 Version:       2.2.0
-Release:       rc1
+Release:       rc2
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS