From 35a6247c5fe788aa77e0b3c7e8010fedb9e60eb5 Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Mon, 26 Jun 2023 16:57:12 -0400
Subject: [PATCH 01/18] Add a delay to tearing down threads.

It's been observed that in certain workloads (zvol-related being a
big one), ZFS will end up spending a large amount of time spinning
up taskqs only to tear them down again almost immediately, then
spin them up again...

I noticed this when I looked at what my mostly-idle system was doing
and wondered how on earth taskq creation/destroy was a bunch of time...

So I added a configurable delay to avoid it tearing down tasks the
first time it notices them idle, and the total number of threads at
steady state went up, but the amount of time being burned just
tearing down/turning up new ones almost vanished.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #14938
---
 include/os/linux/spl/sys/taskq.h |  1 +
 man/man4/spl.4                   | 15 ++++++++++++++
 module/os/linux/spl/spl-taskq.c  | 34 +++++++++++++++++++++++++++++++-
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index 2a6cd8283d..6c1b4377a9 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -104,6 +104,7 @@ typedef struct taskq {
 	/* list node for the cpu hotplug callback */
 	struct hlist_node	tq_hp_cb_node;
 	boolean_t		tq_hp_support;
+	unsigned long		lastshouldstop; /* when to purge dynamic */
 } taskq_t;
 
 typedef struct taskq_ent {
diff --git a/man/man4/spl.4 b/man/man4/spl.4
index 02efaf16dc..82455fb532 100644
--- a/man/man4/spl.4
+++ b/man/man4/spl.4
@@ -193,4 +193,19 @@ The proc file will walk the lists with lock held,
 reading it could cause a lock-up if the list grow too large
 without limiting the output.
 "(truncated)" will be shown if the list is larger than the limit.
+.
+.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint
+(Linux-only)
+How long a taskq has to have had no work before we tear it down.
+Previously, we would tear down a dynamic taskq worker as soon
+as we noticed it had no work, but it was observed that this led
+to a lot of churn in tearing down things we then immediately
+spawned anew.
+In practice, it seems any nonzero value will remove the vast
+majority of this churn, while the nontrivially larger value
+was chosen to help filter out the little remaining churn on
+a mostly idle system.
+Setting this value to
+.Sy 0
+will revert to the previous behavior.
 .El
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index 84497359ce..d18f935b16 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -36,6 +36,12 @@ static int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
 
+static uint_t spl_taskq_thread_timeout_ms = 10000;
+/* BEGIN CSTYLED */
+module_param(spl_taskq_thread_timeout_ms, uint, 0644);
+/* END CSTYLED */
+MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
+	"Time to require a dynamic thread be idle before it gets cleaned up");
 
 static int spl_taskq_thread_dynamic = 1;
 module_param(spl_taskq_thread_dynamic, int, 0444);
@@ -848,12 +854,37 @@ taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
 	    tqt_thread_list) == tqt)
 		return (0);
 
-	return
+	int no_work =
 	    ((tq->tq_nspawn == 0) &&	/* No threads are being spawned */
 	    (tq->tq_nactive == 0) &&	/* No threads are handling tasks */
 	    (tq->tq_nthreads > 1) &&	/* More than 1 thread is running */
 	    (!taskq_next_ent(tq)) &&	/* There are no pending tasks */
 	    (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
+
+	/*
+	 * If we would have said stop before, let's instead wait a bit, maybe
+	 * we'll see more work come our way soon...
+	 */
+	if (no_work) {
+		/* if it's 0, we want the old behavior. */
+		/* if the taskq is being torn down, we also want to go away. */
+		if (spl_taskq_thread_timeout_ms == 0 ||
+		    !(tq->tq_flags & TASKQ_ACTIVE))
+			return (1);
+		unsigned long lasttime = tq->lastshouldstop;
+		if (lasttime > 0) {
+			if (time_after(jiffies, lasttime +
+			    msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
+				return (1);
+			else
+				return (0);
+		} else {
+			tq->lastshouldstop = jiffies;
+		}
+	} else {
+		tq->lastshouldstop = 0;
+	}
+	return (0);
 }
 
 static int
@@ -1091,6 +1122,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
 	tq->tq_flags = (flags | TASKQ_ACTIVE);
 	tq->tq_next_id = TASKQID_INITIAL;
 	tq->tq_lowest_id = TASKQID_INITIAL;
+	tq->lastshouldstop = 0;
 	INIT_LIST_HEAD(&tq->tq_free_list);
 	INIT_LIST_HEAD(&tq->tq_pend_list);
 	INIT_LIST_HEAD(&tq->tq_prio_list);

From 8469b5aac0cee4f0e8b13018c3e83129554a6945 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 27 Jun 2023 12:09:48 -0400
Subject: [PATCH 02/18] Another set of vdev queue optimizations.

Switch FIFO queues (SYNC/TRIM) and active queue of vdev queue from
time-sorted AVL-trees to simple lists.  AVL-trees are too expensive
for such a simple task.  To change I/O priority without searching
through the trees, add io_queue_state field to struct zio.

To not check number of queued I/Os for each priority add vq_cqueued
bitmap to struct vdev_queue.  Update it when adding/removing I/Os.
Make vq_cactive a separate array instead of struct vdev_queue_class
member.  Together those allow to avoid lots of cache misses when
looking for work in vdev_queue_class_to_issue().

Introduce deadline of ~0.5s for LBA-sorted queues.  Before this I
saw some I/Os waiting in a queue for up to 8 seconds and possibly
more due to starvation.  With this change I no longer see it.  I
had to slightly more complicate the comparison function, but since
it uses all the same cache lines the difference is minimal.  For a
sequential I/Os the new code in vdev_queue_io_to_issue() actually
often uses more simple avl_first(), falling back to avl_find() and
avl_nearest() only when needed.

Arrange members in struct zio to access only one cache line when
searching through vdev queues.  While there, remove io_alloc_node,
reusing the io_queue_node instead.  Those two are never used same
time.

Remove zfs_vdev_aggregate_trim parameter.  It was disabled for 4
years since implemented, while still wasted time maintaining the
offset-sorted tree of TRIM requests.  Just remove the tree.

Remove locking from txg_all_lists_empty().  It is racy by design,
while 2 pair of locks/unlocks take noticeable time under the vdev
queue lock.

With these changes in my tests with volblocksize=4KB I measure vdev
queue lock spin time reduction by 50% on read and 75% on write.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14925
---
 include/sys/vdev.h      |   3 +-
 include/sys/vdev_impl.h |  17 +--
 include/sys/zio.h       |  15 +-
 man/man4/zfs.4          |   6 -
 module/zfs/spa_misc.c   |   2 +-
 module/zfs/txg.c        |  13 +-
 module/zfs/vdev.c       |  16 +--
 module/zfs/vdev_queue.c | 305 ++++++++++++++++++++++------------------
 8 files changed, 205 insertions(+), 172 deletions(-)

diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 26c834ff57..03e1f438aa 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
 extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 
-extern int vdev_queue_length(vdev_t *vd);
+extern uint32_t vdev_queue_length(vdev_t *vd);
 extern uint64_t vdev_queue_last_offset(vdev_t *vd);
+extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 74b3737d8e..2b22b973ba 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -130,27 +130,24 @@ typedef const struct vdev_ops {
 /*
  * Virtual device properties
  */
-typedef struct vdev_queue_class {
-	uint32_t	vqc_active;
-
-	/*
-	 * Sorted by offset or timestamp, depending on if the queue is
-	 * LBA-ordered vs FIFO.
-	 */
-	avl_tree_t	vqc_queued_tree;
+typedef union vdev_queue_class {
+	list_t		vqc_list;
+	avl_tree_t	vqc_tree;
 } vdev_queue_class_t;
 
 struct vdev_queue {
 	vdev_t		*vq_vdev;
 	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
-	avl_tree_t	vq_active_tree;
 	avl_tree_t	vq_read_offset_tree;
 	avl_tree_t	vq_write_offset_tree;
-	avl_tree_t	vq_trim_offset_tree;
 	uint64_t	vq_last_offset;
 	zio_priority_t	vq_last_prio;	/* Last sent I/O priority. */
+	uint32_t	vq_cqueued;	/* Classes with queued I/Os. */
+	uint32_t	vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
+	uint32_t	vq_active;	/* Number of active I/Os. */
 	uint32_t	vq_ia_active;	/* Active interactive I/Os. */
 	uint32_t	vq_nia_credit;	/* Non-interactive I/Os credit. */
+	list_t		vq_active_list;	/* List of active I/Os. */
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	hrtime_t	vq_io_delta_ts;
 	zio_t		vq_io_search; /* used as local for stack reduction */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index ec32211f69..85217b873d 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -436,6 +436,12 @@ typedef struct zio_link {
 	list_node_t	zl_child_node;
 } zio_link_t;
 
+enum zio_qstate {
+	ZIO_QS_NONE = 0,
+	ZIO_QS_QUEUED,
+	ZIO_QS_ACTIVE,
+};
+
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_phys_t	io_bookmark;
@@ -479,6 +485,12 @@ struct zio {
 	const zio_vsd_ops_t *io_vsd_ops;
 	metaslab_class_t *io_metaslab_class;	/* dva throttle class */
 
+	enum zio_qstate	io_queue_state;	/* vdev queue state */
+	union {
+		list_node_t l;
+		avl_node_t a;
+	} io_queue_node ____cacheline_aligned;	/* allocator and vdev queues */
+	avl_node_t	io_offset_node;	/* vdev offset queues */
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;	/* submitted at */
 	hrtime_t	io_queued_timestamp;
@@ -486,9 +498,6 @@ struct zio {
 	hrtime_t	io_delta;	/* vdev queue service delta */
 	hrtime_t	io_delay;	/* Device access time (disk or */
 					/* file). */
-	avl_node_t	io_queue_node;
-	avl_node_t	io_offset_node;
-	avl_node_t	io_alloc_node;
 	zio_alloc_list_t 	io_alloc_list;
 
 	/* Internal pipeline state */
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 5fbd9d7db9..04bbbc5fdf 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in
 Flush dirty data to disk at least every this many seconds (maximum TXG
 duration).
 .
-.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
-Allow TRIM I/O operations to be aggregated.
-This is normally not helpful because the extents to be trimmed
-will have been already been aggregated by the metaslab.
-This option is provided for debugging and performance analysis.
-.
 .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
 Max vdev I/O aggregation size.
 .
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 9ef948e9e4..8dc83445e1 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
-		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+		    sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 	}
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index ec61cabcaa..a67c043446 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
 boolean_t
 txg_all_lists_empty(txg_list_t *tl)
 {
-	mutex_enter(&tl->tl_lock);
-	for (int i = 0; i < TXG_SIZE; i++) {
-		if (!txg_list_empty_impl(tl, i)) {
-			mutex_exit(&tl->tl_lock);
-			return (B_FALSE);
-		}
-	}
-	mutex_exit(&tl->tl_lock);
-	return (B_TRUE);
+	boolean_t res = B_TRUE;
+	for (int i = 0; i < TXG_SIZE; i++)
+		res &= (tl->tl_head[i] == NULL);
+	return (res);
 }
 
 /*
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 612e66c3a8..30551feb63 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 
 		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
 
-		for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
-			vsx->vsx_active_queue[t] =
-			    vd->vdev_queue.vq_class[t].vqc_active;
-			vsx->vsx_pend_queue[t] = avl_numnodes(
-			    &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+		for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+			vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
+			vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
 		}
 	}
 }
@@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
-		if (avl_numnodes(&vq->vq_active_tree) > 0) {
+		if (vq->vq_active > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
-			zfs_dbgmsg("slow vdev: %s has %lu active IOs",
-			    vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+			zfs_dbgmsg("slow vdev: %s has %u active IOs",
+			    vd->vdev_path, vq->vq_active);
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
-			fio = avl_first(&vq->vq_active_tree);
+			fio = list_head(&vq->vq_active_list);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa))
 				zio_deadman(fio, tag);
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index abb7d0662b..08d918467d 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300;
  */
 uint_t zfs_vdev_def_queue_depth = 32;
 
-/*
- * Allow TRIM I/Os to be aggregated.  This should normally not be needed since
- * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
- * by the TRIM code in zfs_trim.c.
- */
-static uint_t zfs_vdev_aggregate_trim = 0;
-
 static int
 vdev_queue_offset_compare(const void *x1, const void *x2)
 {
@@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
 	return (TREE_PCMP(z1, z2));
 }
 
-static inline avl_tree_t *
-vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
-{
-	return (&vq->vq_class[p].vqc_queued_tree);
-}
-
-static inline avl_tree_t *
-vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
-{
-	ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
-	if (t == ZIO_TYPE_READ)
-		return (&vq->vq_read_offset_tree);
-	else if (t == ZIO_TYPE_WRITE)
-		return (&vq->vq_write_offset_tree);
-	else
-		return (&vq->vq_trim_offset_tree);
-}
+#define	VDQ_T_SHIFT 29
 
 static int
-vdev_queue_timestamp_compare(const void *x1, const void *x2)
+vdev_queue_to_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = (const zio_t *)x1;
 	const zio_t *z2 = (const zio_t *)x2;
 
-	int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
+	int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
+	    z2->io_timestamp >> VDQ_T_SHIFT);
+	int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
+	int cmp = tcmp ? tcmp : ocmp;
 
-	if (likely(cmp))
+	if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
 		return (cmp);
 
 	return (TREE_PCMP(z1, z2));
 }
 
+static inline boolean_t
+vdev_queue_class_fifo(zio_priority_t p)
+{
+	return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
+	    p == ZIO_PRIORITY_TRIM);
+}
+
+static void
+vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
+{
+	zio_priority_t p = zio->io_priority;
+	vq->vq_cqueued |= 1U << p;
+	if (vdev_queue_class_fifo(p))
+		list_insert_tail(&vq->vq_class[p].vqc_list, zio);
+	else
+		avl_add(&vq->vq_class[p].vqc_tree, zio);
+}
+
+static void
+vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
+{
+	zio_priority_t p = zio->io_priority;
+	uint32_t empty;
+	if (vdev_queue_class_fifo(p)) {
+		list_t *list = &vq->vq_class[p].vqc_list;
+		list_remove(list, zio);
+		empty = list_is_empty(list);
+	} else {
+		avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
+		avl_remove(tree, zio);
+		empty = avl_is_empty(tree);
+	}
+	vq->vq_cqueued &= ~(empty << p);
+}
+
 static uint_t
 vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
 {
@@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa)
 }
 
 static uint_t
-vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
+vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
@@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_max_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
-		return (vdev_queue_max_async_writes(spa));
+		return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
 	case ZIO_PRIORITY_SCRUB:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
@@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
 static zio_priority_t
 vdev_queue_class_to_issue(vdev_queue_t *vq)
 {
-	spa_t *spa = vq->vq_vdev->vdev_spa;
-	zio_priority_t p, n;
+	uint32_t cq = vq->vq_cqueued;
+	zio_priority_t p, p1;
 
-	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+	if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
 		return (ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	/*
@@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
 	 * Do round-robin to reduce starvation due to zfs_vdev_max_active
 	 * and vq_nia_credit limits.
 	 */
-	for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
-		p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
-		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
-		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_min_active(vq, p)) {
-			vq->vq_last_prio = p;
-			return (p);
-		}
+	p1 = vq->vq_last_prio + 1;
+	if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
+		p1 = 0;
+	for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+		    vdev_queue_class_min_active(vq, p))
+			goto found;
+	}
+	for (p = 0; p < p1; p++) {
+		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+		    vdev_queue_class_min_active(vq, p))
+			goto found;
 	}
 
 	/*
@@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
 	 * maximum # outstanding i/os.
 	 */
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
-		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_max_active(spa, vq, p)) {
-			vq->vq_last_prio = p;
-			return (p);
-		}
+		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+		    vdev_queue_class_max_active(vq, p))
+			break;
 	}
 
-	/* No eligible queued i/os */
-	return (ZIO_PRIORITY_NUM_QUEUEABLE);
+found:
+	vq->vq_last_prio = p;
+	return (p);
 }
 
 void
@@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd)
 	vdev_queue_t *vq = &vd->vdev_queue;
 	zio_priority_t p;
 
-	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 	vq->vq_vdev = vd;
-	taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
-
-	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
 
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		int (*compfn) (const void *, const void *);
-
-		/*
-		 * The synchronous/trim i/o queues are dispatched in FIFO rather
-		 * than LBA order. This provides more consistent latency for
-		 * these i/os.
-		 */
-		if (p == ZIO_PRIORITY_SYNC_READ ||
-		    p == ZIO_PRIORITY_SYNC_WRITE ||
-		    p == ZIO_PRIORITY_TRIM) {
-			compfn = vdev_queue_timestamp_compare;
+		if (vdev_queue_class_fifo(p)) {
+			list_create(&vq->vq_class[p].vqc_list,
+			    sizeof (zio_t),
+			    offsetof(struct zio, io_queue_node.l));
 		} else {
-			compfn = vdev_queue_offset_compare;
+			avl_create(&vq->vq_class[p].vqc_tree,
+			    vdev_queue_to_compare, sizeof (zio_t),
+			    offsetof(struct zio, io_queue_node.a));
 		}
-		avl_create(vdev_queue_class_tree(vq, p), compfn,
-		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	}
+	avl_create(&vq->vq_read_offset_tree,
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
+	avl_create(&vq->vq_write_offset_tree,
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
 
 	vq->vq_last_offset = 0;
+	list_create(&vq->vq_active_list, sizeof (struct zio),
+	    offsetof(struct zio, io_queue_node.l));
+	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 }
 
 void
@@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
-	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
-		avl_destroy(vdev_queue_class_tree(vq, p));
-	avl_destroy(&vq->vq_active_tree);
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
+	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if (vdev_queue_class_fifo(p))
+			list_destroy(&vq->vq_class[p].vqc_list);
+		else
+			avl_destroy(&vq->vq_class[p].vqc_tree);
+	}
+	avl_destroy(&vq->vq_read_offset_tree);
+	avl_destroy(&vq->vq_write_offset_tree);
 
+	list_destroy(&vq->vq_active_list);
 	mutex_destroy(&vq->vq_lock);
 }
 
 static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
+	zio->io_queue_state = ZIO_QS_QUEUED;
+	vdev_queue_class_add(vq, zio);
+	if (zio->io_type == ZIO_TYPE_READ)
+		avl_add(&vq->vq_read_offset_tree, zio);
+	else if (zio->io_type == ZIO_TYPE_WRITE)
+		avl_add(&vq->vq_write_offset_tree, zio);
 }
 
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
+	vdev_queue_class_remove(vq, zio);
+	if (zio->io_type == ZIO_TYPE_READ)
+		avl_remove(&vq->vq_read_offset_tree, zio);
+	else if (zio->io_type == ZIO_TYPE_WRITE)
+		avl_remove(&vq->vq_write_offset_tree, zio);
+	zio->io_queue_state = ZIO_QS_NONE;
 }
 
 static boolean_t
@@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	vq->vq_class[zio->io_priority].vqc_active++;
+	vq->vq_cactive[zio->io_priority]++;
+	vq->vq_active++;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (++vq->vq_ia_active == 1)
 			vq->vq_nia_credit = 1;
 	} else if (vq->vq_ia_active > 0) {
 		vq->vq_nia_credit--;
 	}
-	avl_add(&vq->vq_active_tree, zio);
+	zio->io_queue_state = ZIO_QS_ACTIVE;
+	list_insert_tail(&vq->vq_active_list, zio);
 }
 
 static void
@@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	vq->vq_class[zio->io_priority].vqc_active--;
+	vq->vq_cactive[zio->io_priority]--;
+	vq->vq_active--;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (--vq->vq_ia_active == 0)
 			vq->vq_nia_credit = 0;
@@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 			vq->vq_nia_credit = zfs_vdev_nia_credit;
 	} else if (vq->vq_ia_active == 0)
 		vq->vq_nia_credit++;
-	avl_remove(&vq->vq_active_tree, zio);
+	list_remove(&vq->vq_active_list, zio);
+	zio->io_queue_state = ZIO_QS_NONE;
 }
 
 static void
@@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	uint64_t maxgap = 0;
 	uint64_t size;
 	uint64_t limit;
-	int maxblocksize;
 	boolean_t stretch = B_FALSE;
-	avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
-	zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 	uint64_t next_offset;
 	abd_t *abd;
+	avl_tree_t *t;
+
+	/*
+	 * TRIM aggregation should not be needed since code in zfs_trim.c can
+	 * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
+	 */
+	if (zio->io_type == ZIO_TYPE_TRIM)
+		return (NULL);
+
+	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
+		return (NULL);
 
-	maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
 	if (vq->vq_vdev->vdev_nonrot)
 		limit = zfs_vdev_aggregation_limit_non_rotating;
 	else
 		limit = zfs_vdev_aggregation_limit;
-	limit = MIN(limit, maxblocksize);
-
-	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
-		return (NULL);
-
-	/*
-	 * While TRIM commands could be aggregated based on offset this
-	 * behavior is disabled until it's determined to be beneficial.
-	 */
-	if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+	if (limit == 0)
 		return (NULL);
+	limit = MIN(limit, SPA_MAXBLOCKSIZE);
 
 	/*
 	 * I/Os to distributed spares are directly dispatched to the dRAID
@@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 
 	first = last = zio;
 
-	if (zio->io_type == ZIO_TYPE_READ)
+	if (zio->io_type == ZIO_TYPE_READ) {
 		maxgap = zfs_vdev_read_gap_limit;
+		t = &vq->vq_read_offset_tree;
+	} else {
+		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+		t = &vq->vq_write_offset_tree;
+	}
 
 	/*
 	 * We can aggregate I/Os that are sufficiently adjacent and of
@@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	 * Walk backwards through sufficiently contiguous I/Os
 	 * recording the last non-optional I/O.
 	 */
+	zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 	while ((dio = AVL_PREV(t, first)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    IO_SPAN(dio, last) <= limit &&
@@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    (IO_SPAN(first, dio) <= limit ||
 	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
-	    IO_SPAN(first, dio) <= maxblocksize &&
+	    IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
 	    IO_GAP(last, dio) <= maxgap &&
 	    dio->io_type == zio->io_type) {
 		last = dio;
@@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 		return (NULL);
 
 	size = IO_SPAN(first, last);
-	ASSERT3U(size, <=, maxblocksize);
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 
 	abd = abd_alloc_gang();
 	if (abd == NULL)
@@ -824,19 +847,30 @@ again:
 		return (NULL);
 	}
 
-	/*
-	 * For LBA-ordered queues (async / scrub / initializing), issue the
-	 * i/o which follows the most recently issued i/o in LBA (offset) order.
-	 *
-	 * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
-	 */
-	tree = vdev_queue_class_tree(vq, p);
-	vq->vq_io_search.io_timestamp = 0;
-	vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
-	VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
-	zio = avl_nearest(tree, idx, AVL_AFTER);
-	if (zio == NULL)
-		zio = avl_first(tree);
+	if (vdev_queue_class_fifo(p)) {
+		zio = list_head(&vq->vq_class[p].vqc_list);
+	} else {
+		/*
+		 * For LBA-ordered queues (async / scrub / initializing),
+		 * issue the I/O which follows the most recently issued I/O
+		 * in LBA (offset) order, but to avoid starvation only within
+		 * the same 0.5 second interval as the first I/O.
+		 */
+		tree = &vq->vq_class[p].vqc_tree;
+		zio = aio = avl_first(tree);
+		if (zio->io_offset < vq->vq_last_offset) {
+			vq->vq_io_search.io_timestamp = zio->io_timestamp;
+			vq->vq_io_search.io_offset = vq->vq_last_offset;
+			zio = avl_find(tree, &vq->vq_io_search, &idx);
+			if (zio == NULL) {
+				zio = avl_nearest(tree, idx, AVL_AFTER);
+				if (zio == NULL ||
+				    (zio->io_timestamp >> VDQ_T_SHIFT) !=
+				    (aio->io_timestamp >> VDQ_T_SHIFT))
+					zio = aio;
+			}
+		}
+	}
 	ASSERT3U(zio->io_priority, ==, p);
 
 	aio = vdev_queue_aggregate(vq, zio);
@@ -967,7 +1001,6 @@ void
 vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-	avl_tree_t *tree;
 
 	/*
 	 * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 	 * Otherwise, the zio is currently active and we cannot change its
 	 * priority.
 	 */
-	tree = vdev_queue_class_tree(vq, zio->io_priority);
-	if (avl_find(tree, zio, NULL) == zio) {
-		avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	if (zio->io_queue_state == ZIO_QS_QUEUED) {
+		vdev_queue_class_remove(vq, zio);
 		zio->io_priority = priority;
-		avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+		vdev_queue_class_add(vq, zio);
+	} else if (zio->io_queue_state == ZIO_QS_NONE) {
 		zio->io_priority = priority;
 	}
 
@@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
  * vq_lock mutex use here, instead we prefer to keep it lock free for
  * performance.
  */
-int
+uint32_t
 vdev_queue_length(vdev_t *vd)
 {
-	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+	return (vd->vdev_queue.vq_active);
 }
 
 uint64_t
@@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd)
 	return (vd->vdev_queue.vq_last_offset);
 }
 
+uint64_t
+vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+	if (vdev_queue_class_fifo(p))
+		return (list_is_empty(&vq->vq_class[p].vqc_list) == 0);
+	else
+		return (avl_numnodes(&vq->vq_class[p].vqc_tree));
+}
+
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
 	"Max vdev I/O aggregation size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
 	ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW,
-	"Allow TRIM I/O to be aggregated");
-
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
 	"Aggregate read I/O over gap");
 

From bc9d0084ea82a96bfe52e6f7943a554f218f871e Mon Sep 17 00:00:00 2001
From: Laevos <5572812+Laevos@users.noreply.github.com>
Date: Tue, 27 Jun 2023 16:58:32 -0700
Subject: [PATCH 03/18] Remove unnecessary commas in zpool-create.8

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Laevos <5572812+Laevos@users.noreply.github.com>
Closes #15011
---
 man/man8/zpool-create.8 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/man/man8/zpool-create.8 b/man/man8/zpool-create.8
index da1a79c72c..8449520944 100644
--- a/man/man8/zpool-create.8
+++ b/man/man8/zpool-create.8
@@ -87,13 +87,13 @@ currently in use by another subsystem.
 However this check is not robust enough
 to detect simultaneous attempts to use a new device in different pools, even if
 .Sy multihost Ns = Sy enabled .
-The administrator must ensure, that simultaneous invocations of any combination
+The administrator must ensure that simultaneous invocations of any combination
 of
 .Nm zpool Cm replace ,
 .Nm zpool Cm create ,
 .Nm zpool Cm add ,
 or
-.Nm zpool Cm labelclear ,
+.Nm zpool Cm labelclear
 do not refer to the same device.
 Using the same device in two pools will result in pool corruption.
 .Pp

From b0cbc1aa9a1f2a3329f5e483ef9e297e1eca4833 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 27 Jun 2023 20:00:30 -0400
Subject: [PATCH 04/18] Use big transactions for small recordsize writes.

When ZFS appends files in chunks bigger than recordsize, it borrows
buffer from ARC and fills it before opening transaction.  This
supposed to help in case of page faults to not hold transaction open
indefinitely.  The problem appears when recordsize is set lower than
default 128KB. Since each block is committed in separate transaction,
per-transaction overhead becomes significant, and what is even worse,
active use of of per-dataset and per-pool locks to protect space use
accounting for each transaction badly hurts the code SMP scalability.
The same transaction size limitation applies in case of file rewrite,
but without even excuse of buffer borrowing.

To address the issue, disable the borrowing mechanism if recordsize
is smaller than default and the write request is 4x bigger than it.
In such case writes up to 32MB are executed in single transaction,
that dramatically reduces overhead and lock contention.  Since the
borrowing mechanism is not used for file rewrites, and it was never
used by zvols, which seem to work fine, I don't think this change
should create significant problems, partially because in addition to
the borrowing mechanism there are also used pre-faults.

My tests with 4/8 threads writing several files same time on datasets
with 32KB recordsize in 1MB requests show reduction of CPU usage by
the user threads by 25-35%.  I would measure it in GB/s, but at that
block size we are now limited by the lock contention of single write
issue taskqueue, which is a separate problem we are going to work on.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14964
---
 module/zfs/zfs_vnops.c | 106 ++++++++++++++++++-----------------------
 1 file changed, 46 insertions(+), 60 deletions(-)

diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 86706469ac..7bdcc16393 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -462,14 +462,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		return (SET_ERROR(EINVAL));
 	}
 
-	const uint64_t max_blksz = zfsvfs->z_max_blksz;
-
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
-	 * Skip this if uio contains loaned arc_buf.
 	 */
-	if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
+	ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
+	if (zfs_uio_prefaultpages(pfbytes, uio)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EFAULT));
 	}
@@ -544,10 +542,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			break;
 		}
 
+		uint64_t blksz;
+		if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
+			if (zp->z_blksz > zfsvfs->z_max_blksz &&
+			    !ISP2(zp->z_blksz)) {
+				/*
+				 * File's blocksize is already larger than the
+				 * "recordsize" property.  Only let it grow to
+				 * the next power of 2.
+				 */
+				blksz = 1 << highbit64(zp->z_blksz);
+			} else {
+				blksz = zfsvfs->z_max_blksz;
+			}
+			blksz = MIN(blksz, P2ROUNDUP(end_size,
+			    SPA_MINBLOCKSIZE));
+			blksz = MAX(blksz, zp->z_blksz);
+		} else {
+			blksz = zp->z_blksz;
+		}
+
 		arc_buf_t *abuf = NULL;
-		if (n >= max_blksz && woff >= zp->z_size &&
-		    P2PHASE(woff, max_blksz) == 0 &&
-		    zp->z_blksz == max_blksz) {
+		ssize_t nbytes = n;
+		if (n >= blksz && woff >= zp->z_size &&
+		    P2PHASE(woff, blksz) == 0 &&
+		    (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
@@ -555,18 +574,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
-			size_t cbytes;
-
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    max_blksz);
+			    blksz);
 			ASSERT(abuf != NULL);
-			ASSERT(arc_buf_size(abuf) == max_blksz);
-			if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
-			    UIO_WRITE, uio, &cbytes))) {
+			ASSERT(arc_buf_size(abuf) == blksz);
+			if ((error = zfs_uiocopy(abuf->b_data, blksz,
+			    UIO_WRITE, uio, &nbytes))) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
-			ASSERT3S(cbytes, ==, max_blksz);
+			ASSERT3S(nbytes, ==, blksz);
+		} else {
+			nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
+			    P2PHASE(woff, blksz));
+			if (pfbytes < nbytes) {
+				if (zfs_uio_prefaultpages(nbytes, uio)) {
+					error = SET_ERROR(EFAULT);
+					break;
+				}
+				pfbytes = nbytes;
+			}
 		}
 
 		/*
@@ -576,8 +603,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 		DB_DNODE_ENTER(db);
-		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
-		    MIN(n, max_blksz));
+		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
 		DB_DNODE_EXIT(db);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
@@ -600,31 +626,10 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		 * shrink down lr_length to the appropriate size.
 		 */
 		if (lr->lr_length == UINT64_MAX) {
-			uint64_t new_blksz;
-
-			if (zp->z_blksz > max_blksz) {
-				/*
-				 * File's blocksize is already larger than the
-				 * "recordsize" property.  Only let it grow to
-				 * the next power of 2.
-				 */
-				ASSERT(!ISP2(zp->z_blksz));
-				new_blksz = MIN(end_size,
-				    1 << highbit64(zp->z_blksz));
-			} else {
-				new_blksz = MIN(end_size, max_blksz);
-			}
-			zfs_grow_blocksize(zp, new_blksz, tx);
+			zfs_grow_blocksize(zp, blksz, tx);
 			zfs_rangelock_reduce(lr, woff, n);
 		}
 
-		/*
-		 * XXX - should we really limit each write to z_max_blksz?
-		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
-		 */
-		const ssize_t nbytes =
-		    MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
 		ssize_t tx_bytes;
 		if (abuf == NULL) {
 			tx_bytes = zfs_uio_resid(uio);
@@ -644,12 +649,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 				 * zfs_uio_prefaultpages, or prefaultpages may
 				 * error, and we may break the loop early.
 				 */
-				if (tx_bytes != zfs_uio_resid(uio))
-					n -= tx_bytes - zfs_uio_resid(uio);
-				if (zfs_uio_prefaultpages(MIN(n, max_blksz),
-				    uio)) {
-					break;
-				}
+				n -= tx_bytes - zfs_uio_resid(uio);
+				pfbytes -= tx_bytes - zfs_uio_resid(uio);
 				continue;
 			}
 #endif
@@ -665,15 +666,6 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			}
 			tx_bytes -= zfs_uio_resid(uio);
 		} else {
-			/* Implied by abuf != NULL: */
-			ASSERT3S(n, >=, max_blksz);
-			ASSERT0(P2PHASE(woff, max_blksz));
-			/*
-			 * We can simplify nbytes to MIN(n, max_blksz) since
-			 * P2PHASE(woff, max_blksz) is 0, and knowing
-			 * n >= max_blksz lets us simplify further:
-			 */
-			ASSERT3S(nbytes, ==, max_blksz);
 			/*
 			 * Thus, we're writing a full block at a block-aligned
 			 * offset and extending the file past EOF.
@@ -758,13 +750,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			break;
 		ASSERT3S(tx_bytes, ==, nbytes);
 		n -= nbytes;
-
-		if (n > 0) {
-			if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
-				error = SET_ERROR(EFAULT);
-				break;
-			}
-		}
+		pfbytes -= nbytes;
 	}
 
 	zfs_znode_update_vfs(zp);

From a9d6b0690b1863f39a7efce08b1227f2e9e26abb Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 27 Jun 2023 20:03:37 -0400
Subject: [PATCH 05/18] ZIL: Fix another use-after-free.

lwb->lwb_issued_txg can not be accessed after lwb_state is set to
LWB_STATE_FLUSH_DONE and zl_lock is dropped, since the lwb may be
freed by zil_sync().  We must save the txg number before that.

This is similar to the 55b1842f92, but as I see the bug is not new.
It existed for quite a while, just was not triggered due to smaller
race window.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14988
Closes #14999
---
 module/zfs/zil.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index ee8dcce3b3..ef6f52542d 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1425,6 +1425,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 
 	list_move_tail(&itxs, &lwb->lwb_itxs);
 	list_move_tail(&waiters, &lwb->lwb_waiters);
+	txg = lwb->lwb_issued_txg;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
@@ -1465,7 +1466,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 	list_destroy(&waiters);
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
-	txg = lwb->lwb_issued_txg;
 	ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
 	zilog->zl_lwb_inflight[txg & TXG_MASK]--;
 	if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)

From 62ace21a149c34cfe1b5870a76eb036fe805f869 Mon Sep 17 00:00:00 2001
From: Mateusz Piotrowski <0mp@FreeBSD.org>
Date: Thu, 29 Jun 2023 19:54:43 +0200
Subject: [PATCH 06/18] zdb: Add missing poolname to -C synopsis

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org>
Sponsored-by: Klara Inc.
Closes #15014
---
 cmd/zdb/zdb.c  | 2 +-
 man/man8/zdb.8 | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 04a10c4eed..9568d2bbfe 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -794,7 +794,7 @@ usage(void)
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
 	    "\t%s [-v] <bookmark>\n"
-	    "\t%s -C [-A] [-U <cache>]\n"
+	    "\t%s -C [-A] [-U <cache>] [<poolname>]\n"
 	    "\t%s -l [-Aqu] <device>\n"
 	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
 	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
diff --git a/man/man8/zdb.8 b/man/man8/zdb.8
index 031953c543..52c8e452fa 100644
--- a/man/man8/zdb.8
+++ b/man/man8/zdb.8
@@ -14,7 +14,7 @@
 .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 .\" Copyright (c) 2017 Intel Corporation.
 .\"
-.Dd June 4, 2023
+.Dd June 27, 2023
 .Dt ZDB 8
 .Os
 .
@@ -51,6 +51,7 @@
 .Fl C
 .Op Fl A
 .Op Fl U Ar cache
+.Op Ar poolname
 .Nm
 .Fl E
 .Op Fl A

From 77a3bb1f47e67c233eb1961b8746748c02bafde1 Mon Sep 17 00:00:00 2001
From: Yuri Pankov <113725409+yuripv@users.noreply.github.com>
Date: Thu, 29 Jun 2023 20:50:52 +0200
Subject: [PATCH 07/18] spa.h: use IN_BASE instead of IN_FREEBSD_BASE

Consistently get the proper default value for autotrim.

Currently, only the kernel module is built with IN_FREEBSD_BASE,
and libzfs get the wrong default value, leading to confusion and
incorrect output when autotrim value was not set explicitly.

Reviewed-by: Warner Losh <imp@bsdimp.com>
Signed-off-by: Yuri Pankov <yuripv@FreeBSD.org>
Closes #15016
---
 include/sys/spa.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/sys/spa.h b/include/sys/spa.h
index 1fa2044008..ac0847793c 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -723,12 +723,12 @@ typedef enum spa_mode {
  * Send TRIM commands in-line during normal pool operation while deleting.
  *	OFF: no
  *	ON: yes
- * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources.
+ * NB: IN_BASE is defined within the FreeBSD sources.
  */
 typedef enum {
 	SPA_AUTOTRIM_OFF = 0,	/* default */
 	SPA_AUTOTRIM_ON,
-#ifdef IN_FREEBSD_BASE
+#ifdef IN_BASE
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON,
 #else
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF,

From 24554082bd93cb90400c4cb751275debda229009 Mon Sep 17 00:00:00 2001
From: vimproved <66446404+vimproved@users.noreply.github.com>
Date: Thu, 29 Jun 2023 19:54:37 +0000
Subject: [PATCH 08/18] contrib: dracut: Conditionalize copying of
 libgcc_s.so.1 to glibc only

The issue that this is designed to work around is only applicable to
glibc, since it's caused by glibc's pthread_cancel() implementation
using dlopen on libgcc_s.so.1 (and therefor not triggering dracut to
include it in the initramfs). This commit adds an extra condition to the
workaround that tests for glibc via "ldconfig -p | grep -qF 'libc.so.6'"
(which should only be present on glibc systems).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Violet Purcell <vimproved@inventati.org>
Closes #14992
---
 contrib/dracut/90zfs/module-setup.sh.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/dracut/90zfs/module-setup.sh.in b/contrib/dracut/90zfs/module-setup.sh.in
index e55cb60e16..acad468edf 100755
--- a/contrib/dracut/90zfs/module-setup.sh.in
+++ b/contrib/dracut/90zfs/module-setup.sh.in
@@ -36,7 +36,7 @@ install() {
 		{ dfatal "Failed to install essential binaries"; exit 1; }
 
 	# Adapted from https://github.com/zbm-dev/zfsbootmenu
-	if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so'; then
+	if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so' && ldconfig -p 2> /dev/null | grep -qF 'libc.so.6' ; then
 		# On systems with gcc-config (Gentoo, Funtoo, etc.), use it to find libgcc_s
 		if command -v gcc-config >/dev/null; then
 			inst_simple "/usr/lib/gcc/$(s=$(gcc-config -c); echo "${s%-*}/${s##*-}")/libgcc_s.so.1" ||

From eda32dca92a854e5d23877166188c476c2ac75bd Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 30 Jun 2023 11:36:43 -0400
Subject: [PATCH 09/18] Fix remount when setting multiple properties.

The previous code was checking zfs_is_namespace_prop() only for the
last property on the list.  If one was not "namespace", then remount
wasn't called.  To fix that move zfs_is_namespace_prop() inside the
loop and remount if at least one of properties was "namespace".

Reviewed-by: Umer Saleem <usaleem@ixsystems.com>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15000
---
 lib/libzfs/libzfs_dataset.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index fe9f3268d3..11d3eb6a3c 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -1789,7 +1789,8 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
 	nvlist_t *nvl;
 	int nvl_len = 0;
 	int added_resv = 0;
-	zfs_prop_t prop = 0;
+	zfs_prop_t prop;
+	boolean_t nsprop = B_FALSE;
 	nvpair_t *elem;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
@@ -1836,6 +1837,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
 	    elem = nvlist_next_nvpair(nvl, elem)) {
 
 		prop = zfs_name_to_prop(nvpair_name(elem));
+		nsprop |= zfs_is_namespace_prop(prop);
 
 		assert(cl_idx < nvl_len);
 		/*
@@ -1934,8 +1936,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
 			 * if one of the options handled by the generic
 			 * Linux namespace layer has been modified.
 			 */
-			if (zfs_is_namespace_prop(prop) &&
-			    zfs_is_mounted(zhp, NULL))
+			if (nsprop && zfs_is_mounted(zhp, NULL))
 				ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0);
 		}
 	}

From 6052060c133d0caed0e1bc3ec2c057f8c33e5f7a Mon Sep 17 00:00:00 2001
From: Arshad Hussain <arshad.hussain@aeoncomputing.com>
Date: Fri, 30 Jun 2023 21:07:26 +0530
Subject: [PATCH 10/18] Don't use hard-coded 'size' value in snprintf()

This patch changes the passing of "size" to snprintf
from hard-coded (openended) to sizeof(errbuf). This
is bringing to standard with rest of the code where-
ever 'errbuf' is used.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Arshad Hussain <arshad.hussain@aeoncomputing.com>
Closes #15003
---
 cmd/zfs/zfs_main.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index e28f1d04f3..5ed25d1ea7 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -6057,8 +6057,8 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
 				if (p != NULL)
 					rid = p->pw_uid;
 				else if (*endch != '\0') {
-					(void) snprintf(errbuf, 256, gettext(
-					    "invalid user %s\n"), curr);
+					(void) snprintf(errbuf, sizeof (errbuf),
+					    gettext("invalid user %s\n"), curr);
 					allow_usage(un, B_TRUE, errbuf);
 				}
 			} else if (opts->group) {
@@ -6071,8 +6071,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
 				if (g != NULL)
 					rid = g->gr_gid;
 				else if (*endch != '\0') {
-					(void) snprintf(errbuf, 256, gettext(
-					    "invalid group %s\n"),  curr);
+					(void) snprintf(errbuf, sizeof (errbuf),
+					    gettext("invalid group %s\n"),
+					    curr);
 					allow_usage(un, B_TRUE, errbuf);
 				}
 			} else {
@@ -6097,8 +6098,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
 					who_type = ZFS_DELEG_GROUP;
 					rid = g->gr_gid;
 				} else {
-					(void) snprintf(errbuf, 256, gettext(
-					    "invalid user/group %s\n"), curr);
+					(void) snprintf(errbuf, sizeof (errbuf),
+					    gettext("invalid user/group %s\n"),
+					    curr);
 					allow_usage(un, B_TRUE, errbuf);
 				}
 			}

From fa7b2390d4982412f9dd27c151bb5ec2da89dcca Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 30 Jun 2023 11:47:13 -0400
Subject: [PATCH 11/18] Do not report bytes skipped by scan as issued.

Scan process may skip blocks based on their birth time, DVA, etc.
Traditionally those blocks were accounted as issued, that caused
reporting of hugely over-inflated numbers, having nothing to do
with actual disk I/O.  This change utilizes never used field in
struct dsl_scan_phys to account such skipped bytes, allowing to
report how much data were actually scrubbed/resilvered and what
is the actual I/O speed.  While formally it is an on-disk format
change, it should be compatible both ways, so should not need a
feature flag.

This should partially address the same issue as c85ac731a0e, but
from a different perspective, complementing it.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Sponsored by:   iXsystems, Inc.
Closes #15007
---
 cmd/zpool/zpool_main.c              | 94 +++++++++++++++++------------
 cmd/zpool_influxdb/zpool_influxdb.c |  2 +-
 include/sys/dsl_scan.h              |  2 +-
 include/sys/fs/zfs.h                |  3 +-
 include/sys/vdev_rebuild.h          |  1 +
 man/man8/zpool-scrub.8              |  4 +-
 module/zfs/dsl_scan.c               | 22 +++++--
 module/zfs/spa_misc.c               |  2 +-
 module/zfs/vdev_rebuild.c           |  6 +-
 9 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 013dd4a233..10a3b5b14f 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -7662,11 +7662,11 @@ static void
 print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 {
 	time_t start, end, pause;
-	uint64_t pass_scanned, scanned, pass_issued, issued, total;
+	uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i;
 	uint64_t elapsed, scan_rate, issue_rate;
 	double fraction_done;
-	char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
-	char srate_buf[7], irate_buf[7], time_buf[32];
+	char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7];
+	char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32];
 
 	printf("  ");
 	printf_color(ANSI_BOLD, gettext("scan:"));
@@ -7738,10 +7738,11 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 	pass_scanned = ps->pss_pass_exam;
 	issued = ps->pss_issued;
 	pass_issued = ps->pss_pass_issued;
-	total = ps->pss_to_examine;
+	total_s = ps->pss_to_examine;
+	total_i = ps->pss_to_examine - ps->pss_skipped;
 
 	/* we are only done with a block once we have issued the IO for it */
-	fraction_done = (double)issued / total;
+	fraction_done = (double)issued / total_i;
 
 	/* elapsed time for this pass, rounding up to 1 if it's 0 */
 	elapsed = time(NULL) - ps->pss_pass_start;
@@ -7750,26 +7751,25 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 
 	scan_rate = pass_scanned / elapsed;
 	issue_rate = pass_issued / elapsed;
-	uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
-	    ((total - issued) / issue_rate) : UINT64_MAX;
-	secs_to_dhms(total_secs_left, time_buf);
 
 	/* format all of the numbers we will be reporting */
 	zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
 	zfs_nicebytes(issued, issued_buf, sizeof (issued_buf));
-	zfs_nicebytes(total, total_buf, sizeof (total_buf));
-	zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
-	zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
+	zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf));
+	zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf));
 
 	/* do not print estimated time if we have a paused scrub */
-	if (pause == 0) {
-		(void) printf(gettext("\t%s scanned at %s/s, "
-		    "%s issued at %s/s, %s total\n"),
-		    scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
-	} else {
-		(void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
-		    scanned_buf, issued_buf, total_buf);
+	(void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf);
+	if (pause == 0 && scan_rate > 0) {
+		zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
+		(void) printf(gettext(" at %s/s"), srate_buf);
 	}
+	(void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf);
+	if (pause == 0 && issue_rate > 0) {
+		zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
+		(void) printf(gettext(" at %s/s"), irate_buf);
+	}
+	(void) printf(gettext("\n"));
 
 	if (is_resilver) {
 		(void) printf(gettext("\t%s resilvered, %.2f%% done"),
@@ -7782,16 +7782,16 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 	if (pause == 0) {
 		/*
 		 * Only provide an estimate iff:
-		 * 1) the time remaining is valid, and
+		 * 1) we haven't yet issued all we expected, and
 		 * 2) the issue rate exceeds 10 MB/s, and
 		 * 3) it's either:
 		 *    a) a resilver which has started repairs, or
 		 *    b) a scrub which has entered the issue phase.
 		 */
-		if (total_secs_left != UINT64_MAX &&
-		    issue_rate >= 10 * 1024 * 1024 &&
+		if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 &&
 		    ((is_resilver && ps->pss_processed > 0) ||
 		    (is_scrub && issued > 0))) {
+			secs_to_dhms((total_i - issued) / issue_rate, time_buf);
 			(void) printf(gettext(", %s to go\n"), time_buf);
 		} else {
 			(void) printf(gettext(", no estimated "
@@ -7803,7 +7803,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 }
 
 static void
-print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
+print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name)
 {
 	if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
 		return;
@@ -7815,17 +7815,20 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
 	uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
 	uint64_t bytes_issued = vrs->vrs_bytes_issued;
 	uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
-	uint64_t bytes_est = vrs->vrs_bytes_est;
+	uint64_t bytes_est_s = vrs->vrs_bytes_est;
+	uint64_t bytes_est_i = vrs->vrs_bytes_est;
+	if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8)
+		bytes_est_i -= vrs->vrs_pass_bytes_skipped;
 	uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
 	    (vrs->vrs_pass_time_ms + 1)) * 1000;
 	uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
 	    (vrs->vrs_pass_time_ms + 1)) * 1000;
 	double scan_pct = MIN((double)bytes_scanned * 100 /
-	    (bytes_est + 1), 100);
+	    (bytes_est_s + 1), 100);
 
 	/* Format all of the numbers we will be reporting */
 	char bytes_scanned_buf[7], bytes_issued_buf[7];
-	char bytes_rebuilt_buf[7], bytes_est_buf[7];
+	char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7];
 	char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
 	zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
 	    sizeof (bytes_scanned_buf));
@@ -7833,9 +7836,8 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
 	    sizeof (bytes_issued_buf));
 	zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
 	    sizeof (bytes_rebuilt_buf));
-	zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
-	zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
-	zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
+	zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf));
+	zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf));
 
 	time_t start = vrs->vrs_start_time;
 	time_t end = vrs->vrs_end_time;
@@ -7858,17 +7860,29 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
 
 	assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
 
-	secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
-	    MAX(scan_rate, 1), time_buf);
+	(void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf,
+	    bytes_est_s_buf);
+	if (scan_rate > 0) {
+		zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
+		(void) printf(gettext(" at %s/s"), scan_rate_buf);
+	}
+	(void) printf(gettext(", %s / %s issued"), bytes_issued_buf,
+	    bytes_est_i_buf);
+	if (issue_rate > 0) {
+		zfs_nicebytes(issue_rate, issue_rate_buf,
+		    sizeof (issue_rate_buf));
+		(void) printf(gettext(" at %s/s"), issue_rate_buf);
+	}
+	(void) printf(gettext("\n"));
 
-	(void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
-	    "%s total\n"), bytes_scanned_buf, scan_rate_buf,
-	    bytes_issued_buf, issue_rate_buf, bytes_est_buf);
 	(void) printf(gettext("\t%s resilvered, %.2f%% done"),
 	    bytes_rebuilt_buf, scan_pct);
 
 	if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
-		if (scan_rate >= 10 * 1024 * 1024) {
+		if (bytes_est_s >= bytes_scanned &&
+		    scan_rate >= 10 * 1024 * 1024) {
+			secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate,
+			    time_buf);
 			(void) printf(gettext(", %s to go\n"), time_buf);
 		} else {
 			(void) printf(gettext(", no estimated "
@@ -7900,7 +7914,7 @@ print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot)
 		    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
 			char *name = zpool_vdev_name(g_zfs, zhp,
 			    child[c], VDEV_NAME_TYPE_ID);
-			print_rebuild_status_impl(vrs, name);
+			print_rebuild_status_impl(vrs, i, name);
 			free(name);
 		}
 	}
@@ -8005,13 +8019,15 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
 			active_resilver = (ps->pss_state == DSS_SCANNING);
 		}
 
-
 		have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
 		have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
 		scrub_start = ps->pss_start_time;
-		have_errorscrub = (ps->pss_error_scrub_func ==
-		    POOL_SCAN_ERRORSCRUB);
-		errorscrub_start = ps->pss_error_scrub_start;
+		if (c > offsetof(pool_scan_stat_t,
+		    pss_pass_error_scrub_pause) / 8) {
+			have_errorscrub = (ps->pss_error_scrub_func ==
+			    POOL_SCAN_ERRORSCRUB);
+			errorscrub_start = ps->pss_error_scrub_start;
+		}
 	}
 
 	boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
diff --git a/cmd/zpool_influxdb/zpool_influxdb.c b/cmd/zpool_influxdb/zpool_influxdb.c
index 80d0848589..520e569269 100644
--- a/cmd/zpool_influxdb/zpool_influxdb.c
+++ b/cmd/zpool_influxdb/zpool_influxdb.c
@@ -238,6 +238,7 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name)
 	print_kv("end_ts", ps->pss_end_time);
 	print_kv(",errors", ps->pss_errors);
 	print_kv(",examined", examined);
+	print_kv(",skipped", ps->pss_skipped);
 	print_kv(",issued", ps->pss_issued);
 	print_kv(",pass_examined", pass_exam);
 	print_kv(",pass_issued", ps->pss_pass_issued);
@@ -249,7 +250,6 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name)
 	print_kv(",remaining_t", remaining_time);
 	print_kv(",start_ts", ps->pss_start_time);
 	print_kv(",to_examine", ps->pss_to_examine);
-	print_kv(",to_process", ps->pss_to_process);
 	printf(" %llu\n", (u_longlong_t)timestamp);
 	return (0);
 }
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index 6753b4a8f3..2e3452e5eb 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -61,7 +61,7 @@ typedef struct dsl_scan_phys {
 	uint64_t scn_end_time;
 	uint64_t scn_to_examine; /* total bytes to be scanned */
 	uint64_t scn_examined; /* bytes scanned so far */
-	uint64_t scn_to_process;
+	uint64_t scn_skipped;	/* bytes skipped by scanner */
 	uint64_t scn_processed;
 	uint64_t scn_errors;	/* scan I/O error count */
 	uint64_t scn_ddt_class_max;
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 93193fa142..bc940e8a79 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1088,7 +1088,7 @@ typedef struct pool_scan_stat {
 	uint64_t	pss_end_time;	/* scan end time */
 	uint64_t	pss_to_examine;	/* total bytes to scan */
 	uint64_t	pss_examined;	/* total bytes located by scanner */
-	uint64_t	pss_to_process; /* total bytes to process */
+	uint64_t	pss_skipped;	/* total bytes skipped by scanner */
 	uint64_t	pss_processed;	/* total processed bytes */
 	uint64_t	pss_errors;	/* scan errors	*/
 
@@ -1152,6 +1152,7 @@ typedef struct vdev_rebuild_stat {
 	uint64_t vrs_pass_time_ms;	/* pass run time (millisecs) */
 	uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */
 	uint64_t vrs_pass_bytes_issued;	/* bytes rebuilt since start/resume */
+	uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */
 } vdev_rebuild_stat_t;
 
 /*
diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h
index c4cfe0c567..55ec6c5703 100644
--- a/include/sys/vdev_rebuild.h
+++ b/include/sys/vdev_rebuild.h
@@ -79,6 +79,7 @@ typedef struct vdev_rebuild {
 	uint64_t	vr_pass_start_time;
 	uint64_t	vr_pass_bytes_scanned;
 	uint64_t	vr_pass_bytes_issued;
+	uint64_t	vr_pass_bytes_skipped;
 
 	/* On-disk state updated by vdev_rebuild_zap_update_sync() */
 	vdev_rebuild_phys_t vr_rebuild_phys;
diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8
index 138226e456..03f3ad4991 100644
--- a/man/man8/zpool-scrub.8
+++ b/man/man8/zpool-scrub.8
@@ -26,7 +26,7 @@
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd July 25, 2021
+.Dd June 22, 2023
 .Dt ZPOOL-SCRUB 8
 .Os
 .
@@ -123,7 +123,7 @@ Status of pool with ongoing scrub:
 .No # Nm zpool Cm status
   ...
   scan: scrub in progress since Sun Jul 25 16:07:49 2021
-        403M scanned at 100M/s, 68.4M issued at 10.0M/s, 405M total
+        403M / 405M scanned at 100M/s, 68.4M / 405M issued at 10.0M/s
         0B repaired, 16.91% done, 00:00:04 to go
   ...
 .Ed
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 1dd44171c1..50428bff3e 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -573,7 +573,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		 * counter to how far we've scanned. We know we're consistent
 		 * up to here.
 		 */
-		scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+		scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
+		    scn->scn_phys.scn_skipped;
 
 		if (dsl_scan_is_running(scn) &&
 		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
@@ -4362,7 +4363,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	 * Disabled by default, set zfs_scan_report_txgs to report
 	 * average performance over the last zfs_scan_report_txgs TXGs.
 	 */
-	if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 &&
+	if (zfs_scan_report_txgs != 0 &&
 	    tx->tx_txg % zfs_scan_report_txgs == 0) {
 		scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
 		spa_scan_stat_init(spa);
@@ -4564,6 +4565,15 @@ count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all)
 	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
 }
 
+static void
+count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
+{
+	if (BP_IS_EMBEDDED(bp))
+		return;
+	atomic_add_64(&scn->scn_phys.scn_skipped,
+	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
+}
+
 static void
 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
 {
@@ -4709,7 +4719,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 	count_block(dp->dp_blkstats, bp);
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg) {
-		count_block_issued(spa, bp, B_TRUE);
+		count_block_skipped(scn, bp, B_TRUE);
 		return (0);
 	}
 
@@ -4750,7 +4760,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 	if (needs_io && !zfs_no_scrub_io) {
 		dsl_scan_enqueue(dp, bp, zio_flags, zb);
 	} else {
-		count_block_issued(spa, bp, B_TRUE);
+		count_block_skipped(scn, bp, B_TRUE);
 	}
 
 	/* do not relocate this block */
@@ -5119,9 +5129,9 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
 		ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
 		range_tree_remove_fill(queue->q_exts_by_addr, start, size);
 
-		/* count the block as though we issued it */
+		/* count the block as though we skipped it */
 		sio2bp(sio, &tmpbp);
-		count_block_issued(spa, &tmpbp, B_FALSE);
+		count_block_skipped(scn, &tmpbp, B_FALSE);
 
 		sio_free(sio);
 	}
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 8dc83445e1..06f6407690 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -2611,7 +2611,7 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 	ps->pss_end_time = scn->scn_phys.scn_end_time;
 	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
 	ps->pss_examined = scn->scn_phys.scn_examined;
-	ps->pss_to_process = scn->scn_phys.scn_to_process;
+	ps->pss_skipped = scn->scn_phys.scn_skipped;
 	ps->pss_processed = scn->scn_phys.scn_processed;
 	ps->pss_errors = scn->scn_phys.scn_errors;
 
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index 62aa61b3b9..75c3900cbb 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c
@@ -571,8 +571,10 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
 	vdev_rebuild_blkptr_init(&blk, vd, start, size);
 	uint64_t psize = BP_GET_PSIZE(&blk);
 
-	if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
+	if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) {
+		vr->vr_pass_bytes_skipped += size;
 		return (0);
+	}
 
 	mutex_enter(&vr->vr_io_lock);
 
@@ -786,6 +788,7 @@ vdev_rebuild_thread(void *arg)
 	vr->vr_pass_start_time = gethrtime();
 	vr->vr_pass_bytes_scanned = 0;
 	vr->vr_pass_bytes_issued = 0;
+	vr->vr_pass_bytes_skipped = 0;
 
 	uint64_t update_est_time = gethrtime();
 	vdev_rebuild_update_bytes_est(vd, 0);
@@ -1153,6 +1156,7 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
 		    vr->vr_pass_start_time);
 		vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
 		vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
+		vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped;
 		mutex_exit(&tvd->vdev_rebuild_lock);
 	}
 

From b4a0873092353cefe0f1bc3ee6a50d5e16b35675 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 30 Jun 2023 11:54:00 -0400
Subject: [PATCH 12/18] Some ZIO micro-optimizations.

- Pack struct zio_prop by 4 bytes from 84 to 80.
 - Skip new child ZIO locking while linking to parent.  The newly
allocated ZIO is not externally visible yet, so nobody should care.
 - Skip io_bp_copy writes when not used (write && non-debug).

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14985
---
 include/sys/zio.h |  3 ++-
 module/zfs/zio.c  | 52 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/include/sys/zio.h b/include/sys/zio.h
index 85217b873d..f4da80783e 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -341,9 +341,9 @@ typedef struct zio_prop {
 	enum zio_checksum	zp_checksum;
 	enum zio_compress	zp_compress;
 	uint8_t			zp_complevel;
-	dmu_object_type_t	zp_type;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
+	dmu_object_type_t	zp_type;
 	boolean_t		zp_dedup;
 	boolean_t		zp_dedup_verify;
 	boolean_t		zp_nopwrite;
@@ -611,6 +611,7 @@ extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
 extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);
+extern void zio_add_child_first(zio_t *pio, zio_t *cio);
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index fb8164f0ae..10279fde89 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -626,8 +626,6 @@ zio_unique_parent(zio_t *cio)
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
-	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
-
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
@@ -636,6 +634,7 @@ zio_add_child(zio_t *pio, zio_t *cio)
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
+	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
@@ -644,8 +643,9 @@ zio_add_child(zio_t *pio, zio_t *cio)
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
+	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
-		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
@@ -654,6 +654,37 @@ zio_add_child(zio_t *pio, zio_t *cio)
 	mutex_exit(&pio->io_lock);
 }
 
+void
+zio_add_child_first(zio_t *pio, zio_t *cio)
+{
+	/*
+	 * Logical I/Os can have logical, gang, or vdev children.
+	 * Gang I/Os can have gang or vdev children.
+	 * Vdev I/Os can only have vdev children.
+	 * The following ASSERT captures all of these constraints.
+	 */
+	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
+
+	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+	zl->zl_parent = pio;
+	zl->zl_child = cio;
+
+	ASSERT(list_is_empty(&cio->io_parent_list));
+	list_insert_head(&cio->io_parent_list, zl);
+
+	mutex_enter(&pio->io_lock);
+
+	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+	uint64_t *countp = pio->io_children[cio->io_child_type];
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		countp[w] += !cio->io_state[w];
+
+	list_insert_head(&pio->io_child_list, zl);
+
+	mutex_exit(&pio->io_lock);
+}
+
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
@@ -840,12 +871,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
-		zio->io_bp = (blkptr_t *)bp;
-		zio->io_bp_copy = *bp;
-		zio->io_bp_orig = *bp;
 		if (type != ZIO_TYPE_WRITE ||
-		    zio->io_child_type == ZIO_CHILD_DDT)
+		    zio->io_child_type == ZIO_CHILD_DDT) {
+			zio->io_bp_copy = *bp;
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
+		} else {
+			zio->io_bp = (blkptr_t *)bp;
+		}
+		zio->io_bp_orig = *bp;
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
@@ -880,7 +913,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
-		zio_add_child(pio, zio);
+		zio_add_child_first(pio, zio);
 	}
 
 	taskq_init_ent(&zio->io_tqent);
@@ -1601,7 +1634,6 @@ zio_read_bp_init(zio_t *zio)
 		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
-		ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 	}
 
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
@@ -4442,8 +4474,10 @@ zio_ready(zio_t *zio)
 		zio->io_ready(zio);
 	}
 
+#ifdef ZFS_DEBUG
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
+#endif
 
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

From 233425a153af74b7d5ef9730684f3a1d61ff8f11 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 30 Jun 2023 11:59:39 -0400
Subject: [PATCH 13/18] Again fix race between zil_commit() and zil_suspend().

With zl_suspend read in zil_commit() not protected by any locks it
is possible for new ZIL writes to be in progress while zil_destroy()
called by zil_suspend() freeing them.  This patch closes the race
by taking zl_issuer_lock in zil_suspend() and adding the second
zl_suspend check to zil_get_commit_list(), protected by the lock.
It allows all already queued transactions to be logged normally,
while blocks any new ones, calling txg_wait_synced() for the TXGs.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14979
---
 module/zfs/zil.c | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index ef6f52542d..00d66a2481 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -798,8 +798,8 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
-	ASSERT(list_is_empty(&lwb->lwb_waiters));
-	ASSERT(list_is_empty(&lwb->lwb_itxs));
+	VERIFY(list_is_empty(&lwb->lwb_waiters));
+	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
@@ -2525,10 +2525,10 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
  * This function will traverse the queue of itxs that need to be
  * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
-static void
+static uint64_t
 zil_get_commit_list(zilog_t *zilog)
 {
-	uint64_t otxg, txg;
+	uint64_t otxg, txg, wtxg = 0;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@@ -2562,10 +2562,22 @@ zil_get_commit_list(zilog_t *zilog)
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
-		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
+		if (unlikely(zilog->zl_suspend > 0)) {
+			/*
+			 * ZIL was just suspended, but we lost the race.
+			 * Allow all earlier itxs to be committed, but ask
+			 * caller to do txg_wait_synced(txg) for any new.
+			 */
+			if (!list_is_empty(sync_list))
+				wtxg = MAX(wtxg, txg);
+		} else {
+			list_move_tail(commit_list, sync_list);
+		}
 
 		mutex_exit(&itxg->itxg_lock);
 	}
+	return (wtxg);
 }
 
 /*
@@ -2953,11 +2965,12 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
  * not issued, we rely on future calls to zil_commit_writer() to issue
  * the lwb, or the timeout mechanism found in zil_commit_waiter().
  */
-static void
+static uint64_t
 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	list_t ilwbs;
 	lwb_t *lwb;
+	uint64_t wtxg = 0;
 
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
@@ -2987,7 +3000,7 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 
 	ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
 
-	zil_get_commit_list(zilog);
+	wtxg = zil_get_commit_list(zilog);
 	zil_prune_commit_list(zilog);
 	zil_process_commit_list(zilog, zcw, &ilwbs);
 
@@ -2996,6 +3009,7 @@ out:
 	while ((lwb = list_remove_head(&ilwbs)) != NULL)
 		zil_lwb_write_issue(zilog, lwb);
 	list_destroy(&ilwbs);
+	return (wtxg);
 }
 
 static void
@@ -3511,7 +3525,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
 	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
 	zil_commit_itx_assign(zilog, zcw);
 
-	zil_commit_writer(zilog, zcw);
+	uint64_t wtxg = zil_commit_writer(zilog, zcw);
 	zil_commit_waiter(zilog, zcw);
 
 	if (zcw->zcw_zio_error != 0) {
@@ -3526,6 +3540,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
 		DTRACE_PROBE2(zil__commit__io__error,
 		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
+	} else if (wtxg != 0) {
+		txg_wait_synced(zilog->zl_dmu_pool, wtxg);
 	}
 
 	zil_free_commit_waiter(zcw);
@@ -3905,11 +3921,13 @@ zil_suspend(const char *osname, void **cookiep)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
+	mutex_enter(&zilog->zl_issuer_lock);
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
+		mutex_exit(&zilog->zl_issuer_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
@@ -3923,6 +3941,7 @@ zil_suspend(const char *osname, void **cookiep)
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
+		mutex_exit(&zilog->zl_issuer_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
@@ -3931,6 +3950,7 @@ zil_suspend(const char *osname, void **cookiep)
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
+	mutex_exit(&zilog->zl_issuer_lock);
 
 	if (zilog->zl_suspend > 1) {
 		/*

From 61ab05cac74830f2658cd16138c5876b4b31b4fa Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 1 Jul 2023 02:01:58 +1000
Subject: [PATCH 14/18] ddt_addref: remove unnecessary phys fill when refcount
 is 0

The previous comment wondered if this case could happen; it turns out
that it really can't.

This block can only be entered if dde_type and dde_class are "real";
that only happens when a ddt entry has been previously synced to a ddt
store, that is, it was created on a previous txg. Since its gone through
that sync, its dde_refcount must be >0.

ddt_addref() is called from brt_pending_apply(), which is called at the
beginning of spa_sync(), before pending DMU writes/frees are issued.
Freeing a dedup block is the only thing that can decrement dde_refcount,
so there's no way for it to drop to zero before applying the clone bumps
it.

Further, even if it _could_ go to zero, it wouldn't be necessary to fill
the entry from the block. The phys content is not cleared until the free
is issued, which happens when the refcount goes to zero, when the last
real free comes through. The cloned block should be identical to what's
in the phys already, so the fill should be a no-op anyway.

I've replaced this with an assertion because this is all very dependent
on the ordering in which BRT and DDT changes are applied, and that might
change in the future.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: Klara, Inc.
Closes #15004
---
 module/zfs/ddt.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 33fea0ba3d..1fb1982199 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -1209,10 +1209,19 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
 
 		ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
-		if (ddp->ddp_refcnt == 0) {
-			/* This should never happen? */
-			ddt_phys_fill(ddp, bp);
-		}
+
+		/*
+		 * This entry already existed (dde_type is real), so it must
+		 * have refcnt >0 at the start of this txg. We are called from
+		 * brt_pending_apply(), before frees are issued, so the refcnt
+		 * can't be lowered yet. Therefore, it must be >0. We assert
+		 * this because if the order of BRT and DDT interactions were
+		 * ever to change and the refcnt was ever zero here, then
+		 * likely further action is required to fill out the DDT entry,
+		 * and this is a place that is likely to be missed in testing.
+		 */
+		ASSERT3U(ddp->ddp_refcnt, >, 0);
+
 		ddt_phys_addref(ddp);
 		result = B_TRUE;
 	} else {

From 2b10e32561dff234144c0b0d998c60359864ac71 Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Fri, 30 Jun 2023 12:42:02 -0400
Subject: [PATCH 15/18] Pack our DDT ZAPs a bit denser.

The DDT is really inefficient on 4k and up vdevs, because it always
allocates 4k blocks, and while compression could save us somewhat
at ashift 9, that stops being true.

So let's change the default to 32 KiB, which seems like a reasonable
compromise between improved space savings and inflated write sizes
for DDT updates.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #14654
---
 man/man4/zfs.4       | 10 ++++++++++
 module/zfs/ddt_zap.c | 13 ++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 04bbbc5fdf..271b02b6ee 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -239,6 +239,16 @@ relative to the pool.
 Make some blocks above a certain size be gang blocks.
 This option is used by the test suite to facilitate testing.
 .
+.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+Default DDT ZAP data block size as a power of 2. Note that changing this after
+creating a DDT on the pool will not affect existing DDTs, only newly created
+ones.
+.
+.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+Default DDT ZAP indirect block size as a power of 2. Note that changing this
+after creating a DDT on the pool will not affect existing DDTs, only newly
+created ones.
+.
 .It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int
 Default dnode block size as a power of 2.
 .
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 27dbbc55f1..8f6397a6d1 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -31,8 +31,8 @@
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 
-static const int ddt_zap_leaf_blockshift = 12;
-static const int ddt_zap_indirect_blockshift = 12;
+static unsigned int ddt_zap_default_bs = 15;
+static unsigned int ddt_zap_default_ibs = 15;
 
 static int
 ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
@@ -43,7 +43,7 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
 		flags |= ZAP_FLAG_PRE_HASHED_KEY;
 
 	*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
-	    ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+	    ddt_zap_default_bs, ddt_zap_default_ibs,
 	    DMU_OT_NONE, 0, tx);
 
 	return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0);
@@ -166,3 +166,10 @@ const ddt_ops_t ddt_zap_ops = {
 	ddt_zap_walk,
 	ddt_zap_count,
 };
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW,
+	"DDT ZAP leaf blockshift");
+ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW,
+	"DDT ZAP indirect blockshift");
+/* END CSTYLED */

From ac8ae18d2255eab48a77e3fa4e9e6e3230bde015 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 30 Jun 2023 10:03:41 -0700
Subject: [PATCH 16/18] Revert "spa.h: use IN_BASE instead of IN_FREEBSD_BASE"

This reverts commit 77a3bb1f47e67c233eb1961b8746748c02bafde1.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 include/sys/spa.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/sys/spa.h b/include/sys/spa.h
index ac0847793c..1fa2044008 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -723,12 +723,12 @@ typedef enum spa_mode {
  * Send TRIM commands in-line during normal pool operation while deleting.
  *	OFF: no
  *	ON: yes
- * NB: IN_BASE is defined within the FreeBSD sources.
+ * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources.
  */
 typedef enum {
 	SPA_AUTOTRIM_OFF = 0,	/* default */
 	SPA_AUTOTRIM_ON,
-#ifdef IN_BASE
+#ifdef IN_FREEBSD_BASE
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON,
 #else
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF,

From 945e39fc3a34dbffb9a630a99ae523f2e03e314b Mon Sep 17 00:00:00 2001
From: Prakash Surya <prakash.surya@delphix.com>
Date: Fri, 30 Jun 2023 11:34:05 -0700
Subject: [PATCH 17/18] Enable tuning of ZVOL open timeout value

The default timeout for ZVOL opens may not be sufficient for all cases,
so we should enable the value to be more easily tuned to account for
systems where the default value is insufficient.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Prakash Surya <prakash.surya@delphix.com>
Closes #15023
---
 module/os/linux/zfs/zvol_os.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index cdf32c78b4..38bc8e2c4e 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -54,7 +54,7 @@ static unsigned int zvol_prefetch_bytes = (128 * 1024);
 static unsigned long zvol_max_discard_blocks = 16384;
 
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
-static const unsigned int zvol_open_timeout_ms = 1000;
+static unsigned int zvol_open_timeout_ms = 1000;
 #endif
 
 static unsigned int zvol_threads = 0;
@@ -1612,4 +1612,9 @@ MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
     "Process volblocksize blocks per thread");
 #endif
 
+#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+module_param(zvol_open_timeout_ms, uint, 0644);
+MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
+#endif
+
 /* END CSTYLED */

From 009d3288dea524c7ad373b04b65bee8bb6f0bfea Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 30 Jun 2023 11:42:14 -0700
Subject: [PATCH 18/18] Tag 2.2.0-rc1

New features:
- Fully adaptive ARC eviction (#14359)
- Block cloning (#13392)
- Scrub error log (#12812, #12355)
- Linux container support (#14070, #14097, #12263)
- BLAKE3 Checksums (#12918)
- Corrective "zfs receive" (#9372)

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 META | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/META b/META
index e4b476aff1..5f834d5cc7 100644
--- a/META
+++ b/META
@@ -1,8 +1,8 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.1.99
-Release:       1
+Version:       2.2.0
+Release:       rc1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS