Merge branch 'openzfs/master' into NAS-130821-2

Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
2024-09-03 19:09:32 +05:00 · 2024-09-03 19:09:32 +05:00 · cfa06493fa
parent 4460085c39 b3b7491615
commit cfa06493fa
112 changed files with 6355 additions and 1567 deletions
--- a/2
+++ b/2
@ -6,5 +6,5 @@ Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.9
+Linux-Maximum: 6.10
 Linux-Minimum: 3.10
--- a/cmd/Makefile.am
+++ b/cmd/Makefile.am
@ -24,7 +24,7 @@ zfs_ids_to_path_LDADD = \
 	libzfs.la


-zhack_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zhack_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)

 sbin_PROGRAMS   += zhack
 CPPCHECKTARGETS += zhack
@ -39,7 +39,7 @@ zhack_LDADD = \


 ztest_CFLAGS    = $(AM_CFLAGS) $(KERNEL_CFLAGS)
-ztest_CPPFLAGS  = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+ztest_CPPFLAGS  = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)

 sbin_PROGRAMS   += ztest
 CPPCHECKTARGETS += ztest
--- a/cmd/mount_zfs.c
+++ b/cmd/mount_zfs.c
@ -269,8 +269,7 @@ main(int argc, char **argv)
 		return (MOUNT_USAGE);
 	}

-	if (!zfsutil || sloppy ||
-	    libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
+	if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 		zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
 	}

@ -337,7 +336,7 @@ main(int argc, char **argv)
 		    dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);

 	if (!fake) {
-		if (zfsutil && !sloppy &&
+		if (!remount && !sloppy &&
 		    !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 			error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint);
 			if (error) {
--- a/cmd/raidz_test/Makefile.am
+++ b/cmd/raidz_test/Makefile.am
@ -1,5 +1,5 @@
 raidz_test_CFLAGS   = $(AM_CFLAGS)   $(KERNEL_CFLAGS)
-raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)

 bin_PROGRAMS    += raidz_test
 CPPCHECKTARGETS += raidz_test
--- a/cmd/zdb/Makefile.am
+++ b/cmd/zdb/Makefile.am
@ -1,4 +1,4 @@
-zdb_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zdb_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 zdb_CFLAGS   = $(AM_CFLAGS) $(LIBCRYPTO_CFLAGS)

 sbin_PROGRAMS   += zdb
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@ -33,7 +33,7 @@
 *     under sponsorship from the FreeBSD Foundation.
 * Copyright (c) 2021 Allan Jude
 * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
- * Copyright (c) 2023, Klara Inc.
+ * Copyright (c) 2023, 2024, Klara Inc.
 * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
 */

@ -1914,23 +1914,25 @@ dump_log_spacemaps(spa_t *spa)
 }

 static void
-dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
+    uint64_t index)
 {
-	const ddt_phys_t *ddp = dde->dde_phys;
-	const ddt_key_t *ddk = &dde->dde_key;
-	const char *types[4] = { "ditto", "single", "double", "triple" };
+	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;

-	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		if (ddp->ddp_phys_birth == 0)
+	for (p = 0; p < DDT_NPHYS(ddt); p++) {
+		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
-		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
-		(void) printf("index %llx refcnt %llu %s %s\n",
-		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
-		    types[p], blkbuf);
+		(void) printf("index %llx refcnt %llu phys %d %s\n",
+		    (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
+		    p, blkbuf);
 	}
 }

@ -1956,11 +1958,37 @@ dump_dedup_ratio(const ddt_stat_t *dds)
 	    dedup, compress, copies, dedup * compress / copies);
 }

+static void
+dump_ddt_log(ddt_t *ddt)
+{
+	for (int n = 0; n < 2; n++) {
+		ddt_log_t *ddl = &ddt->ddt_log[n];
+
+		uint64_t count = avl_numnodes(&ddl->ddl_tree);
+		if (count == 0)
+			continue;
+
+		printf(DMU_POOL_DDT_LOG ": %lu log entries\n",
+		    zio_checksum_table[ddt->ddt_checksum].ci_name, n, count);
+
+		if (dump_opt['D'] < 4)
+			continue;
+
+		ddt_lightweight_entry_t ddlwe;
+		uint64_t index = 0;
+		for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+		    ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
+			DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+			dump_ddt_entry(ddt, &ddlwe, index++);
+		}
+	}
+}
+
 static void
 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	char name[DDT_NAMELEN];
-	ddt_entry_t dde;
+	ddt_lightweight_entry_t ddlwe;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
@ -2001,8 +2029,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)

 	(void) printf("%s contents:\n\n", name);

-	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
-		dump_dde(ddt, &dde, walk);
+	while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
+		dump_ddt_entry(ddt, &ddlwe, walk);

 	ASSERT3U(error, ==, ENOENT);

@ -2025,6 +2053,7 @@ dump_all_ddts(spa_t *spa)
 				dump_ddt(ddt, type, class);
 			}
 		}
+		dump_ddt_log(ddt);
 	}

 	ddt_get_dedup_stats(spa, &dds_total);
@ -3287,9 +3316,45 @@ fuid_table_destroy(void)
 	}
 }

+/*
+ * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
+ * a live pool are normally cleaned up during ddt_sync(). We can't do that (and
+ * wouldn't want to anyway), but if we don't clean up the presence of stuff on
+ * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
+ *
+ * Note that this is not a particularly efficient way to do this, but
+ * ddt_remove() is the only public method that can do the work we need, and it
+ * requires the right locks and etc to do the job. This is only ever called
+ * during zdb shutdown so efficiency is not especially important.
+ */
+static void
+zdb_ddt_cleanup(spa_t *spa)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt)
+			continue;
+
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		ddt_enter(ddt);
+		ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
+		while (dde) {
+			next = AVL_NEXT(&ddt->ddt_tree, dde);
+			dde->dde_io = NULL;
+			ddt_remove(ddt, dde);
+			dde = next;
+		}
+		ddt_exit(ddt);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+	}
+}
+
 static void
 zdb_exit(int reason)
 {
+	if (spa != NULL)
+		zdb_ddt_cleanup(spa);
+
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
@ -4592,7 +4657,6 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
 	l2arc_log_blk_phys_t this_lb;
 	uint64_t asize;
 	l2arc_log_blkptr_t lbps[2];
-	abd_t *abd;
 	zio_cksum_t cksum;
 	int failed = 0;
 	l2arc_dev_t dev;
@ -4646,20 +4710,25 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
 		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
 		case ZIO_COMPRESS_OFF:
 			break;
-		default:
-			abd = abd_alloc_for_io(asize, B_TRUE);
+		default: {
+			abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
-			if (zio_decompress_data(L2BLK_GET_COMPRESS(
-			    (&lbps[0])->lbp_prop), abd, &this_lb,
-			    asize, sizeof (this_lb), NULL) != 0) {
+			abd_t dabd;
+			abd_get_from_buf_struct(&dabd, &this_lb,
+			    sizeof (this_lb));
+			int err = zio_decompress_data(L2BLK_GET_COMPRESS(
+			    (&lbps[0])->lbp_prop), abd, &dabd,
+			    asize, sizeof (this_lb), NULL);
+			abd_free(&dabd);
+			abd_free(abd);
+			if (err != 0) {
 				(void) printf("L2ARC block decompression "
 				    "failed\n");
-				abd_free(abd);
 				goto out;
 			}
-			abd_free(abd);
 			break;
 		}
+		}

 		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 			byteswap_uint64_array(&this_lb, sizeof (this_lb));
@ -5633,7 +5702,6 @@ static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
    dmu_object_type_t type)
 {
-	uint64_t refcnt = 0;
 	int i;

 	ASSERT(type < ZDB_OT_TOTAL);
@ -5641,8 +5709,161 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;

+	/*
+	 * This flag controls if we will issue a claim for the block while
+	 * counting it, to ensure that all blocks are referenced in space maps.
+	 * We don't issue claims if we're not doing leak tracking, because it's
+	 * expensive if the user isn't interested. We also don't claim the
+	 * second or later occurences of cloned or dedup'd blocks, because we
+	 * already claimed them the first time.
+	 */
+	boolean_t do_claim = !dump_opt['L'];
+
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);

+	blkptr_t tempbp;
+	if (BP_GET_DEDUP(bp)) {
+		/*
+		 * Dedup'd blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them once.
+		 *
+		 * We use the existing dedup system to track what we've seen.
+		 * The first time we see a block, we do a ddt_lookup() to see
+		 * if it exists in the DDT. If we're doing leak tracking, we
+		 * claim the block at this time.
+		 *
+		 * Each time we see a block, we reduce the refcount in the
+		 * entry by one, and add to the size and count of dedup'd
+		 * blocks to report at the end.
+		 */
+
+		ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
+
+		ddt_enter(ddt);
+
+		/*
+		 * Find the block. This will create the entry in memory, but
+		 * we'll know if that happened by its refcount.
+		 */
+		ddt_entry_t *dde = ddt_lookup(ddt, bp);
+
+		/*
+		 * ddt_lookup() can only return NULL if this block didn't exist
+		 * in the DDT and creating it would take the DDT over its
+		 * quota. Since we got the block from disk, it must exist in
+		 * the DDT, so this can't happen.
+		 */
+		VERIFY3P(dde, !=, NULL);
+
+		/* Get the phys for this variant */
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+
+		/*
+		 * This entry may have multiple sets of DVAs. We must claim
+		 * each set the first time we see them in a real block on disk,
+		 * or count them on subsequent occurences. We don't have a
+		 * convenient way to track the first time we see each variant,
+		 * so we repurpose dde_io as a set of "seen" flag bits. We can
+		 * do this safely in zdb because it never writes, so it will
+		 * never have a writing zio for this block in that pointer.
+		 */
+		boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
+		if (!seen)
+			dde->dde_io =
+			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
+
+		/* Consume a reference for this block. */
+		VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
+		ddt_phys_decref(dde->dde_phys, v);
+
+		/*
+		 * If this entry has a single flat phys, it may have been
+		 * extended with additional DVAs at some time in its life.
+		 * This block might be from before it was fully extended, and
+		 * so have fewer DVAs.
+		 *
+		 * If this is the first time we've seen this block, and we
+		 * claimed it as-is, then we would miss the claim on some
+		 * number of DVAs, which would then be seen as leaked.
+		 *
+		 * In all cases, if we've had fewer DVAs, then the asize would
+		 * be too small, and would lead to the pool apparently using
+		 * more space than allocated.
+		 *
+		 * To handle this, we copy the canonical set of DVAs from the
+		 * entry back to the block pointer before we claim it.
+		 */
+		if (v == DDT_PHYS_FLAT) {
+			ASSERT3U(BP_GET_BIRTH(bp), ==,
+			    ddt_phys_birth(dde->dde_phys, v));
+			tempbp = *bp;
+			ddt_bp_fill(dde->dde_phys, v, &tempbp,
+			    BP_GET_BIRTH(bp));
+			bp = &tempbp;
+		}
+
+		if (seen) {
+			/*
+			 * The second or later time we see this block,
+			 * it's a duplicate and we count it.
+			 */
+			zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_dedup_blocks++;
+
+			/* Already claimed, don't do it again. */
+			do_claim = B_FALSE;
+		}
+
+		ddt_exit(ddt);
+	} else if (zcb->zcb_brt_is_active &&
+	    brt_maybe_exists(zcb->zcb_spa, bp)) {
+		/*
+		 * Cloned blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them once.
+		 *
+		 * To do this, we keep our own in-memory BRT. For each block
+		 * we haven't seen before, we look it up in the real BRT and
+		 * if its there, we note it and its refcount then proceed as
+		 * normal. If we see the block again, we count it as a clone
+		 * and then give it no further consideration.
+		 */
+		zdb_brt_entry_t zbre_search, *zbre;
+		avl_index_t where;
+
+		zbre_search.zbre_dva = bp->blk_dva[0];
+		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
+		if (zbre == NULL) {
+			/* Not seen before; track it */
+			uint64_t refcnt =
+			    brt_entry_get_refcount(zcb->zcb_spa, bp);
+			if (refcnt > 0) {
+				zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
+				    UMEM_NOFAIL);
+				zbre->zbre_dva = bp->blk_dva[0];
+				zbre->zbre_refcount = refcnt;
+				avl_insert(&zcb->zcb_brt, zbre, where);
+			}
+		} else  {
+			/*
+			 * Second or later occurrence, count it and take a
+			 * refcount.
+			 */
+			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_clone_blocks++;
+
+			zbre->zbre_refcount--;
+			if (zbre->zbre_refcount == 0) {
+				avl_remove(&zcb->zcb_brt, zbre);
+				umem_free(zbre, sizeof (zdb_brt_entry_t));
+			}
+
+			/* Already claimed, don't do it again. */
+			do_claim = B_FALSE;
+		}
+	}
+
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
@ -5745,71 +5966,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);

-	if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
-		/*
-		 * Cloned blocks are special. We need to count them, so we can
-		 * later uncount them when reporting leaked space, and we must
-		 * only claim them them once.
-		 *
-		 * To do this, we keep our own in-memory BRT. For each block
-		 * we haven't seen before, we look it up in the real BRT and
-		 * if its there, we note it and its refcount then proceed as
-		 * normal. If we see the block again, we count it as a clone
-		 * and then give it no further consideration.
-		 */
-		zdb_brt_entry_t zbre_search, *zbre;
-		avl_index_t where;
-
-		zbre_search.zbre_dva = bp->blk_dva[0];
-		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
-		if (zbre != NULL) {
-			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
-			zcb->zcb_clone_blocks++;
-
-			zbre->zbre_refcount--;
-			if (zbre->zbre_refcount == 0) {
-				avl_remove(&zcb->zcb_brt, zbre);
-				umem_free(zbre, sizeof (zdb_brt_entry_t));
-			}
-			return;
-		}
-
-		uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
-		if (crefcnt > 0) {
-			zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
-			    UMEM_NOFAIL);
-			zbre->zbre_dva = bp->blk_dva[0];
-			zbre->zbre_refcount = crefcnt;
-			avl_insert(&zcb->zcb_brt, zbre, where);
-		}
-	}
-
-	if (dump_opt['L'])
+	if (!do_claim)
 		return;

-	if (BP_GET_DEDUP(bp)) {
-		ddt_t *ddt;
-		ddt_entry_t *dde;
-
-		ddt = ddt_select(zcb->zcb_spa, bp);
-		ddt_enter(ddt);
-		dde = ddt_lookup(ddt, bp, B_FALSE);
-
-		if (dde == NULL) {
-			refcnt = 0;
-		} else {
-			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
-			ddt_phys_decref(ddp);
-			refcnt = ddp->ddp_refcnt;
-			if (ddt_phys_total_refcnt(dde) == 0)
-				ddt_remove(ddt, dde);
-		}
-		ddt_exit(ddt);
-	}
-
-	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
-	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
-	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+	VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+	    spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
+	    ZIO_FLAG_CANFAIL)));
 }

 static void
@ -6120,49 +6282,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
 	return (counts);
 }

-static void
-zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
-{
-	ddt_bookmark_t ddb = {0};
-	ddt_entry_t dde;
-	int error;
-	int p;
-
-	ASSERT(!dump_opt['L']);
-
-	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
-		blkptr_t blk;
-		ddt_phys_t *ddp = dde.dde_phys;
-
-		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
-			return;
-
-		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
-		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
-		VERIFY(ddt);
-
-		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-			if (ddp->ddp_phys_birth == 0)
-				continue;
-			ddt_bp_create(ddb.ddb_checksum,
-			    &dde.dde_key, ddp, &blk);
-			if (p == DDT_PHYS_DITTO) {
-				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
-			} else {
-				zcb->zcb_dedup_asize +=
-				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
-				zcb->zcb_dedup_blocks++;
-			}
-		}
-
-		ddt_enter(ddt);
-		VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
-		ddt_exit(ddt);
-	}
-
-	ASSERT(error == ENOENT);
-}
-
 typedef struct checkpoint_sm_exclude_entry_arg {
 	vdev_t *cseea_vd;
 	uint64_t cseea_checkpoint_size;
@ -6546,10 +6665,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
 		    increment_indirect_mapping_cb, zcb, NULL);
 	}
-
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-	zdb_ddt_leak_init(spa, zcb);
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }

 static boolean_t
@ -6814,6 +6929,8 @@ dump_block_stats(spa_t *spa)
 	int e, c, err;
 	bp_embedded_type_t i;

+	ddt_prefetch_all(spa);
+
 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);

 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
@ -6938,7 +7055,6 @@ dump_block_stats(spa_t *spa)
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
-		leaks = B_TRUE;
 	}

 	if (tzb->zb_count == 0) {
@ -7272,29 +7388,27 @@ dump_simulated_ddt(spa_t *spa)
 	spa_config_exit(spa, SCL_CONFIG, FTAG);

 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
-		ddt_stat_t dds;
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);

-		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
-		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
-		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
-		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+		ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];

-		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
-		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
-		dds.dds_ref_psize = zdde->zdde_ref_psize;
-		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+		dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
+		dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
+		dds->dds_psize += zdde->zdde_ref_psize / refcnt;
+		dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;

-		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
-		    &dds, 0);
+		dds->dds_ref_blocks += zdde->zdde_ref_blocks;
+		dds->dds_ref_lsize += zdde->zdde_ref_lsize;
+		dds->dds_ref_psize += zdde->zdde_ref_psize;
+		dds->dds_ref_dsize += zdde->zdde_ref_dsize;

 		umem_free(zdde, sizeof (*zdde));
 	}

 	avl_destroy(&t);

-	ddt_histogram_stat(&dds_total, &ddh_total);
+	ddt_histogram_total(&dds_total, &ddh_total);

 	(void) printf("Simulated DDT histogram:\n");

@ -8022,16 +8136,25 @@ dump_mos_leaks(spa_t *spa)

 	mos_leak_vdev(spa->spa_root_vdev);

-	for (uint64_t class = 0; class < DDT_CLASSES; class++) {
-		for (uint64_t type = 0; type < DDT_TYPES; type++) {
-			for (uint64_t cksum = 0;
-			    cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
-				ddt_t *ddt = spa->spa_ddt[cksum];
-				if (!ddt)
-					continue;
+	for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt)
+			continue;
+
+		/* DDT store objects */
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0; class < DDT_CLASSES;
+			    class++) {
 				mos_obj_refd(ddt->ddt_object[type][class]);
 			}
 		}
+
+		/* FDT container */
+		mos_obj_refd(ddt->ddt_dir_object);
+
+		/* FDT log objects */
+		mos_obj_refd(ddt->ddt_log[0].ddl_object);
+		mos_obj_refd(ddt->ddt_log[1].ddl_object);
 	}

 	if (spa->spa_brt != NULL) {
@ -8499,13 +8622,22 @@ try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
 	memset(lbuf, 0x00, lsize);
 	memset(lbuf2, 0xff, lsize);

+	abd_t labd, labd2;
+	abd_get_from_buf_struct(&labd, lbuf, lsize);
+	abd_get_from_buf_struct(&labd2, lbuf2, lsize);
+
+	boolean_t ret = B_FALSE;
 	if (zio_decompress_data(cfunc, pabd,
-	    lbuf, psize, lsize, NULL) == 0 &&
+	    &labd, psize, lsize, NULL) == 0 &&
 	    zio_decompress_data(cfunc, pabd,
-	    lbuf2, psize, lsize, NULL) == 0 &&
+	    &labd2, psize, lsize, NULL) == 0 &&
 	    memcmp(lbuf, lbuf2, lsize) == 0)
-		return (B_TRUE);
-	return (B_FALSE);
+		ret = B_TRUE;
+
+	abd_free(&labd2);
+	abd_free(&labd);
+
+	return (ret);
 }

 static uint64_t
@ -9624,6 +9756,9 @@ retry_lookup:
 	}

 fini:
+	if (spa != NULL)
+		zdb_ddt_cleanup(spa);
+
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
--- a/cmd/zed/agents/zfs_diagnosis.c
+++ b/cmd/zed/agents/zfs_diagnosis.c
@ -844,7 +844,6 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 		const char *failmode = NULL;
 		boolean_t checkremove = B_FALSE;
 		uint32_t pri = 0;
-		int32_t flags = 0;

 		/*
 		 * If this is a checksum or I/O error, then toss it into the
@ -922,18 +921,28 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 			}
 		} else if (fmd_nvl_class_match(hdl, nvl,
 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
+			uint64_t flags = 0;
+			int32_t flags32 = 0;
 			/*
 			 * We ignore ereports for checksum errors generated by
 			 * scrub/resilver I/O to avoid potentially further
 			 * degrading the pool while it's being repaired.
+			 *
+			 * Note that FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS used to
+			 * be int32. To allow newer zed to work on older
+			 * kernels, if we don't find the flags, we look for
+			 * the older ones too.
 			 */
 			if (((nvlist_lookup_uint32(nvl,
 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
 			    (pri == ZIO_PRIORITY_SCRUB ||
 			    pri == ZIO_PRIORITY_REBUILD)) ||
-			    ((nvlist_lookup_int32(nvl,
+			    ((nvlist_lookup_uint64(nvl,
 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
-			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
+			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) ||
+			    ((nvlist_lookup_int32(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags32) == 0) &&
+			    (flags32 & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
 				fmd_hdl_debug(hdl, "ignoring '%s' for "
 				    "scrub/resilver I/O", class);
 				return;
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@ -537,7 +537,7 @@ get_usage(zpool_help_t idx)
 		    "\t    [-o property=value] <pool> <newpool> "
 		    "[<device> ...]\n"));
 	case HELP_REGUID:
-		return (gettext("\treguid <pool>\n"));
+		return (gettext("\treguid [-g guid] <pool>\n"));
 	case HELP_SYNC:
 		return (gettext("\tsync [pool] ...\n"));
 	case HELP_VERSION:
@ -2025,7 +2025,7 @@ zpool_do_create(int argc, char **argv)
 				char *end;
 				u_longlong_t ver;

-				ver = strtoull(propval, &end, 10);
+				ver = strtoull(propval, &end, 0);
 				if (*end == '\0' &&
 				    ver < SPA_VERSION_FEATURES) {
 					enable_pool_features = B_FALSE;
@ -8232,19 +8232,32 @@ zpool_do_clear(int argc, char **argv)
 }

 /*
- * zpool reguid <pool>
+ * zpool reguid [-g <guid>] <pool>
 */
 int
 zpool_do_reguid(int argc, char **argv)
 {
+	uint64_t guid;
+	uint64_t *guidp = NULL;
 	int c;
+	char *endptr;
 	char *poolname;
 	zpool_handle_t *zhp;
 	int ret = 0;

 	/* check options */
-	while ((c = getopt(argc, argv, "")) != -1) {
+	while ((c = getopt(argc, argv, "g:")) != -1) {
 		switch (c) {
+		case 'g':
+			errno = 0;
+			guid = strtoull(optarg, &endptr, 10);
+			if (errno != 0 || *endptr != '\0') {
+				(void) fprintf(stderr,
+				    gettext("invalid GUID: %s\n"), optarg);
+				usage(B_FALSE);
+			}
+			guidp = &guid;
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@ -8270,7 +8283,7 @@ zpool_do_reguid(int argc, char **argv)
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);

-	ret = zpool_reguid(zhp);
+	ret = zpool_set_guid(zhp, guidp);

 	zpool_close(zhp);
 	return (ret);
--- a/cmd/zstream/Makefile.am
+++ b/cmd/zstream/Makefile.am
@ -1,3 +1,5 @@
+zstream_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
+
 sbin_PROGRAMS   += zstream
 CPPCHECKTARGETS += zstream

--- a/cmd/zstream/zstream_decompress.c
+++ b/cmd/zstream/zstream_decompress.c
@ -22,6 +22,8 @@
 /*
 * Copyright 2022 Axcient.  All rights reserved.
 * Use is subject to license terms.
+ *
+ * Copyright (c) 2024, Klara, Inc.
 */

 #include <err.h>
@ -257,83 +259,73 @@ zstream_do_decompress(int argc, char *argv[])
 			ENTRY e = {.key = key};

 			p = hsearch(e, FIND);
-			if (p != NULL) {
-				zio_decompress_func_t *xfunc = NULL;
-				switch ((enum zio_compress)(intptr_t)p->data) {
-				case ZIO_COMPRESS_OFF:
-					xfunc = NULL;
-					break;
-				case ZIO_COMPRESS_LZJB:
-					xfunc = lzjb_decompress;
-					break;
-				case ZIO_COMPRESS_GZIP_1:
-					xfunc = gzip_decompress;
-					break;
-				case ZIO_COMPRESS_ZLE:
-					xfunc = zle_decompress;
-					break;
-				case ZIO_COMPRESS_LZ4:
-					xfunc = lz4_decompress_zfs;
-					break;
-				case ZIO_COMPRESS_ZSTD:
-					xfunc = zfs_zstd_decompress;
-					break;
-				default:
-					assert(B_FALSE);
-				}
-
-
-				/*
-				 * Read and decompress the block
-				 */
-				char *lzbuf = safe_calloc(payload_size);
-				(void) sfread(lzbuf, payload_size, stdin);
-				if (xfunc == NULL) {
-					memcpy(buf, lzbuf, payload_size);
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-					if (verbose)
-						fprintf(stderr, "Resetting "
-						    "compression type to off "
-						    "for ino %llu offset "
-						    "%llu\n",
-						    (u_longlong_t)
-						    drrw->drr_object,
-						    (u_longlong_t)
-						    drrw->drr_offset);
-				} else if (0 != xfunc(lzbuf, buf,
-				    payload_size, payload_size, 0)) {
-					/*
-					 * The block must not be compressed,
-					 * at least not with this compression
-					 * type, possibly because it gets
-					 * written multiple times in this
-					 * stream.
-					 */
-					warnx("decompression failed for "
-					    "ino %llu offset %llu",
-					    (u_longlong_t)drrw->drr_object,
-					    (u_longlong_t)drrw->drr_offset);
-					memcpy(buf, lzbuf, payload_size);
-				} else if (verbose) {
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-					fprintf(stderr, "successfully "
-					    "decompressed ino %llu "
-					    "offset %llu\n",
-					    (u_longlong_t)drrw->drr_object,
-					    (u_longlong_t)drrw->drr_offset);
-				} else {
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-				}
-				free(lzbuf);
-			} else {
+			if (p == NULL) {
 				/*
 				 * Read the contents of the block unaltered
 				 */
 				(void) sfread(buf, payload_size, stdin);
+				break;
 			}
+
+			/*
+			 * Read and decompress the block
+			 */
+			enum zio_compress c =
+			    (enum zio_compress)(intptr_t)p->data;
+
+			if (c == ZIO_COMPRESS_OFF) {
+				(void) sfread(buf, payload_size, stdin);
+				drrw->drr_compressiontype = 0;
+				drrw->drr_compressed_size = 0;
+				if (verbose)
+					fprintf(stderr,
+					    "Resetting compression type to "
+					    "off for ino %llu offset %llu\n",
+					    (u_longlong_t)drrw->drr_object,
+					    (u_longlong_t)drrw->drr_offset);
+				break;
+			}
+
+			uint64_t lsize = drrw->drr_logical_size;
+			ASSERT3U(payload_size, <=, lsize);
+
+			char *lzbuf = safe_calloc(payload_size);
+			(void) sfread(lzbuf, payload_size, stdin);
+
+			abd_t sabd, dabd;
+			abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
+			abd_get_from_buf_struct(&dabd, buf, lsize);
+			int err = zio_decompress_data(c, &sabd, &dabd,
+			    payload_size, lsize, NULL);
+			abd_free(&dabd);
+			abd_free(&sabd);
+
+			if (err == 0) {
+				drrw->drr_compressiontype = 0;
+				drrw->drr_compressed_size = 0;
+				payload_size = lsize;
+				if (verbose) {
+					fprintf(stderr,
+					    "successfully decompressed "
+					    "ino %llu offset %llu\n",
+					    (u_longlong_t)drrw->drr_object,
+					    (u_longlong_t)drrw->drr_offset);
+				}
+			} else {
+				/*
+				 * The block must not be compressed, at least
+				 * not with this compression type, possibly
+				 * because it gets written multiple times in
+				 * this stream.
+				 */
+				warnx("decompression failed for "
+				    "ino %llu offset %llu",
+				    (u_longlong_t)drrw->drr_object,
+				    (u_longlong_t)drrw->drr_offset);
+				memcpy(buf, lzbuf, payload_size);
+			}
+
+			free(lzbuf);
 			break;
 		}

--- a/cmd/zstream/zstream_recompress.c
+++ b/cmd/zstream/zstream_recompress.c
@ -22,10 +22,9 @@
 /*
 * Copyright 2022 Axcient.  All rights reserved.
 * Use is subject to license terms.
- */
-
-/*
+ *
 * Copyright (c) 2022 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Klara, Inc.
 */

 #include <err.h>
@ -72,7 +71,7 @@ zstream_do_recompress(int argc, char *argv[])
 	dmu_replay_record_t *drr = &thedrr;
 	zio_cksum_t stream_cksum;
 	int c;
-	int level = -1;
+	int level = 0;

 	while ((c = getopt(argc, argv, "l:")) != -1) {
 		switch (c) {
@ -97,34 +96,22 @@ zstream_do_recompress(int argc, char *argv[])

 	if (argc != 1)
 		zstream_usage();
-	int type = 0;
-	zio_compress_info_t *cinfo = NULL;
-	if (0 == strcmp(argv[0], "off")) {
-		type = ZIO_COMPRESS_OFF;
-		cinfo = &zio_compress_table[type];
-	} else if (0 == strcmp(argv[0], "inherit") ||
-	    0 == strcmp(argv[0], "empty") ||
-	    0 == strcmp(argv[0], "on")) {
-		// Fall through to invalid compression type case
-	} else {
-		for (int i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
-			if (0 == strcmp(zio_compress_table[i].ci_name,
-			    argv[0])) {
-				cinfo = &zio_compress_table[i];
-				type = i;
-				break;
-			}
-		}
-	}
-	if (cinfo == NULL) {
-		fprintf(stderr, "Invalid compression type %s.\n",
-		    argv[0]);
-		exit(2);
-	}

-	if (cinfo->ci_compress == NULL) {
-		type = 0;
-		cinfo = &zio_compress_table[0];
+	enum zio_compress ctype;
+	if (strcmp(argv[0], "off") == 0) {
+		ctype = ZIO_COMPRESS_OFF;
+	} else {
+		for (ctype = 0; ctype < ZIO_COMPRESS_FUNCTIONS; ctype++) {
+			if (strcmp(argv[0],
+			    zio_compress_table[ctype].ci_name) == 0)
+				break;
+		}
+		if (ctype == ZIO_COMPRESS_FUNCTIONS ||
+		    zio_compress_table[ctype].ci_compress == NULL) {
+			fprintf(stderr, "Invalid compression type %s.\n",
+			    argv[0]);
+			exit(2);
+		}
 	}

 	if (isatty(STDIN_FILENO)) {
@ -135,6 +122,7 @@ zstream_do_recompress(int argc, char *argv[])
 		exit(1);
 	}

+	abd_init();
 	fletcher_4_init();
 	zio_init();
 	zstd_init();
@ -247,63 +235,78 @@ zstream_do_recompress(int argc, char *argv[])
 				(void) sfread(buf, payload_size, stdin);
 				break;
 			}
-			if (drrw->drr_compressiontype >=
-			    ZIO_COMPRESS_FUNCTIONS) {
+			enum zio_compress dtype = drrw->drr_compressiontype;
+			if (dtype >= ZIO_COMPRESS_FUNCTIONS) {
 				fprintf(stderr, "Invalid compression type in "
-				    "stream: %d\n", drrw->drr_compressiontype);
+				    "stream: %d\n", dtype);
 				exit(3);
 			}
-			zio_compress_info_t *dinfo =
-			    &zio_compress_table[drrw->drr_compressiontype];
+			if (zio_compress_table[dtype].ci_decompress == NULL)
+				dtype = ZIO_COMPRESS_OFF;

 			/* Set up buffers to minimize memcpys */
 			char *cbuf, *dbuf;
-			if (cinfo->ci_compress == NULL)
+			if (ctype == ZIO_COMPRESS_OFF)
 				dbuf = buf;
 			else
 				dbuf = safe_calloc(bufsz);

-			if (dinfo->ci_decompress == NULL)
+			if (dtype == ZIO_COMPRESS_OFF)
 				cbuf = dbuf;
 			else
 				cbuf = safe_calloc(payload_size);

 			/* Read and decompress the payload */
 			(void) sfread(cbuf, payload_size, stdin);
-			if (dinfo->ci_decompress != NULL) {
-				if (0 != dinfo->ci_decompress(cbuf, dbuf,
-				    payload_size, MIN(bufsz,
-				    drrw->drr_logical_size), dinfo->ci_level)) {
+			if (dtype != ZIO_COMPRESS_OFF) {
+				abd_t cabd, dabd;
+				abd_get_from_buf_struct(&cabd,
+				    cbuf, payload_size);
+				abd_get_from_buf_struct(&dabd, dbuf,
+				    MIN(bufsz, drrw->drr_logical_size));
+				if (zio_decompress_data(dtype, &cabd, &dabd,
+				    payload_size, abd_get_size(&dabd),
+				    NULL) != 0) {
 					warnx("decompression type %d failed "
 					    "for ino %llu offset %llu",
-					    type,
+					    dtype,
 					    (u_longlong_t)drrw->drr_object,
 					    (u_longlong_t)drrw->drr_offset);
 					exit(4);
 				}
 				payload_size = drrw->drr_logical_size;
+				abd_free(&dabd);
+				abd_free(&cabd);
 				free(cbuf);
 			}

 			/* Recompress the payload */
-			if (cinfo->ci_compress != NULL) {
-				payload_size = P2ROUNDUP(cinfo->ci_compress(
-				    dbuf, buf, drrw->drr_logical_size,
-				    MIN(payload_size, bufsz), (level == -1 ?
-				    cinfo->ci_level : level)),
-				    SPA_MINBLOCKSIZE);
-				if (payload_size != drrw->drr_logical_size) {
-					drrw->drr_compressiontype = type;
-					drrw->drr_compressed_size =
-					    payload_size;
-				} else {
+			if (ctype != ZIO_COMPRESS_OFF) {
+				abd_t dabd, abd;
+				abd_get_from_buf_struct(&dabd,
+				    dbuf, drrw->drr_logical_size);
+				abd_t *pabd =
+				    abd_get_from_buf_struct(&abd, buf, bufsz);
+				size_t csize = zio_compress_data(ctype, &dabd,
+				    &pabd, drrw->drr_logical_size, level);
+				size_t rounded =
+				    P2ROUNDUP(csize, SPA_MINBLOCKSIZE);
+				if (rounded >= drrw->drr_logical_size) {
 					memcpy(buf, dbuf, payload_size);
 					drrw->drr_compressiontype = 0;
 					drrw->drr_compressed_size = 0;
+				} else {
+					abd_zero_off(pabd, csize,
+					    rounded - csize);
+					drrw->drr_compressiontype = ctype;
+					drrw->drr_compressed_size =
+					    payload_size = rounded;
 				}
+				abd_free(&abd);
+				abd_free(&dabd);
 				free(dbuf);
 			} else {
-				drrw->drr_compressiontype = type;
+				drrw->drr_compressiontype = 0;
 				drrw->drr_compressed_size = 0;
 			}
 			break;
@ -371,6 +374,7 @@ zstream_do_recompress(int argc, char *argv[])
 	fletcher_4_fini();
 	zio_fini();
 	zstd_fini();
+	abd_fini();

 	return (0);
 }
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@ -6746,7 +6746,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
 	load = spa_load_guid(spa);

 	(void) pthread_rwlock_wrlock(&ztest_name_lock);
-	error = spa_change_guid(spa);
+	error = spa_change_guid(spa, NULL);
 	zs->zs_guid = spa_guid(spa);
 	(void) pthread_rwlock_unlock(&ztest_name_lock);

--- a/config/Rules.am
+++ b/config/Rules.am
@ -10,7 +10,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/module/icp/include \
 	-I$(top_srcdir)/lib/libspl/include \
-	-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@
+	-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ \
+	-I$(top_srcdir)/lib/libzpool/include

 AM_LIBTOOLFLAGS = --silent

@ -85,4 +86,7 @@ KERNEL_CFLAGS       = $(FRAME_LARGER_THAN)
 LIBRARY_CFLAGS      = -no-suppress

 # Forcibly enable asserts/debugging for libzpool &al.
-FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
+# Since ZFS_DEBUG can change shared data structures, all libzpool users must
+# be compiled with the same flags.
+# See https://github.com/openzfs/zfs/issues/16476
+LIBZPOOL_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
--- a/config/kernel-blk-queue.m4
+++ b/config/kernel-blk-queue.m4
@ -25,6 +25,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [
 dnl #
 dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue
 dnl # 4.12: dynamically allocated bdi in request_queue
+dnl # 6.11: bdi no longer available through request_queue, so get it from
+dnl #       the gendisk attached to the queue
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [
 	ZFS_LINUX_TEST_SRC([blk_queue_bdi], [
@ -47,6 +49,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [
 	])
 ])

+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI], [
+	ZFS_LINUX_TEST_SRC([blk_queue_disk_bdi], [
+		#include <linux/blkdev.h>
+		#include <linux/backing-dev.h>
+	], [
+		struct request_queue q;
+		struct gendisk disk;
+		struct backing_dev_info bdi __attribute__ ((unused));
+		q.disk = &disk;
+		q.disk->bdi = &bdi;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI], [
+	AC_MSG_CHECKING([whether backing_dev_info is available through queue gendisk])
+	ZFS_LINUX_TEST_RESULT([blk_queue_disk_bdi], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLK_QUEUE_DISK_BDI, 1,
+		    [backing_dev_info is available through queue gendisk])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 dnl #
 dnl # 5.9: added blk_queue_update_readahead(),
 dnl # 5.15: renamed to disk_update_readahead()
@ -407,6 +433,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE
@ -421,6 +448,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
 	ZFS_AC_KERNEL_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI
 	ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE
--- a/config/kernel-make-request-fn.m4
+++ b/config/kernel-make-request-fn.m4
@ -58,6 +58,13 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
 		disk = blk_alloc_disk(lim, NUMA_NO_NODE);
 	])

+	ZFS_LINUX_TEST_SRC([blkdev_queue_limits_features], [
+		#include <linux/blkdev.h>
+	],[
+		struct queue_limits *lim = NULL;
+		lim->features = 0;
+	])
+
 	ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
 		#include <linux/blkdev.h>
 	],[
@ -114,6 +121,20 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])

+			dnl #
+			dnl # Linux 6.11 API change:
+			dnl # struct queue_limits gains a 'features' field,
+			dnl # used to set flushing options
+			dnl #
+			AC_MSG_CHECKING([whether struct queue_limits has a features field])
+			ZFS_LINUX_TEST_RESULT([blkdev_queue_limits_features], [
+				AC_MSG_RESULT(yes)
+				AC_DEFINE([HAVE_BLKDEV_QUEUE_LIMITS_FEATURES], 1,
+				    [struct queue_limits has a features field])
+			], [
+				AC_MSG_RESULT(no)
+			])
+
 			dnl #
 			dnl # 5.20 API change,
 			dnl # Removed blk_cleanup_disk(), put_disk() should be used.
--- a/config/kernel-mm-page-size.m4
+++ b/config/kernel-mm-page-size.m4
@ -1,17 +0,0 @@
-AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
-	ZFS_LINUX_TEST_SRC([page_size], [
-		#include <linux/mm.h>
-	],[
-		unsigned long s;
-		s = page_size(NULL);
-	])
-])
-AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
-	AC_MSG_CHECKING([whether page_size() is available])
-	ZFS_LINUX_TEST_RESULT([page_size], [
-		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
-	],[
-		AC_MSG_RESULT(no)
-	])
-])
--- a/config/kernel-mm-pagemap.m4
+++ b/config/kernel-mm-pagemap.m4
@ -0,0 +1,36 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
+	ZFS_LINUX_TEST_SRC([page_size], [
+		#include <linux/mm.h>
+	],[
+		unsigned long s;
+		s = page_size(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
+	AC_MSG_CHECKING([whether page_size() is available])
+	ZFS_LINUX_TEST_RESULT([page_size], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING], [
+	ZFS_LINUX_TEST_SRC([page_mapping], [
+		#include <linux/pagemap.h>
+	],[
+		struct page *p = NULL;
+		struct address_space *m = page_mapping(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_MAPPING], [
+	AC_MSG_CHECKING([whether page_mapping() is available])
+	ZFS_LINUX_TEST_RESULT([page_mapping], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_MAPPING, 1, [page_mapping() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
--- a/config/kernel-register_sysctl_table.m4
+++ b/config/kernel-register_sysctl_table.m4
@ -25,3 +25,62 @@ AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
 		AC_MSG_RESULT([no])
 	])
 ])
+
+dnl #
+dnl # Linux 6.11 register_sysctl() enforces that sysctl tables no longer
+dnl # supply a sentinel end-of-table element. 6.6 introduces
+dnl # register_sysctl_sz() to enable callers to choose, so we use it if
+dnl # available for backward compatibility.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ], [
+	ZFS_LINUX_TEST_SRC([has_register_sysctl_sz], [
+		#include <linux/sysctl.h>
+	],[
+		struct ctl_table test_table[] __attribute__((unused)) = {0};
+		register_sysctl_sz("", test_table, 0);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ], [
+	AC_MSG_CHECKING([whether register_sysctl_sz exists])
+	ZFS_LINUX_TEST_RESULT([has_register_sysctl_sz], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_REGISTER_SYSCTL_SZ, 1,
+			[register_sysctl_sz exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
+dnl #
+dnl # Linux 6.11 makes const the ctl_table arg of proc_handler
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST], [
+	ZFS_LINUX_TEST_SRC([has_proc_handler_ctl_table_const], [
+		#include <linux/sysctl.h>
+
+		static int test_handler(
+		    const struct ctl_table *ctl __attribute((unused)),
+		    int write __attribute((unused)),
+		    void *buffer __attribute((unused)),
+		    size_t *lenp __attribute((unused)),
+		    loff_t *ppos __attribute((unused)))
+		{
+			return (0);
+		}
+	], [
+		proc_handler *ph __attribute((unused)) =
+		    &test_handler;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST], [
+	AC_MSG_CHECKING([whether proc_handler ctl_table arg is const])
+	ZFS_LINUX_TEST_RESULT([has_proc_handler_ctl_table_const], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_PROC_HANDLER_CTL_TABLE_CONST, 1,
+		    [proc_handler ctl_table arg is const])
+	], [
+		AC_MSG_RESULT([no])
+	])
+])
--- a/config/kernel.m4
+++ b/config/kernel.m4
@ -167,9 +167,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ
+	ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
 	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
+	ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@ -319,9 +322,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ
+	ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
 	ZFS_AC_KERNEL_MM_PAGE_SIZE
+	ZFS_AC_KERNEL_MM_PAGE_MAPPING
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
--- a/include/libzfs.h
+++ b/include/libzfs.h
@ -300,6 +300,7 @@ _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,

 _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
 _LIBZFS_H int zpool_reguid(zpool_handle_t *);
+_LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *);
 _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);

 _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@ -77,6 +77,8 @@ noinst_HEADERS = \
 	%D%/spl/sys/zmod.h \
 	%D%/spl/sys/zone.h \
 	\
+	%D%/zfs/sys/abd_os.h \
+	%D%/zfs/sys/abd_impl_os.h \
 	%D%/zfs/sys/arc_os.h \
 	%D%/zfs/sys/freebsd_crypto.h \
 	%D%/zfs/sys/freebsd_event.h \
--- a/include/os/freebsd/zfs/sys/abd_impl_os.h
+++ b/include/os/freebsd/zfs/sys/abd_impl_os.h
@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	critical_enter()
+#define	abd_exit_critical(flags)	critical_exit()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
--- a/include/os/freebsd/zfs/sys/abd_os.h
+++ b/include/os/freebsd/zfs/sys/abd_os.h
@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	void		*abd_chunks[1]; /* actually variable-length */
+};
+
+struct abd_linear {
+	void		*abd_buf;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@ -20,6 +20,8 @@ kernel_linux_HEADERS = \

 kernel_sysdir = $(kerneldir)/sys
 kernel_sys_HEADERS = \
+	%D%/zfs/sys/abd_os.h \
+	%D%/zfs/sys/abd_impl_os.h \
 	%D%/zfs/sys/policy.h \
 	%D%/zfs/sys/trace_acl.h \
 	%D%/zfs/sys/trace_arc.h \
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@ -57,6 +57,11 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
 #endif

 /*
+ * 6.11 API
+ * Setting the flush flags directly is no longer possible; flush flags are set
+ * on the queue_limits structure and passed to blk_disk_alloc(). In this case
+ * we remove this function entirely.
+ *
 * 4.7 API,
 * The blk_queue_write_cache() interface has replaced blk_queue_flush()
 * interface.  However, the new interface is GPL-only thus we implement
@ -68,31 +73,33 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
 * new one is GPL-only.   Thus if the GPL-only version is detected we
 * implement our own trivial helper.
 */
+#if !defined(HAVE_BLK_ALLOC_DISK_2ARG) || \
+	!defined(HAVE_BLKDEV_QUEUE_LIMITS_FEATURES)
 static inline void
-blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
+blk_queue_set_write_cache(struct request_queue *q, bool on)
 {
 #if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY)
-	if (wc)
+	if (on) {
 		blk_queue_flag_set(QUEUE_FLAG_WC, q);
-	else
-		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
-	if (fua)
 		blk_queue_flag_set(QUEUE_FLAG_FUA, q);
-	else
+	} else {
+		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
 		blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
+	}
 #elif defined(HAVE_BLK_QUEUE_WRITE_CACHE)
-	blk_queue_write_cache(q, wc, fua);
+	blk_queue_write_cache(q, on, on);
 #elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY)
-	if (wc)
-		q->flush_flags |= REQ_FLUSH;
-	if (fua)
-		q->flush_flags |= REQ_FUA;
+	if (on)
+		q->flush_flags |= REQ_FLUSH | REQ_FUA;
+	else
+		q->flush_flags &= ~(REQ_FLUSH | REQ_FUA);
 #elif defined(HAVE_BLK_QUEUE_FLUSH)
-	blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0));
+	blk_queue_flush(q, on ? (REQ_FLUSH | REQ_FUA) : 0);
 #else
 #error "Unsupported kernel"
 #endif
 }
+#endif /* !HAVE_BLK_ALLOC_DISK_2ARG || !HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */

 /*
 * Detect if a device has a write cache. Used to set the intial value for the
@ -126,8 +133,10 @@ blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
 {
 #if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \
 	!defined(HAVE_DISK_UPDATE_READAHEAD)
-#ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC
+#if defined(HAVE_BLK_QUEUE_BDI_DYNAMIC)
 	q->backing_dev_info->ra_pages = ra_pages;
+#elif defined(HAVE_BLK_QUEUE_DISK_BDI)
+	q->disk->bdi->ra_pages = ra_pages;
 #else
 	q->backing_dev_info.ra_pages = ra_pages;
 #endif
--- a/include/os/linux/kernel/linux/mm_compat.h
+++ b/include/os/linux/kernel/linux/mm_compat.h
@ -21,16 +21,23 @@

 /*
 * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
 */

 #ifndef _ZFS_MM_COMPAT_H
 #define	_ZFS_MM_COMPAT_H

 #include <linux/mm.h>
+#include <linux/pagemap.h>

 /* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
 #ifndef HAVE_MM_PAGE_SIZE
 #define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
 #endif

+/* 6.11 removed page_mapping(). A simple wrapper around folio_mapping() works */
+#ifndef HAVE_MM_PAGE_MAPPING
+#define	page_mapping(p) folio_mapping(page_folio(p))
+#endif
+
 #endif /* _ZFS_MM_COMPAT_H */
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@ -20,6 +20,10 @@
 *  You should have received a copy of the GNU General Public License along
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
 */
+/*
+ * Copyright (c) 2024, Klara Inc.
+ * Copyright (c) 2024, Syneto
+ */

 #ifndef _SPL_TASKQ_H
 #define	_SPL_TASKQ_H
@ -33,6 +37,9 @@
 #include <sys/thread.h>
 #include <sys/rwlock.h>
 #include <sys/wait.h>
+#include <sys/wmsum.h>
+
+typedef struct kstat_s kstat_t;

 #define	TASKQ_NAMELEN		31

@ -74,6 +81,32 @@ typedef enum tq_lock_role {
 typedef unsigned long taskqid_t;
 typedef void (task_func_t)(void *);

+typedef struct taskq_sums {
+	/* gauges (inc/dec counters, current value) */
+	wmsum_t tqs_threads_active;		/* threads running a task */
+	wmsum_t tqs_threads_idle;		/* threads waiting for work */
+	wmsum_t tqs_threads_total;		/* total threads */
+	wmsum_t tqs_tasks_pending;		/* tasks waiting to execute */
+	wmsum_t tqs_tasks_priority;		/* hi-pri tasks waiting */
+	wmsum_t tqs_tasks_total;		/* total waiting tasks */
+	wmsum_t tqs_tasks_delayed;		/* tasks deferred to future */
+	wmsum_t tqs_entries_free;		/* task entries on free list */
+
+	/* counters (inc only, since taskq creation) */
+	wmsum_t tqs_threads_created;		/* threads created */
+	wmsum_t tqs_threads_destroyed;		/* threads destroyed */
+	wmsum_t tqs_tasks_dispatched;		/* tasks dispatched */
+	wmsum_t tqs_tasks_dispatched_delayed;	/* tasks delayed to future */
+	wmsum_t tqs_tasks_executed_normal;	/* normal pri tasks executed */
+	wmsum_t tqs_tasks_executed_priority;	/* high pri tasks executed */
+	wmsum_t tqs_tasks_executed;		/* total tasks executed */
+	wmsum_t tqs_tasks_delayed_requeued;	/* delayed tasks requeued */
+	wmsum_t tqs_tasks_cancelled;		/* tasks cancelled before run */
+	wmsum_t tqs_thread_wakeups;		/* total thread wakeups */
+	wmsum_t tqs_thread_wakeups_nowork;	/* thread woken but no tasks */
+	wmsum_t tqs_thread_sleeps;		/* total thread sleeps */
+} taskq_sums_t;
+
 typedef struct taskq {
 	spinlock_t		tq_lock;	/* protects taskq_t */
 	char			*tq_name;	/* taskq name */
@ -105,6 +138,8 @@ typedef struct taskq {
 	struct hlist_node	tq_hp_cb_node;
 	boolean_t		tq_hp_support;
 	unsigned long		lastspawnstop;	/* when to purge dynamic */
+	taskq_sums_t		tq_sums;
+	kstat_t			*tq_ksp;
 } taskq_t;

 typedef struct taskq_ent {
@ -123,6 +158,13 @@ typedef struct taskq_ent {
 #define	TQENT_FLAG_PREALLOC	0x1
 #define	TQENT_FLAG_CANCEL	0x2

+/* bits 2-3 are which list tqent is on */
+#define	TQENT_LIST_NONE		0x0
+#define	TQENT_LIST_PENDING	0x4
+#define	TQENT_LIST_PRIORITY	0x8
+#define	TQENT_LIST_DELAY	0xc
+#define	TQENT_LIST_MASK		0xc
+
 typedef struct taskq_thread {
 	struct list_head	tqt_thread_list;
 	struct list_head	tqt_active_list;
--- a/include/os/linux/zfs/sys/abd_impl_os.h
+++ b/include/os/linux/zfs/sys/abd_impl_os.h
@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	local_irq_save(flags)
+#define	abd_exit_critical(flags)	local_irq_restore(flags)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
--- a/include/os/linux/zfs/sys/abd_os.h
+++ b/include/os/linux/zfs/sys/abd_os.h
@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	uint_t		abd_nents;
+	struct scatterlist *abd_sgl;
+};
+
+struct abd_linear {
+	void		*abd_buf;
+	struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
+};
+
+typedef struct abd abd_t;
+
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+    void *);
+
+/*
+ * Linux ABD bio functions
+ * Note: these are only needed to support vdev_classic. See comment in
+ * vdev_disk.c.
+ */
+unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
+unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@ -30,6 +30,7 @@
 #include <sys/debug.h>
 #include <sys/zfs_refcount.h>
 #include <sys/uio.h>
+#include <sys/abd_os.h>

 #ifdef __cplusplus
 extern "C" {
@ -44,8 +45,7 @@ typedef enum abd_flags {
 	ABD_FLAG_LINEAR_PAGE 	= 1 << 5, /* linear but allocd from page */
 	ABD_FLAG_GANG		= 1 << 6, /* mult ABDs chained together */
 	ABD_FLAG_GANG_FREE	= 1 << 7, /* gang ABD is responsible for mem */
-	ABD_FLAG_ZEROS		= 1 << 8, /* ABD for zero-filled buffer */
-	ABD_FLAG_ALLOCD		= 1 << 9, /* we allocated the abd_t */
+	ABD_FLAG_ALLOCD		= 1 << 8, /* we allocated the abd_t */
 } abd_flags_t;

 typedef struct abd {
@ -58,19 +58,8 @@ typedef struct abd {
 #endif
 	kmutex_t	abd_mtx;
 	union {
-		struct abd_scatter {
-			uint_t		abd_offset;
-#if defined(__FreeBSD__) && defined(_KERNEL)
-			void    *abd_chunks[1]; /* actually variable-length */
-#else
-			uint_t		abd_nents;
-			struct scatterlist *abd_sgl;
-#endif
-		} abd_scatter;
-		struct abd_linear {
-			void		*abd_buf;
-			struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
-		} abd_linear;
+		struct abd_scatter	abd_scatter;
+		struct abd_linear	abd_linear;
 		struct abd_gang {
 			list_t abd_gang_chain;
 		} abd_gang;
@ -79,9 +68,6 @@ typedef struct abd {

 typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
 typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
-#if defined(__linux__) && defined(_KERNEL)
-typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
-#endif

 extern int zfs_abd_scatter_enabled;

@ -107,6 +93,7 @@ abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
 abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t);
 abd_t *abd_get_zeros(size_t);
 abd_t *abd_get_from_buf(void *, size_t);
+abd_t *abd_get_from_buf_struct(abd_t *, void *, size_t);
 void abd_cache_reap_now(void);

 /*
@ -128,10 +115,6 @@ void abd_release_ownership_of_buf(abd_t *);
 int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
 int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
    abd_iter_func2_t *, void *);
-#if defined(__linux__) && defined(_KERNEL)
-int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
-    void *);
-#endif
 void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
 void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
 void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
@ -225,16 +208,6 @@ abd_get_size(abd_t *abd)
 void abd_init(void);
 void abd_fini(void);

-/*
- * Linux ABD bio functions
- * Note: these are only needed to support vdev_classic. See comment in
- * vdev_disk.c.
- */
-#if defined(__linux__) && defined(_KERNEL)
-unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
-unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
-#endif
-
 #ifdef __cplusplus
 }
 #endif
--- a/include/sys/abd_impl.h
+++ b/include/sys/abd_impl.h
@ -28,6 +28,7 @@
 #define	_ABD_IMPL_H

 #include <sys/abd.h>
+#include <sys/abd_impl_os.h>
 #include <sys/wmsum.h>

 #ifdef __cplusplus
@ -111,19 +112,6 @@ void abd_iter_page(struct abd_iter *);
 #define	ABD_LINEAR_BUF(abd)	(abd->abd_u.abd_linear.abd_buf)
 #define	ABD_GANG(abd)		(abd->abd_u.abd_gang)

-#if defined(_KERNEL)
-#if defined(__FreeBSD__)
-#define	abd_enter_critical(flags)	critical_enter()
-#define	abd_exit_critical(flags)	critical_exit()
-#else
-#define	abd_enter_critical(flags)	local_irq_save(flags)
-#define	abd_exit_critical(flags)	local_irq_restore(flags)
-#endif
-#else /* !_KERNEL */
-#define	abd_enter_critical(flags)	((void)0)
-#define	abd_exit_critical(flags)	((void)0)
-#endif
-
 #ifdef __cplusplus
 }
 #endif
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@ -39,6 +39,13 @@ extern "C" {

 struct abd;

+/*
+ * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
+ */
+#define	DDT_FLAG_FLAT	(1 << 0)	/* single extensible phys */
+#define	DDT_FLAG_LOG	(1 << 1)	/* dedup log (journal) */
+#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT|DDT_FLAG_LOG)
+
 /*
 * DDT on-disk storage object types. Each one corresponds to specific
 * implementation, see ddt_ops_t. The value itself is not stored on disk.
@ -120,30 +127,80 @@ typedef struct {
 * characteristics of the stored block, such as its location on disk (DVAs),
 * birth txg and ref count.
 *
- * Note that an entry has an array of four ddt_phys_t, one for each number of
- * DVAs (copies= property) and another for additional "ditto" copies. Most
- * users of ddt_phys_t will handle indexing into or counting the phys they
- * want.
+ * The "traditional" entry has an array of four, one for each number of DVAs
+ * (copies= property) and another for additional "ditto" copies. Users of the
+ * traditional struct will specify the variant (index) of the one they want.
+ *
+ * The newer "flat" entry has only a single form that is specified using the
+ * DDT_PHYS_FLAT variant.
+ *
+ * Since the value size varies, use one of the size macros when interfacing
+ * with the ddt zap.
 */
-typedef struct {
-	dva_t		ddp_dva[SPA_DVAS_PER_BP];
-	uint64_t	ddp_refcnt;
-	uint64_t	ddp_phys_birth;
-} ddt_phys_t;
+
+#define	DDT_PHYS_MAX	(4)

 /*
- * Named indexes into the ddt_phys_t array in each entry.
+ * Note - this can be used in a flexible array and allocated for
+ * a specific size (ddp_trad or ddp_flat). So be careful not to
+ * copy using "=" assignment but instead use ddt_phys_copy().
+ */
+typedef union {
+	/*
+	 * Traditional physical payload value for DDT zap (256 bytes)
+	 */
+	struct {
+		dva_t		ddp_dva[SPA_DVAS_PER_BP];
+		uint64_t	ddp_refcnt;
+		uint64_t	ddp_phys_birth;
+	} ddp_trad[DDT_PHYS_MAX];
+
+	/*
+	 * Flat physical payload value for DDT zap (72 bytes)
+	 */
+	struct {
+		dva_t		ddp_dva[SPA_DVAS_PER_BP];
+		uint64_t	ddp_refcnt;
+		uint64_t	ddp_phys_birth; /* txg based from BP */
+		uint64_t	ddp_class_start; /* in realtime seconds */
+	} ddp_flat;
+} ddt_univ_phys_t;
+
+/*
+ * This enum denotes which variant of a ddt_univ_phys_t to target. For
+ * a traditional DDT entry, it represents the indexes into the ddp_trad
+ * array. Any consumer of a ddt_univ_phys_t needs to know which variant
+ * is being targeted.
 *
 * Note, we no longer generate new DDT_PHYS_DITTO-type blocks.  However,
 * we maintain the ability to free existing dedup-ditto blocks.
 */
-enum ddt_phys_type {
+
+typedef enum {
 	DDT_PHYS_DITTO = 0,
 	DDT_PHYS_SINGLE = 1,
 	DDT_PHYS_DOUBLE = 2,
 	DDT_PHYS_TRIPLE = 3,
-	DDT_PHYS_TYPES
-};
+	DDT_PHYS_FLAT = 4,
+	DDT_PHYS_NONE = 5
+} ddt_phys_variant_t;
+
+#define	DDT_PHYS_VARIANT(ddt, p)	\
+	(ASSERT((p) < DDT_PHYS_NONE),	\
+	((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
+
+#define	DDT_TRAD_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
+#define	DDT_FLAT_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
+
+#define	_DDT_PHYS_SWITCH(ddt, flat, trad)	\
+	(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
+
+#define	DDT_PHYS_SIZE(ddt)		_DDT_PHYS_SWITCH(ddt,	\
+	DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_NPHYS(ddt)			_DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
+#define	DDT_PHYS_FOR_COPIES(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, p)
+#define	DDT_PHYS_IS_DITTO(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, (p == 0))

 /*
 * A "live" entry, holding changes to an entry made this txg, and other data to
@ -153,17 +210,27 @@ enum ddt_phys_type {
 /* State flags for dde_flags */
 #define	DDE_FLAG_LOADED		(1 << 0)	/* entry ready for use */
 #define	DDE_FLAG_OVERQUOTA	(1 << 1)	/* entry unusable, no space */
+#define	DDE_FLAG_LOGGED		(1 << 2)	/* loaded from log */
+
+/*
+ * Additional data to support entry update or repair. This is fixed size
+ * because its relatively rarely used.
+ */
+typedef struct {
+	/* copy of data after a repair read, to be rewritten */
+	abd_t		*dde_repair_abd;
+
+	/* original phys contents before update, for error handling */
+	ddt_univ_phys_t	dde_orig_phys;
+
+	/* in-flight update IOs */
+	zio_t		*dde_lead_zio[DDT_PHYS_MAX];
+} ddt_entry_io_t;

 typedef struct {
 	/* key must be first for ddt_key_compare */
-	ddt_key_t	dde_key;			/* ddt_tree key */
-	ddt_phys_t	dde_phys[DDT_PHYS_TYPES];	/* on-disk data */
-
-	/* in-flight update IOs */
-	zio_t		*dde_lead_zio[DDT_PHYS_TYPES];
-
-	/* copy of data after a repair read, to be rewritten */
-	struct abd	*dde_repair_abd;
+	ddt_key_t	dde_key;	/* ddt_tree key */
+	avl_node_t	dde_node;	/* ddt_tree_node */

 	/* storage type and class the entry was loaded from */
 	ddt_type_t	dde_type;
@ -173,9 +240,35 @@ typedef struct {
 	kcondvar_t	dde_cv;		/* signaled when load completes */
 	uint64_t	dde_waiters;	/* count of waiters on dde_cv */

-	avl_node_t	dde_node;	/* ddt_tree node */
+	ddt_entry_io_t	*dde_io;	/* IO support, when required */
+
+	ddt_univ_phys_t	dde_phys[];	/* flexible -- allocated size varies */
 } ddt_entry_t;

+/*
+ * A lightweight entry is for short-lived or transient uses, like iterating or
+ * inspecting, when you don't care where it came from.
+ */
+typedef struct {
+	ddt_key_t	ddlwe_key;
+	ddt_type_t	ddlwe_type;
+	ddt_class_t	ddlwe_class;
+	ddt_univ_phys_t	ddlwe_phys;
+} ddt_lightweight_entry_t;
+
+/*
+ * In-core DDT log. A separate struct to make it easier to switch between the
+ * appending and flushing logs.
+ */
+typedef struct {
+	avl_tree_t	ddl_tree;	/* logged entries */
+	uint32_t	ddl_flags;	/* flags for this log */
+	uint64_t	ddl_object;	/* log object id */
+	uint64_t	ddl_length;	/* on-disk log size */
+	uint64_t	ddl_first_txg;	/* txg log became active */
+	ddt_key_t	ddl_checkpoint;	/* last checkpoint */
+} ddt_log_t;
+
 /*
 * In-core DDT object. This covers all entries and stats for a the whole pool
 * for a given checksum type.
@ -184,23 +277,49 @@ typedef struct {
 	kmutex_t	ddt_lock;	/* protects changes to all fields */

 	avl_tree_t	ddt_tree;	/* "live" (changed) entries this txg */
+	avl_tree_t	ddt_log_tree;	/* logged entries */

 	avl_tree_t	ddt_repair_tree;	/* entries being repaired */

-	enum zio_checksum ddt_checksum;		/* checksum algorithm in use */
-	spa_t		*ddt_spa;		/* pool this ddt is on */
-	objset_t	*ddt_os;		/* ddt objset (always MOS) */
+	ddt_log_t	ddt_log[2];		/* active/flushing logs */
+	ddt_log_t	*ddt_log_active;	/* pointers into ddt_log */
+	ddt_log_t	*ddt_log_flushing;	/* swapped when flush starts */
+
+	hrtime_t	ddt_flush_start;	/* log flush start this txg */
+	uint32_t	ddt_flush_pass;		/* log flush pass this txg */
+
+	int32_t		ddt_flush_count;	/* entries flushed this txg */
+	int32_t		ddt_flush_min;		/* min rem entries to flush */
+	int32_t		ddt_log_ingest_rate;	/* rolling log ingest rate */
+	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
+	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
+
+	uint64_t	ddt_flush_force_txg;	/* flush hard before this txg */
+
+	kstat_t		*ddt_ksp;	/* kstats context */
+
+	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
+	spa_t		*ddt_spa;	/* pool this ddt is on */
+	objset_t	*ddt_os;	/* ddt objset (always MOS) */
+
+	uint64_t	ddt_dir_object;	/* MOS dir holding ddt objects */
+	uint64_t	ddt_version;	/* DDT version */
+	uint64_t	ddt_flags;	/* FDT option flags */

 	/* per-type/per-class entry store objects */
 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];

-	/* object ids for whole-ddt and per-type/per-class stats */
+	/* object ids for stored, logged and per-type/per-class stats */
 	uint64_t	ddt_stat_object;
+	ddt_object_t	ddt_log_stats;
 	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];

 	/* type/class stats by power-2-sized referenced blocks */
 	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
 	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+
+	/* log stats power-2-sized referenced blocks */
+	ddt_histogram_t	ddt_log_histogram;
 } ddt_t;

 /*
@ -215,20 +334,36 @@ typedef struct {
 	uint64_t	ddb_cursor;
 } ddt_bookmark_t;

-extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
-    uint64_t txg);
+extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    blkptr_t *bp, uint64_t txg);
 extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
-    const ddt_phys_t *ddp, blkptr_t *bp);
+    const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);

-extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
-extern void ddt_phys_clear(ddt_phys_t *ddp);
-extern void ddt_phys_addref(ddt_phys_t *ddp);
-extern void ddt_phys_decref(ddt_phys_t *ddp);
-extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    const blkptr_t *bp);
+extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
+    ddt_phys_variant_t v);
+extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v);
+extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
+    const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v);
+extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    boolean_t encrypted);
+
+extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe);
+extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe);

 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
-extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh);
 extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+
 extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
 extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
 extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
@ -243,7 +378,7 @@ extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern void ddt_init(void);
 extern void ddt_fini(void);
-extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp);
 extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
 extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
 extern void ddt_prefetch_all(spa_t *spa);
@ -251,6 +386,8 @@ extern void ddt_prefetch_all(spa_t *spa);
 extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
    const blkptr_t *bp);

+extern void ddt_alloc_entry_io(ddt_entry_t *dde);
+
 extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
 extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);

@ -260,7 +397,11 @@ extern void ddt_create(spa_t *spa);
 extern int ddt_load(spa_t *spa);
 extern void ddt_unload(spa_t *spa);
 extern void ddt_sync(spa_t *spa, uint64_t txg);
-extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+
+extern void ddt_walk_init(spa_t *spa, uint64_t txg);
+extern boolean_t ddt_walk_ready(spa_t *spa);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
+    ddt_lightweight_entry_t *ddlwe);

 extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);

--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@ -28,11 +28,129 @@
 #define	_SYS_DDT_IMPL_H

 #include <sys/ddt.h>
+#include <sys/bitops.h>

 #ifdef	__cplusplus
 extern "C" {
 #endif

+/* DDT version numbers */
+#define	DDT_VERSION_LEGACY	(0)
+#define	DDT_VERSION_FDT		(1)
+
+/* Names of interesting objects in the DDT root dir */
+#define	DDT_DIR_VERSION		"version"
+#define	DDT_DIR_FLAGS		"flags"
+
+/* Fill a lightweight entry from a live entry. */
+#define	DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do {			\
+	memset((ddlwe), 0, sizeof (*ddlwe));				\
+	(ddlwe)->ddlwe_key = (dde)->dde_key;				\
+	(ddlwe)->ddlwe_type = (dde)->dde_type;				\
+	(ddlwe)->ddlwe_class = (dde)->dde_class;			\
+	memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
+} while (0)
+
+#define	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do {             \
+	memset((ddlwe), 0, sizeof (*ddlwe));                            \
+	(ddlwe)->ddlwe_key = (ddle)->ddle_key;                          \
+	(ddlwe)->ddlwe_type = (ddle)->ddle_type;                        \
+	(ddlwe)->ddlwe_class = (ddle)->ddle_class;                      \
+	memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
+} while (0)
+
+/*
+ * An entry on the log tree. These are "frozen", and a record of what's in
+ * the on-disk log. They can't be used in place, but can be "loaded" back into
+ * the live tree.
+ */
+typedef struct {
+	ddt_key_t	ddle_key;	/* ddt_log_tree key */
+	avl_node_t	ddle_node;	/* ddt_log_tree node */
+
+	ddt_type_t	ddle_type;	/* storage type */
+	ddt_class_t	ddle_class;	/* storage class */
+
+	/* extra allocation for flat/trad phys */
+	ddt_univ_phys_t	ddle_phys[];
+} ddt_log_entry_t;
+
+/* On-disk log record types. */
+typedef enum {
+	DLR_INVALID	= 0,	/* end of block marker */
+	DLR_ENTRY	= 1,	/* an entry to add or replace in the log tree */
+} ddt_log_record_type_t;
+
+/* On-disk log record header. */
+typedef struct {
+	/*
+	 * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:    record type (ddt_log_record_type_t)
+	 * bits 8-15:  length of record header+payload
+	 * bits 16-47:  reserved, all zero
+	 * bits 48-55:   if type==DLR_ENTRY, storage type (ddt_type)
+	 *                otherwise all zero
+	 * bits 56-63:  if type==DLR_ENTRY, storage class (ddt_class)
+	 *                otherwise all zero
+	 */
+	uint64_t	dlr_info;
+	uint8_t		dlr_payload[];
+} ddt_log_record_t;
+
+#define	DLR_GET_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 0, 8)
+#define	DLR_SET_TYPE(dlr, v)		BF64_SET((dlr)->dlr_info, 0, 8, v)
+#define	DLR_GET_RECLEN(dlr)		BF64_GET((dlr)->dlr_info, 8, 16)
+#define	DLR_SET_RECLEN(dlr, v)		BF64_SET((dlr)->dlr_info, 8, 16, v)
+#define	DLR_GET_ENTRY_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 48, 8)
+#define	DLR_SET_ENTRY_TYPE(dlr, v)	BF64_SET((dlr)->dlr_info, 48, 8, v)
+#define	DLR_GET_ENTRY_CLASS(dlr)	BF64_GET((dlr)->dlr_info, 56, 8)
+#define	DLR_SET_ENTRY_CLASS(dlr, v)	BF64_SET((dlr)->dlr_info, 56, 8, v)
+
+/* Payload for DLR_ENTRY. */
+typedef struct {
+	ddt_key_t	dlre_key;
+	ddt_univ_phys_t	dlre_phys[];
+} ddt_log_record_entry_t;
+
+/* Log flags (ddl_flags, dlh_flags) */
+#define	DDL_FLAG_FLUSHING	(1 << 0)	/* this log is being flushed */
+#define	DDL_FLAG_CHECKPOINT	(1 << 1)	/* header has a checkpoint */
+
+/* On-disk log header, stored in the bonus buffer. */
+typedef struct {
+	/*
+	 * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:   log version
+	 * bits 8-15:  log flags
+	 * bits 16-63: reserved, all zero
+	 */
+	uint64_t	dlh_info;
+
+	uint64_t	dlh_length;	/* log size in bytes */
+	uint64_t	dlh_first_txg;	/* txg this log went active */
+	ddt_key_t	dlh_checkpoint;	/* last checkpoint */
+} ddt_log_header_t;
+
+#define	DLH_GET_VERSION(dlh)	BF64_GET((dlh)->dlh_info, 0, 8)
+#define	DLH_SET_VERSION(dlh, v)	BF64_SET((dlh)->dlh_info, 0, 8, v)
+#define	DLH_GET_FLAGS(dlh)	BF64_GET((dlh)->dlh_info, 8, 8)
+#define	DLH_SET_FLAGS(dlh, v)	BF64_SET((dlh)->dlh_info, 8, 8, v)
+
+/* DDT log update state */
+typedef struct {
+	dmu_tx_t	*dlu_tx;	/* tx the update is being applied to */
+	dnode_t		*dlu_dn;	/* log object dnode */
+	dmu_buf_t	**dlu_dbp;	/* array of block buffer pointers */
+	int		dlu_ndbp;	/* number of block buffer pointers */
+	uint16_t	dlu_reclen;	/* cached length of record */
+	uint64_t	dlu_block;	/* block for next entry */
+	uint64_t	dlu_offset;	/* offset for next entry */
+} ddt_log_update_t;
+
 /*
 * Ops vector to access a specific DDT object type.
 */
@ -42,25 +160,50 @@ typedef struct {
 	    boolean_t prehash);
 	int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
 	int (*ddt_op_lookup)(objset_t *os, uint64_t object,
-	    const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
+	    const ddt_key_t *ddk, void *phys, size_t psize);
 	int (*ddt_op_contains)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk);
 	void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk);
 	void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
 	int (*ddt_op_update)(objset_t *os, uint64_t object,
-	    const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
+	    const ddt_key_t *ddk, const void *phys, size_t psize,
 	    dmu_tx_t *tx);
 	int (*ddt_op_remove)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk, dmu_tx_t *tx);
 	int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
-	    ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
+	    ddt_key_t *ddk, void *phys, size_t psize);
 	int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
 } ddt_ops_t;

 extern const ddt_ops_t ddt_zap_ops;

-extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
+/* Dedup log API */
+extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
+    ddt_log_update_t *dlu);
+extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
+    ddt_log_update_t *dlu);
+extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
+
+extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
+    ddt_lightweight_entry_t *ddlwe);
+extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
+    const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
+
+extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    dmu_tx_t *tx);
+extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
+
+extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
+
+extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
+
+extern int ddt_log_load(ddt_t *ddt);
+extern void ddt_log_alloc(ddt_t *ddt);
+extern void ddt_log_free(ddt_t *ddt);
+
+extern void ddt_log_init(void);
+extern void ddt_log_fini(void);

 /*
 * These are only exposed so that zdb can access them. Try not to use them
@ -74,16 +217,15 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
 */
 #define	DDT_NAMELEN	32

-extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
+    const ddt_univ_phys_t *ddp);

 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);

-extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
-
 extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
    char *name);
 extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
-    uint64_t *walk, ddt_entry_t *dde);
+    uint64_t *walk, ddt_lightweight_entry_t *ddlwe);
 extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
    uint64_t *count);
 extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@ -375,7 +375,9 @@ typedef struct dmu_buf {
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_LOG		"DDT-log-%s-%u"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
+#define	DMU_POOL_DDT_DIR		"DDT-%s"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_ERRORSCRUB		"error_scrub"
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
 boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
 boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
 void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx);
+    ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
 void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@ -1710,6 +1710,11 @@ typedef enum {
 #define	ZPOOL_INITIALIZE_COMMAND	"initialize_command"
 #define	ZPOOL_INITIALIZE_VDEVS		"initialize_vdevs"

+/*
+ * The following are names used when invoking ZFS_IOC_POOL_REGUID.
+ */
+#define	ZPOOL_REGUID_GUID	"guid"
+
 /*
 * The following are names used when invoking ZFS_IOC_POOL_TRIM.
 */
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@ -572,7 +572,7 @@ typedef struct blkptr {
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
 				BP_GET_PSIZE(bp))

-#define	BP_ZERO(bp)				\
+#define	BP_ZERO_DVAS(bp)			\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
@ -580,6 +580,11 @@ typedef struct blkptr {
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
+}
+
+#define	BP_ZERO(bp)				\
+{						\
+	BP_ZERO_DVAS(bp);			\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
@ -1087,7 +1092,7 @@ extern void spa_strfree(char *);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
-extern int spa_change_guid(spa_t *spa);
+extern int spa_change_guid(spa_t *spa, const uint64_t *guidp);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@ -22,7 +22,7 @@
 /*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Copyright (c) 2019, Allan Jude
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2024, Klara, Inc.
 * Use is subject to license terms.
 * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
 */
@ -122,25 +122,15 @@ enum zio_zstd_levels {
 struct zio_prop;

 /* Common signature for all zio compress functions. */
-typedef size_t zio_compress_func_t(void *src, void *dst,
+typedef size_t zio_compress_func_t(abd_t *src, abd_t *dst,
    size_t s_len, size_t d_len, int);
 /* Common signature for all zio decompress functions. */
-typedef int zio_decompress_func_t(void *src, void *dst,
+typedef int zio_decompress_func_t(abd_t *src, abd_t *dst,
    size_t s_len, size_t d_len, int);
 /* Common signature for all zio decompress and get level functions. */
-typedef int zio_decompresslevel_func_t(void *src, void *dst,
+typedef int zio_decompresslevel_func_t(abd_t *src, abd_t *dst,
    size_t s_len, size_t d_len, uint8_t *level);
-/* Common signature for all zio get-compression-level functions. */
-typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level);

-
-/*
- * Common signature for all zio decompress functions using an ABD as input.
- * This is helpful if you have both compressed ARC and scatter ABDs enabled,
- * but is not a requirement for all compression algorithms.
- */
-typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
-    size_t s_len, size_t d_len, int);
 /*
 * Information about each compression function.
 */
@ -163,34 +153,66 @@ extern void lz4_fini(void);
 /*
 * Compression routines.
 */
-extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
+extern size_t zfs_lzjb_compress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_lzjb_decompress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_gzip_compress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_gzip_decompress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_zle_compress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_zle_decompress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_lz4_compress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);

 /*
 * Compress and decompress data if necessary.
 */
-extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst,
+extern size_t zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst,
    size_t s_len, uint8_t level);
-extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
-    size_t s_len, size_t d_len, uint8_t *level);
-extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
+extern int zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *abd,
    size_t s_len, size_t d_len, uint8_t *level);
 extern int zio_compress_to_feature(enum zio_compress comp);

+#define	ZFS_COMPRESS_WRAP_DECL(name)					\
+size_t									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n)		\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	size_t c_len = name##_buf(s_buf, d_buf, s_len, d_len, n);	\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (c_len);							\
+}
+#define	ZFS_DECOMPRESS_WRAP_DECL(name)					\
+int									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n)		\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	int err = name##_buf(s_buf, d_buf, s_len, d_len, n);		\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (err);							\
+}
+#define	ZFS_DECOMPRESS_LEVEL_WRAP_DECL(name)				\
+int									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *n)	\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	int err = name##_buf(s_buf, d_buf, s_len, d_len, n);		\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (err);							\
+}
+
 #ifdef	__cplusplus
 }
 #endif
--- a/include/sys/zstd/zstd.h
+++ b/include/sys/zstd/zstd.h
@ -90,14 +90,12 @@ typedef struct zfs_zstd_meta {
 int zstd_init(void);
 void zstd_fini(void);

-size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len,
-    size_t d_len, int level);
-size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len,
+size_t zfs_zstd_compress(abd_t *src, abd_t *dst, size_t s_len,
    size_t d_len, int level);
 int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
-int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
+int zfs_zstd_decompress_level(abd_t *src, abd_t *dst, size_t s_len,
    size_t d_len, uint8_t *level);
-int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
+int zfs_zstd_decompress(abd_t *src, abd_t *dst, size_t s_len,
    size_t d_len, int n);
 void zfs_zstd_cache_reap_now(void);

--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@ -82,6 +82,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_AVZ_V2,
 	SPA_FEATURE_REDACTION_LIST_SPILL,
 	SPA_FEATURE_RAIDZ_EXPANSION,
+	SPA_FEATURE_FAST_DEDUP,
 	SPA_FEATURES
 } spa_feature_t;

--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@ -556,6 +556,7 @@
    <elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_set_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_set_vdev_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_skip_pool' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -616,7 +617,7 @@
    <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='spa_feature_table' size='2296' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='spa_feature_table' size='2352' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -6006,7 +6007,8 @@
      <enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
      <enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
      <enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
-      <enumerator name='SPA_FEATURES' value='41'/>
+      <enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
+      <enumerator name='SPA_FEATURES' value='42'/>
    </enum-decl>
    <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
    <qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@ -6638,6 +6640,11 @@
      <parameter type-id='9c313c2d' name='guid'/>
      <return type-id='95e97e5e'/>
    </function-decl>
+    <function-decl name='zpool_set_guid' mangled-name='zpool_set_guid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_set_guid'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='713a56f5' name='guid'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
    <function-decl name='zpool_reguid' mangled-name='zpool_reguid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_reguid'>
      <parameter type-id='4c81de99' name='zhp'/>
      <return type-id='95e97e5e'/>
@ -9131,8 +9138,8 @@
    </function-decl>
  </abi-instr>
  <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
-    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18368' id='b93e4d14'>
-      <subrange length='41' type-id='7359adad' id='cb834f44'/>
+    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18816' id='b937914f'>
+      <subrange length='42' type-id='7359adad' id='cb7c937f'/>
    </array-type-def>
    <enum-decl name='zfeature_flags' id='6db816a4'>
      <underlying-type type-id='9cac1fee'/>
@ -9209,7 +9216,7 @@
    <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
    <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
    <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
-    <var-decl name='spa_feature_table' type-id='b93e4d14' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
+    <var-decl name='spa_feature_table' type-id='b937914f' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
    <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
    <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
      <parameter type-id='80f4b756'/>
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@ -3735,6 +3735,13 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
 			(void) zpool_standard_error(hdl, errno, errbuf);
 		}
 		break;
+
+	case ZFS_ERR_ASHIFT_MISMATCH:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "The new device cannot have a higher alignment requirement "
+		    "than the top-level vdev."));
+		(void) zfs_error(hdl, EZFS_BADTARGET, errbuf);
+		break;
 	default:
 		(void) zpool_standard_error(hdl, errno, errbuf);
 	}
@ -4305,22 +4312,55 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)

 /*
 * Change the GUID for a pool.
+ *
+ * Similar to zpool_reguid(), but may take a GUID.
+ *
+ * If the guid argument is NULL, then no GUID is passed in the nvlist to the
+ * ioctl().
 */
 int
-zpool_reguid(zpool_handle_t *zhp)
+zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid)
 {
 	char errbuf[ERRBUFLEN];
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	nvlist_t *nvl = NULL;
 	zfs_cmd_t zc = {"\0"};
+	int error = -1;
+
+	if (guid != NULL) {
+		if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+			return (no_memory(hdl));
+
+		if (nvlist_add_uint64(nvl, ZPOOL_REGUID_GUID, *guid) != 0) {
+			nvlist_free(nvl);
+			return (no_memory(hdl));
+		}
+
+		zcmd_write_src_nvlist(hdl, &zc, nvl);
+	}

 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);

 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
-		return (0);
+	error = zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc);
+	if (error) {
+		return (zpool_standard_error(hdl, errno, errbuf));
+	}
+	if (guid != NULL) {
+		zcmd_free_nvlists(&zc);
+		nvlist_free(nvl);
+	}
+	return (0);
+}

-	return (zpool_standard_error(hdl, errno, errbuf));
+/*
+ * Change the GUID for a pool.
+ */
+int
+zpool_reguid(zpool_handle_t *zhp)
+{
+	return (zpool_set_guid(zhp, NULL));
 }

 /*
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@ -1,7 +1,9 @@
+include $(srcdir)/%D%/include/Makefile.am
+
 libzpool_la_CFLAGS  = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS)
 libzpool_la_CFLAGS += $(ZLIB_CFLAGS)

-libzpool_la_CPPFLAGS  = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+libzpool_la_CPPFLAGS  = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 libzpool_la_CPPFLAGS += -I$(srcdir)/include/os/@ac_system_l@/zfs
 libzpool_la_CPPFLAGS += -DLIB_ZPOOL_BUILD

@ -9,6 +11,7 @@ lib_LTLIBRARIES += libzpool.la
 CPPCHECKTARGETS += libzpool.la

 dist_libzpool_la_SOURCES = \
+	%D%/abd_os.c \
 	%D%/kernel.c \
 	%D%/taskq.c \
 	%D%/util.c
@ -39,7 +42,6 @@ nodist_libzpool_la_SOURCES = \
 	module/lua/lvm.c \
 	module/lua/lzio.c \
 	\
-	module/os/linux/zfs/abd_os.c \
 	module/os/linux/zfs/arc_os.c \
 	module/os/linux/zfs/trace.c \
 	module/os/linux/zfs/vdev_file.c \
@ -79,6 +81,7 @@ nodist_libzpool_la_SOURCES = \
 	module/zfs/dbuf.c \
 	module/zfs/dbuf_stats.c \
 	module/zfs/ddt.c \
+	module/zfs/ddt_log.c \
 	module/zfs/ddt_stats.c \
 	module/zfs/ddt_zap.c \
 	module/zfs/dmu.c \
--- a/lib/libzpool/abd_os.c
+++ b/lib/libzpool/abd_os.c
@ -0,0 +1,365 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+/*
+ * We're simulating scatter/gather with 4K allocations, since that's more like
+ * what a typical kernel does.
+ */
+#define	ABD_PAGESIZE	(4096)
+#define	ABD_PAGESHIFT	(12)
+#define	ABD_PAGEMASK	(ABD_PAGESIZE-1)
+
+/*
+ * See rationale in module/os/linux/zfs/abd_os.c, but in userspace this is
+ * mostly useful to get a mix of linear and scatter ABDs for testing.
+ */
+#define	ABD_SCATTER_MIN_SIZE	(512 * 3)
+
+abd_t *abd_zero_scatter = NULL;
+
+static uint_t
+abd_iovcnt_for_bytes(size_t size)
+{
+	/*
+	 * Each iovec points to a 4K page. There's no real reason to do this
+	 * in userspace, but our whole point here is to make it feel a bit
+	 * more like a real paged memory model.
+	 */
+	return (P2ROUNDUP(size, ABD_PAGESIZE) / ABD_PAGESIZE);
+}
+
+abd_t *
+abd_alloc_struct_impl(size_t size)
+{
+	/*
+	 * Zero-sized means it will be used for a linear or gang abd, so just
+	 * allocate the abd itself and return.
+	 */
+	if (size == 0)
+		return (umem_alloc(sizeof (abd_t), UMEM_NOFAIL));
+
+	/*
+	 * Allocating for a scatter abd, so compute how many ABD_PAGESIZE
+	 * iovecs we will need to hold this size. Append that allocation to the
+	 * end. Note that struct abd_scatter has includes abd_iov[1], so we
+	 * allocate one less iovec than we need.
+	 *
+	 * Note we're not allocating the pages proper, just the iovec pointers.
+	 * That's down in abd_alloc_chunks. We _could_ do it here in a single
+	 * allocation, but it's fiddly and harder to read for no real gain.
+	 */
+	uint_t n = abd_iovcnt_for_bytes(size);
+	abd_t *abd = umem_alloc(sizeof (abd_t) + (n-1) * sizeof (struct iovec),
+	    UMEM_NOFAIL);
+	ABD_SCATTER(abd).abd_offset = 0;
+	ABD_SCATTER(abd).abd_iovcnt = n;
+	return (abd);
+}
+
+void
+abd_free_struct_impl(abd_t *abd)
+{
+	/* For scatter, compute the extra amount we need to free */
+	uint_t iovcnt =
+	    abd_is_linear(abd) || abd_is_gang(abd) ?
+	    0 : (ABD_SCATTER(abd).abd_iovcnt - 1);
+	umem_free(abd, sizeof (abd_t) + iovcnt * sizeof (struct iovec));
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	/*
+	 * We've already allocated the iovec array; ensure that the wanted size
+	 * actually matches, otherwise the caller has made a mistake somewhere.
+	 */
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	ASSERT3U(n, ==, abd_iovcnt_for_bytes(size));
+
+	/*
+	 * Allocate a ABD_PAGESIZE region for each iovec.
+	 */
+	struct iovec *iov = ABD_SCATTER(abd).abd_iov;
+	for (int i = 0; i < n; i++) {
+		iov[i].iov_base =
+		    umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
+		iov[i].iov_len = ABD_PAGESIZE;
+	}
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	struct iovec *iov = ABD_SCATTER(abd).abd_iov;
+	for (int i = 0; i < n; i++)
+		umem_free_aligned(iov[i].iov_base, ABD_PAGESIZE);
+}
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+	return (size < ABD_SCATTER_MIN_SIZE);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	int waste = P2ROUNDUP(abd->abd_size, ABD_PAGESIZE) - abd->abd_size;
+	if (op == ABDSTAT_INCR) {
+		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	} else {
+		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	}
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+	(void) abd;
+	(void) op;
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+#ifdef ZFS_DEBUG
+	/*
+	 * scatter abds shall have:
+	 * - at least one iovec
+	 * - all iov_base point somewhere
+	 * - all iov_len are ABD_PAGESIZE
+	 * - offset set within the abd pages somewhere
+	 */
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	ASSERT3U(n, >, 0);
+
+	uint_t len = 0;
+	for (int i = 0; i < n; i++) {
+		ASSERT3P(ABD_SCATTER(abd).abd_iov[i].iov_base, !=, NULL);
+		ASSERT3U(ABD_SCATTER(abd).abd_iov[i].iov_len, ==, ABD_PAGESIZE);
+		len += ABD_PAGESIZE;
+	}
+
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <, len);
+#endif
+}
+
+void
+abd_init(void)
+{
+	/*
+	 * Create the "zero" scatter abd. This is always the size of the
+	 * largest possible block, but only actually has a single allocated
+	 * page, which all iovecs in the abd point to.
+	 */
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+
+	void *zero =
+	    umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
+	memset(zero, 0, ABD_PAGESIZE);
+
+	uint_t n = abd_iovcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	struct iovec *iov = ABD_SCATTER(abd_zero_scatter).abd_iov;
+	for (int i = 0; i < n; i++) {
+		iov[i].iov_base = zero;
+		iov[i].iov_len = ABD_PAGESIZE;
+	}
+}
+
+void
+abd_fini(void)
+{
+	umem_free_aligned(
+	    ABD_SCATTER(abd_zero_scatter).abd_iov[0].iov_base, ABD_PAGESIZE);
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+	/*
+	 * LINEAR_PAGE is specific to the Linux kernel; we never set this
+	 * flag, so this will never be called.
+	 */
+	(void) abd;
+	PANIC("unreachable");
+}
+
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+	return (abd_alloc(size, is_metadata));
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *dabd, abd_t *sabd, size_t off, size_t size)
+{
+
+	/*
+	 * Create a new scatter dabd by borrowing data pages from sabd to cover
+	 * off+size.
+	 *
+	 * sabd is an existing scatter abd with a set of iovecs, each covering
+	 * an ABD_PAGESIZE (4K) allocation. It's "zero" is at abd_offset.
+	 *
+	 *   [........][........][........][........]
+	 *      ^- sabd_offset
+	 *
+	 * We want to produce a new abd, referencing those allocations at the
+	 * given offset.
+	 *
+	 *   [........][........][........][........]
+	 *                    ^- dabd_offset = sabd_offset + off
+	 *                                        ^- dabd_offset + size
+	 *
+	 * In this example, dabd needs three iovecs. The first iovec is offset
+	 * 0, so the final dabd_offset is masked back into the first iovec.
+	 *
+	 *             [........][........][........]
+	 *                    ^- dabd_offset
+	 */
+	size_t soff = ABD_SCATTER(sabd).abd_offset + off;
+	size_t doff = soff & ABD_PAGEMASK;
+	size_t iovcnt = abd_iovcnt_for_bytes(doff + size);
+
+	/*
+	 * If the passed-in abd has enough allocated iovecs already, reuse it.
+	 * Otherwise, make a new one. The caller will free the original if the
+	 * one it gets back is not the same.
+	 *
+	 * Note that it's ok if we reuse an abd with more iovecs than we need.
+	 * abd_size has the usable amount of data, and the abd does not own the
+	 * pages referenced by the iovecs. At worst, they're holding dangling
+	 * pointers that we'll never use anyway.
+	 */
+	if (dabd == NULL || ABD_SCATTER(dabd).abd_iovcnt < iovcnt)
+		dabd = abd_alloc_struct(iovcnt << ABD_PAGESHIFT);
+
+	/* Set offset into first page in view */
+	ABD_SCATTER(dabd).abd_offset = doff;
+
+	/* Copy the wanted iovecs from the source to the dest */
+	memcpy(&ABD_SCATTER(dabd).abd_iov[0],
+	    &ABD_SCATTER(sabd).abd_iov[soff >> ABD_PAGESHIFT],
+	    iovcnt * sizeof (struct iovec));
+
+	return (dabd);
+}
+
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+	ASSERT(!abd_is_gang(abd));
+	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
+	aiter->iter_abd = abd;
+}
+
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
+	return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	if (abd_iter_at_end(aiter))
+		return;
+
+	aiter->iter_pos += amount;
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
+}
+
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		aiter->iter_mapaddr =
+		    ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+		aiter->iter_mapsize =
+		    aiter->iter_abd->abd_size - aiter->iter_pos;
+		return;
+	}
+
+	/*
+	 * For scatter, we index into the appropriate iovec, and return the
+	 * smaller of the amount requested, or up to the end of the page.
+	 */
+	size_t poff = aiter->iter_pos + ABD_SCATTER(aiter->iter_abd).abd_offset;
+
+	ASSERT3U(poff >> ABD_PAGESHIFT, <=,
+	    ABD_SCATTER(aiter->iter_abd).abd_iovcnt);
+	struct iovec *iov = &ABD_SCATTER(aiter->iter_abd).
+	    abd_iov[poff >> ABD_PAGESHIFT];
+
+	aiter->iter_mapsize = MIN(ABD_PAGESIZE - (poff & ABD_PAGEMASK),
+	    aiter->iter_abd->abd_size - aiter->iter_pos);
+	ASSERT3U(aiter->iter_mapsize, <=, ABD_PAGESIZE);
+
+	aiter->iter_mapaddr = iov->iov_base + (poff & ABD_PAGEMASK);
+}
+
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+	if (abd_iter_at_end(aiter))
+		return;
+
+	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+	ASSERT3U(aiter->iter_mapsize, >, 0);
+
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+}
--- a/lib/libzpool/include/Makefile.am
+++ b/lib/libzpool/include/Makefile.am
@ -0,0 +1,4 @@
+libzpooldir = $(includedir)/libzpool
+libzpool_HEADERS = \
+	%D%/sys/abd_os.h \
+	%D%/sys/abd_impl_os.h
--- a/lib/libzpool/include/sys/abd_impl_os.h
+++ b/lib/libzpool/include/sys/abd_impl_os.h
@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	((void)0)
+#define	abd_exit_critical(flags)	((void)0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
--- a/lib/libzpool/include/sys/abd_os.h
+++ b/lib/libzpool/include/sys/abd_os.h
@ -0,0 +1,47 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	uint_t		abd_iovcnt;
+	struct iovec	abd_iov[1]; /* actually variable-length */
+};
+
+struct abd_linear {
+	void		*abd_buf;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
--- a/man/man4/spl.4
+++ b/man/man4/spl.4
@ -175,17 +175,6 @@ Increasing this value will
 result in a slower thread creation rate which may be preferable for some
 configurations.
 .
-.It Sy spl_max_show_tasks Ns = Ns Sy 512 Pq uint
-The maximum number of tasks per pending list in each taskq shown in
-.Pa /proc/spl/taskq{,-all} .
-Write
-.Sy 0
-to turn off the limit.
-The proc file will walk the lists with lock held,
-reading it could cause a lock-up if the list grow too large
-without limiting the output.
-"(truncated)" will be shown if the list is larger than the limit.
-.
 .It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
 Minimum idle threads exit interval for dynamic taskqs.
 Smaller values allow idle threads exit more often and potentially be
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@ -77,6 +77,17 @@ the array is dynamically sized based on total system memory.
 dnode slots allocated in a single operation as a power of 2.
 The default value minimizes lock contention for the bulk operation performed.
 .
+.It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint
+Controls the number of copies stored for DeDup Table
+.Pq DDT
+objects.
+Reducing the number of copies to 1 from the previous default of 3
+can reduce the write inflation caused by deduplication.
+This assumes redundancy for this data is provided by the vdev layer.
+If the DDT is damaged, space may be leaked
+.Pq not freed
+when the DDT can not report the correct reference count.
+.
 .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Limit the amount we can prefetch with one call to this amount in bytes.
 This helps to limit the amount of memory that can be used by prefetching.
@ -121,20 +132,26 @@ Controls whether buffers present on special vdevs are eligible for caching
 into L2ARC.
 If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
 .
-.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq  int
+.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
 Controls whether only MFU metadata and data are cached from ARC into L2ARC.
 This may be desired to avoid wasting space on L2ARC when reading/writing large
 amounts of data that are not expected to be accessed more than once.
 .Pp
-The default is off,
+The default is 0,
 meaning both MRU and MFU data and metadata are cached.
-When turning off this feature, some MRU buffers will still be present
-in ARC and eventually cached on L2ARC.
+When turning off this feature (setting it to 0), some MRU buffers will
+still be present in ARC and eventually cached on L2ARC.
 .No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
 some prefetched buffers will be cached to L2ARC, and those might later
 transition to MRU, in which case the
 .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
 .Pp
+Setting it to 1 means to L2 cache only MFU data and metadata.
+.Pp
+Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
+only MFU data (ie: MRU data are not cached). This can be the right setting
+to cache as much metadata as possible even when having high data turnover.
+.Pp
 Regardless of
 .Sy l2arc_noprefetch ,
 some MFU buffers might be evicted from ARC,
@ -821,6 +838,7 @@ This is a limit on how many pages the ARC shrinker makes available for
 eviction in response to one page allocation attempt.
 Note that in practice, the kernel's shrinker can ask us to evict
 up to about four times this for one allocation attempt.
+To reduce OOM risk, this limit is applied for kswapd reclaims only.
 .Pp
 The default limit of
 .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
@ -974,6 +992,88 @@ milliseconds until the operation completes.
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
+.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
+Maximum number of dedup log flush passes (iterations) each transaction.
+.Pp
+At the start of each transaction, OpenZFS will estimate how many entries it
+needs to flush out to keep up with the change rate, taking the amount and time
+taken to flush on previous txgs into account (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+It will spread this amount into a number of passes.
+At each pass, it will use the amount already flushed and the total time taken
+by flushing and by other IO to recompute how much it should do for the remainder
+of the txg.
+.Pp
+Reducing the max number of passes will make flushing more aggressive, flushing
+out more entries on each pass.
+This can be faster, but also more likely to compete with other IO.
+Increasing the max number of passes will put fewer entries onto each pass,
+keeping the overhead of dedup changes to a minimum but possibly causing a large
+number of changes to be dumped on the last pass, which can blow out the txg
+sync time beyond
+.Sy zfs_txg_timeout .
+.
+.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
+Minimum time to spend on dedup log flush each transaction.
+.Pp
+At least this long will be spent flushing dedup log entries each transaction,
+up to
+.Sy zfs_txg_timeout .
+This occurs even if doing so would delay the transaction, that is, other IO
+completes under this time.
+.
+.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
+Flush at least this many entries each transaction.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction to
+keep up with the ingest rate (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+This sets the minimum for that estimate.
+Raising it can force OpenZFS to flush more aggressively, keeping the log small
+and so reducing pool import times, but can make it less able to back off if
+log flushing would compete with other IO too much.
+.
+.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
+Number of transactions to use to compute the flow rate.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction by
+monitoring the number of entries changed (ingest rate), number of entries
+flushed (flush rate) and time spent flushing (flush time rate) and combining
+these into an overall "flow rate".
+It will use an exponential weighted moving average over some number of recent
+transactions to compute these rates.
+This sets the number of transactions to compute these averages over.
+Setting it higher can help to smooth out the flow rate in the face of spiky
+workloads, but will take longer for the flow rate to adjust to a sustained
+change in the ingress rate.
+.
+.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
+Max transactions to before starting to flush dedup logs.
+.Pp
+OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
+If there is nothing to flush, it will accumulate changes for no more than this
+many transactions before switching the logs and starting to flush entries out.
+.
+.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
+Max memory to use for dedup logs.
+.Pp
+OpenZFS will spend no more than this much memory on maintaining the in-memory
+dedup log.
+Flushing will begin when around half this amount is being spent on logs.
+The default value of
+.Sy 0
+will cause it to be set by
+.Sy zfs_dedup_log_mem_max_percent
+instead.
+.
+.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
+Max memory to use for dedup logs, as a percentage of total memory.
+.Pp
+If
+.Sy zfs_dedup_log_mem_max
+is not set, it will be initialised as a percentage of the total memory in the
+system.
+.
 .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@ -17,8 +17,9 @@
 .\" Copyright (c) 2019, Klara Inc.
 .\" Copyright (c) 2019, Allan Jude
 .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+.\" Copyright (c) 2023, Klara Inc.
 .\"
-.Dd June 23, 2022
+.Dd February 14, 2024
 .Dt ZPOOL-FEATURES 7
 .Os
 .
@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the
 .Sy enabled
 state when all datasets that use this feature are destroyed.
 .
+.feature com.klarasystems fast_dedup yes
+This feature allows more advanced deduplication features to be enabled on new
+dedup tables.
+.Pp
+This feature will be
+.Sy active
+when the first deduplicated block is written after a new dedup table is created
+(ie after a new pool creation, or new checksum used on a dataset with
+.Sy dedup
+enabled).
+It will be returned to the
+.Sy enabled
+state when all deduplicated blocks using it are freed.
+.
 .feature com.delphix extensible_dataset no
 This feature allows more flexible use of internal ZFS data structures,
 and exists for other features to depend on.
--- a/man/man8/zpool-reguid.8
+++ b/man/man8/zpool-reguid.8
@ -25,8 +25,10 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2024, Klara Inc.
+.\" Copyright (c) 2024, Mateusz Piotrowski
 .\"
-.Dd May 31, 2021
+.Dd June 21, 2023
 .Dt ZPOOL-REGUID 8
 .Os
 .
@ -36,6 +38,7 @@
 .Sh SYNOPSIS
 .Nm zpool
 .Cm reguid
+.Op Fl g Ar guid
 .Ar pool
 .
 .Sh DESCRIPTION
@ -43,6 +46,15 @@ Generates a new unique identifier for the pool.
 You must ensure that all devices in this pool are online and healthy before
 performing this action.
 .
+.Bl -tag -width Ds
+.It Fl g Ar guid
+Set the pool GUID to the provided value.
+The GUID can be any 64-bit value accepted by
+.Xr strtoull 3
+in base 10.
+.Nm
+will return an error if the provided GUID is already in use.
+.El
 .Sh SEE ALSO
 .Xr zpool-export 8 ,
 .Xr zpool-import 8
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@ -16,8 +16,8 @@ src = @abs_srcdir@
 obj = @abs_builddir@
 else
 zfs_include = $(srctree)/include/zfs
-icp_include = $(srctree)/$(src)/icp/include
-zstd_include = $(srctree)/$(src)/zstd/include
+icp_include = $(src)/icp/include
+zstd_include = $(src)/zstd/include
 ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
 endif

@ -323,6 +323,7 @@ ZFS_OBJS := \
 	dbuf.o \
 	dbuf_stats.o \
 	ddt.o \
+	ddt_log.o \
 	ddt_stats.o \
 	ddt_zap.o \
 	dmu.o \
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@ -252,6 +252,7 @@ SRCS+=	abd.c \
 	dbuf.c \
 	dbuf_stats.c \
 	ddt.c \
+	ddt_log.c \
 	ddt_stats.c \
 	ddt_zap.c \
 	dmu.c \
@ -426,6 +427,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast

 CFLAGS.abd.c= -Wno-cast-qual
 CFLAGS.ddt.c= -Wno-cast-qual
+CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.ddt_zap.c= -Wno-cast-qual
 CFLAGS.dmu.c= -Wno-cast-qual
 CFLAGS.dmu_traverse.c= -Wno-cast-qual
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@ -95,14 +95,12 @@ struct {
 */
 static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;

-#if defined(_KERNEL)
 SYSCTL_DECL(_vfs_zfs);

 SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
 	&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
 SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
 	&zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
-#endif

 kmem_cache_t *abd_chunk_cache;
 static kstat_t *abd_ksp;
@ -250,7 +248,7 @@ abd_alloc_zero_scatter(void)

 	n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;

 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@ -6125,7 +6125,9 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 	    error == EOPNOTSUPP)
 		goto bad_locked_fallback;
 	*ap->a_lenp = (size_t)len;
+#ifdef MAC
 out_locked:
+#endif
 	if (invp != outvp)
 		VOP_UNLOCK(invp);
 	VOP_UNLOCK(outvp);
--- a/module/os/linux/spl/spl-generic.c
+++ b/module/os/linux/spl/spl-generic.c
@ -868,16 +868,16 @@ spl_init(void)
 	if ((rc = spl_tsd_init()))
 		goto out2;

-	if ((rc = spl_taskq_init()))
+	if ((rc = spl_proc_init()))
 		goto out3;

-	if ((rc = spl_kmem_cache_init()))
+	if ((rc = spl_kstat_init()))
 		goto out4;

-	if ((rc = spl_proc_init()))
+	if ((rc = spl_taskq_init()))
 		goto out5;

-	if ((rc = spl_kstat_init()))
+	if ((rc = spl_kmem_cache_init()))
 		goto out6;

 	if ((rc = spl_zlib_init()))
@ -891,13 +891,13 @@ spl_init(void)
 out8:
 	spl_zlib_fini();
 out7:
-	spl_kstat_fini();
-out6:
-	spl_proc_fini();
-out5:
 	spl_kmem_cache_fini();
-out4:
+out6:
 	spl_taskq_fini();
+out5:
+	spl_kstat_fini();
+out4:
+	spl_proc_fini();
 out3:
 	spl_tsd_fini();
 out2:
@ -913,10 +913,10 @@ spl_fini(void)
 {
 	spl_zone_fini();
 	spl_zlib_fini();
-	spl_kstat_fini();
-	spl_proc_fini();
 	spl_kmem_cache_fini();
 	spl_taskq_fini();
+	spl_kstat_fini();
+	spl_proc_fini();
 	spl_tsd_fini();
 	spl_kvmem_fini();
 	spl_random_fini();
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@ -22,13 +22,15 @@
 *
 *  Solaris Porting Layer (SPL) Proc Implementation.
 */
+/*
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ */

 #include <sys/systeminfo.h>
 #include <sys/kstat.h>
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/vmem.h>
-#include <sys/taskq.h>
 #include <sys/proc.h>
 #include <linux/ctype.h>
 #include <linux/kmod.h>
@ -43,6 +45,12 @@ typedef struct ctl_table __no_const spl_ctl_table;
 typedef struct ctl_table spl_ctl_table;
 #endif

+#ifdef HAVE_PROC_HANDLER_CTL_TABLE_CONST
+#define	CONST_CTL_TABLE		const struct ctl_table
+#else
+#define	CONST_CTL_TABLE		struct ctl_table
+#endif
+
 static unsigned long table_min = 0;
 static unsigned long table_max = ~0;

@ -54,13 +62,11 @@ static struct ctl_table_header *spl_kstat = NULL;
 static struct proc_dir_entry *proc_spl = NULL;
 static struct proc_dir_entry *proc_spl_kmem = NULL;
 static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
-static struct proc_dir_entry *proc_spl_taskq_all = NULL;
-static struct proc_dir_entry *proc_spl_taskq = NULL;
 struct proc_dir_entry *proc_spl_kstat = NULL;

 #ifdef DEBUG_KMEM
 static int
-proc_domemused(struct ctl_table *table, int write,
+proc_domemused(CONST_CTL_TABLE *table, int write,
    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int rc = 0;
@ -88,7 +94,7 @@ proc_domemused(struct ctl_table *table, int write,
 #endif /* DEBUG_KMEM */

 static int
-proc_doslab(struct ctl_table *table, int write,
+proc_doslab(CONST_CTL_TABLE *table, int write,
    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int rc = 0;
@ -135,7 +141,7 @@ proc_doslab(struct ctl_table *table, int write,
 }

 static int
-proc_dohostid(struct ctl_table *table, int write,
+proc_dohostid(CONST_CTL_TABLE *table, int write,
    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char *end, str[32];
@ -168,195 +174,6 @@ proc_dohostid(struct ctl_table *table, int write,
 	return (0);
 }

-static void
-taskq_seq_show_headers(struct seq_file *f)
-{
-	seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
-	    "taskq", "act", "nthr", "spwn", "maxt", "pri",
-	    "mina", "maxa", "cura", "flags");
-}
-
-/* indices into the lheads array below */
-#define	LHEAD_PEND	0
-#define	LHEAD_PRIO	1
-#define	LHEAD_DELAY	2
-#define	LHEAD_WAIT	3
-#define	LHEAD_ACTIVE	4
-#define	LHEAD_SIZE	5
-
-static unsigned int spl_max_show_tasks = 512;
-/* CSTYLED */
-module_param(spl_max_show_tasks, uint, 0644);
-MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
-
-static int
-taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
-{
-	taskq_t *tq = p;
-	taskq_thread_t *tqt = NULL;
-	spl_wait_queue_entry_t *wq;
-	struct task_struct *tsk;
-	taskq_ent_t *tqe;
-	char name[100];
-	struct list_head *lheads[LHEAD_SIZE], *lh;
-	static char *list_names[LHEAD_SIZE] =
-	    {"pend", "prio", "delay", "wait", "active" };
-	int i, j, have_lheads = 0;
-	unsigned long wflags, flags;
-
-	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
-	spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
-
-	/* get the various lists and check whether they're empty */
-	lheads[LHEAD_PEND] = &tq->tq_pend_list;
-	lheads[LHEAD_PRIO] = &tq->tq_prio_list;
-	lheads[LHEAD_DELAY] = &tq->tq_delay_list;
-#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
-	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
-#else
-	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
-#endif
-	lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
-
-	for (i = 0; i < LHEAD_SIZE; ++i) {
-		if (list_empty(lheads[i]))
-			lheads[i] = NULL;
-		else
-			++have_lheads;
-	}
-
-	/* early return in non-"all" mode if lists are all empty */
-	if (!allflag && !have_lheads) {
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-		spin_unlock_irqrestore(&tq->tq_lock, flags);
-		return (0);
-	}
-
-	/* unlock the waitq quickly */
-	if (!lheads[LHEAD_WAIT])
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-
-	/* show the base taskq contents */
-	snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
-	seq_printf(f, "%-25s ", name);
-	seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
-	    tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
-	    tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
-	    tq->tq_nalloc, tq->tq_flags);
-
-	/* show the active list */
-	if (lheads[LHEAD_ACTIVE]) {
-		j = 0;
-		list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
-			if (j == 0)
-				seq_printf(f, "\t%s:",
-				    list_names[LHEAD_ACTIVE]);
-			else if (j == 2) {
-				seq_printf(f, "\n\t       ");
-				j = 0;
-			}
-			seq_printf(f, " [%d]%pf(%ps)",
-			    tqt->tqt_thread->pid,
-			    tqt->tqt_task->tqent_func,
-			    tqt->tqt_task->tqent_arg);
-			++j;
-		}
-		seq_printf(f, "\n");
-	}
-
-	for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
-		if (lheads[i]) {
-			j = 0;
-			list_for_each(lh, lheads[i]) {
-				if (spl_max_show_tasks != 0 &&
-				    j >= spl_max_show_tasks) {
-					seq_printf(f, "\n\t(truncated)");
-					break;
-				}
-				/* show the wait waitq list */
-				if (i == LHEAD_WAIT) {
-#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
-					wq = list_entry(lh,
-					    spl_wait_queue_entry_t, entry);
-#else
-					wq = list_entry(lh,
-					    spl_wait_queue_entry_t, task_list);
-#endif
-					if (j == 0)
-						seq_printf(f, "\t%s:",
-						    list_names[i]);
-					else if (j % 8 == 0)
-						seq_printf(f, "\n\t     ");
-
-					tsk = wq->private;
-					seq_printf(f, " %d", tsk->pid);
-				/* pend, prio and delay lists */
-				} else {
-					tqe = list_entry(lh, taskq_ent_t,
-					    tqent_list);
-					if (j == 0)
-						seq_printf(f, "\t%s:",
-						    list_names[i]);
-					else if (j % 2 == 0)
-						seq_printf(f, "\n\t     ");
-
-					seq_printf(f, " %pf(%ps)",
-					    tqe->tqent_func,
-					    tqe->tqent_arg);
-				}
-				++j;
-			}
-			seq_printf(f, "\n");
-		}
-	if (lheads[LHEAD_WAIT])
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-	spin_unlock_irqrestore(&tq->tq_lock, flags);
-
-	return (0);
-}
-
-static int
-taskq_all_seq_show(struct seq_file *f, void *p)
-{
-	return (taskq_seq_show_impl(f, p, B_TRUE));
-}
-
-static int
-taskq_seq_show(struct seq_file *f, void *p)
-{
-	return (taskq_seq_show_impl(f, p, B_FALSE));
-}
-
-static void *
-taskq_seq_start(struct seq_file *f, loff_t *pos)
-{
-	struct list_head *p;
-	loff_t n = *pos;
-
-	down_read(&tq_list_sem);
-	if (!n)
-		taskq_seq_show_headers(f);
-
-	p = tq_list.next;
-	while (n--) {
-		p = p->next;
-		if (p == &tq_list)
-		return (NULL);
-	}
-
-	return (list_entry(p, taskq_t, tq_taskqs));
-}
-
-static void *
-taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
-{
-	taskq_t *tq = p;
-
-	++*pos;
-	return ((tq->tq_taskqs.next == &tq_list) ?
-	    NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
-}
-
 static void
 slab_seq_show_headers(struct seq_file *f)
 {
@ -492,66 +309,6 @@ static const kstat_proc_op_t proc_slab_operations = {
 #endif
 };

-static void
-taskq_seq_stop(struct seq_file *f, void *v)
-{
-	up_read(&tq_list_sem);
-}
-
-static const struct seq_operations taskq_all_seq_ops = {
-	.show	= taskq_all_seq_show,
-	.start	= taskq_seq_start,
-	.next	= taskq_seq_next,
-	.stop	= taskq_seq_stop,
-};
-
-static const struct seq_operations taskq_seq_ops = {
-	.show	= taskq_seq_show,
-	.start	= taskq_seq_start,
-	.next	= taskq_seq_next,
-	.stop	= taskq_seq_stop,
-};
-
-static int
-proc_taskq_all_open(struct inode *inode, struct file *filp)
-{
-	return (seq_open(filp, &taskq_all_seq_ops));
-}
-
-static int
-proc_taskq_open(struct inode *inode, struct file *filp)
-{
-	return (seq_open(filp, &taskq_seq_ops));
-}
-
-static const kstat_proc_op_t proc_taskq_all_operations = {
-#ifdef HAVE_PROC_OPS_STRUCT
-	.proc_open	= proc_taskq_all_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-#else
-	.open		= proc_taskq_all_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-#endif
-};
-
-static const kstat_proc_op_t proc_taskq_operations = {
-#ifdef HAVE_PROC_OPS_STRUCT
-	.proc_open	= proc_taskq_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-#else
-	.open		= proc_taskq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-#endif
-};
-
 static struct ctl_table spl_kmem_table[] = {
 #ifdef DEBUG_KMEM
 	{
@ -668,8 +425,6 @@ static void spl_proc_cleanup(void)
 	remove_proc_entry("kstat", proc_spl);
 	remove_proc_entry("slab", proc_spl_kmem);
 	remove_proc_entry("kmem", proc_spl);
-	remove_proc_entry("taskq-all", proc_spl);
-	remove_proc_entry("taskq", proc_spl);
 	remove_proc_entry("spl", NULL);

 #ifndef HAVE_REGISTER_SYSCTL_TABLE
@ -688,6 +443,37 @@ static void spl_proc_cleanup(void)
 	}
 }

+#ifndef HAVE_REGISTER_SYSCTL_TABLE
+
+/*
+ * Traditionally, struct ctl_table arrays have been terminated by an "empty"
+ * sentinel element (specifically, one with .procname == NULL).
+ *
+ * Linux 6.6 began migrating away from this, adding register_sysctl_sz() so
+ * that callers could provide the size directly, and redefining
+ * register_sysctl() to just call register_sysctl_sz() with the array size. It
+ * retained support for the terminating element so that existing callers would
+ * continue to work.
+ *
+ * Linux 6.11 removed support for the terminating element, instead interpreting
+ * it as a real malformed element, and rejecting it.
+ *
+ * In order to continue support older kernels, we retain the terminating
+ * sentinel element for our sysctl tables, but instead detect availability of
+ * register_sysctl_sz(). If it exists, we pass it the array size -1, stopping
+ * the kernel from trying to process the terminator. For pre-6.6 kernels that
+ * don't have register_sysctl_sz(), we just use register_sysctl(), which can
+ * handle the terminating element as it always has.
+ */
+#ifdef HAVE_REGISTER_SYSCTL_SZ
+#define	spl_proc_register_sysctl(p, t)	\
+	register_sysctl_sz(p, t, ARRAY_SIZE(t)-1)
+#else
+#define	spl_proc_register_sysctl(p, t)	\
+	register_sysctl(p, t)
+#endif
+#endif
+
 int
 spl_proc_init(void)
 {
@ -698,16 +484,17 @@ spl_proc_init(void)
 	if (spl_header == NULL)
 		return (-EUNATCH);
 #else
-	spl_header = register_sysctl("kernel/spl", spl_table);
+	spl_header = spl_proc_register_sysctl("kernel/spl", spl_table);
 	if (spl_header == NULL)
 		return (-EUNATCH);

-	spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table);
+	spl_kmem = spl_proc_register_sysctl("kernel/spl/kmem", spl_kmem_table);
 	if (spl_kmem == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
-	spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table);
+	spl_kstat = spl_proc_register_sysctl("kernel/spl/kstat",
+	    spl_kstat_table);
 	if (spl_kstat == NULL) {
 		rc = -EUNATCH;
 		goto out;
@ -720,20 +507,6 @@ spl_proc_init(void)
 		goto out;
 	}

-	proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
-	    &proc_taskq_all_operations, NULL);
-	if (proc_spl_taskq_all == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-
-	proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
-	    &proc_taskq_operations, NULL);
-	if (proc_spl_taskq == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-
 	proc_spl_kmem = proc_mkdir("kmem", proc_spl);
 	if (proc_spl_kmem == NULL) {
 		rc = -EUNATCH;
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@ -22,16 +22,98 @@
 *
 *  Solaris Porting Layer (SPL) Task Queue Implementation.
 */
+/*
+ * Copyright (c) 2024, Klara Inc.
+ * Copyright (c) 2024, Syneto
+ */

 #include <sys/timer.h>
 #include <sys/taskq.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/trace_spl.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/kstat.h>
 #ifdef HAVE_CPU_HOTPLUG
 #include <linux/cpuhotplug.h>
 #endif

+typedef struct taskq_kstats {
+	/* static values, for completeness */
+	kstat_named_t tqks_threads_max;
+	kstat_named_t tqks_entry_pool_min;
+	kstat_named_t tqks_entry_pool_max;
+
+	/* gauges (inc/dec counters, current value) */
+	kstat_named_t tqks_threads_active;
+	kstat_named_t tqks_threads_idle;
+	kstat_named_t tqks_threads_total;
+	kstat_named_t tqks_tasks_pending;
+	kstat_named_t tqks_tasks_priority;
+	kstat_named_t tqks_tasks_total;
+	kstat_named_t tqks_tasks_delayed;
+	kstat_named_t tqks_entries_free;
+
+	/* counters (inc only, since taskq creation) */
+	kstat_named_t tqks_threads_created;
+	kstat_named_t tqks_threads_destroyed;
+	kstat_named_t tqks_tasks_dispatched;
+	kstat_named_t tqks_tasks_dispatched_delayed;
+	kstat_named_t tqks_tasks_executed_normal;
+	kstat_named_t tqks_tasks_executed_priority;
+	kstat_named_t tqks_tasks_executed;
+	kstat_named_t tqks_tasks_delayed_requeued;
+	kstat_named_t tqks_tasks_cancelled;
+	kstat_named_t tqks_thread_wakeups;
+	kstat_named_t tqks_thread_wakeups_nowork;
+	kstat_named_t tqks_thread_sleeps;
+} taskq_kstats_t;
+
+static taskq_kstats_t taskq_kstats_template = {
+	{ "threads_max",		KSTAT_DATA_UINT64 },
+	{ "entry_pool_min",		KSTAT_DATA_UINT64 },
+	{ "entry_pool_max",		KSTAT_DATA_UINT64 },
+	{ "threads_active",		KSTAT_DATA_UINT64 },
+	{ "threads_idle",		KSTAT_DATA_UINT64 },
+	{ "threads_total",		KSTAT_DATA_UINT64 },
+	{ "tasks_pending",		KSTAT_DATA_UINT64 },
+	{ "tasks_priority",		KSTAT_DATA_UINT64 },
+	{ "tasks_total",		KSTAT_DATA_UINT64 },
+	{ "tasks_delayed",		KSTAT_DATA_UINT64 },
+	{ "entries_free",		KSTAT_DATA_UINT64 },
+
+	{ "threads_created",		KSTAT_DATA_UINT64 },
+	{ "threads_destroyed",		KSTAT_DATA_UINT64 },
+	{ "tasks_dispatched",		KSTAT_DATA_UINT64 },
+	{ "tasks_dispatched_delayed",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed_normal",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed_priority",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed",		KSTAT_DATA_UINT64 },
+	{ "tasks_delayed_requeued",	KSTAT_DATA_UINT64 },
+	{ "tasks_cancelled",		KSTAT_DATA_UINT64 },
+	{ "thread_wakeups",		KSTAT_DATA_UINT64 },
+	{ "thread_wakeups_nowork",	KSTAT_DATA_UINT64 },
+	{ "thread_sleeps",		KSTAT_DATA_UINT64 },
+};
+
+#define	TQSTAT_INC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, 1)
+#define	TQSTAT_DEC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, -1)
+
+#define	_TQSTAT_MOD_LIST(mod, tq, t) do { \
+	switch (t->tqent_flags & TQENT_LIST_MASK) {			\
+	case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\
+	case TQENT_LIST_PENDING: mod(tq, tasks_pending); break;		\
+	case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break;	\
+	case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break;		\
+	}								\
+} while (0)
+#define	TQSTAT_INC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_INC, tq, t)
+#define	TQSTAT_DEC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t)
+
+#define	TQENT_SET_LIST(t, l)	\
+	t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l;
+
 static int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
@ -134,6 +216,7 @@ retry:
 		ASSERT(!timer_pending(&t->tqent_timer));

 		list_del_init(&t->tqent_list);
+		TQSTAT_DEC(tq, entries_free);
 		return (t);
 	}

@ -204,12 +287,11 @@ task_done(taskq_t *tq, taskq_ent_t *t)
 {
 	ASSERT(tq);
 	ASSERT(t);
+	ASSERT(list_empty(&t->tqent_list));

 	/* Wake tasks blocked in taskq_wait_id() */
 	wake_up_all(&t->tqent_waitq);

-	list_del_init(&t->tqent_list);
-
 	if (tq->tq_nalloc <= tq->tq_minalloc) {
 		t->tqent_id = TASKQID_INVALID;
 		t->tqent_func = NULL;
@ -217,6 +299,7 @@ task_done(taskq_t *tq, taskq_ent_t *t)
 		t->tqent_flags = 0;

 		list_add_tail(&t->tqent_list, &tq->tq_free_list);
+		TQSTAT_INC(tq, entries_free);
 	} else {
 		task_free(tq, t);
 	}
@ -263,6 +346,8 @@ task_expire_impl(taskq_ent_t *t)
 	spin_unlock_irqrestore(&tq->tq_lock, flags);

 	wake_up(&tq->tq_work_waitq);
+
+	TQSTAT_INC(tq, tasks_delayed_requeued);
 }

 static void
@ -534,7 +619,11 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
 	t = taskq_find(tq, id);
 	if (t && t != ERR_PTR(-EBUSY)) {
 		list_del_init(&t->tqent_list);
+		TQSTAT_DEC_LIST(tq, t);
+		TQSTAT_DEC(tq, tasks_total);
+
 		t->tqent_flags |= TQENT_FLAG_CANCEL;
+		TQSTAT_INC(tq, tasks_cancelled);

 		/*
 		 * When canceling the lowest outstanding task id we
@ -604,13 +693,19 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	spin_lock(&t->tqent_lock);

 	/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
-	if (flags & TQ_NOQUEUE)
+	if (flags & TQ_NOQUEUE) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add(&t->tqent_list, &tq->tq_prio_list);
 	/* Queue to the priority list instead of the pending list */
-	else if (flags & TQ_FRONT)
+	} else if (flags & TQ_FRONT) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
-	else
+	} else {
+		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+	}
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);

 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
@ -629,6 +724,8 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)

 	wake_up(&tq->tq_work_waitq);

+	TQSTAT_INC(tq, tasks_dispatched);
+
 	/* Spawn additional taskq threads if required. */
 	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@ -662,6 +759,9 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,

 	/* Queue to the delay list for subsequent execution */
 	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+	TQENT_SET_LIST(t, TQENT_LIST_DELAY);
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);

 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
@ -676,6 +776,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,

 	spin_unlock(&t->tqent_lock);

+	TQSTAT_INC(tq, tasks_dispatched_delayed);
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@ -724,10 +826,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 	t->tqent_flags |= TQENT_FLAG_PREALLOC;

 	/* Queue to the priority list instead of the pending list */
-	if (flags & TQ_FRONT)
+	if (flags & TQ_FRONT) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
-	else
+	} else {
+		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+	}
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);

 	t->tqent_id = tq->tq_next_id;
 	tq->tq_next_id++;
@ -742,6 +849,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,

 	wake_up(&tq->tq_work_waitq);

+	TQSTAT_INC(tq, tasks_dispatched);
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@ -908,6 +1017,8 @@ taskq_thread(void *args)
 	wake_up(&tq->tq_wait_waitq);
 	set_current_state(TASK_INTERRUPTIBLE);

+	TQSTAT_INC(tq, threads_total);
+
 	while (!kthread_should_stop()) {

 		if (list_empty(&tq->tq_pend_list) &&
@ -919,9 +1030,15 @@ taskq_thread(void *args)
 			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
 			spin_unlock_irqrestore(&tq->tq_lock, flags);

+			TQSTAT_INC(tq, thread_sleeps);
+			TQSTAT_INC(tq, threads_idle);
+
 			schedule();
 			seq_tasks = 0;

+			TQSTAT_DEC(tq, threads_idle);
+			TQSTAT_INC(tq, thread_wakeups);
+
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
 			remove_wait_queue(&tq->tq_work_waitq, &wait);
@ -931,6 +1048,8 @@ taskq_thread(void *args)

 		if ((t = taskq_next_ent(tq)) != NULL) {
 			list_del_init(&t->tqent_list);
+			TQSTAT_DEC_LIST(tq, t);
+			TQSTAT_DEC(tq, tasks_total);

 			/*
 			 * A TQENT_FLAG_PREALLOC task may be reused or freed
@ -955,6 +1074,7 @@ taskq_thread(void *args)
 			tq->tq_nactive++;
 			spin_unlock_irqrestore(&tq->tq_lock, flags);

+			TQSTAT_INC(tq, threads_active);
 			DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);

 			/* Perform the requested task */
@ -962,8 +1082,17 @@ taskq_thread(void *args)

 			DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);

+			TQSTAT_DEC(tq, threads_active);
+			if ((t->tqent_flags & TQENT_LIST_MASK) ==
+			    TQENT_LIST_PENDING)
+				TQSTAT_INC(tq, tasks_executed_normal);
+			else
+				TQSTAT_INC(tq, tasks_executed_priority);
+			TQSTAT_INC(tq, tasks_executed);
+
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
+
 			tq->tq_nactive--;
 			list_del_init(&tqt->tqt_active_list);
 			tqt->tqt_task = NULL;
@ -989,7 +1118,8 @@ taskq_thread(void *args)
 			tqt->tqt_id = TASKQID_INVALID;
 			tqt->tqt_flags = 0;
 			wake_up_all(&tq->tq_wait_waitq);
-		}
+		} else
+			TQSTAT_INC(tq, thread_wakeups_nowork);

 		set_current_state(TASK_INTERRUPTIBLE);

@ -998,6 +1128,10 @@ taskq_thread(void *args)
 	__set_current_state(TASK_RUNNING);
 	tq->tq_nthreads--;
 	list_del_init(&tqt->tqt_thread_list);
+
+	TQSTAT_DEC(tq, threads_total);
+	TQSTAT_INC(tq, threads_destroyed);
+
 error:
 	kmem_free(tqt, sizeof (taskq_thread_t));
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
@ -1037,9 +1171,156 @@ taskq_thread_create(taskq_t *tq)

 	wake_up_process(tqt->tqt_thread);

+	TQSTAT_INC(tq, threads_created);
+
 	return (tqt);
 }

+static void
+taskq_stats_init(taskq_t *tq)
+{
+	taskq_sums_t *tqs = &tq->tq_sums;
+	wmsum_init(&tqs->tqs_threads_active, 0);
+	wmsum_init(&tqs->tqs_threads_idle, 0);
+	wmsum_init(&tqs->tqs_threads_total, 0);
+	wmsum_init(&tqs->tqs_tasks_pending, 0);
+	wmsum_init(&tqs->tqs_tasks_priority, 0);
+	wmsum_init(&tqs->tqs_tasks_total, 0);
+	wmsum_init(&tqs->tqs_tasks_delayed, 0);
+	wmsum_init(&tqs->tqs_entries_free, 0);
+	wmsum_init(&tqs->tqs_threads_created, 0);
+	wmsum_init(&tqs->tqs_threads_destroyed, 0);
+	wmsum_init(&tqs->tqs_tasks_dispatched, 0);
+	wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0);
+	wmsum_init(&tqs->tqs_tasks_executed_normal, 0);
+	wmsum_init(&tqs->tqs_tasks_executed_priority, 0);
+	wmsum_init(&tqs->tqs_tasks_executed, 0);
+	wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0);
+	wmsum_init(&tqs->tqs_tasks_cancelled, 0);
+	wmsum_init(&tqs->tqs_thread_wakeups, 0);
+	wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0);
+	wmsum_init(&tqs->tqs_thread_sleeps, 0);
+}
+
+static void
+taskq_stats_fini(taskq_t *tq)
+{
+	taskq_sums_t *tqs = &tq->tq_sums;
+	wmsum_fini(&tqs->tqs_threads_active);
+	wmsum_fini(&tqs->tqs_threads_idle);
+	wmsum_fini(&tqs->tqs_threads_total);
+	wmsum_fini(&tqs->tqs_tasks_pending);
+	wmsum_fini(&tqs->tqs_tasks_priority);
+	wmsum_fini(&tqs->tqs_tasks_total);
+	wmsum_fini(&tqs->tqs_tasks_delayed);
+	wmsum_fini(&tqs->tqs_entries_free);
+	wmsum_fini(&tqs->tqs_threads_created);
+	wmsum_fini(&tqs->tqs_threads_destroyed);
+	wmsum_fini(&tqs->tqs_tasks_dispatched);
+	wmsum_fini(&tqs->tqs_tasks_dispatched_delayed);
+	wmsum_fini(&tqs->tqs_tasks_executed_normal);
+	wmsum_fini(&tqs->tqs_tasks_executed_priority);
+	wmsum_fini(&tqs->tqs_tasks_executed);
+	wmsum_fini(&tqs->tqs_tasks_delayed_requeued);
+	wmsum_fini(&tqs->tqs_tasks_cancelled);
+	wmsum_fini(&tqs->tqs_thread_wakeups);
+	wmsum_fini(&tqs->tqs_thread_wakeups_nowork);
+	wmsum_fini(&tqs->tqs_thread_sleeps);
+}
+
+static int
+taskq_kstats_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	taskq_t *tq = ksp->ks_private;
+	taskq_kstats_t *tqks = ksp->ks_data;
+
+	tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads;
+	tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc;
+	tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc;
+
+	taskq_sums_t *tqs = &tq->tq_sums;
+
+	tqks->tqks_threads_active.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_active);
+	tqks->tqks_threads_idle.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_idle);
+	tqks->tqks_threads_total.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_total);
+	tqks->tqks_tasks_pending.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_pending);
+	tqks->tqks_tasks_priority.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_priority);
+	tqks->tqks_tasks_total.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_total);
+	tqks->tqks_tasks_delayed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_delayed);
+	tqks->tqks_entries_free.value.ui64 =
+	    wmsum_value(&tqs->tqs_entries_free);
+	tqks->tqks_threads_created.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_created);
+	tqks->tqks_threads_destroyed.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_destroyed);
+	tqks->tqks_tasks_dispatched.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_dispatched);
+	tqks->tqks_tasks_dispatched_delayed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_dispatched_delayed);
+	tqks->tqks_tasks_executed_normal.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed_normal);
+	tqks->tqks_tasks_executed_priority.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed_priority);
+	tqks->tqks_tasks_executed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed);
+	tqks->tqks_tasks_delayed_requeued.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_delayed_requeued);
+	tqks->tqks_tasks_cancelled.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_cancelled);
+	tqks->tqks_thread_wakeups.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_wakeups);
+	tqks->tqks_thread_wakeups_nowork.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_wakeups_nowork);
+	tqks->tqks_thread_sleeps.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_sleeps);
+
+	return (0);
+}
+
+static void
+taskq_kstats_init(taskq_t *tq)
+{
+	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
+	snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance);
+
+	kstat_t *ksp = kstat_create("taskq", 0, name, "misc",
+	    KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	ksp->ks_private = tq;
+	ksp->ks_update = taskq_kstats_update;
+	ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP);
+	memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t));
+	kstat_install(ksp);
+
+	tq->tq_ksp = ksp;
+}
+
+static void
+taskq_kstats_fini(taskq_t *tq)
+{
+	if (tq->tq_ksp == NULL)
+		return;
+
+	kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t));
+	kstat_delete(tq->tq_ksp);
+
+	tq->tq_ksp = NULL;
+}
+
 taskq_t *
 taskq_create(const char *name, int threads_arg, pri_t pri,
    int minalloc, int maxalloc, uint_t flags)
@ -1104,6 +1385,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
 	init_waitqueue_head(&tq->tq_wait_waitq);
 	tq->tq_lock_class = TQ_LOCK_GENERAL;
 	INIT_LIST_HEAD(&tq->tq_taskqs);
+	taskq_stats_init(tq);

 	if (flags & TASKQ_PREPOPULATE) {
 		spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
@ -1137,14 +1419,17 @@ taskq_create(const char *name, int threads_arg, pri_t pri,

 	if (rc) {
 		taskq_destroy(tq);
-		tq = NULL;
-	} else {
-		down_write(&tq_list_sem);
-		tq->tq_instance = taskq_find_by_name(name) + 1;
-		list_add_tail(&tq->tq_taskqs, &tq_list);
-		up_write(&tq_list_sem);
+		return (NULL);
 	}

+	down_write(&tq_list_sem);
+	tq->tq_instance = taskq_find_by_name(name) + 1;
+	list_add_tail(&tq->tq_taskqs, &tq_list);
+	up_write(&tq_list_sem);
+
+	/* Install kstats late, because the name includes tq_instance */
+	taskq_kstats_init(tq);
+
 	return (tq);
 }
 EXPORT_SYMBOL(taskq_create);
@ -1177,6 +1462,8 @@ taskq_destroy(taskq_t *tq)

 	taskq_wait(tq);

+	taskq_kstats_fini(tq);
+
 	/* remove taskq from global list used by the kstats */
 	down_write(&tq_list_sem);
 	list_del(&tq->tq_taskqs);
@ -1230,6 +1517,7 @@ taskq_destroy(taskq_t *tq)

 	spin_unlock_irqrestore(&tq->tq_lock, flags);

+	taskq_stats_fini(tq);
 	kmem_strfree(tq->tq_name);
 	kmem_free(tq, sizeof (taskq_t));
 }
@ -1271,6 +1559,100 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri,
 }
 EXPORT_SYMBOL(taskq_create_synced);

+static kstat_t *taskq_summary_ksp = NULL;
+
+static int
+spl_taskq_kstat_headers(char *buf, size_t size)
+{
+	size_t n = snprintf(buf, size,
+	    "%-20s | %-17s | %-23s\n"
+	    "%-20s | %-17s | %-23s\n"
+	    "%-20s | %-17s | %-23s\n",
+	    "", "threads", "tasks on queue",
+	    "taskq name", "tot [act idl] max", " pend [ norm  high] dly",
+	    "--------------------", "-----------------",
+	    "-----------------------");
+	return (n >= size ? ENOMEM : 0);
+}
+
+static int
+spl_taskq_kstat_data(char *buf, size_t size, void *data)
+{
+	struct list_head *tql = NULL;
+	taskq_t *tq;
+	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
+	char threads[25];
+	char tasks[30];
+	size_t n;
+	int err = 0;
+
+	down_read(&tq_list_sem);
+	list_for_each_prev(tql, &tq_list) {
+		tq = list_entry(tql, taskq_t, tq_taskqs);
+
+		mutex_enter(tq->tq_ksp->ks_lock);
+		taskq_kstats_update(tq->tq_ksp, KSTAT_READ);
+		taskq_kstats_t *tqks = tq->tq_ksp->ks_data;
+
+		snprintf(name, sizeof (name), "%s.%d", tq->tq_name,
+		    tq->tq_instance);
+		snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu",
+		    tqks->tqks_threads_total.value.ui64,
+		    tqks->tqks_threads_active.value.ui64,
+		    tqks->tqks_threads_idle.value.ui64,
+		    tqks->tqks_threads_max.value.ui64);
+		snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu",
+		    tqks->tqks_tasks_total.value.ui64,
+		    tqks->tqks_tasks_pending.value.ui64,
+		    tqks->tqks_tasks_priority.value.ui64,
+		    tqks->tqks_tasks_delayed.value.ui64);
+
+		mutex_exit(tq->tq_ksp->ks_lock);
+
+		n = snprintf(buf, size, "%-20s | %-17s | %-23s\n",
+		    name, threads, tasks);
+		if (n >= size) {
+			err = ENOMEM;
+			break;
+		}
+
+		buf = &buf[n];
+		size -= n;
+	}
+
+	up_read(&tq_list_sem);
+
+	return (err);
+}
+
+static void
+spl_taskq_kstat_init(void)
+{
+	kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	ksp->ks_data = (void *)(uintptr_t)1;
+	ksp->ks_ndata = 1;
+	kstat_set_raw_ops(ksp, spl_taskq_kstat_headers,
+	    spl_taskq_kstat_data, NULL);
+	kstat_install(ksp);
+
+	taskq_summary_ksp = ksp;
+}
+
+static void
+spl_taskq_kstat_fini(void)
+{
+	if (taskq_summary_ksp == NULL)
+		return;
+
+	kstat_delete(taskq_summary_ksp);
+	taskq_summary_ksp = NULL;
+}
+
 static unsigned int spl_taskq_kick = 0;

 /*
@ -1451,12 +1833,16 @@ spl_taskq_init(void)
 	 */
 	dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;

+	spl_taskq_kstat_init();
+
 	return (0);
 }

 void
 spl_taskq_fini(void)
 {
+	spl_taskq_kstat_fini();
+
 	taskq_destroy(dynamic_taskq);
 	dynamic_taskq = NULL;

--- a/module/os/linux/spl/spl-thread.c
+++ b/module/os/linux/spl/spl-thread.c
@ -186,6 +186,13 @@ issig(void)

 		schedule();
 #endif
+		/*
+		 * Dequeued SIGSTOP/SIGTSTP.
+		 * Check if process has other singal pending.
+		 */
+		if (signal_pending(current))
+			return (1);
+
 		return (0);
 	}

--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@ -58,22 +58,16 @@
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
-#ifdef _KERNEL
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #include <linux/version.h>
-#endif

-#ifdef _KERNEL
 #if defined(MAX_ORDER)
 #define	ABD_MAX_ORDER	(MAX_ORDER)
 #elif defined(MAX_PAGE_ORDER)
 #define	ABD_MAX_ORDER	(MAX_PAGE_ORDER)
 #endif
-#else
-#define	ABD_MAX_ORDER	(1)
-#endif

 typedef struct abd_stats {
 	kstat_named_t abdstat_struct_size;
@ -193,11 +187,9 @@ abd_t *abd_zero_scatter = NULL;

 struct page;
 /*
- * _KERNEL   - Will point to ZERO_PAGE if it is available or it will be
- *             an allocated zero'd PAGESIZE buffer.
- * Userspace - Will be an allocated zero'ed PAGESIZE buffer.
- *
- * abd_zero_page is assigned to each of the pages of abd_zero_scatter.
+ * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
+ * point to ZERO_PAGE if it is available or it will be an allocated zero'd
+ * PAGESIZE buffer.
 */
 static struct page *abd_zero_page = NULL;

@ -232,7 +224,6 @@ abd_free_struct_impl(abd_t *abd)
 	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
 }

-#ifdef _KERNEL
 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;

 /*
@ -509,7 +500,7 @@ abd_alloc_zero_scatter(void)
 	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
 	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;

 	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
 		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
@ -520,134 +511,6 @@ abd_alloc_zero_scatter(void)
 	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 }

-#else /* _KERNEL */
-
-#ifndef PAGE_SHIFT
-#define	PAGE_SHIFT (highbit64(PAGESIZE)-1)
-#endif
-
-#define	zfs_kmap_local(chunk)		((void *)chunk)
-#define	zfs_kunmap_local(addr)		do { (void)(addr); } while (0)
-#define	local_irq_save(flags)		do { (void)(flags); } while (0)
-#define	local_irq_restore(flags)	do { (void)(flags); } while (0)
-#define	nth_page(pg, i) \
-	((struct page *)((void *)(pg) + (i) * PAGESIZE))
-
-struct scatterlist {
-	struct page *page;
-	int length;
-	int end;
-};
-
-static void
-sg_init_table(struct scatterlist *sg, int nr)
-{
-	memset(sg, 0, nr * sizeof (struct scatterlist));
-	sg[nr - 1].end = 1;
-}
-
-/*
- * This must be called if any of the sg_table allocation functions
- * are called.
- */
-static void
-abd_free_sg_table(abd_t *abd)
-{
-	int nents = ABD_SCATTER(abd).abd_nents;
-	vmem_free(ABD_SCATTER(abd).abd_sgl,
-	    nents * sizeof (struct scatterlist));
-}
-
-#define	for_each_sg(sgl, sg, nr, i)	\
-	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
-
-static inline void
-sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
-    unsigned int offset)
-{
-	/* currently we don't use offset */
-	ASSERT(offset == 0);
-	sg->page = page;
-	sg->length = len;
-}
-
-static inline struct page *
-sg_page(struct scatterlist *sg)
-{
-	return (sg->page);
-}
-
-static inline struct scatterlist *
-sg_next(struct scatterlist *sg)
-{
-	if (sg->end)
-		return (NULL);
-
-	return (sg + 1);
-}
-
-void
-abd_alloc_chunks(abd_t *abd, size_t size)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
-	struct scatterlist *sg;
-	int i;
-
-	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd, sg, nr_pages, i) {
-		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-		sg_set_page(sg, p, PAGESIZE, 0);
-	}
-	ABD_SCATTER(abd).abd_nents = nr_pages;
-}
-
-void
-abd_free_chunks(abd_t *abd)
-{
-	int i, n = ABD_SCATTER(abd).abd_nents;
-	struct scatterlist *sg;
-
-	abd_for_each_sg(abd, sg, n, i) {
-		struct page *p = nth_page(sg_page(sg), 0);
-		umem_free_aligned(p, PAGESIZE);
-	}
-	abd_free_sg_table(abd);
-}
-
-static void
-abd_alloc_zero_scatter(void)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
-	struct scatterlist *sg;
-	int i;
-
-	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-	memset(abd_zero_page, 0, PAGESIZE);
-	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
-	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
-	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
-	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-
-	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
-		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
-	}
-
-	ABDSTAT_BUMP(abdstat_scatter_cnt);
-	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
-	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
-}
-
-#endif /* _KERNEL */
-
 boolean_t
 abd_size_alloc_linear(size_t size)
 {
@ -712,14 +575,10 @@ abd_free_zero_scatter(void)
 	abd_free_struct(abd_zero_scatter);
 	abd_zero_scatter = NULL;
 	ASSERT3P(abd_zero_page, !=, NULL);
-#if defined(_KERNEL)
 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 	abd_unmark_zfs_page(abd_zero_page);
 	__free_page(abd_zero_page);
 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
-#else
-	umem_free_aligned(abd_zero_page, PAGESIZE);
-#endif /* _KERNEL */
 }

 static int
@ -1014,8 +873,6 @@ abd_cache_reap_now(void)
 {
 }

-#if defined(_KERNEL)
-
 /*
 * This is abd_iter_page(), the function underneath abd_iterate_page_func().
 * It yields the next page struct and data offset and size within it, without
@ -1297,5 +1154,3 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
-
-#endif /* _KERNEL */
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@ -201,9 +201,9 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 	 * See also the comment above zfs_arc_shrinker_limit.
 	 */
 	int64_t can_free = btop(arc_evictable_memory());
-	int64_t limit = zfs_arc_shrinker_limit != 0 ?
-	    zfs_arc_shrinker_limit : INT64_MAX;
-	return (MIN(can_free, limit));
+	if (current_is_kswapd() && zfs_arc_shrinker_limit)
+		can_free = MIN(can_free, zfs_arc_shrinker_limit);
+	return (can_free);
 }

 static unsigned long
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	zfsvfs_t *snap_zfsvfs;
 	zfs_snapentry_t *se;
 	char *full_name, *full_path;
-	char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
-	    NULL };
+	char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",
+	    NULL, NULL, NULL };
 	char *envp[] = { NULL };
 	int error;
 	struct path spath;
@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
 	 */
 	dprintf("mount; name=%s path=%s\n", full_name, full_path);
-	argv[5] = full_name;
-	argv[6] = full_path;
+	argv[6] = full_name;
+	argv[7] = full_path;
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (error) {
 		if (!(error & MOUNT_BUSY << 8)) {
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@ -69,6 +69,7 @@
 #include <sys/zpl.h>
 #include <sys/zil.h>
 #include <sys/sa_impl.h>
+#include <linux/mm_compat.h>

 /*
 * Programming rules.
@ -1820,24 +1821,36 @@ zfs_setattr_dir(znode_t *dzp)
 			    &gid, sizeof (gid));
 		}

-		if (zp->z_projid != dzp->z_projid) {
+
+		uint64_t projid = dzp->z_projid;
+		if (zp->z_projid != projid) {
 			if (!(zp->z_pflags & ZFS_PROJID)) {
-				zp->z_pflags |= ZFS_PROJID;
-				SA_ADD_BULK_ATTR(bulk, count,
-				    SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
-				    sizeof (zp->z_pflags));
+				err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+				if (unlikely(err == EEXIST)) {
+					err = 0;
+				} else if (err != 0) {
+					goto sa_add_projid_err;
+				} else {
+					projid = ZFS_INVALID_PROJID;
+				}
 			}

-			zp->z_projid = dzp->z_projid;
-			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
-			    NULL, &zp->z_projid, sizeof (zp->z_projid));
+			if (projid != ZFS_INVALID_PROJID) {
+				zp->z_projid = projid;
+				SA_ADD_BULK_ATTR(bulk, count,
+				    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+				    sizeof (zp->z_projid));
+			}
 		}

+sa_add_projid_err:
 		mutex_exit(&dzp->z_lock);

 		if (likely(count > 0)) {
 			err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 			dmu_tx_commit(tx);
+		} else if (projid == ZFS_INVALID_PROJID) {
+			dmu_tx_commit(tx);
 		} else {
 			dmu_tx_abort(tx);
 		}
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@ -295,6 +295,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 {
 	struct super_block *s;
 	objset_t *os;
+	boolean_t issnap = B_FALSE;
 	int err;

 	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
@ -326,6 +327,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 		if (zpl_enter(zfsvfs, FTAG) == 0) {
 			if (os != zfsvfs->z_os)
 				err = -SET_ERROR(EBUSY);
+			issnap = zfsvfs->z_issnap;
 			zpl_exit(zfsvfs, FTAG);
 		} else {
 			err = -SET_ERROR(EBUSY);
@ -349,7 +351,11 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 			return (ERR_PTR(err));
 		}
 		s->s_flags |= SB_ACTIVE;
-	} else if ((flags ^ s->s_flags) & SB_RDONLY) {
+	} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
+		/*
+		 * Skip ro check for snap since snap is always ro regardless
+		 * ro flag is passed by mount or not.
+		 */
 		deactivate_locked_super(s);
 		return (ERR_PTR(-EBUSY));
 	}
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
 * Copyright (c) 2024, Klara, Inc.
 */

@ -1089,11 +1090,42 @@ static const struct block_device_operations zvol_ops = {
 #endif
 };

+/*
+ * Since 6.9, Linux has been removing queue limit setters in favour of an
+ * initial queue_limits struct applied when the device is open. Since 6.11,
+ * queue_limits is being extended to allow more things to be applied when the
+ * device is open. Setters are also being removed for this.
+ *
+ * For OpenZFS, this means that depending on kernel version, some options may
+ * be set up before the device is open, and some applied to an open device
+ * (queue) after the fact.
+ *
+ * We manage this complexity by having our own limits struct,
+ * zvol_queue_limits_t, in which we carry any queue config that we're
+ * interested in setting. This structure is the same on all kernels.
+ *
+ * These limits are then applied to the queue at device open time by the most
+ * appropriate method for the kernel.
+ *
+ * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
+ * blk_alloc_disk() exists). This converts our limits struct to a proper Linux
+ * struct queue_limits, and passes it in. Any fields added in later kernels are
+ * (obviously) not set up here.
+ *
+ * zvol_queue_limits_apply() is called on all kernel versions after the queue
+ * is created, and applies any remaining config. Before 6.9 that will be
+ * everything, via setter methods. After 6.9 that will be whatever couldn't be
+ * put into struct queue_limits. (This implies that zvol_queue_limits_apply()
+ * will always be a no-op on the latest kernel we support).
+ */
 typedef struct zvol_queue_limits {
 	unsigned int	zql_max_hw_sectors;
 	unsigned short	zql_max_segments;
 	unsigned int	zql_max_segment_size;
 	unsigned int	zql_io_opt;
+	unsigned int	zql_physical_block_size;
+	unsigned int	zql_max_discard_sectors;
+	unsigned int	zql_discard_granularity;
 } zvol_queue_limits_t;

 static void
@ -1162,6 +1194,11 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
 	}

 	limits->zql_io_opt = zv->zv_volblocksize;
+
+	limits->zql_physical_block_size = zv->zv_volblocksize;
+	limits->zql_max_discard_sectors =
+	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
+	limits->zql_discard_granularity = zv->zv_volblocksize;
 }

 #ifdef HAVE_BLK_ALLOC_DISK_2ARG
@ -1174,18 +1211,35 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
 	qlimits->max_segments = limits->zql_max_segments;
 	qlimits->max_segment_size = limits->zql_max_segment_size;
 	qlimits->io_opt = limits->zql_io_opt;
+	qlimits->physical_block_size = limits->zql_physical_block_size;
+	qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->discard_granularity = limits->zql_discard_granularity;
+#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
+	qlimits->features =
+	    BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
+#endif
 }
-#else
+#endif
+
 static void
 zvol_queue_limits_apply(zvol_queue_limits_t *limits,
    struct request_queue *queue)
 {
+#ifndef HAVE_BLK_ALLOC_DISK_2ARG
 	blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
 	blk_queue_max_segments(queue, limits->zql_max_segments);
 	blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
 	blk_queue_io_opt(queue, limits->zql_io_opt);
-}
+	blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
+	blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
+	blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
 #endif
+#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
+	blk_queue_set_write_cache(queue, B_TRUE);
+	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
+#endif
+}

 static int
 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
@ -1198,7 +1252,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)

 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
@ -1211,6 +1264,7 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 	zso->zvo_disk = disk;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
+
 #else
 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)
@ -1223,7 +1277,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 	}

 	zso->zvo_disk->queue = zso->zvo_queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif /* HAVE_BLK_ALLOC_DISK */
 #else
 	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
@ -1237,8 +1290,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 	}

 	zso->zvo_disk->queue = zso->zvo_queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
+
+	zvol_queue_limits_apply(limits, zso->zvo_queue);
+
 	return (0);

 }
@ -1260,7 +1315,6 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
 		return (1);
 	}
 	zso->zvo_queue = zso->zvo_disk->queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 	zso->zvo_disk->minors = ZVOL_MINORS;
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
@ -1291,10 +1345,11 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)

 	/* Our queue is now created, assign it to our disk */
 	zso->zvo_disk->queue = zso->zvo_queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
+#endif

+	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif
-#endif
+
 	return (0);
 }

@ -1303,7 +1358,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
 * request queue and generic disk structures for the block device.
 */
 static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name)
+zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
@ -1323,6 +1378,7 @@ zvol_alloc(dev_t dev, const char *name)
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
+	zv->zv_volblocksize = volblocksize;

 	list_link_init(&zv->zv_next);
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
@ -1360,8 +1416,6 @@ zvol_alloc(dev_t dev, const char *name)
 	if (ret != 0)
 		goto out_kmem;

-	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
-
 	/* Limit read-ahead to a single page to prevent over-prefetching. */
 	blk_queue_set_read_ahead(zso->zvo_queue, 1);

@ -1370,9 +1424,6 @@ zvol_alloc(dev_t dev, const char *name)
 		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
 	}

-	/* Enable /proc/diskstats */
-	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
-
 	zso->zvo_queue->queuedata = zv;
 	zso->zvo_dev = dev;
 	zv->zv_open_count = 0;
@ -1617,7 +1668,8 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;

-	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
+	    doi->doi_data_block_size);
 	if (zv == NULL) {
 		error = SET_ERROR(EAGAIN);
 		goto out_dmu_objset_disown;
@ -1627,7 +1679,6 @@ zvol_os_create_minor(const char *name)
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;

-	zv->zv_volblocksize = doi->doi_data_block_size;
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;

@ -1639,14 +1690,6 @@ zvol_os_create_minor(const char *name)

 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);

-
-
-	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
-	    zv->zv_volblocksize);
-	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
-	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
-	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
-	    zv->zv_volblocksize);
 #ifdef QUEUE_FLAG_DISCARD
 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
 #endif
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@ -754,6 +754,12 @@ zpool_feature_init(void)
 	    "Support for raidz expansion",
 	    ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);

+	zfeature_register(SPA_FEATURE_FAST_DEDUP,
+	    "com.klarasystems:fast_dedup", "fast_dedup",
+	    "Support for advanced deduplication",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
+	    sfeatures);
+
 	zfs_mod_list_supported_free(sfeatures);
 }

--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@ -113,7 +113,7 @@ abd_verify(abd_t *abd)
 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
 	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
-	    ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
+	    ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 	if (abd_is_linear(abd)) {
@ -603,13 +603,11 @@ abd_get_zeros(size_t size)
 }

 /*
- * Allocate a linear ABD structure for buf.
+ * Create a linear ABD for an existing buf.
 */
-abd_t *
-abd_get_from_buf(void *buf, size_t size)
+static abd_t *
+abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)
 {
-	abd_t *abd = abd_alloc_struct(0);
-
 	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);

 	/*
@ -625,6 +623,20 @@ abd_get_from_buf(void *buf, size_t size)
 	return (abd);
 }

+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+	abd_t *abd = abd_alloc_struct(0);
+	return (abd_get_from_buf_impl(abd, buf, size));
+}
+
+abd_t *
+abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)
+{
+	abd_init_struct(abd);
+	return (abd_get_from_buf_impl(abd, buf, size));
+}
+
 /*
 * Get the raw buffer associated with a linear ABD.
 */
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@ -1767,12 +1767,12 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
-	void *tmpbuf = NULL;
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
+	boolean_t free_abd = B_FALSE;

 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+	ASSERT3P(abd, !=, NULL);

 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
@ -1784,14 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
-
+		abd = NULL;
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
-		ASSERT3P(tmpbuf, !=, NULL);
+		    hdr->b_l1hdr.b_pabd, &abd, lsize, hdr->b_complevel);
+		ASSERT3P(abd, !=, NULL);
 		ASSERT3U(csize, <=, psize);
-		abd = abd_get_from_buf(tmpbuf, lsize);
-		abd_take_ownership_of_buf(abd, B_TRUE);
 		abd_zero_off(abd, csize, psize - csize);
+		free_abd = B_TRUE;
 	}

 	/*
@ -1810,16 +1809,10 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)

 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
-	else if (ret != ENOENT)
-		goto error;
+	else if (ret == ENOENT)
+		ret = 0;

-	if (tmpbuf != NULL)
-		abd_free(abd);
-
-	return (0);
-
-error:
-	if (tmpbuf != NULL)
+	if (free_abd)
 		abd_free(abd);

 	return (ret);
@ -1836,7 +1829,6 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
-	void *tmp = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);

@ -1871,17 +1863,14 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
-		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));

 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
-			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
 			goto error;
 		}

-		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
@ -2123,10 +2112,14 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
+			abd_t dabd;
+			abd_get_from_buf_struct(&dabd, buf->b_data,
+			    HDR_GET_LSIZE(hdr));
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-			    hdr->b_l1hdr.b_pabd, buf->b_data,
+			    hdr->b_l1hdr.b_pabd, &dabd,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
+			abd_free(&dabd);

 			/*
 			 * Absent hardware errors or software bugs, this should
@ -8531,18 +8524,15 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
-		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));

 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
-			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}

-		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
@ -9037,9 +9027,8 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 	}

 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
-		size_t bufsize = MAX(size, asize);
-		void *buf = zio_buf_alloc(bufsize);
-		uint64_t csize = zio_compress_data(compress, to_write, &buf,
+		cabd = abd_alloc_for_io(MAX(size, asize), ismd);
+		uint64_t csize = zio_compress_data(compress, to_write, &cabd,
 		    size, hdr->b_complevel);
 		if (csize > psize) {
 			/*
@ -9047,13 +9036,12 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 			 * psize.  Even if it fits into asize, it does not
 			 * matter, since checksum will never match on read.
 			 */
-			zio_buf_free(buf, bufsize);
+			abd_free(cabd);
 			return (SET_ERROR(EIO));
 		}
 		if (asize > csize)
-			memset((char *)buf + csize, 0, asize - csize);
-		to_write = cabd = abd_get_from_buf(buf, bufsize);
-		abd_take_ownership_of_buf(cabd, B_TRUE);
+			abd_zero_off(cabd, csize, asize - csize);
+		to_write = cabd;
 	}

 	if (HDR_ENCRYPTED(hdr)) {
@ -9158,12 +9146,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
-		 * If pass == 1 or 3, we cache MRU metadata and data
-		 * respectively.
+		 * pass == 0: MFU meta
+		 * pass == 1: MRU meta
+		 * pass == 2: MFU data
+		 * pass == 3: MRU data
 		 */
-		if (l2arc_mfuonly) {
+		if (l2arc_mfuonly == 1) {
 			if (pass == 1 || pass == 3)
 				continue;
+		} else if (l2arc_mfuonly > 1) {
+			if (pass == 3)
+				continue;
 		}

 		uint64_t passed_sz = 0;
@ -10179,7 +10172,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
-	abd_t		*abd = NULL;
 	uint64_t	asize;

 	ASSERT(this_lbp != NULL && next_lbp != NULL);
@ -10241,16 +10233,22 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
-	case ZIO_COMPRESS_LZ4:
-		abd = abd_alloc_for_io(asize, B_TRUE);
+	case ZIO_COMPRESS_LZ4: {
+		abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
-		if ((err = zio_decompress_data(
+		abd_t dabd;
+		abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
+		err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
-		    abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
+		    abd, &dabd, asize, sizeof (*this_lb), NULL);
+		abd_free(&dabd);
+		abd_free(abd);
+		if (err != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
+	}
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
@ -10267,8 +10265,6 @@ cleanup:
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
-	if (abd != NULL)
-		abd_free(abd);
 	return (err);
 }

@ -10504,7 +10500,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
-	uint8_t			*tmpbuf = NULL;
+	abd_t			*abd = NULL;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;

 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
@ -10527,7 +10523,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)

 	/* try to compress the buffer */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
-	    abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
+	    abd_buf->abd, &abd, sizeof (*lb), 0);

 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
@ -10553,27 +10549,26 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
-		memset(tmpbuf + psize, 0, asize - psize);
+		abd_zero_off(abd, psize, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
-		memcpy(tmpbuf, lb, sizeof (*lb));
+		abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}

 	/* checksum what we're about to write */
-	fletcher_4_native(tmpbuf, asize, NULL,
+	abd_fletcher_4_native(abd, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);

 	abd_free(abd_buf->abd);

 	/* perform the write itself */
-	abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
-	abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
+	abd_buf->abd = abd;
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
--- a/module/zfs/blkptr.c
+++ b/module/zfs/blkptr.c
@ -142,8 +142,13 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		uint8_t dstbuf[BPE_PAYLOAD_SIZE];
 		decode_embedded_bp_compressed(bp, dstbuf);
-		VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
-		    dstbuf, buf, psize, buflen, NULL));
+		abd_t cabd, dabd;
+		abd_get_from_buf_struct(&cabd, dstbuf, psize);
+		abd_get_from_buf_struct(&dabd, buf, buflen);
+		VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &cabd,
+		    &dabd, psize, buflen, NULL));
+		abd_free(&dabd);
+		abd_free(&cabd);
 	} else {
 		ASSERT3U(lsize, ==, psize);
 		decode_embedded_bp_compressed(bp, buf);
--- a/module/zfs/dataset_kstats.c
+++ b/module/zfs/dataset_kstats.c
@ -40,6 +40,9 @@ static dataset_kstat_values_t empty_dataset_kstats = {
 	{
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
+	{ "zil_commit_error_count",		KSTAT_DATA_UINT64 },
+	{ "zil_commit_stall_count",		KSTAT_DATA_UINT64 },
+	{ "zil_commit_suspend_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
@ -201,6 +204,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
 void
 dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
 {
+	if (dk->dk_kstats == NULL)
+		return;
+
 	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
 	char *ds_name;

--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
--- a/module/zfs/ddt_log.c
+++ b/module/zfs/ddt_log.c
@ -0,0 +1,764 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/ddt.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu.h>
+#include <sys/ddt_impl.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * No more than this many txgs before swapping logs.
+ */
+uint_t zfs_dedup_log_txg_max = 8;
+
+/*
+ * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
+ * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
+ */
+uint64_t zfs_dedup_log_mem_max = 0;
+uint_t zfs_dedup_log_mem_max_percent = 1;
+
+
+static kmem_cache_t *ddt_log_entry_flat_cache;
+static kmem_cache_t *ddt_log_entry_trad_cache;
+
+#define	DDT_LOG_ENTRY_FLAT_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define	DDT_LOG_ENTRY_TRAD_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_LOG_ENTRY_SIZE(ddt)	\
+	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
+
+void
+ddt_log_init(void)
+{
+	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
+	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
+	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * Max memory for log AVL entries. At least 1M, because we need
+	 * something (that's ~3800 entries per tree). They can say 100% if they
+	 * want; it just means they're at the mercy of the the txg flush limit.
+	 */
+	if (zfs_dedup_log_mem_max == 0) {
+		zfs_dedup_log_mem_max_percent =
+		    MIN(zfs_dedup_log_mem_max_percent, 100);
+		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
+		    zfs_dedup_log_mem_max_percent / 100;
+	}
+	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
+}
+
+void
+ddt_log_fini(void)
+{
+	kmem_cache_destroy(ddt_log_entry_trad_cache);
+	kmem_cache_destroy(ddt_log_entry_flat_cache);
+}
+
+static void
+ddt_log_name(ddt_t *ddt, char *name, uint_t n)
+{
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
+}
+
+static void
+ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+
+	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
+	DLH_SET_VERSION(hdr, 1);
+	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
+	hdr->dlh_length = ddl->ddl_length;
+	hdr->dlh_first_txg = ddl->ddl_first_txg;
+	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
+
+	dmu_buf_rele(db, FTAG);
+}
+
+static void
+ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+	ASSERT3U(ddl->ddl_object, ==, 0);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
+	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
+	ddl->ddl_length = 0;
+	ddl->ddl_first_txg = tx->tx_txg;
+	ddt_log_update_header(ddt, ddl, tx);
+}
+
+static void
+ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT0(ddl->ddl_length);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
+	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
+
+	ddl->ddl_object = 0;
+}
+
+void
+ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_update_stats(ddt_t *ddt)
+{
+	/*
+	 * Log object stats. We count the number of live entries in the log
+	 * tree, even if there are more than on disk, and even if the same
+	 * entry is on both append and flush trees, because that's more what
+	 * the user expects to see. This does mean the on-disk size is not
+	 * really correlated with the number of entries, but I don't think
+	 * that's reasonable to expect anyway.
+	 */
+	dmu_object_info_t doi;
+	uint64_t nblocks;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
+	nblocks = doi.doi_physical_blocks_512;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
+	nblocks += doi.doi_physical_blocks_512;
+
+	ddt_object_t *ddo = &ddt->ddt_log_stats;
+	ddo->ddo_count =
+	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
+	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
+	ddo->ddo_dspace = nblocks << 9;
+}
+
+void
+ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
+{
+	ASSERT3U(nentries, >, 0);
+	ASSERT3P(dlu->dlu_dbp, ==, NULL);
+
+	if (ddt->ddt_log_active->ddl_object == 0)
+		ddt_log_create(ddt, tx);
+
+	/*
+	 * We want to store as many entries as we can in a block, but never
+	 * split an entry across block boundaries.
+	 */
+	size_t reclen = P2ALIGN_TYPED(
+	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
+	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
+	ASSERT3U(reclen, <=, UINT16_MAX);
+	dlu->dlu_reclen = reclen;
+
+	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
+	    &dlu->dlu_dn));
+	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
+
+	uint64_t nblocks = howmany(nentries,
+	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
+	uint64_t offset = ddt->ddt_log_active->ddl_length;
+	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
+
+	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
+	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
+	    DMU_READ_NO_PREFETCH));
+
+	dlu->dlu_tx = tx;
+	dlu->dlu_block = dlu->dlu_offset = 0;
+}
+
+static ddt_log_entry_t *
+ddt_log_alloc_entry(ddt_t *ddt)
+{
+	ddt_log_entry_t *ddle;
+
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
+	} else {
+		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
+	}
+
+	return (ddle);
+}
+
+static void
+ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	/* Create the log tree entry from a live or stored entry */
+	avl_index_t where;
+	ddt_log_entry_t *ddle =
+	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
+	if (ddle == NULL) {
+		ddle = ddt_log_alloc_entry(ddt);
+		ddle->ddle_key = ddlwe->ddlwe_key;
+		avl_insert(&ddl->ddl_tree, ddle, where);
+	}
+	ddle->ddle_type = ddlwe->ddlwe_type;
+	ddle->ddle_class = ddlwe->ddlwe_class;
+	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+}
+
+void
+ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+
+	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
+	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	/* Get our block */
+	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
+
+	/*
+	 * If this would take us past the end of the block, finish it and
+	 * move to the next one.
+	 */
+	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
+		ASSERT3U(dlu->dlu_offset, >, 0);
+		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
+		dlu->dlu_block++;
+		dlu->dlu_offset = 0;
+		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+		db = dlu->dlu_dbp[dlu->dlu_block];
+	}
+
+	/*
+	 * If this is the first time touching the block, inform the DMU that
+	 * we will fill it, and zero it out.
+	 */
+	if (dlu->dlu_offset == 0) {
+		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
+		memset(db->db_data, 0, db->db_size);
+	}
+
+	/* Create the log record directly in the buffer */
+	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
+	DLR_SET_TYPE(dlr, DLR_ENTRY);
+	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
+	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
+	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
+	dlre->dlre_key = ddlwe->ddlwe_key;
+	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+
+	/* Advance offset for next record. */
+	dlu->dlu_offset += dlu->dlu_reclen;
+}
+
+void
+ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
+	ASSERT3U(dlu->dlu_offset, >, 0);
+
+	/*
+	 * Close out the last block. Whatever we haven't used will be zeroed,
+	 * which matches DLR_INVALID, so we can detect this during load.
+	 */
+	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
+
+	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
+
+	ddt->ddt_log_active->ddl_length +=
+	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
+	dnode_rele(dlu->dlu_dn, FTAG);
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
+
+	memset(dlu, 0, sizeof (ddt_log_update_t));
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+boolean_t
+ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+void
+ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	ASSERT3U(ddl->ddl_object, !=, 0);
+
+#ifdef ZFS_DEBUG
+	/*
+	 * There should not be any entries on the log tree before the given
+	 * checkpoint. Assert that this is the case.
+	 */
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle != NULL)
+		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
+		    >, 0);
+#endif
+
+	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
+	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+void
+ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+
+	/* Eject the entire object */
+	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
+
+	ddl->ddl_length = 0;
+	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
+	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
+{
+	/* Swap the logs. The old flushing one must be empty */
+	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
+
+	/*
+	 * If there are still blocks on the flushing log, truncate it first.
+	 * This can happen if there were entries on the flushing log that were
+	 * removed in memory via ddt_lookup(); their vestigal remains are
+	 * on disk.
+	 */
+	if (ddt->ddt_log_flushing->ddl_length > 0)
+		ddt_log_truncate(ddt, tx);
+
+	/*
+	 * Swap policy. We swap the logs (and so begin flushing) when the
+	 * active tree grows too large, or when we haven't swapped it in
+	 * some amount of time, or if something has requested the logs be
+	 * flushed ASAP (see ddt_walk_init()).
+	 */
+
+	/*
+	 * The log tree is too large if the memory usage of its entries is over
+	 * half of the memory limit. This effectively gives each log tree half
+	 * the available memory.
+	 */
+	const boolean_t too_large =
+	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
+	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
+
+	const boolean_t too_old =
+	    tx->tx_txg >=
+	    (ddt->ddt_log_active->ddl_first_txg +
+	    MAX(1, zfs_dedup_log_txg_max));
+
+	const boolean_t force =
+	    ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
+
+	if (!(too_large || too_old || force))
+		return (B_FALSE);
+
+	ddt_log_t *swap = ddt->ddt_log_active;
+	ddt->ddt_log_active = ddt->ddt_log_flushing;
+	ddt->ddt_log_flushing = swap;
+
+	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
+	ddt->ddt_log_active->ddl_flags &=
+	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
+
+	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+
+	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
+	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
+
+	ddt_log_update_stats(ddt);
+
+	return (B_TRUE);
+}
+
+static inline void
+ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
+    const ddt_key_t *checkpoint)
+{
+	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)dlr->dlr_payload;
+	if (checkpoint != NULL &&
+	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
+		/* Skip pre-checkpoint entries; they're already flushed. */
+		return;
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
+	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
+
+	ddlwe.ddlwe_key = dlre->dlre_key;
+	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
+
+	ddt_log_update_entry(ddt, ddl, &ddlwe);
+}
+
+static void
+ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
+{
+	void *cookie = NULL;
+	ddt_log_entry_t *ddle;
+	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
+	while ((ddle =
+	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
+		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	}
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+}
+
+static int
+ddt_log_load_one(ddt_t *ddt, uint_t n)
+{
+	ASSERT3U(n, <, 2);
+
+	ddt_log_t *ddl = &ddt->ddt_log[n];
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	uint64_t obj;
+	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &obj);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);
+
+	dnode_t *dn;
+	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+
+	ddt_log_header_t hdr;
+	dmu_buf_t *db;
+	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
+	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
+	dmu_buf_rele(db, FTAG);
+
+	if (DLH_GET_VERSION(&hdr) != 1) {
+		dnode_rele(dn, FTAG);
+		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
+		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
+		    (u_longlong_t)DLH_GET_VERSION(&hdr));
+		return (SET_ERROR(EINVAL));
+	}
+
+	ddt_key_t *checkpoint = NULL;
+	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
+		/*
+		 * If the log has a checkpoint, then we can ignore any entries
+		 * that have already been flushed.
+		 */
+		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
+		checkpoint = &hdr.dlh_checkpoint;
+	}
+
+	if (hdr.dlh_length > 0) {
+		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
+		    ZIO_PRIORITY_SYNC_READ);
+
+		for (uint64_t offset = 0; offset < hdr.dlh_length;
+		    offset += dn->dn_datablksz) {
+			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
+			    DMU_READ_PREFETCH);
+			if (err != 0) {
+				dnode_rele(dn, FTAG);
+				ddt_log_empty(ddt, ddl);
+				return (err);
+			}
+
+			uint64_t boffset = 0;
+			while (boffset < db->db_size) {
+				ddt_log_record_t *dlr =
+				    (ddt_log_record_t *)(db->db_data + boffset);
+
+				/* Partially-filled block, skip the rest */
+				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
+					break;
+
+				switch (DLR_GET_TYPE(dlr)) {
+				case DLR_ENTRY:
+					ddt_log_load_entry(ddt, ddl, dlr,
+					    checkpoint);
+					break;
+
+				default:
+					dmu_buf_rele(db, FTAG);
+					dnode_rele(dn, FTAG);
+					ddt_log_empty(ddt, ddl);
+					return (SET_ERROR(EINVAL));
+				}
+
+				boffset += DLR_GET_RECLEN(dlr);
+			}
+
+			dmu_buf_rele(db, FTAG);
+		}
+	}
+
+	dnode_rele(dn, FTAG);
+
+	ddl->ddl_object = obj;
+	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
+	ddl->ddl_length = hdr.dlh_length;
+	ddl->ddl_first_txg = hdr.dlh_first_txg;
+
+	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
+		ddt->ddt_log_flushing = ddl;
+	else
+		ddt->ddt_log_active = ddl;
+
+	return (0);
+}
+
+int
+ddt_log_load(ddt_t *ddt)
+{
+	int err;
+
+	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
+		/*
+		 * The DDT is going to be freed again in a moment, so there's
+		 * no point loading the log; it'll just slow down import.
+		 */
+		return (0);
+	}
+
+	ASSERT0(ddt->ddt_log[0].ddl_object);
+	ASSERT0(ddt->ddt_log[1].ddl_object);
+	if (ddt->ddt_dir_object == 0) {
+		/*
+		 * If we're configured but the containing dir doesn't exist
+		 * yet, then the log object can't possibly exist either.
+		 */
+		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if ((err = ddt_log_load_one(ddt, 0)) != 0)
+		return (err);
+	if ((err = ddt_log_load_one(ddt, 1)) != 0)
+		return (err);
+
+	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
+	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
+
+	/*
+	 * We have two finalisation tasks:
+	 *
+	 * - rebuild the histogram. We do this at the end rather than while
+	 *   we're loading so we don't need to uncount and recount entries that
+	 *   appear multiple times in the log.
+	 *
+	 * - remove entries from the flushing tree that are on both trees. This
+	 *   happens when ddt_lookup() rehydrates an entry from the flushing
+	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
+	 *   tree but doesn't remove it from disk.
+	 */
+
+	/*
+	 * We don't technically need a config lock here, since there shouldn't
+	 * be pool config changes during DDT load. dva_get_dsize_sync() via
+	 * ddt_stat_generate() is expecting it though, and it won't hurt
+	 * anything, so we take it.
+	 */
+	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
+
+	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
+	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
+	ddt_log_entry_t *ae = avl_first(al);
+	ddt_log_entry_t *fe = avl_first(fl);
+	while (ae != NULL || fe != NULL) {
+		ddt_log_entry_t *ddle;
+		if (ae == NULL) {
+			/* active exhausted, take flushing */
+			ddle = fe;
+			fe = AVL_NEXT(fl, fe);
+		} else if (fe == NULL) {
+			/* flushing exuhausted, take active */
+			ddle = ae;
+			ae = AVL_NEXT(al, ae);
+		} else {
+			/* compare active and flushing */
+			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
+			if (c < 0) {
+				/* active behind, take and advance */
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			} else if (c > 0) {
+				/* flushing behind, take and advance */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+			} else {
+				/* match. remove from flushing, take active */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+				avl_remove(fl, ddle);
+
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			}
+		}
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
+	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
+
+	ddt_log_update_stats(ddt);
+
+	return (0);
+}
+
+void
+ddt_log_alloc(ddt_t *ddt)
+{
+	ASSERT3P(ddt->ddt_log_active, ==, NULL);
+	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
+
+	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	ddt->ddt_log_active = &ddt->ddt_log[0];
+	ddt->ddt_log_flushing = &ddt->ddt_log[1];
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+}
+
+void
+ddt_log_free(ddt_t *ddt)
+{
+	ddt_log_empty(ddt, &ddt->ddt_log[0]);
+	ddt_log_empty(ddt, &ddt->ddt_log[1]);
+	avl_destroy(&ddt->ddt_log[0].ddl_tree);
+	avl_destroy(&ddt->ddt_log[1].ddl_tree);
+}
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
+	"Max transactions before starting to flush dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
+	"Max memory for dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
+	"Max memory for dedup logs, as % of total memory");
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@ -33,27 +33,32 @@
 #include <sys/ddt_impl.h>

 static void
-ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
+    ddt_stat_t *dds)
 {
 	spa_t *spa = ddt->ddt_spa;
-	ddt_phys_t *ddp = dde->dde_phys;
-	ddt_key_t *ddk = &dde->dde_key;
-	uint64_t lsize = DDK_GET_LSIZE(ddk);
-	uint64_t psize = DDK_GET_PSIZE(ddk);
+	uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key);
+	uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key);

 	memset(dds, 0, sizeof (*dds));

-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		uint64_t dsize = 0;
-		uint64_t refcnt = ddp->ddp_refcnt;
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

-		if (ddp->ddp_phys_birth == 0)
+		if (ddt_phys_birth(ddp, v) == 0)
 			continue;

-		int ndvas = DDK_GET_CRYPT(&dde->dde_key) ?
-		    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+		int ndvas = ddt_phys_dva_count(ddp, v,
+		    DDK_GET_CRYPT(&ddlwe->ddlwe_key));
+		const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
+		    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
+
+		uint64_t dsize = 0;
 		for (int d = 0; d < ndvas; d++)
-			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+			dsize += dva_get_dsize_sync(spa, &dvas[d]);
+
+		uint64_t refcnt = ddt_phys_refcnt(ddp, v);

 		dds->dds_blocks += 1;
 		dds->dds_lsize += lsize;
@ -67,61 +72,108 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 	}
 }

-void
-ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+static void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src)
 {
-	const uint64_t *s = (const uint64_t *)src;
-	uint64_t *d = (uint64_t *)dst;
-	uint64_t *d_end = (uint64_t *)(dst + 1);
+	dst->dds_blocks		+= src->dds_blocks;
+	dst->dds_lsize		+= src->dds_lsize;
+	dst->dds_psize		+= src->dds_psize;
+	dst->dds_dsize		+= src->dds_dsize;
+	dst->dds_ref_blocks	+= src->dds_ref_blocks;
+	dst->dds_ref_lsize	+= src->dds_ref_lsize;
+	dst->dds_ref_psize	+= src->dds_ref_psize;
+	dst->dds_ref_dsize	+= src->dds_ref_dsize;
+}

-	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
+static void
+ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src)
+{
+	/* This caught more during development than you might expect... */
+	ASSERT3U(dst->dds_blocks, >=, src->dds_blocks);
+	ASSERT3U(dst->dds_lsize, >=, src->dds_lsize);
+	ASSERT3U(dst->dds_psize, >=, src->dds_psize);
+	ASSERT3U(dst->dds_dsize, >=, src->dds_dsize);
+	ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks);
+	ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize);
+	ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize);
+	ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize);

-	for (int i = 0; i < d_end - d; i++)
-		d[i] += (s[i] ^ neg) - neg;
+	dst->dds_blocks		-= src->dds_blocks;
+	dst->dds_lsize		-= src->dds_lsize;
+	dst->dds_psize		-= src->dds_psize;
+	dst->dds_dsize		-= src->dds_dsize;
+	dst->dds_ref_blocks	-= src->dds_ref_blocks;
+	dst->dds_ref_lsize	-= src->dds_ref_lsize;
+	dst->dds_ref_psize	-= src->dds_ref_psize;
+	dst->dds_ref_dsize	-= src->dds_ref_dsize;
 }

 void
-ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe)
 {
 	ddt_stat_t dds;
-	ddt_histogram_t *ddh;
 	int bucket;

-	ddt_stat_generate(ddt, dde, &dds);
+	ddt_stat_generate(ddt, ddlwe, &dds);

 	bucket = highbit64(dds.dds_ref_blocks) - 1;
-	ASSERT3U(bucket, >=, 0);
+	if (bucket < 0)
+		return;

-	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+	ddt_stat_add(&ddh->ddh_stat[bucket], &dds);
+}

-	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+void
+ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_stat_t dds;
+	int bucket;
+
+	ddt_stat_generate(ddt, ddlwe, &dds);
+
+	bucket = highbit64(dds.dds_ref_blocks) - 1;
+	if (bucket < 0)
+		return;
+
+	ddt_stat_sub(&ddh->ddh_stat[bucket], &dds);
 }

 void
 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 {
 	for (int h = 0; h < 64; h++)
-		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]);
 }

 void
-ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 {
 	memset(dds, 0, sizeof (*dds));

 	for (int h = 0; h < 64; h++)
-		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+		ddt_stat_add(dds, &ddh->ddh_stat[h]);
 }

 boolean_t
 ddt_histogram_empty(const ddt_histogram_t *ddh)
 {
-	const uint64_t *s = (const uint64_t *)ddh;
-	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+	for (int h = 0; h < 64; h++) {
+		const ddt_stat_t *dds = &ddh->ddh_stat[h];

-	while (s < s_end)
-		if (*s++ != 0)
-			return (B_FALSE);
+		if (dds->dds_blocks == 0 &&
+		    dds->dds_lsize == 0 &&
+		    dds->dds_psize == 0 &&
+		    dds->dds_dsize == 0 &&
+		    dds->dds_ref_blocks == 0 &&
+		    dds->dds_ref_lsize == 0 &&
+		    dds->dds_ref_psize == 0 &&
+		    dds->dds_ref_dsize == 0)
+			continue;
+
+		return (B_FALSE);
+	}

 	return (B_TRUE);
 }
@ -170,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 				ddo_total->ddo_mspace += ddo->ddo_mspace;
 			}
 		}
+
+		ddt_object_t *ddo = &ddt->ddt_log_stats;
+		ddo_total->ddo_count += ddo->ddo_count;
+		ddo_total->ddo_dspace += ddo->ddo_dspace;
+		ddo_total->ddo_mspace += ddo->ddo_mspace;
 	}

 	/*
@ -207,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 				    &ddt->ddt_histogram_cache[type][class]);
 			}
 		}
+
+		ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
 	}
 }

@ -217,7 +276,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)

 	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 	ddt_get_dedup_histogram(spa, ddh_total);
-	ddt_histogram_stat(dds_total, ddh_total);
+	ddt_histogram_total(dds_total, ddh_total);
 	kmem_free(ddh_total, sizeof (ddt_histogram_t));
 }

--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@ -22,6 +22,7 @@
 /*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2023, Klara Inc.
 */

 #include <sys/zfs_context.h>
@ -51,8 +52,13 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)

 	ASSERT3U(d_len, >=, s_len + 1);	/* no compression plus version byte */

-	c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
-	    ci->ci_level);
+	/* Call compress function directly to avoid hole detection. */
+	abd_t sabd, dabd;
+	abd_get_from_buf_struct(&sabd, (void *)src, s_len);
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level);
+	abd_free(&dabd);
+	abd_free(&sabd);

 	if (c_len == s_len) {
 		cpfunc = ZIO_COMPRESS_OFF;
@ -71,12 +77,18 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 {
 	uchar_t version = *src++;
 	int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];

-	if (ci->ci_decompress != NULL)
-		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
-	else
+	if (zio_compress_table[cpfunc].ci_decompress == NULL) {
 		memcpy(dst, src, d_len);
+		return;
+	}
+
+	abd_t sabd, dabd;
+	abd_get_from_buf_struct(&sabd, src, s_len);
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL));
+	abd_free(&dabd);
+	abd_free(&sabd);

 	if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
 	    (ZFS_HOST_BYTEORDER != 0))
@ -108,7 +120,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)

 static int
 ddt_zap_lookup(objset_t *os, uint64_t object,
-    const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize)
+    const ddt_key_t *ddk, void *phys, size_t psize)
 {
 	uchar_t *cbuf;
 	uint64_t one, csize;
@ -155,7 +167,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object)

 static int
 ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
-    const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx)
+    const void *phys, size_t psize, dmu_tx_t *tx)
 {
 	const size_t cbuf_size = psize + 1;

@ -181,7 +193,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,

 static int
 ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
-    ddt_phys_t *phys, size_t psize)
+    void *phys, size_t psize)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@ -95,6 +95,12 @@ uint_t dmu_prefetch_max = 8 * 1024 * 1024;
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 #endif

+/*
+ * Override copies= for dedup state objects. 0 means the traditional behaviour
+ * (ie the default for the containing objset ie 3 for the MOS).
+ */
+uint_t dmu_ddt_copies = 0;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
@ -2272,6 +2278,28 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
 		}
+
+		if (dmu_ddt_copies > 0) {
+			/*
+			 * If this tuneable is set, and this is a write for a
+			 * dedup entry store (zap or log), then we treat it
+			 * something like ZFS_REDUNDANT_METADATA_MOST on a
+			 * regular dataset: this many copies, and one more for
+			 * "higher" indirect blocks. This specific exception is
+			 * necessary because dedup objects are stored in the
+			 * MOS, which always has the highest possible copies.
+			 */
+			dmu_object_type_t stype =
+			    dn ? dn->dn_storage_type : DMU_OT_NONE;
+			if (stype == DMU_OT_NONE)
+				stype = type;
+			if (stype == DMU_OT_DDT_ZAP) {
+				copies = dmu_ddt_copies;
+				if (level >=
+				    zfs_redundant_metadata_most_ditto_level)
+					copies++;
+			}
+		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);

@ -2824,3 +2852,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 /* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
+	"Override copies= for dedup objects");
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@ -1391,7 +1391,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 		abd_t *dabd = abd_alloc_linear(
 		    drrw->drr_logical_size, B_FALSE);
 		err = zio_decompress_data(drrw->drr_compressiontype,
-		    abd, abd_to_buf(dabd), abd_get_size(abd),
+		    abd, dabd, abd_get_size(abd),
 		    abd_get_size(dabd), NULL);

 		if (err != 0) {
@ -1407,9 +1407,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 		/* Recompress the data */
 		abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
 		    B_FALSE);
-		void *buf = abd_to_buf(cabd);
 		uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
-		    abd, &buf, abd_get_size(abd),
+		    abd, &cabd, abd_get_size(abd),
 		    rwa->os->os_complevel);
 		abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
 		/* Swap in newly compressed data into the abd */
@ -2221,7 +2220,7 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)

 				err = zio_decompress_data(
 				    drrw->drr_compressiontype,
-				    abd, abd_to_buf(decomp_abd),
+				    abd, decomp_abd,
 				    abd_get_size(abd),
 				    abd_get_size(decomp_abd), NULL);

--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@ -2425,8 +2425,14 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
 	fnvlist_free(token_nv);
 	compressed = kmem_alloc(packed_size, KM_SLEEP);

-	compressed_size = gzip_compress(packed, compressed,
+	/* Call compress function directly to avoid hole detection. */
+	abd_t pabd, cabd;
+	abd_get_from_buf_struct(&pabd, packed, packed_size);
+	abd_get_from_buf_struct(&cabd, compressed, packed_size);
+	compressed_size = zfs_gzip_compress(&pabd, &cabd,
 	    packed_size, packed_size, 6);
+	abd_free(&cabd);
+	abd_free(&pabd);

 	zio_cksum_t cksum;
 	fletcher_4_native_varsize(compressed, compressed_size, &cksum);
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		zap_cursor_fini(&zc);
 	}

+	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);

@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)

 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));

+	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);

 	spa_history_log_internal(spa, "scan setup", tx,
@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa) ||
-	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
+	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
+	    !ddt_walk_ready(scn->scn_dp->dp_spa)) {
 		if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
 			dprintf("suspending at first available bookmark "
 			    "%llx/%llx/%llx/%llx\n",
@ -2929,11 +2934,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)

 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+    ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	(void) tx;
-	const ddt_key_t *ddk = &dde->dde_key;
-	ddt_phys_t *ddp = dde->dde_phys;
+	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };

@ -2954,11 +2958,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 	if (scn->scn_done_txg != 0)
 		return;

-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
+
+		if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
-		ddt_bp_create(checksum, ddk, ddp, &bp);
+		ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);

 		scn->scn_visited_this_txg++;
 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
@ -3002,11 +3008,11 @@ static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
-	ddt_entry_t dde = {{{{0}}}};
+	ddt_lightweight_entry_t ddlwe = {0};
 	int error;
 	uint64_t n = 0;

-	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
 		ddt_t *ddt;

 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
@ -3021,16 +3027,28 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);

-		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
 		n++;

 		if (dsl_scan_check_suspend(scn, NULL))
 			break;
 	}

-	zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
-	    "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
-	    (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
+	if (error == EAGAIN) {
+		dsl_scan_check_suspend(scn, NULL);
+		error = 0;
+
+		zfs_dbgmsg("waiting for ddt to become ready for scan "
+		    "on %s with class_max = %u; suspending=%u",
+		    scn->scn_dp->dp_spa->spa_name,
+		    (int)scn->scn_phys.scn_ddt_class_max,
+		    (int)scn->scn_suspending);
+	} else
+		zfs_dbgmsg("scanned %llu ddt entries on %s with "
+		    "class_max = %u; suspending=%u", (longlong_t)n,
+		    scn->scn_dp->dp_spa->spa_name,
+		    (int)scn->scn_phys.scn_ddt_class_max,
+		    (int)scn->scn_suspending);

 	ASSERT(error == 0 || error == ENOENT);
 	ASSERT(error != ENOENT ||
--- a/module/zfs/gzip.c
+++ b/module/zfs/gzip.c
@ -47,8 +47,9 @@ typedef uLongf zlen_t;

 #endif

-size_t
-gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+static size_t
+zfs_gzip_compress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	int ret;
 	zlen_t dstlen = d_len;
@ -82,8 +83,9 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 	return ((size_t)dstlen);
 }

-int
-gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+static int
+zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	(void) n;
 	zlen_t dstlen = d_len;
@ -103,3 +105,6 @@ gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)

 	return (0);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_gzip_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_gzip_decompress)
--- a/module/zfs/lz4_zfs.c
+++ b/module/zfs/lz4_zfs.c
@ -52,8 +52,8 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,

 static kmem_cache_t *lz4_cache;

-size_t
-lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_lz4_compress_buf(void *s_start, void *d_start, size_t s_len,
    size_t d_len, int n)
 {
 	(void) n;
@ -80,8 +80,8 @@ lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
 	return (bufsiz + sizeof (bufsiz));
 }

-int
-lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len,
    size_t d_len, int n)
 {
 	(void) n;
@ -100,6 +100,9 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
 	    d_start, bufsiz, d_len) < 0);
 }

+ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_lz4_decompress)
+
 /*
 * LZ4 API Description:
 *
--- a/module/zfs/lzjb.c
+++ b/module/zfs/lzjb.c
@ -45,8 +45,9 @@
 #define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
 #define	LEMPEL_SIZE	1024

-size_t
-lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+static size_t
+zfs_lzjb_compress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	(void) n;
 	uchar_t *src = s_start;
@ -100,8 +101,9 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 	return (dst - (uchar_t *)d_start);
 }

-int
-lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+static int
+zfs_lzjb_decompress_buf(void *s_start, void *d_start,
+    size_t s_len, size_t d_len, int n)
 {
 	(void) s_len, (void) n;
 	uchar_t *src = s_start;
@ -130,3 +132,6 @@ lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 	}
 	return (0);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_lzjb_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_lzjb_decompress)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@ -1040,16 +1040,34 @@ spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 * online when we do this, or else any vdevs that weren't present
 * would be orphaned from our pool.  We are also going to issue a
 * sysevent to update any watchers.
+ *
+ * The GUID of the pool will be changed to the value pointed to by guidp.
+ * The GUID may not be set to the reserverd value of 0.
+ * The new GUID will be generated if guidp is NULL.
 */
 int
-spa_change_guid(spa_t *spa)
+spa_change_guid(spa_t *spa, const uint64_t *guidp)
 {
-	int error;
 	uint64_t guid;
+	int error;

 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
-	guid = spa_generate_guid(NULL);
+
+	if (guidp != NULL) {
+		guid = *guidp;
+		if (guid == 0) {
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		if (spa_guid_exists(guid, 0)) {
+			error = SET_ERROR(EEXIST);
+			goto out;
+		}
+	} else {
+		guid = spa_generate_guid(NULL);
+	}

 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
@ -1068,6 +1086,7 @@ spa_change_guid(spa_t *spa)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 	}

+out:
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);

@ -7602,8 +7621,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
-	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
-		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) {
+		return (spa_vdev_exit(spa, newrootvd, txg,
+		    ZFS_ERR_ASHIFT_MISMATCH));
+	}

 	/*
 	 * RAIDZ-expansion-specific checks.
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@ -645,7 +645,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
 		    DATA_TYPE_INT32, zio->io_error, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
-		    DATA_TYPE_INT32, zio->io_flags, NULL);
+		    DATA_TYPE_UINT64, zio->io_flags, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
 		    DATA_TYPE_UINT32, zio->io_stage, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@ -1794,17 +1794,45 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 	return (error);
 }

+/*
+ * inputs:
+ * zc_nvlist_src	nvlist optionally containing ZPOOL_REGUID_GUID
+ * zc_nvlist_src_size	size of the nvlist
+ */
 static int
 zfs_ioc_pool_reguid(zfs_cmd_t *zc)
 {
+	uint64_t *guidp = NULL;
+	nvlist_t *props = NULL;
 	spa_t *spa;
+	uint64_t guid;
 	int error;

+	if (zc->zc_nvlist_src_size != 0) {
+		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+		    zc->zc_iflags, &props);
+		if (error != 0)
+			return (error);
+
+		error = nvlist_lookup_uint64(props, ZPOOL_REGUID_GUID, &guid);
+		if (error == 0)
+			guidp = &guid;
+		else if (error == ENOENT)
+			guidp = NULL;
+		else
+			goto out;
+	}
+
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
-		error = spa_change_guid(spa);
+		error = spa_change_guid(spa, guidp);
 		spa_close(spa, FTAG);
 	}
+
+out:
+	if (props != NULL)
+		nvlist_free(props);
+
 	return (error);
 }

--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -299,10 +299,13 @@ zio_fini(void)
 * ==========================================================================
 */

-#ifdef ZFS_DEBUG
-static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
+#if defined(ZFS_DEBUG) && defined(_KERNEL)
+#define	ZFS_ZIO_BUF_CANARY	1
 #endif

+#ifdef ZFS_ZIO_BUF_CANARY
+static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
+
 /*
 * Use empty space after the buffer to detect overflows.
 *
@ -314,7 +317,6 @@ static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
 static void
 zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
-#ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
@ -323,13 +325,11 @@ zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t))
 		*canary = zio_buf_canary;
-#endif
 }

 static void
 zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
-#ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
@ -343,8 +343,8 @@ zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 			    *canary, zio_buf_canary);
 		}
 	}
-#endif
 }
+#endif

 /*
 * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
@ -363,7 +363,9 @@ zio_buf_alloc(size_t size)
 #endif

 	void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_buf_cache, c);
+#endif
 	return (p);
 }

@ -381,7 +383,9 @@ zio_data_buf_alloc(size_t size)
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);

 	void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_data_buf_cache, c);
+#endif
 	return (p);
 }

@ -395,7 +399,9 @@ zio_buf_free(void *buf, size_t size)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif

+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_buf_cache, c);
+#endif
 	kmem_cache_free(zio_buf_cache[c], buf);
 }

@ -406,7 +412,9 @@ zio_data_buf_free(void *buf, size_t size)

 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);

+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
+#endif
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }

@ -479,11 +487,9 @@ static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
-		void *tmp = abd_borrow_buf(data, size);
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
-		    zio->io_abd, tmp, zio->io_size, size,
+		    zio->io_abd, data, zio->io_size, size,
 		    &zio->io_prop.zp_complevel);
-		abd_return_buf_copy(data, tmp, size);

 		if (zio_injection_enabled && ret == 0)
 			ret = zio_handle_fault_injection(zio, EINVAL);
@ -530,17 +536,18 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 			 * from the indirect block. We decompress it now and
 			 * throw away the result after we are finished.
 			 */
-			tmp = zio_buf_alloc(lsize);
+			abd_t *abd = abd_alloc_linear(lsize, B_TRUE);
 			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
-			    zio->io_abd, tmp, zio->io_size, lsize,
+			    zio->io_abd, abd, zio->io_size, lsize,
 			    &zio->io_prop.zp_complevel);
 			if (ret != 0) {
+				abd_free(abd);
 				ret = SET_ERROR(EIO);
 				goto error;
 			}
-			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
-			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
-			zio_buf_free(tmp, lsize);
+			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
+			    abd, lsize, BP_SHOULD_BYTESWAP(bp), mac);
+			abd_free(abd);
 		} else {
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
@ -1858,30 +1865,32 @@ zio_write_compress(zio_t *zio)
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
-		void *cbuf = NULL;
+		abd_t *cabd = NULL;
 		if (abd_cmp_zero(zio->io_abd, lsize) == 0)
 			psize = 0;
 		else if (compress == ZIO_COMPRESS_EMPTY)
 			psize = lsize;
 		else
-			psize = zio_compress_data(compress, zio->io_abd, &cbuf,
+			psize = zio_compress_data(compress, zio->io_abd, &cabd,
 			    lsize, zp->zp_complevel);
 		if (psize == 0) {
 			compress = ZIO_COMPRESS_OFF;
 		} else if (psize >= lsize) {
 			compress = ZIO_COMPRESS_OFF;
-			if (cbuf != NULL)
-				zio_buf_free(cbuf, lsize);
+			if (cabd != NULL)
+				abd_free(cabd);
 		} else if (!zp->zp_dedup && !zp->zp_encrypt &&
 		    psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+			void *cbuf = abd_borrow_buf_copy(cabd, lsize);
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
-			zio_buf_free(cbuf, lsize);
+			abd_return_buf(cabd, cbuf, lsize);
+			abd_free(cabd);
 			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
@ -1900,14 +1909,12 @@ zio_write_compress(zio_t *zio)
 			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
-				zio_buf_free(cbuf, lsize);
+				abd_free(cabd);
 				psize = lsize;
 			} else {
-				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
-				abd_take_ownership_of_buf(cdata, B_TRUE);
-				abd_zero_off(cdata, psize, rounded - psize);
+				abd_zero_off(cabd, psize, rounded - psize);
 				psize = rounded;
-				zio_push_transform(zio, cdata,
+				zio_push_transform(zio, cabd,
 				    psize, lsize, NULL);
 			}
 		}
@ -3254,17 +3261,21 @@ static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt;
 	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);

 	mutex_enter(&pio->io_lock);
-	ddp = ddt_phys_select(dde, bp);
-	if (zio->io_error == 0)
-		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
+	ddt = ddt_select(zio->io_spa, bp);

-	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
-		dde->dde_repair_abd = zio->io_abd;
+	if (zio->io_error == 0) {
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+		/* this phys variant doesn't need repair */
+		ddt_phys_clear(dde->dde_phys, v);
+	}
+
+	if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
+		dde->dde_io->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
@ -3282,21 +3293,25 @@ zio_ddt_read_start(zio_t *zio)
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
-		ddt_phys_t *ddp = dde->dde_phys;
-		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+		ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp);
+		ddt_univ_phys_t *ddp = dde->dde_phys;
 		blkptr_t blk;

 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;

-		if (ddp_self == NULL)
+		if (v_self == DDT_PHYS_NONE)
 			return (zio);

-		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+		/* issue I/O for the other copies */
+		for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+			ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+			if (ddt_phys_birth(ddp, v) == 0 || v == v_self)
 				continue;
-			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
-			    &blk);
+
+			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
+			    ddp, v, &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
@ -3338,8 +3353,8 @@ zio_ddt_read_done(zio_t *zio)
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (NULL);
 		}
-		if (dde->dde_repair_abd != NULL) {
-			abd_copy(zio->io_abd, dde->dde_repair_abd,
+		if (dde->dde_io->dde_repair_abd != NULL) {
+			abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
@ -3372,28 +3387,36 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 	 * loaded).
 	 */

-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
-		zio_t *lio = dde->dde_lead_zio[p];
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		if (DDT_PHYS_IS_DITTO(ddt, p))
+			continue;

-		if (lio != NULL && do_raw) {
+		if (dde->dde_io == NULL)
+			continue;
+
+		zio_t *lio = dde->dde_io->dde_lead_zio[p];
+		if (lio == NULL)
+			continue;
+
+		if (do_raw)
 			return (lio->io_size != zio->io_size ||
 			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
-		} else if (lio != NULL) {
-			return (lio->io_orig_size != zio->io_orig_size ||
-			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
-		}
+
+		return (lio->io_orig_size != zio->io_orig_size ||
+		    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
 	}

-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
-		ddt_phys_t *ddp = &dde->dde_phys[p];
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v);

-		if (ddp->ddp_phys_birth != 0 && do_raw) {
+		if (phys_birth != 0 && do_raw) {
 			blkptr_t blk = *zio->io_bp;
 			uint64_t psize;
 			abd_t *tmpabd;
 			int error;

-			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
 			psize = BP_GET_PSIZE(&blk);

 			if (psize != zio->io_size)
@ -3416,13 +3439,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 			abd_free(tmpabd);
 			ddt_enter(ddt);
 			return (error != 0);
-		} else if (ddp->ddp_phys_birth != 0) {
+		} else if (phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;

-			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);

 			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
 				return (B_TRUE);
@ -3450,50 +3473,87 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 }

 static void
-zio_ddt_child_write_ready(zio_t *zio)
+zio_ddt_child_write_done(zio_t *zio)
 {
-	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp = &dde->dde_phys[p];
-	zio_t *pio;

-	if (zio->io_error)
-		return;
+	zio_link_t *zl = NULL;
+	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
+
+	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
+	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+	ddt_univ_phys_t *ddp = dde->dde_phys;

 	ddt_enter(ddt);

-	ASSERT(dde->dde_lead_zio[p] == zio);
+	/* we're the lead, so once we're done there's no one else outstanding */
+	if (dde->dde_io->dde_lead_zio[p] == zio)
+		dde->dde_io->dde_lead_zio[p] = NULL;

-	ddt_phys_fill(ddp, zio->io_bp);
+	ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys;

-	zio_link_t *zl = NULL;
-	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
-		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+	if (zio->io_error != 0) {
+		/*
+		 * The write failed, so we're about to abort the entire IO
+		 * chain. We need to revert the entry back to what it was at
+		 * the last time it was successfully extended.
+		 */
+		ddt_phys_copy(ddp, orig, v);
+		ddt_phys_clear(orig, v);
+
+		ddt_exit(ddt);
+		return;
+	}
+
+	/*
+	 * We've successfully added new DVAs to the entry. Clear the saved
+	 * state or, if there's still outstanding IO, remember it so we can
+	 * revert to a known good state if that IO fails.
+	 */
+	if (dde->dde_io->dde_lead_zio[p] == NULL)
+		ddt_phys_clear(orig, v);
+	else
+		ddt_phys_copy(orig, ddp, v);
+
+	/*
+	 * Add references for all dedup writes that were waiting on the
+	 * physical one, skipping any other physical writes that are waiting.
+	 */
+	zio_t *pio;
+	zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
+			ddt_phys_addref(ddp, v);
+	}

 	ddt_exit(ddt);
 }

 static void
-zio_ddt_child_write_done(zio_t *zio)
+zio_ddt_child_write_ready(zio_t *zio)
 {
-	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp = &dde->dde_phys[p];
+
+	zio_link_t *zl = NULL;
+	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
+
+	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
+	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+	if (zio->io_error != 0)
+		return;

 	ddt_enter(ddt);

-	ASSERT(ddp->ddp_refcnt == 0);
-	ASSERT(dde->dde_lead_zio[p] == zio);
-	dde->dde_lead_zio[p] = NULL;
+	ddt_phys_extend(dde->dde_phys, v, zio->io_bp);

-	if (zio->io_error == 0) {
-		zio_link_t *zl = NULL;
-		while (zio_walk_parents(zio, &zl) != NULL)
-			ddt_phys_addref(ddp);
-	} else {
-		ddt_phys_clear(ddp);
+	zio_t *pio;
+	zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
+			ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
 	}

 	ddt_exit(ddt);
@ -3506,11 +3566,8 @@ zio_ddt_write(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
-	int p = zp->zp_copies;
-	zio_t *cio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
-	ddt_phys_t *ddp;

 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
@ -3518,7 +3575,7 @@ zio_ddt_write(zio_t *zio)
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));

 	ddt_enter(ddt);
-	dde = ddt_lookup(ddt, bp, B_TRUE);
+	dde = ddt_lookup(ddt, bp);
 	if (dde == NULL) {
 		/* DDT size is over its quota so no new entries */
 		zp->zp_dedup = B_FALSE;
@ -3528,7 +3585,6 @@ zio_ddt_write(zio_t *zio)
 		ddt_exit(ddt);
 		return (zio);
 	}
-	ddp = &dde->dde_phys[p];

 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
@ -3553,29 +3609,227 @@ zio_ddt_write(zio_t *zio)
 		return (zio);
 	}

-	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
-		if (ddp->ddp_phys_birth != 0)
-			ddt_bp_fill(ddp, bp, txg);
-		if (dde->dde_lead_zio[p] != NULL)
-			zio_add_child(zio, dde->dde_lead_zio[p]);
-		else
-			ddt_phys_addref(ddp);
-	} else if (zio->io_bp_override) {
-		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
-		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
-		ddt_phys_fill(ddp, bp);
-		ddt_phys_addref(ddp);
-	} else {
-		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
-		    zio->io_orig_size, zio->io_orig_size, zp,
-		    zio_ddt_child_write_ready, NULL,
-		    zio_ddt_child_write_done, dde, zio->io_priority,
-		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
+	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+	ddt_univ_phys_t *ddp = dde->dde_phys;

-		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
-		dde->dde_lead_zio[p] = cio;
+	/*
+	 * In the common cases, at this point we have a regular BP with no
+	 * allocated DVAs, and the corresponding DDT entry for its checksum.
+	 * Our goal is to fill the BP with enough DVAs to satisfy its copies=
+	 * requirement.
+	 *
+	 * One of three things needs to happen to fulfill this:
+	 *
+	 * - if the DDT entry has enough DVAs to satisfy the BP, we just copy
+	 *   them out of the entry and return;
+	 *
+	 * - if the DDT entry has no DVAs (ie its brand new), then we have to
+	 *   issue the write as normal so that DVAs can be allocated and the
+	 *   data land on disk. We then copy the DVAs into the DDT entry on
+	 *   return.
+	 *
+	 * - if the DDT entry has some DVAs, but too few, we have to issue the
+	 *   write, adjusted to have allocate fewer copies. When it returns, we
+	 *   add the new DVAs to the DDT entry, and update the BP to have the
+	 *   full amount it originally requested.
+	 *
+	 * In all cases, if there's already a writing IO in flight, we need to
+	 * defer the action until after the write is done. If our action is to
+	 * write, we need to adjust our request for additional DVAs to match
+	 * what will be in the DDT entry after it completes. In this way every
+	 * IO can be guaranteed to recieve enough DVAs simply by joining the
+	 * end of the chain and letting the sequence play out.
+	 */
+
+	/*
+	 * Number of DVAs in the DDT entry. If the BP is encrypted we ignore
+	 * the third one as normal.
+	 */
+	int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
+	IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
+
+	/* Number of DVAs requested bya the IO. */
+	uint8_t need_dvas = zp->zp_copies;
+
+	/*
+	 * What we do next depends on whether or not there's IO outstanding that
+	 * will update this entry.
+	 */
+	if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
+		/*
+		 * No IO outstanding, so we only need to worry about ourselves.
+		 */
+
+		/*
+		 * Override BPs bring their own DVAs and their own problems.
+		 */
+		if (zio->io_bp_override) {
+			/*
+			 * For a brand-new entry, all the work has been done
+			 * for us, and we can just fill it out from the provided
+			 * block and leave.
+			 */
+			if (have_dvas == 0) {
+				ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
+				ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+				ddt_phys_extend(ddp, v, bp);
+				ddt_phys_addref(ddp, v);
+				ddt_exit(ddt);
+				return (zio);
+			}
+
+			/*
+			 * If we already have this entry, then we want to treat
+			 * it like a regular write. To do this we just wipe
+			 * them out and proceed like a regular write.
+			 *
+			 * Even if there are some DVAs in the entry, we still
+			 * have to clear them out. We can't use them to fill
+			 * out the dedup entry, as they are all referenced
+			 * together by a bp already on disk, and will be freed
+			 * as a group.
+			 */
+			BP_ZERO_DVAS(bp);
+			BP_SET_BIRTH(bp, 0, 0);
+		}
+
+		/*
+		 * If there are enough DVAs in the entry to service our request,
+		 * then we can just use them as-is.
+		 */
+		if (have_dvas >= need_dvas) {
+			ddt_bp_fill(ddp, v, bp, txg);
+			ddt_phys_addref(ddp, v);
+			ddt_exit(ddt);
+			return (zio);
+		}
+
+		/*
+		 * Otherwise, we have to issue IO to fill the entry up to the
+		 * amount we need.
+		 */
+		need_dvas -= have_dvas;
+	} else {
+		/*
+		 * There's a write in-flight. If there's already enough DVAs on
+		 * the entry, then either there were already enough to start
+		 * with, or the in-flight IO is between READY and DONE, and so
+		 * has extended the entry with new DVAs. Either way, we don't
+		 * need to do anything, we can just slot in behind it.
+		 */
+
+		if (zio->io_bp_override) {
+			/*
+			 * If there's a write out, then we're soon going to
+			 * have our own copies of this block, so clear out the
+			 * override block and treat it as a regular dedup
+			 * write. See comment above.
+			 */
+			BP_ZERO_DVAS(bp);
+			BP_SET_BIRTH(bp, 0, 0);
+		}
+
+		if (have_dvas >= need_dvas) {
+			/*
+			 * A minor point: there might already be enough
+			 * committed DVAs in the entry to service our request,
+			 * but we don't know which are completed and which are
+			 * allocated but not yet written. In this case, should
+			 * the IO for the new DVAs fail, we will be on the end
+			 * of the IO chain and will also recieve an error, even
+			 * though our request could have been serviced.
+			 *
+			 * This is an extremely rare case, as it requires the
+			 * original block to be copied with a request for a
+			 * larger number of DVAs, then copied again requesting
+			 * the same (or already fulfilled) number of DVAs while
+			 * the first request is active, and then that first
+			 * request errors. In return, the logic required to
+			 * catch and handle it is complex. For now, I'm just
+			 * not going to bother with it.
+			 */
+
+			/*
+			 * We always fill the bp here as we may have arrived
+			 * after the in-flight write has passed READY, and so
+			 * missed out.
+			 */
+			ddt_bp_fill(ddp, v, bp, txg);
+			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
+			ddt_exit(ddt);
+			return (zio);
+		}
+
+		/*
+		 * There's not enough in the entry yet, so we need to look at
+		 * the write in-flight and see how many DVAs it will have once
+		 * it completes.
+		 *
+		 * The in-flight write has potentially had its copies request
+		 * reduced (if we're filling out an existing entry), so we need
+		 * to reach in and get the original write to find out what it is
+		 * expecting.
+		 *
+		 * Note that the parent of the lead zio will always have the
+		 * highest zp_copies of any zio in the chain, because ones that
+		 * can be serviced without additional IO are always added to
+		 * the back of the chain.
+		 */
+		zio_link_t *zl = NULL;
+		zio_t *pio =
+		    zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
+		ASSERT(pio);
+		uint8_t parent_dvas = pio->io_prop.zp_copies;
+
+		if (parent_dvas >= need_dvas) {
+			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
+			ddt_exit(ddt);
+			return (zio);
+		}
+
+		/*
+		 * Still not enough, so we will need to issue to get the
+		 * shortfall.
+		 */
+		need_dvas -= parent_dvas;
 	}

+	/*
+	 * We need to write. We will create a new write with the copies
+	 * property adjusted to match the number of DVAs we need to need to
+	 * grow the DDT entry by to satisfy the request.
+	 */
+	zio_prop_t czp = *zp;
+	czp.zp_copies = need_dvas;
+	zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
+	    zio->io_orig_size, zio->io_orig_size, &czp,
+	    zio_ddt_child_write_ready, NULL,
+	    zio_ddt_child_write_done, dde, zio->io_priority,
+	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+	zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
+
+	/*
+	 * We are the new lead zio, because our parent has the highest
+	 * zp_copies that has been requested for this entry so far.
+	 */
+	ddt_alloc_entry_io(dde);
+	if (dde->dde_io->dde_lead_zio[p] == NULL) {
+		/*
+		 * First time out, take a copy of the stable entry to revert
+		 * to if there's an error (see zio_ddt_child_write_done())
+		 */
+		ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
+	} else {
+		/*
+		 * Make the existing chain our child, because it cannot
+		 * complete until we have.
+		 */
+		zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
+	}
+	dde->dde_io->dde_lead_zio[p] = cio;
+
 	ddt_exit(ddt);

 	zio_nowait(cio);
@ -3591,18 +3845,17 @@ zio_ddt_free(zio_t *zio)
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
-	ddt_entry_t *dde;
-	ddt_phys_t *ddp;
+	ddt_entry_t *dde = NULL;

 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);

 	ddt_enter(ddt);
-	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+	freedde = dde = ddt_lookup(ddt, bp);
 	if (dde) {
-		ddp = ddt_phys_select(dde, bp);
-		if (ddp)
-			ddt_phys_decref(ddp);
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+		if (v != DDT_PHYS_NONE)
+			ddt_phys_decref(dde->dde_phys, v);
 	}
 	ddt_exit(ddt);

--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@ -29,7 +29,7 @@

 /*
 * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2024, Klara, Inc.
 * Copyright (c) 2019, Allan Jude
 */

@ -48,26 +48,42 @@ static unsigned long zio_decompress_fail_fraction = 0;

 /*
 * Compression vectors.
+ *
+ * NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS.
+ * THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE
+ * PART OF THE ON-DISK FORMAT.
 */
 zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
-	{"inherit",	0,	NULL,		NULL, NULL},
-	{"on",		0,	NULL,		NULL, NULL},
-	{"uncompressed", 0,	NULL,		NULL, NULL},
-	{"lzjb",	0,	lzjb_compress,	lzjb_decompress, NULL},
-	{"empty",	0,	NULL,		NULL, NULL},
-	{"gzip-1",	1,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-2",	2,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-3",	3,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-4",	4,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-5",	5,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-6",	6,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-7",	7,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-8",	8,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-9",	9,	gzip_compress,	gzip_decompress, NULL},
-	{"zle",		64,	zle_compress,	zle_decompress, NULL},
-	{"lz4",		0,	lz4_compress_zfs, lz4_decompress_zfs, NULL},
-	{"zstd",	ZIO_ZSTD_LEVEL_DEFAULT,	zfs_zstd_compress_wrap,
-	    zfs_zstd_decompress, zfs_zstd_decompress_level},
+	{"inherit",	0,	NULL,	NULL, NULL},
+	{"on",		0,	NULL,	NULL, NULL},
+	{"uncompressed", 0,	NULL,	NULL, NULL},
+	{"lzjb",	0,
+	    zfs_lzjb_compress,	zfs_lzjb_decompress, NULL},
+	{"empty",	0,	NULL,	NULL, NULL},
+	{"gzip-1",	1,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-2",	2,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-3",	3,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-4",	4,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-5",	5,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-6",	6,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-7",	7,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-8",	8,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-9",	9,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"zle",		64,
+	    zfs_zle_compress,	zfs_zle_decompress, NULL},
+	{"lz4",		0,
+	    zfs_lz4_compress,	zfs_lz4_decompress, NULL},
+	{"zstd",	ZIO_ZSTD_LEVEL_DEFAULT,
+	    zfs_zstd_compress,	zfs_zstd_decompress, zfs_zstd_decompress_level},
 };

 uint8_t
@ -112,20 +128,16 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
 }

 size_t
-zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
+zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len,
    uint8_t level)
 {
 	size_t c_len, d_len;
 	uint8_t complevel;
 	zio_compress_info_t *ci = &zio_compress_table[c];

-	ASSERT3U(c, <, ZIO_COMPRESS_FUNCTIONS);
 	ASSERT3U(ci->ci_compress, !=, NULL);
 	ASSERT3U(s_len, >, 0);

-	/* Compress at least 12.5% */
-	d_len = s_len - (s_len >> 3);
-
 	complevel = ci->ci_level;

 	if (c == ZIO_COMPRESS_ZSTD) {
@ -142,12 +154,12 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 	}

 	if (*dst == NULL)
-		*dst = zio_buf_alloc(s_len);
+		*dst = abd_alloc_sametype(src, s_len);

-	/* No compression algorithms can read from ABDs directly */
-	void *tmp = abd_borrow_buf_copy(src, s_len);
-	c_len = ci->ci_compress(tmp, *dst, s_len, d_len, complevel);
-	abd_return_buf(src, tmp, s_len);
+	/* Compress at least 12.5%, but limit to the size of the dest abd. */
+	d_len = MIN(s_len - (s_len >> 3), abd_get_size(*dst));
+
+	c_len = ci->ci_compress(src, *dst, s_len, d_len, complevel);

 	if (c_len > d_len)
 		return (s_len);
@ -157,26 +169,18 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 }

 int
-zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
+zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *dst,
    size_t s_len, size_t d_len, uint8_t *level)
 {
 	zio_compress_info_t *ci = &zio_compress_table[c];
 	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
 		return (SET_ERROR(EINVAL));

+	int err;
 	if (ci->ci_decompress_level != NULL && level != NULL)
-		return (ci->ci_decompress_level(src, dst, s_len, d_len, level));
-
-	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
-}
-
-int
-zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
-    size_t s_len, size_t d_len, uint8_t *level)
-{
-	void *tmp = abd_borrow_buf_copy(src, s_len);
-	int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level);
-	abd_return_buf(src, tmp, s_len);
+		err = ci->ci_decompress_level(src, dst, s_len, d_len, level);
+	else
+		err = ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);

 	/*
 	 * Decompression shouldn't fail, because we've already verified
@ -185,9 +189,9 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
 	 */
 	if (zio_decompress_fail_fraction != 0 &&
 	    random_in_range(zio_decompress_fail_fraction) == 0)
-		ret = SET_ERROR(EINVAL);
+		err = SET_ERROR(EINVAL);

-	return (ret);
+	return (err);
 }

 int
--- a/module/zfs/zle.c
+++ b/module/zfs/zle.c
@ -34,8 +34,9 @@
 #include <sys/sysmacros.h>
 #include <sys/zio_compress.h>

-size_t
-zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+static size_t
+zfs_zle_compress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
@ -64,8 +65,9 @@ zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 	return (src == s_end ? dst - (uchar_t *)d_start : s_len);
 }

-int
-zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+static int
+zfs_zle_decompress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
@ -89,3 +91,6 @@ zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 	}
 	return (dst == d_end ? 0 : -1);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_zle_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_zle_decompress)
--- a/module/zstd/zfs_zstd.c
+++ b/module/zstd/zfs_zstd.c
@ -429,68 +429,9 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
 	return (1);
 }

-
-size_t
-zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
-    int level)
-{
-	int16_t zstd_level;
-	if (zstd_enum_to_level(level, &zstd_level)) {
-		ZSTDSTAT_BUMP(zstd_stat_com_inval);
-		return (s_len);
-	}
-	/*
-	 * A zstd early abort heuristic.
-	 *
-	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
-	 *   128k), don't try any of this, just go.
-	 *   (because experimentally that was a reasonable cutoff for a perf win
-	 *   with tiny ratio change)
-	 * - First, we try LZ4 compression, and if it doesn't early abort, we
-	 *   jump directly to whatever compression level we intended to try.
-	 * - Second, we try zstd-1 - if that errors out (usually, but not
-	 *   exclusively, if it would overflow), we give up early.
-	 *
-	 *   If it works, instead we go on and compress anyway.
-	 *
-	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
-	 * compressible data, it was losing up to 8.5% of the compressed
-	 * savings versus no early abort, and all the zstd-fast levels are
-	 * worse indications on their own than LZ4, and don't improve the LZ4
-	 * pass noticably if stacked like this.
-	 */
-	size_t actual_abort_size = zstd_abort_size;
-	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
-	    s_len >= actual_abort_size) {
-		int pass_len = 1;
-		pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
-		if (pass_len < d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
-			goto keep_trying;
-		}
-		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
-
-		pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
-		    ZIO_ZSTD_LEVEL_1);
-		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
-			return (s_len);
-		}
-		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
-	} else {
-		ZSTDSTAT_BUMP(zstd_stat_passignored);
-		if (s_len < actual_abort_size) {
-			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
-		}
-	}
-keep_trying:
-	return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
-
-}
-
 /* Compress block using zstd */
-size_t
-zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+static size_t
+zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
    int level)
 {
 	size_t c_len;
@ -594,9 +535,73 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
 	return (c_len + sizeof (*hdr));
 }

+
+static size_t
+zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
+    int level)
+{
+	int16_t zstd_level;
+	if (zstd_enum_to_level(level, &zstd_level)) {
+		ZSTDSTAT_BUMP(zstd_stat_com_inval);
+		return (s_len);
+	}
+	/*
+	 * A zstd early abort heuristic.
+	 *
+	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
+	 *   128k), don't try any of this, just go.
+	 *   (because experimentally that was a reasonable cutoff for a perf win
+	 *   with tiny ratio change)
+	 * - First, we try LZ4 compression, and if it doesn't early abort, we
+	 *   jump directly to whatever compression level we intended to try.
+	 * - Second, we try zstd-1 - if that errors out (usually, but not
+	 *   exclusively, if it would overflow), we give up early.
+	 *
+	 *   If it works, instead we go on and compress anyway.
+	 *
+	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
+	 * compressible data, it was losing up to 8.5% of the compressed
+	 * savings versus no early abort, and all the zstd-fast levels are
+	 * worse indications on their own than LZ4, and don't improve the LZ4
+	 * pass noticably if stacked like this.
+	 */
+	size_t actual_abort_size = zstd_abort_size;
+	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
+	    s_len >= actual_abort_size) {
+		int pass_len = 1;
+		abd_t sabd, dabd;
+		abd_get_from_buf_struct(&sabd, s_start, s_len);
+		abd_get_from_buf_struct(&dabd, d_start, d_len);
+		pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
+		abd_free(&dabd);
+		abd_free(&sabd);
+		if (pass_len < d_len) {
+			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
+			goto keep_trying;
+		}
+		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
+
+		pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
+		    d_len, ZIO_ZSTD_LEVEL_1);
+		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
+			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
+			return (s_len);
+		}
+		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
+	} else {
+		ZSTDSTAT_BUMP(zstd_stat_passignored);
+		if (s_len < actual_abort_size) {
+			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
+		}
+	}
+keep_trying:
+	return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
+
+}
+
 /* Decompress block using zstd and return its stored level */
-int
-zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
    size_t d_len, uint8_t *level)
 {
 	ZSTD_DCtx *dctx;
@ -671,15 +676,20 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
 }

 /* Decompress datablock using zstd */
-int
-zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
-    int level __maybe_unused)
+static int
+zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int level __maybe_unused)
 {

-	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
+	return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
 	    NULL));
 }

+ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
+ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
+
+
 /* Allocator for zstd compression context using mempool_allocator */
 static void *
 zstd_alloc(void *opaque __maybe_unused, size_t size)
--- a/rpm/generic/zfs-kmod.spec.in
+++ b/rpm/generic/zfs-kmod.spec.in
@ -145,6 +145,24 @@ for kernel_version in %{?kernel_versions}; do
        %{?kernel_cc} \
        %{?kernel_ld} \
        %{?kernel_llvm}
+
+    # Pre-6.10 kernel builds didn't need to copy over the source files to the
+    # build directory.  However we do need to do it though post-6.10 due to
+    # these commits:
+    #
+    # b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
+    #                      directory
+    #
+    # 9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
+    #                      rules
+    #
+    # Note that kmodtool actually copies over the source into the build
+    # directory, so what we're doing here is normal.  For efficiency reasons
+    # though we just use hardlinks instead of copying.
+    #
+    # See https://github.com/openzfs/zfs/issues/16439 for more info.
+    cp -lR ../%{module}-%{version}/module/* module/
+
    make %{?_smp_mflags}
    cd ..
 done
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@ -514,6 +514,10 @@ tags = ['functional', 'cli_root', 'zpool_offline']
 tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
 tags = ['functional', 'cli_root', 'zpool_online']

+[tests/functional/cli_root/zpool_reguid]
+tests = ['zpool_reguid_001_pos', 'zpool_reguid_002_neg']
+tags = ['functional', 'cli_root', 'zpool_reguid']
+
 [tests/functional/cli_root/zpool_remove]
 tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
    'zpool_remove_003_pos']
@ -672,7 +676,9 @@ post =
 tags = ['functional', 'deadman']

 [tests/functional/dedup]
-tests = ['dedup_quota']
+tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import',
+    'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
+    'dedup_legacy_fdt_mixed', 'dedup_quota']
 pre =
 post =
 tags = ['functional', 'dedup']
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@ -24,7 +24,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/badsend


 scripts_zfs_tests_bin_PROGRAMS += %D%/btree_test
-%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 %C%_btree_test_LDADD = \
 	libzpool.la \
 	libzfs_core.la
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT		dbuf.cache_shift		dbuf_cache_shift
 DDT_ZAP_DEFAULT_BS		dedup.ddt_zap_default_bs	ddt_zap_default_bs
 DDT_ZAP_DEFAULT_IBS		dedup.ddt_zap_default_ibs	ddt_zap_default_ibs
 DDT_DATA_IS_SPECIAL		ddt_data_is_special		zfs_ddt_data_is_special
+DEDUP_LOG_TXG_MAX		dedup.log_txg_max		zfs_dedup_log_txg_max
 DEADMAN_CHECKTIME_MS		deadman.checktime_ms		zfs_deadman_checktime_ms
 DEADMAN_EVENTS_PER_SECOND	deadman_events_per_second	zfs_deadman_events_per_second
 DEADMAN_FAILMODE		deadman.failmode		zfs_deadman_failmode
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@ -1424,6 +1424,12 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/deadman/deadman_zio.ksh \
 	functional/dedup/cleanup.ksh \
 	functional/dedup/setup.ksh \
+	functional/dedup/dedup_fdt_create.ksh \
+	functional/dedup/dedup_fdt_import.ksh \
+	functional/dedup/dedup_legacy_create.ksh \
+	functional/dedup/dedup_legacy_import.ksh \
+	functional/dedup/dedup_legacy_fdt_upgrade.ksh \
+	functional/dedup/dedup_legacy_fdt_mixed.ksh \
 	functional/dedup/dedup_quota.ksh \
 	functional/delegate/cleanup.ksh \
 	functional/delegate/setup.ksh \
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
@ -55,7 +55,7 @@ function display_status
 	((ret |= $?))

 	typeset mntpnt=$(get_prop mountpoint $pool)
-	dd if=/dev/random of=$mntpnt/testfile.$$ &
+	dd if=/dev/urandom of=$mntpnt/testfile.$$ &
 	typeset pid=$!

 	zpool iostat -v 1 3 > /dev/null
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_rlimit_fsize.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_rlimit_fsize.ksh
@ -54,7 +54,7 @@ log_must truncate -s 1G $VDEV

 log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV

-log_must dd if=/dev/random of=/$TESTPOOL/file1 bs=1 count=1000
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1 count=1000

 ulimit -f 2
 log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 all
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@ -109,5 +109,6 @@ if is_linux || is_freebsd; then
 	    "feature@block_cloning"
 	    "feature@vdev_zaps_v2"
 	    "feature@raidz_expansion"
+	    "feature@fast_dedup"
 	)
 fi
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
@ -95,6 +95,10 @@ while (( i < 16384 )); do
 	done
 	((i += 1))
 done
+
+# Force the DDT logs to disk with a scrub so they can be prefetched
+log_must zpool scrub -w $TESTPOOL
+
 log_note "Dataset generation completed."

 typeset -A generated
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am
@ -0,0 +1,6 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_reguid
+dist_pkgdata_SCRIPTS = \
+	setup.ksh \
+	cleanup.ksh \
+	zpool_reguid_001_pos.ksh \
+	zpool_reguid_002_neg.ksh
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh
@ -0,0 +1,32 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+default_cleanup
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh
@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+DISK=${DISKS%% *}
+
+default_setup $DISK
--- a/Show More
+++ b/Show More