Merge pull request #207 from truenas/truenas/zfs-2.2.3-staging-2

Sync with upstream zfs-2.2.3-staging for Dragonfish BETA.1
2024-01-17 11:07:23 -05:00 · 2024-01-17 11:07:23 -05:00 · fc0a9f0cda
parent f12cd3d5d0 21acb5a27c
commit fc0a9f0cda
73 changed files with 1913 additions and 649 deletions
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@ -168,7 +168,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
 	    (u_longlong_t)lr->lr_length);

-	if (txtype == TX_WRITE2 || verbose < 5)
+	if (txtype == TX_WRITE2 || verbose < 4)
 		return;

 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
@ -178,6 +178,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);

+		if (verbose < 5)
+			return;
 		if (BP_IS_HOLE(bp)) {
 			(void) printf("\t\t\tLSIZE 0x%llx\n",
 			    (u_longlong_t)BP_GET_LSIZE(bp));
@ -202,6 +204,9 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 		if (error)
 			goto out;
 	} else {
+		if (verbose < 5)
+			return;
+
 		/* data is stored after the end of the lr_write record */
 		data = abd_alloc(lr->lr_length, B_FALSE);
 		abd_copy_from_buf(data, lr + 1, lr->lr_length);
@ -217,6 +222,28 @@ out:
 	abd_free(data);
 }

+static void
+zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg)
+{
+	(void) txtype;
+	const lr_write_t *lr = arg;
+	const blkptr_t *bp = &lr->lr_blkptr;
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+
+	(void) printf("%s(encrypted)\n", tab_prefix);
+
+	if (verbose < 4)
+		return;
+
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		(void) printf("%shas blkptr, %s\n", tab_prefix,
+		    !BP_IS_HOLE(bp) &&
+		    bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+		    "will claim" : "won't claim");
+		print_log_bp(bp, tab_prefix);
+	}
+}
+
 static void
 zil_prt_rec_truncate(zilog_t *zilog, int txtype, const void *arg)
 {
@ -312,11 +339,34 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_clone_range_t *lr = arg;
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);

 	(void) printf("%sfoid %llu, offset %llx, length %llx, blksize %llx\n",
 	    tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
 	    (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz);

+	if (verbose < 4)
+		return;
+
+	for (unsigned int i = 0; i < lr->lr_nbps; i++) {
+		(void) printf("%s[%u/%llu] ", tab_prefix, i + 1,
+		    (u_longlong_t)lr->lr_nbps);
+		print_log_bp(&lr->lr_bps[i], "");
+	}
+}
+
+static void
+zil_prt_rec_clone_range_enc(zilog_t *zilog, int txtype, const void *arg)
+{
+	(void) zilog, (void) txtype;
+	const lr_clone_range_t *lr = arg;
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+
+	(void) printf("%s(encrypted)\n", tab_prefix);
+
+	if (verbose < 4)
+		return;
+
 	for (unsigned int i = 0; i < lr->lr_nbps; i++) {
 		(void) printf("%s[%u/%llu] ", tab_prefix, i + 1,
 		    (u_longlong_t)lr->lr_nbps);
@ -327,6 +377,7 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg)
 typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *);
 typedef struct zil_rec_info {
 	zil_prt_rec_func_t	zri_print;
+	zil_prt_rec_func_t	zri_print_enc;
 	const char		*zri_name;
 	uint64_t		zri_count;
 } zil_rec_info_t;
@ -341,7 +392,9 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
 	{.zri_print = zil_prt_rec_remove,   .zri_name = "TX_RMDIR           "},
 	{.zri_print = zil_prt_rec_link,	    .zri_name = "TX_LINK            "},
 	{.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME          "},
-	{.zri_print = zil_prt_rec_write,    .zri_name = "TX_WRITE           "},
+	{.zri_print = zil_prt_rec_write,
+	    .zri_print_enc = zil_prt_rec_write_enc,
+	    .zri_name = "TX_WRITE           "},
 	{.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE        "},
 	{.zri_print = zil_prt_rec_setattr,  .zri_name = "TX_SETATTR         "},
 	{.zri_print = zil_prt_rec_acl,	    .zri_name = "TX_ACL_V0          "},
@ -358,6 +411,7 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
 	{.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME_EXCHANGE "},
 	{.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME_WHITEOUT "},
 	{.zri_print = zil_prt_rec_clone_range,
+	    .zri_print_enc = zil_prt_rec_clone_range_enc,
 	    .zri_name = "TX_CLONE_RANGE     "},
 };

@ -384,6 +438,8 @@ print_log_record(zilog_t *zilog, const lr_t *lr, void *arg, uint64_t claim_txg)
 	if (txtype && verbose >= 3) {
 		if (!zilog->zl_os->os_encrypted) {
 			zil_rec_info[txtype].zri_print(zilog, txtype, lr);
+		} else if (zil_rec_info[txtype].zri_print_enc) {
+			zil_rec_info[txtype].zri_print_enc(zilog, txtype, lr);
 		} else {
 			(void) printf("%s(encrypted)\n", tab_prefix);
 		}
--- a/config/kernel-current-time.m4
+++ b/config/kernel-current-time.m4
@ -2,12 +2,15 @@ dnl #
 dnl # 4.9, current_time() added
 dnl # 4.18, return type changed from timespec to timespec64
 dnl #
+dnl # Note that we don't care about the return type in this check. If we have
+dnl # to implement a fallback, we'll know we're <4.9, which was timespec.
+dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_CURRENT_TIME], [
 	ZFS_LINUX_TEST_SRC([current_time], [
 		#include <linux/fs.h>
 	], [
 		struct inode ip __attribute__ ((unused));
-		ip.i_atime = current_time(&ip);
+		(void) current_time(&ip);
 	])
 ])

--- a/config/kernel-flush_dcache_page.m4
+++ b/config/kernel-flush_dcache_page.m4
@ -1,7 +1,8 @@
 dnl #
 dnl # Starting from Linux 5.13, flush_dcache_page() becomes an inline
-dnl # function and may indirectly referencing GPL-only cpu_feature_keys on
-dnl # powerpc
+dnl # function and may indirectly referencing GPL-only symbols:
+dnl # on powerpc: cpu_feature_keys
+dnl # on riscv: PageHuge (added from 6.2)
 dnl #

 dnl #
--- a/config/kernel-fpu.m4
+++ b/config/kernel-fpu.m4
@ -79,6 +79,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [
 		__kernel_fpu_end();
 	], [], [])

+	ZFS_LINUX_TEST_SRC([kernel_neon], [
+		#include <asm/neon.h>
+	], [
+		kernel_neon_begin();
+		kernel_neon_end();
+	], [], [ZFS_META_LICENSE])
 ])

 AC_DEFUN([ZFS_AC_KERNEL_FPU], [
@ -105,9 +111,20 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
 			AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
 			    [kernel exports FPU functions])
 		],[
-			AC_MSG_RESULT(internal)
-			AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1,
-			    [kernel fpu internal])
+			dnl #
+			dnl # ARM neon symbols (only on arm and arm64)
+			dnl # could be GPL-only on arm64 after Linux 6.2
+			dnl #
+			ZFS_LINUX_TEST_RESULT([kernel_neon_license],[
+				AC_MSG_RESULT(kernel_neon_*)
+				AC_DEFINE(HAVE_KERNEL_NEON, 1,
+				    [kernel has kernel_neon_* functions])
+			],[
+				# catch-all
+				AC_MSG_RESULT(internal)
+				AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1,
+				    [kernel fpu internal])
+			])
 		])
 	])
 ])
--- a/config/kernel-inode-times.m4
+++ b/config/kernel-inode-times.m4
@ -52,6 +52,48 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [
 		memset(&ip, 0, sizeof(ip));
 		inode_set_ctime_to_ts(&ip, ts);
 	])
+
+	dnl #
+	dnl # 6.7 API change
+	dnl # i_atime/i_mtime no longer directly accessible, must use
+	dnl # inode_get_mtime(ip), inode_set_mtime*(ip) to
+	dnl # read/write.
+	dnl #
+	ZFS_LINUX_TEST_SRC([inode_get_atime], [
+		#include <linux/fs.h>
+	],[
+		struct inode ip;
+
+		memset(&ip, 0, sizeof(ip));
+		inode_get_atime(&ip);
+	])
+	ZFS_LINUX_TEST_SRC([inode_get_mtime], [
+		#include <linux/fs.h>
+	],[
+		struct inode ip;
+
+		memset(&ip, 0, sizeof(ip));
+		inode_get_mtime(&ip);
+	])
+
+	ZFS_LINUX_TEST_SRC([inode_set_atime_to_ts], [
+		#include <linux/fs.h>
+	],[
+		struct inode ip;
+		struct timespec64 ts = {0};
+
+		memset(&ip, 0, sizeof(ip));
+		inode_set_atime_to_ts(&ip, ts);
+	])
+	ZFS_LINUX_TEST_SRC([inode_set_mtime_to_ts], [
+		#include <linux/fs.h>
+	],[
+		struct inode ip;
+		struct timespec64 ts = {0};
+
+		memset(&ip, 0, sizeof(ip));
+		inode_set_mtime_to_ts(&ip, ts);
+	])
 ])

 AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [
@ -90,4 +132,40 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [
 	],[
 		AC_MSG_RESULT(no)
 	])
+
+	AC_MSG_CHECKING([whether inode_get_atime() exists])
+	ZFS_LINUX_TEST_RESULT([inode_get_atime], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_INODE_GET_ATIME, 1,
+		    [inode_get_atime() exists in linux/fs.h])
+	],[
+		AC_MSG_RESULT(no)
+	])
+
+	AC_MSG_CHECKING([whether inode_set_atime_to_ts() exists])
+	ZFS_LINUX_TEST_RESULT([inode_set_atime_to_ts], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_INODE_SET_ATIME_TO_TS, 1,
+		    [inode_set_atime_to_ts() exists in linux/fs.h])
+	],[
+		AC_MSG_RESULT(no)
+	])
+
+	AC_MSG_CHECKING([whether inode_get_mtime() exists])
+	ZFS_LINUX_TEST_RESULT([inode_get_mtime], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_INODE_GET_MTIME, 1,
+		    [inode_get_mtime() exists in linux/fs.h])
+	],[
+		AC_MSG_RESULT(no)
+	])
+
+	AC_MSG_CHECKING([whether inode_set_mtime_to_ts() exists])
+	ZFS_LINUX_TEST_RESULT([inode_set_mtime_to_ts], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_INODE_SET_MTIME_TO_TS, 1,
+		    [inode_set_mtime_to_ts() exists in linux/fs.h])
+	],[
+		AC_MSG_RESULT(no)
+	])
 ])
--- a/config/kernel-shrink.m4
+++ b/config/kernel-shrink.m4
@ -19,12 +19,44 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK], [
 	],[])
 ])

+dnl #
+dnl # 6.7 API change
+dnl # s_shrink is now a pointer.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR], [
+	ZFS_LINUX_TEST_SRC([super_block_s_shrink_ptr], [
+		#include <linux/fs.h>
+		unsigned long shrinker_cb(struct shrinker *shrink,
+		    struct shrink_control *sc) { return 0; }
+		static struct shrinker shrinker = {
+			.count_objects = shrinker_cb,
+			.scan_objects = shrinker_cb,
+			.seeks = DEFAULT_SEEKS,
+		};
+		static const struct super_block
+		    sb __attribute__ ((unused)) = {
+			.s_shrink = &shrinker,
+		};
+	],[])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SUPER_BLOCK_S_SHRINK], [
 	AC_MSG_CHECKING([whether super_block has s_shrink])
 	ZFS_LINUX_TEST_RESULT([super_block_s_shrink], [
 		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK, 1,
+			[have super_block s_shrink])
 	],[
-		ZFS_LINUX_TEST_ERROR([sb->s_shrink()])
+		AC_MSG_RESULT(no)
+		AC_MSG_CHECKING([whether super_block has s_shrink pointer])
+		ZFS_LINUX_TEST_RESULT([super_block_s_shrink_ptr], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK_PTR, 1,
+				[have super_block s_shrink pointer])
+		],[
+			AC_MSG_RESULT(no)
+			ZFS_LINUX_TEST_ERROR([sb->s_shrink()])
+		])
 	])
 ])

@ -96,6 +128,25 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK], [
 	])
 ])

+dnl #
+dnl # 6.7 API change
+dnl # register_shrinker has been replaced by shrinker_register.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER], [
+	ZFS_LINUX_TEST_SRC([shrinker_register], [
+		#include <linux/shrinker.h>
+		unsigned long shrinker_cb(struct shrinker *shrink,
+		    struct shrink_control *sc) { return 0; }
+	],[
+		struct shrinker cache_shrinker = {
+			.count_objects = shrinker_cb,
+			.scan_objects = shrinker_cb,
+			.seeks = DEFAULT_SEEKS,
+		};
+		shrinker_register(&cache_shrinker);
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[
 	dnl #
 	dnl # 6.0 API change
@ -133,14 +184,36 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[
 			dnl # cs->shrink() is logically split in to
 			dnl # cs->count_objects() and cs->scan_objects()
 			dnl #
-			AC_MSG_CHECKING([if cs->count_objects callback exists])
+			AC_MSG_CHECKING(
+			    [whether cs->count_objects callback exists])
 			ZFS_LINUX_TEST_RESULT(
-				[shrinker_cb_shrink_control_split],[
-					AC_MSG_RESULT(yes)
-					AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1,
-						[cs->count_objects exists])
+			    [shrinker_cb_shrink_control_split],[
+				AC_MSG_RESULT(yes)
+				AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1,
+				    [cs->count_objects exists])
 			],[
+				AC_MSG_RESULT(no)
+
+				AC_MSG_CHECKING(
+				    [whether shrinker_register exists])
+				ZFS_LINUX_TEST_RESULT([shrinker_register], [
+					AC_MSG_RESULT(yes)
+					AC_DEFINE(HAVE_SHRINKER_REGISTER, 1,
+					    [shrinker_register exists])
+
+					dnl # We assume that the split shrinker
+					dnl # callback exists if
+					dnl # shrinker_register() exists,
+					dnl # because the latter is a much more
+					dnl # recent addition, and the macro
+					dnl # test for shrinker_register() only
+					dnl # works if the callback is split
+					AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK,
+					    1, [cs->count_objects exists])
+				],[
+					AC_MSG_RESULT(no)
 					ZFS_LINUX_TEST_ERROR([shrinker])
+				])
 			])
 		])
 	])
@ -174,10 +247,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT], [

 AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER], [
 	ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK
+	ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR
 	ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_HAS_NID
 	ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK
 	ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT
 	ZFS_AC_KERNEL_SRC_REGISTER_SHRINKER_VARARG
+	ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER
 ])

 AC_DEFUN([ZFS_AC_KERNEL_SHRINKER], [
--- a/config/kernel.m4
+++ b/config/kernel.m4
@ -168,6 +168,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
 			ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE
 			;;
+		riscv*)
+			ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE
+			;;
 	esac

 	AC_MSG_CHECKING([for available kernel interfaces])
@ -310,6 +313,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
 			ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE
 			;;
+		riscv*)
+			ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE
+			;;
 	esac
 ])

--- a/include/os/freebsd/spl/sys/mod_os.h
+++ b/include/os/freebsd/spl/sys/mod_os.h
@ -91,6 +91,12 @@
 #define	param_set_max_auto_ashift_args(var) \
    CTLTYPE_UINT, NULL, 0, param_set_max_auto_ashift, "IU"

+#define	spa_taskq_read_param_set_args(var) \
+    CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A"
+
+#define	spa_taskq_write_param_set_args(var) \
+    CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A"
+
 #define	fletcher_4_param_set_args(var) \
    CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A"

--- a/include/os/freebsd/spl/sys/uio.h
+++ b/include/os/freebsd/spl/sys/uio.h
@ -62,7 +62,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
 }

 static inline void
-zfs_uio_advance(zfs_uio_t *uio, size_t size)
+zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
 {
 	zfs_uio_resid(uio) -= size;
 	zfs_uio_offset(uio) += size;
--- a/include/os/linux/kernel/linux/dcache_compat.h
+++ b/include/os/linux/kernel/linux/dcache_compat.h
@ -42,8 +42,8 @@
 /*
 * Starting from Linux 5.13, flush_dcache_page() becomes an inline function
 * and under some configurations, may indirectly referencing GPL-only
- * cpu_feature_keys on powerpc. Override this function when it is detected
- * being GPL-only.
+ * symbols, e.g., cpu_feature_keys on powerpc and PageHuge on riscv.
+ * Override this function when it is detected being GPL-only.
 */
 #if defined __powerpc__ && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY
 #include <linux/simd_powerpc.h>
@ -53,6 +53,17 @@
 			clear_bit(PG_dcache_clean, &(page)->flags);	\
 	} while (0)
 #endif
+/*
+ * For riscv implementation, the use of PageHuge can be safely removed.
+ * Because it handles pages allocated by HugeTLB, while flush_dcache_page
+ * in zfs module is only called on kernel pages.
+ */
+#if defined __riscv && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY
+#define	flush_dcache_page(page)	do {					\
+		if (test_bit(PG_dcache_clean, &(page)->flags))		\
+			clear_bit(PG_dcache_clean, &(page)->flags);	\
+	} while (0)
+#endif

 /*
 * 2.6.30 API change,
--- a/include/os/linux/kernel/linux/simd_aarch64.h
+++ b/include/os/linux/kernel/linux/simd_aarch64.h
@ -71,9 +71,15 @@
 #define	ID_AA64PFR0_EL1		sys_reg(3, 0, 0, 1, 0)
 #define	ID_AA64ISAR0_EL1	sys_reg(3, 0, 0, 6, 0)

+#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON))
 #define	kfpu_allowed()		1
 #define	kfpu_begin()		kernel_neon_begin()
 #define	kfpu_end()		kernel_neon_end()
+#else
+#define	kfpu_allowed()		0
+#define	kfpu_begin()		do {} while (0)
+#define	kfpu_end()		do {} while (0)
+#endif
 #define	kfpu_init()		(0)
 #define	kfpu_fini()		do {} while (0)

--- a/include/os/linux/kernel/linux/simd_arm.h
+++ b/include/os/linux/kernel/linux/simd_arm.h
@ -53,9 +53,15 @@
 #include <asm/elf.h>
 #include <asm/hwcap.h>

+#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON))
 #define	kfpu_allowed()		1
 #define	kfpu_begin()		kernel_neon_begin()
 #define	kfpu_end()		kernel_neon_end()
+#else
+#define	kfpu_allowed()		0
+#define	kfpu_begin()		do {} while (0)
+#define	kfpu_end()		do {} while (0)
+#endif
 #define	kfpu_init()		(0)
 #define	kfpu_fini()		do {} while (0)

--- a/include/os/linux/spl/sys/kmem_cache.h
+++ b/include/os/linux/spl/sys/kmem_cache.h
@ -70,8 +70,6 @@ typedef enum kmem_cbrc {
 #define	KMC_REAP_CHUNK		INT_MAX
 #define	KMC_DEFAULT_SEEKS	1

-#define	KMC_RECLAIM_ONCE	0x1	/* Force a single shrinker pass */
-
 extern struct list_head spl_kmem_cache_list;
 extern struct rw_semaphore spl_kmem_cache_sem;

--- a/include/os/linux/spl/sys/shrinker.h
+++ b/include/os/linux/spl/sys/shrinker.h
@ -29,12 +29,13 @@

 /*
 * Due to frequent changes in the shrinker API the following
- * compatibility wrappers should be used.  They are as follows:
+ * compatibility wrapper should be used.
 *
- *   SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost);
+ *   shrinker = spl_register_shrinker(name, countfunc, scanfunc, seek_cost);
+ *   spl_unregister_shrinker(shrinker);
 *
- * SPL_SHRINKER_DECLARE is used to declare a shrinker with the name varname,
- * which is passed to spl_register_shrinker()/spl_unregister_shrinker().
+ * spl_register_shrinker is used to create and register a shrinker with the
+ * given name.
 * The countfunc returns the number of free-able objects.
 * The scanfunc returns the number of objects that were freed.
 * The callbacks can return SHRINK_STOP if further calls can't make any more
@ -57,57 +58,28 @@
 *	...scan objects in the cache and reclaim them...
 * }
 *
- * SPL_SHRINKER_DECLARE(my_shrinker, my_count, my_scan, DEFAULT_SEEKS);
+ * static struct shrinker *my_shrinker;
 *
 * void my_init_func(void) {
- *	spl_register_shrinker(&my_shrinker);
+ *	my_shrinker = spl_register_shrinker("my-shrinker",
+ *	    my_count, my_scan, DEFAULT_SEEKS);
+ * }
+ *
+ * void my_fini_func(void) {
+ *	spl_unregister_shrinker(my_shrinker);
 * }
 */

-#ifdef HAVE_REGISTER_SHRINKER_VARARG
-#define	spl_register_shrinker(x)	register_shrinker(x, "zfs-arc-shrinker")
-#else
-#define	spl_register_shrinker(x)	register_shrinker(x)
-#endif
-#define	spl_unregister_shrinker(x)	unregister_shrinker(x)
+typedef unsigned long (*spl_shrinker_cb)
+	(struct shrinker *, struct shrink_control *);

-/*
- * Linux 3.0 to 3.11 Shrinker API Compatibility.
- */
-#if defined(HAVE_SINGLE_SHRINKER_CALLBACK)
-#define	SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost)	\
-static int								\
-__ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\
-{									\
-	if (sc->nr_to_scan != 0) {					\
-		(void) scanfunc(shrink, sc);				\
-	}								\
-	return (countfunc(shrink, sc));					\
-}									\
-									\
-static struct shrinker varname = {					\
-	.shrink = __ ## varname ## _wrapper,				\
-	.seeks = seek_cost,						\
-}
+struct shrinker *spl_register_shrinker(const char *name,
+    spl_shrinker_cb countfunc, spl_shrinker_cb scanfunc, int seek_cost);
+void spl_unregister_shrinker(struct shrinker *);

+#ifndef SHRINK_STOP
+/* 3.0-3.11 compatibility */
 #define	SHRINK_STOP	(-1)
-
-/*
- * Linux 3.12 and later Shrinker API Compatibility.
- */
-#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
-#define	SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost)	\
-static struct shrinker varname = {					\
-	.count_objects = countfunc,					\
-	.scan_objects = scanfunc,					\
-	.seeks = seek_cost,						\
-}
-
-#else
-/*
- * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced.
- */
-#error "Unknown shrinker callback"
 #endif

 #endif /* SPL_SHRINKER_H */
--- a/include/os/linux/spl/sys/uio.h
+++ b/include/os/linux/spl/sys/uio.h
@ -95,7 +95,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
 }

 static inline void
-zfs_uio_advance(zfs_uio_t *uio, size_t size)
+zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
 {
 	uio->uio_resid -= size;
 	uio->uio_loffset += size;
--- a/include/os/linux/zfs/sys/zpl.h
+++ b/include/os/linux/zfs/sys/zpl.h
@ -282,5 +282,25 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg);
 #else
 #define	zpl_inode_set_ctime_to_ts(ip, ts)	(ip->i_ctime = ts)
 #endif
+#ifdef HAVE_INODE_GET_ATIME
+#define	zpl_inode_get_atime(ip)	inode_get_atime(ip)
+#else
+#define	zpl_inode_get_atime(ip)	(ip->i_atime)
+#endif
+#ifdef HAVE_INODE_SET_ATIME_TO_TS
+#define	zpl_inode_set_atime_to_ts(ip, ts)	inode_set_atime_to_ts(ip, ts)
+#else
+#define	zpl_inode_set_atime_to_ts(ip, ts)	(ip->i_atime = ts)
+#endif
+#ifdef HAVE_INODE_GET_MTIME
+#define	zpl_inode_get_mtime(ip)	inode_get_mtime(ip)
+#else
+#define	zpl_inode_get_mtime(ip)	(ip->i_mtime)
+#endif
+#ifdef HAVE_INODE_SET_MTIME_TO_TS
+#define	zpl_inode_set_mtime_to_ts(ip, ts)	inode_set_mtime_to_ts(ip, ts)
+#else
+#define	zpl_inode_set_mtime_to_ts(ip, ts)	(ip->i_mtime = ts)
+#endif

 #endif	/* _SYS_ZPL_H */
--- a/include/sys/dataset_kstats.h
+++ b/include/sys/dataset_kstats.h
@ -71,6 +71,7 @@ typedef struct dataset_kstats {

 int dataset_kstats_create(dataset_kstats_t *, objset_t *);
 void dataset_kstats_destroy(dataset_kstats_t *);
+void dataset_kstats_rename(dataset_kstats_t *dk, const char *);

 void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t);
 void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t);
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@ -379,8 +379,8 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
+boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
--- a/include/sys/dsl_crypt.h
+++ b/include/sys/dsl_crypt.h
@ -206,6 +206,7 @@ void dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin,
    dmu_tx_t *tx);
 int dmu_objset_create_crypt_check(dsl_dir_t *parentdd,
    dsl_crypto_params_t *dcp, boolean_t *will_encrypt);
+boolean_t dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb);
 void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd,
    struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx);
 uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey,
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@ -181,7 +181,7 @@ typedef struct zil_vdev_node {
 	avl_node_t	zv_node;	/* AVL tree linkage */
 } zil_vdev_node_t;

-#define	ZIL_PREV_BLKS 16
+#define	ZIL_BURSTS 8

 /*
 * Stable storage intent log management structure.  One per dataset.
@ -216,14 +216,18 @@ struct zilog {
 	uint64_t	zl_parse_lr_count; /* number of log records parsed */
 	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
 	list_t		zl_itx_commit_list; /* itx list to be committed */
-	uint64_t	zl_cur_used;	/* current commit log size used */
+	uint64_t	zl_cur_size;	/* current burst full size */
+	uint64_t	zl_cur_left;	/* current burst remaining size */
+	uint64_t	zl_cur_max;	/* biggest record in current burst */
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
 	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
 	zil_header_t	zl_old_header;	/* debugging aid */
-	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+	uint_t		zl_parallel;	/* workload is multi-threaded */
 	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
+	uint_t		zl_prev_opt[ZIL_BURSTS]; /* optimal block size */
+	uint_t		zl_prev_min[ZIL_BURSTS]; /* minimal first block size */
 	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
 	uint64_t	zl_dirty_max_txg; /* highest txg used to dirty zilog */

--- a/lib/libspl/include/assert.h
+++ b/lib/libspl/include/assert.h
@ -64,6 +64,9 @@ libspl_assert(const char *buf, const char *file, const char *func, int line)
 #undef verify
 #endif

+#define	PANIC(fmt, a...)						\
+	libspl_assertf(__FILE__, __FUNCTION__, __LINE__, fmt, ## a)
+
 #define	VERIFY(cond)							\
 	(void) ((!(cond)) &&						\
 	    libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__))
--- a/lib/libspl/include/sys/uio.h
+++ b/lib/libspl/include/sys/uio.h
@ -90,7 +90,7 @@ zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len)
 }

 static inline void
-zfs_uio_advance(zfs_uio_t *uio, size_t size)
+zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
 {
 	uio->uio_resid -= size;
 	uio->uio_loffset += size;
--- a/man/man4/spl.4
+++ b/man/man4/spl.4
@ -31,14 +31,6 @@ for use by the kmem caches.
 For the majority of systems and workloads only a small number of threads are
 required.
 .
-.It Sy spl_kmem_cache_reclaim Ns = Ns Sy 0 Pq uint
-When this is set it prevents Linux from being able to rapidly reclaim all the
-memory held by the kmem caches.
-This may be useful in circumstances where it's preferable that Linux
-reclaim memory from some other subsystem first.
-Setting this will increase the likelihood out of memory events on a memory
-constrained system.
-.
 .It Sy spl_kmem_cache_obj_per_slab Ns = Ns Sy 8 Pq uint
 The preferred number of objects per slab in the cache.
 In general, a larger value will increase the caches memory footprint
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@ -798,7 +798,7 @@ Note that this should not be set below the ZED thresholds
 (currently 10 checksums over 10 seconds)
 or else the daemon may not trigger any action.
 .
-.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
+.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
 This controls the amount of time that a ZIL block (lwb) will remain "open"
 when it isn't "full", and it has a thread waiting for it to be committed to
 stable storage.
@ -2160,13 +2160,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
 It tunes a tradeoff between additional memory copy and possibly worse log
 space efficiency vs additional range lock/unlock.
 .
-.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
-This sets the minimum delay in nanoseconds ZIL care to delay block commit,
-waiting for more records.
-If ZIL writes are too fast, kernel may not be able sleep for so short interval,
-increasing log latency above allowed by
-.Sy zfs_commit_timeout_pct .
-.
 .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable the cache flush commands that are normally sent to disk by
 the ZIL after an LWB write has completed.
@ -2280,6 +2273,16 @@ If
 .Sy 0 ,
 generate a system-dependent value close to 6 threads per taskq.
 .
+.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
+Set the queue and thread configuration for the IO read queues.
+This is an advanced debugging parameter.
+Don't change this unless you understand what it does.
+.
+.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp
+Set the queue and thread configuration for the IO write queues.
+This is an advanced debugging parameter.
+Don't change this unless you understand what it does.
+.
 .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Do not create zvol device nodes.
 This may slightly improve startup time on
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@ -364,9 +364,12 @@ When this feature is enabled ZFS will use block cloning for operations like
 Block cloning allows to create multiple references to a single block.
 It is much faster than copying the data (as the actual data is neither read nor
 written) and takes no additional space.
-Blocks can be cloned across datasets under some conditions (like disabled
-encryption and equal
-.Nm recordsize ) .
+Blocks can be cloned across datasets under some conditions (like equal
+.Nm recordsize ,
+the same master encryption key, etc.).
+ZFS tries its best to clone across datasets including encrypted ones.
+This is limited for various (nontrivial) reasons depending on the OS
+and/or ZFS internals.
 .Pp
 This feature becomes
 .Sy active
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@ -80,6 +80,7 @@ SPL_OBJS := \
 	spl-kstat.o \
 	spl-proc.o \
 	spl-procfs-list.o \
+	spl-shrinker.o \
 	spl-taskq.o \
 	spl-thread.o \
 	spl-trace.o \
--- a/module/os/freebsd/spl/spl_kstat.c
+++ b/module/os/freebsd/spl/spl_kstat.c
@ -187,19 +187,18 @@ kstat_sysctl_dataset_string(SYSCTL_HANDLER_ARGS)
 static int
 kstat_sysctl_io(SYSCTL_HANDLER_ARGS)
 {
-	struct sbuf *sb;
+	struct sbuf sb;
 	kstat_t *ksp = arg1;
 	kstat_io_t *kip = ksp->ks_data;
 	int rc;

-	sb = sbuf_new_auto();
-	if (sb == NULL)
-		return (ENOMEM);
+	sbuf_new_for_sysctl(&sb, NULL, 0, req);
+
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);

 	/* though wlentime & friends are signed, they will never be negative */
-	sbuf_printf(sb,
+	sbuf_printf(&sb,
 	    "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
 	    "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
 	    kip->nread, kip->nwritten,
@ -207,25 +206,21 @@ kstat_sysctl_io(SYSCTL_HANDLER_ARGS)
 	    kip->wtime, kip->wlentime, kip->wlastupdate,
 	    kip->rtime, kip->rlentime, kip->rlastupdate,
 	    kip->wcnt,  kip->rcnt);
-	rc = sbuf_finish(sb);
-	if (rc == 0)
-		rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
-	sbuf_delete(sb);
+	rc = sbuf_finish(&sb);
+	sbuf_delete(&sb);
 	return (rc);
 }

 static int
 kstat_sysctl_raw(SYSCTL_HANDLER_ARGS)
 {
-	struct sbuf *sb;
+	struct sbuf sb;
 	void *data;
 	kstat_t *ksp = arg1;
 	void *(*addr_op)(kstat_t *ksp, loff_t index);
 	int n, has_header, rc = 0;

-	sb = sbuf_new_auto();
-	if (sb == NULL)
-		return (ENOMEM);
+	sbuf_new_for_sysctl(&sb, NULL, PAGE_SIZE, req);

 	if (ksp->ks_raw_ops.addr)
 		addr_op = ksp->ks_raw_ops.addr;
@ -258,8 +253,10 @@ restart_headers:
 	if (has_header) {
 		if (rc == ENOMEM && !kstat_resize_raw(ksp))
 			goto restart_headers;
-		if (rc == 0)
-			sbuf_printf(sb, "\n%s", ksp->ks_raw_buf);
+		if (rc == 0) {
+			sbuf_cat(&sb, "\n");
+			sbuf_cat(&sb, ksp->ks_raw_buf);
+		}
 	}

 	while ((data = addr_op(ksp, n)) != NULL) {
@ -270,22 +267,19 @@ restart:
 			if (rc == ENOMEM && !kstat_resize_raw(ksp))
 				goto restart;
 			if (rc == 0)
-				sbuf_printf(sb, "%s", ksp->ks_raw_buf);
+				sbuf_cat(&sb, ksp->ks_raw_buf);

 		} else {
 			ASSERT3U(ksp->ks_ndata, ==, 1);
-			sbuf_hexdump(sb, ksp->ks_data,
+			sbuf_hexdump(&sb, ksp->ks_data,
 			    ksp->ks_data_size, NULL, 0);
 		}
 		n++;
 	}
 	free(ksp->ks_raw_buf, M_TEMP);
 	mutex_exit(ksp->ks_lock);
-	sbuf_trim(sb);
-	rc = sbuf_finish(sb);
-	if (rc == 0)
-		rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
-	sbuf_delete(sb);
+	rc = sbuf_finish(&sb);
+	sbuf_delete(&sb);
 	return (rc);
 }

--- a/module/os/freebsd/zfs/dmu_os.c
+++ b/module/os/freebsd/zfs/dmu_os.c
@ -110,7 +110,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);

 		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
+			dmu_buf_will_fill(db, tx, B_FALSE);
 		else
 			dmu_buf_will_dirty(db, tx);

@ -126,7 +126,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		}

 		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
+			dmu_buf_fill_done(db, tx, B_FALSE);

 		offset += tocpy;
 		size -= tocpy;
--- a/module/os/freebsd/zfs/zio_crypt.c
+++ b/module/os/freebsd/zfs/zio_crypt.c
@ -1251,7 +1251,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 	iovec_t *dst_iovecs;
 	zil_chain_t *zilc;
 	lr_t *lr;
-	uint64_t txtype, lr_len;
+	uint64_t txtype, lr_len, nused;
 	uint_t crypt_len, nr_iovecs, vec;
 	uint_t aad_len = 0, total_len = 0;

@ -1268,7 +1268,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 	zilc = (zil_chain_t *)src;
 	slrp = src + sizeof (zil_chain_t);
 	aadp = aadbuf;
-	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	ASSERT3U(nused, >=, sizeof (zil_chain_t));
+	ASSERT3U(nused, <=, datalen);
+	blkend = src + nused;

 	/*
 	 * Calculate the number of encrypted iovecs we will need.
@ -1287,6 +1290,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		}
+		ASSERT3U(lr_len, >=, sizeof (lr_t));
+		ASSERT3U(lr_len, <=, blkend - slrp);

 		nr_iovecs++;
 		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@ -1333,6 +1333,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 		}
 	}
 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+	dataset_kstats_rename(&zv->zv_kstat, newname);
 }

 /*
--- a/module/os/linux/spl/spl-kmem-cache.c
+++ b/module/os/linux/spl/spl-kmem-cache.c
@ -76,17 +76,6 @@ module_param(spl_kmem_cache_magazine_size, uint, 0444);
 MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
 	"Default magazine size (2-256), set automatically (0)");

-/*
- * The default behavior is to report the number of objects remaining in the
- * cache.  This allows the Linux VM to repeatedly reclaim objects from the
- * cache when memory is low satisfy other memory allocations.  Alternately,
- * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
- * is reclaimed.  This may increase the likelihood of out of memory events.
- */
-static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
-module_param(spl_kmem_cache_reclaim, uint, 0644);
-MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
-
 static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
--- a/module/os/linux/spl/spl-shrinker.c
+++ b/module/os/linux/spl/spl-shrinker.c
@ -0,0 +1,115 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Shrinker Implementation.
+ */
+
+#include <sys/kmem.h>
+#include <sys/shrinker.h>
+
+#ifdef HAVE_SINGLE_SHRINKER_CALLBACK
+/* 3.0-3.11: single shrink() callback, which we wrap to carry both functions */
+struct spl_shrinker_wrap {
+	struct shrinker shrinker;
+	spl_shrinker_cb countfunc;
+	spl_shrinker_cb scanfunc;
+};
+
+static int
+spl_shrinker_single_cb(struct shrinker *shrinker, struct shrink_control *sc)
+{
+	struct spl_shrinker_wrap *sw = (struct spl_shrinker_wrap *)shrinker;
+
+	if (sc->nr_to_scan != 0)
+		(void) sw->scanfunc(&sw->shrinker, sc);
+	return (sw->countfunc(&sw->shrinker, sc));
+}
+#endif
+
+struct shrinker *
+spl_register_shrinker(const char *name, spl_shrinker_cb countfunc,
+    spl_shrinker_cb scanfunc, int seek_cost)
+{
+	struct shrinker *shrinker;
+
+	/* allocate shrinker */
+#if defined(HAVE_SHRINKER_REGISTER)
+	/* 6.7: kernel will allocate the shrinker for us */
+	shrinker = shrinker_alloc(0, name);
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+	/* 3.12-6.6: we allocate the shrinker  */
+	shrinker = kmem_zalloc(sizeof (struct shrinker), KM_SLEEP);
+#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK)
+	/* 3.0-3.11: allocate a wrapper */
+	struct spl_shrinker_wrap *sw =
+	    kmem_zalloc(sizeof (struct spl_shrinker_wrap), KM_SLEEP);
+	shrinker = &sw->shrinker;
+#else
+	/* 2.x-2.6.22, or a newer shrinker API has been introduced. */
+#error "Unknown shrinker API"
+#endif
+
+	if (shrinker == NULL)
+		return (NULL);
+
+	/* set callbacks */
+#ifdef HAVE_SINGLE_SHRINKER_CALLBACK
+	sw->countfunc = countfunc;
+	sw->scanfunc = scanfunc;
+	shrinker->shrink = spl_shrinker_single_cb;
+#else
+	shrinker->count_objects = countfunc;
+	shrinker->scan_objects = scanfunc;
+#endif
+
+	/* set params */
+	shrinker->seeks = seek_cost;
+
+	/* register with kernel */
+#if defined(HAVE_SHRINKER_REGISTER)
+	shrinker_register(shrinker);
+#elif defined(HAVE_REGISTER_SHRINKER_VARARG)
+	register_shrinker(shrinker, name);
+#else
+	register_shrinker(shrinker);
+#endif
+
+	return (shrinker);
+}
+EXPORT_SYMBOL(spl_register_shrinker);
+
+void
+spl_unregister_shrinker(struct shrinker *shrinker)
+{
+#if defined(HAVE_SHRINKER_REGISTER)
+	shrinker_free(shrinker);
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+	unregister_shrinker(shrinker);
+	kmem_free(shrinker, sizeof (struct shrinker));
+#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK)
+	unregister_shrinker(shrinker);
+	kmem_free(shrinker, sizeof (struct spl_shrinker_wrap));
+#else
+#error "Unknown shrinker API"
+#endif
+}
+EXPORT_SYMBOL(spl_unregister_shrinker);
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@ -247,8 +247,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
 	return (sc->nr_to_scan);
 }

-SPL_SHRINKER_DECLARE(arc_shrinker,
-    arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
+static struct shrinker *arc_shrinker = NULL;

 int
 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
@ -351,14 +350,18 @@ arc_lowmem_init(void)
 	 * reclaim from the arc.  This is done to prevent kswapd from
 	 * swapping out pages when it is preferable to shrink the arc.
 	 */
-	spl_register_shrinker(&arc_shrinker);
+	arc_shrinker = spl_register_shrinker("zfs-arc-shrinker",
+	    arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
+	VERIFY(arc_shrinker);
+
 	arc_set_sys_free(allmem);
 }

 void
 arc_lowmem_fini(void)
 {
-	spl_unregister_shrinker(&arc_shrinker);
+	spl_unregister_shrinker(arc_shrinker);
+	arc_shrinker = NULL;
 }

 int
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@ -85,7 +85,7 @@ static blk_mode_t
 #else
 static fmode_t
 #endif
-vdev_bdev_mode(spa_mode_t spa_mode)
+vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive)
 {
 #ifdef HAVE_BLK_MODE_T
 	blk_mode_t mode = 0;
@ -95,6 +95,9 @@ vdev_bdev_mode(spa_mode_t spa_mode)

 	if (spa_mode & SPA_MODE_WRITE)
 		mode |= BLK_OPEN_WRITE;
+
+	if (exclusive)
+		mode |= BLK_OPEN_EXCL;
 #else
 	fmode_t mode = 0;

@ -103,6 +106,9 @@ vdev_bdev_mode(spa_mode_t spa_mode)

 	if (spa_mode & SPA_MODE_WRITE)
 		mode |= FMODE_WRITE;
+
+	if (exclusive)
+		mode |= FMODE_EXCL;
 #endif

 	return (mode);
@ -225,10 +231,10 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder,
 {
 #ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG
 	return (blkdev_get_by_path(path,
-	    vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops));
+	    vdev_bdev_mode(mode, B_TRUE), holder, hops));
 #else
 	return (blkdev_get_by_path(path,
-	    vdev_bdev_mode(mode) | FMODE_EXCL, holder));
+	    vdev_bdev_mode(mode, B_TRUE), holder));
 #endif
 }

@ -238,7 +244,7 @@ vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder)
 #ifdef HAVE_BLKDEV_PUT_HOLDER
 	return (blkdev_put(bdev, holder));
 #else
-	return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL));
+	return (blkdev_put(bdev, vdev_bdev_mode(mode, B_TRUE)));
 #endif
 }

@ -248,9 +254,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 {
 	struct block_device *bdev;
 #ifdef HAVE_BLK_MODE_T
-	blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+	blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
 #else
-	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
 #endif
 	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
 	vdev_disk_t *vd;
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@ -520,8 +520,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
 	ip->i_uid = SUID_TO_KUID(0);
 	ip->i_gid = SGID_TO_KGID(0);
 	ip->i_blkbits = SPA_MINBLOCKSHIFT;
-	ip->i_atime = now;
-	ip->i_mtime = now;
+	zpl_inode_set_atime_to_ts(ip, now);
+	zpl_inode_set_mtime_to_ts(ip, now);
 	zpl_inode_set_ctime_to_ts(ip, now);
 	ip->i_fop = fops;
 	ip->i_op = ops;
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@ -1258,12 +1258,18 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
 * and inode caches.  This can occur when the ARC needs to free meta data
 * blocks but can't because they are all pinned by entries in these caches.
 */
+#if defined(HAVE_SUPER_BLOCK_S_SHRINK)
+#define	S_SHRINK(sb)	(&(sb)->s_shrink)
+#elif defined(HAVE_SUPER_BLOCK_S_SHRINK_PTR)
+#define	S_SHRINK(sb)	((sb)->s_shrink)
+#endif
+
 int
 zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
 {
 	zfsvfs_t *zfsvfs = sb->s_fs_info;
 	int error = 0;
-	struct shrinker *shrinker = &sb->s_shrink;
+	struct shrinker *shrinker = S_SHRINK(sb);
 	struct shrink_control sc = {
 		.nr_to_scan = nr_to_scan,
 		.gfp_mask = GFP_KERNEL,
@ -1275,7 +1281,7 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
 #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \
 	defined(SHRINK_CONTROL_HAS_NID) && \
 	defined(SHRINKER_NUMA_AWARE)
-	if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) {
+	if (shrinker->flags & SHRINKER_NUMA_AWARE) {
 		*objects = 0;
 		for_each_online_node(sc.nid) {
 			*objects += (*shrinker->scan_objects)(shrinker, &sc);
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@ -2464,15 +2464,16 @@ top:

 	if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
 		zp->z_atime_dirty = B_FALSE;
-		ZFS_TIME_ENCODE(&ip->i_atime, atime);
+		inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
+		ZFS_TIME_ENCODE(&tmp_atime, atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &atime, sizeof (atime));
 	}

 	if (mask & (ATTR_MTIME | ATTR_SIZE)) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
-		ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
-		    vap->va_mtime, ZTOI(zp));
+		zpl_inode_set_mtime_to_ts(ZTOI(zp),
+		    zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));

 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
@ -3686,7 +3687,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	caddr_t		va;
 	int		err = 0;
 	uint64_t	mtime[2], ctime[2];
-	inode_timespec_t tmp_ctime;
+	inode_timespec_t tmp_ts;
 	sa_bulk_attr_t	bulk[3];
 	int		cnt = 0;
 	struct address_space *mapping;
@ -3850,9 +3851,10 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	    &zp->z_pflags, 8);

 	/* Preserve the mtime and ctime provided by the inode */
-	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
-	tmp_ctime = zpl_inode_get_ctime(ip);
-	ZFS_TIME_ENCODE(&tmp_ctime, ctime);
+	tmp_ts = zpl_inode_get_mtime(ip);
+	ZFS_TIME_ENCODE(&tmp_ts, mtime);
+	tmp_ts = zpl_inode_get_ctime(ip);
+	ZFS_TIME_ENCODE(&tmp_ts, ctime);
 	zp->z_atime_dirty = B_FALSE;
 	zp->z_seq++;

@ -3902,7 +3904,7 @@ zfs_dirty_inode(struct inode *ip, int flags)
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	dmu_tx_t	*tx;
 	uint64_t	mode, atime[2], mtime[2], ctime[2];
-	inode_timespec_t tmp_ctime;
+	inode_timespec_t tmp_ts;
 	sa_bulk_attr_t	bulk[4];
 	int		error = 0;
 	int		cnt = 0;
@ -3947,10 +3949,12 @@ zfs_dirty_inode(struct inode *ip, int flags)
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);

 	/* Preserve the mode, mtime and ctime provided by the inode */
-	ZFS_TIME_ENCODE(&ip->i_atime, atime);
-	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
-	tmp_ctime = zpl_inode_get_ctime(ip);
-	ZFS_TIME_ENCODE(&tmp_ctime, ctime);
+	tmp_ts = zpl_inode_get_atime(ip);
+	ZFS_TIME_ENCODE(&tmp_ts, atime);
+	tmp_ts = zpl_inode_get_mtime(ip);
+	ZFS_TIME_ENCODE(&tmp_ts, mtime);
+	tmp_ts = zpl_inode_get_ctime(ip);
+	ZFS_TIME_ENCODE(&tmp_ts, ctime);
 	mode = ip->i_mode;

 	zp->z_mode = mode;
@ -3993,7 +3997,9 @@ zfs_inactive(struct inode *ip)
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
-			ZFS_TIME_ENCODE(&ip->i_atime, atime);
+			inode_timespec_t tmp_atime;
+			tmp_atime = zpl_inode_get_atime(ip);
+			ZFS_TIME_ENCODE(&tmp_atime, atime);
 			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&atime, sizeof (atime), tx);
--- a/module/os/linux/zfs/zfs_znode.c
+++ b/module/os/linux/zfs/zfs_znode.c
@ -542,7 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	uint64_t links;
 	uint64_t z_uid, z_gid;
 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
-	inode_timespec_t tmp_ctime;
+	inode_timespec_t tmp_ts;
 	uint64_t projid = ZFS_DEFAULT_PROJID;
 	sa_bulk_attr_t bulk[12];
 	int count = 0;
@ -614,10 +614,12 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	if (zp->z_pflags & ZFS_XATTR)
 		zp->z_xattr_parent = parent;

-	ZFS_TIME_DECODE(&ip->i_atime, atime);
-	ZFS_TIME_DECODE(&ip->i_mtime, mtime);
-	ZFS_TIME_DECODE(&tmp_ctime, ctime);
-	zpl_inode_set_ctime_to_ts(ip, tmp_ctime);
+	ZFS_TIME_DECODE(&tmp_ts, atime);
+	zpl_inode_set_atime_to_ts(ip, tmp_ts);
+	ZFS_TIME_DECODE(&tmp_ts, mtime);
+	zpl_inode_set_mtime_to_ts(ip, tmp_ts);
+	ZFS_TIME_DECODE(&tmp_ts, ctime);
+	zpl_inode_set_ctime_to_ts(ip, tmp_ts);
 	ZFS_TIME_DECODE(&zp->z_btime, btime);

 	ip->i_ino = zp->z_id;
@ -1197,7 +1199,7 @@ zfs_rezget(znode_t *zp)
 	uint64_t gen;
 	uint64_t z_uid, z_gid;
 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
-	inode_timespec_t tmp_ctime;
+	inode_timespec_t tmp_ts;
 	uint64_t projid = ZFS_DEFAULT_PROJID;
 	znode_hold_t *zh;

@ -1290,10 +1292,12 @@ zfs_rezget(znode_t *zp)
 	zfs_uid_write(ZTOI(zp), z_uid);
 	zfs_gid_write(ZTOI(zp), z_gid);

-	ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
-	ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
-	ZFS_TIME_DECODE(&tmp_ctime, ctime);
-	zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
+	ZFS_TIME_DECODE(&tmp_ts, atime);
+	zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts);
+	ZFS_TIME_DECODE(&tmp_ts, mtime);
+	zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
+	ZFS_TIME_DECODE(&tmp_ts, ctime);
+	zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
 	ZFS_TIME_DECODE(&zp->z_btime, btime);

 	if ((uint32_t)gen != ZTOI(zp)->i_generation) {
@ -1401,22 +1405,24 @@ zfs_zinactive(znode_t *zp)
 boolean_t
 zfs_relatime_need_update(const struct inode *ip)
 {
-	inode_timespec_t now, tmp_ctime;
+	inode_timespec_t now, tmp_atime, tmp_ts;

 	gethrestime(&now);
+	tmp_atime = zpl_inode_get_atime(ip);
 	/*
 	 * In relatime mode, only update the atime if the previous atime
 	 * is earlier than either the ctime or mtime or if at least a day
 	 * has passed since the last update of atime.
 	 */
-	if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
+	tmp_ts = zpl_inode_get_mtime(ip);
+	if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
 		return (B_TRUE);

-	tmp_ctime = zpl_inode_get_ctime(ip);
-	if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0)
+	tmp_ts = zpl_inode_get_ctime(ip);
+	if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
 		return (B_TRUE);

-	if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
+	if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60)
 		return (B_TRUE);

 	return (B_FALSE);
@ -1439,7 +1445,7 @@ void
 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
    uint64_t ctime[2])
 {
-	inode_timespec_t now, tmp_ctime;
+	inode_timespec_t now, tmp_ts;

 	gethrestime(&now);

@ -1447,7 +1453,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],

 	if (flag & ATTR_MTIME) {
 		ZFS_TIME_ENCODE(&now, mtime);
-		ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
+		ZFS_TIME_DECODE(&tmp_ts, mtime);
+		zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
 		if (ZTOZSB(zp)->z_use_fuids) {
 			zp->z_pflags |= (ZFS_ARCHIVE |
 			    ZFS_AV_MODIFIED);
@ -1456,8 +1463,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],

 	if (flag & ATTR_CTIME) {
 		ZFS_TIME_ENCODE(&now, ctime);
-		ZFS_TIME_DECODE(&tmp_ctime, ctime);
-		zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
+		ZFS_TIME_DECODE(&tmp_ts, ctime);
+		zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
 		if (ZTOZSB(zp)->z_use_fuids)
 			zp->z_pflags |= ZFS_ARCHIVE;
 	}
--- a/module/os/linux/zfs/zio_crypt.c
+++ b/module/os/linux/zfs/zio_crypt.c
@ -1405,7 +1405,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
    boolean_t *no_crypt)
 {
 	int ret;
-	uint64_t txtype, lr_len;
+	uint64_t txtype, lr_len, nused;
 	uint_t nr_src, nr_dst, crypt_len;
 	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
 	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
@ -1432,7 +1432,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 	zilc = (zil_chain_t *)src;
 	slrp = src + sizeof (zil_chain_t);
 	aadp = aadbuf;
-	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	ASSERT3U(nused, >=, sizeof (zil_chain_t));
+	ASSERT3U(nused, <=, datalen);
+	blkend = src + nused;

 	/* calculate the number of encrypted iovecs we will need */
 	for (; slrp < blkend; slrp += lr_len) {
@ -1445,6 +1448,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
+		ASSERT3U(lr_len, >=, sizeof (lr_t));
+		ASSERT3U(lr_len, <=, blkend - slrp);

 		nr_iovecs++;
 		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
--- a/module/os/linux/zfs/zpl_inode.c
+++ b/module/os/linux/zfs/zpl_inode.c
@ -526,7 +526,8 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia)
 	vap->va_ctime = ia->ia_ctime;

 	if (vap->va_mask & ATTR_ATIME)
-		ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip);
+		zpl_inode_set_atime_to_ts(ip,
+		    zpl_inode_timestamp_truncate(ia->ia_atime, ip));

 	cookie = spl_fstrans_mark();
 #ifdef HAVE_USERNS_IOPS_SETATTR
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@ -1528,6 +1528,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 	 */
 	set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
+
+	dataset_kstats_rename(&zv->zv_kstat, newname);
 }

 void
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@ -802,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
 	abd_verify(abd);
 	ASSERT3U(off + size, <=, abd->abd_size);

-	boolean_t gang = abd_is_gang(abd);
 	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);

 	while (size > 0) {
-		/* If we are at the end of the gang ABD we are done */
-		if (gang && !c_abd)
-			break;
+		IMPLY(abd_is_gang(abd), c_abd != NULL);

 		abd_iter_map(&aiter);

@ -930,7 +927,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
 {
 	int ret = 0;
 	struct abd_iter daiter, saiter;
-	boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
 	abd_t *c_dabd, *c_sabd;

 	if (size == 0)
@ -942,16 +938,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
 	ASSERT3U(doff + size, <=, dabd->abd_size);
 	ASSERT3U(soff + size, <=, sabd->abd_size);

-	dabd_is_gang_abd = abd_is_gang(dabd);
-	sabd_is_gang_abd = abd_is_gang(sabd);
 	c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
 	c_sabd = abd_init_abd_iter(sabd, &saiter, soff);

 	while (size > 0) {
-		/* if we are at the end of the gang ABD we are done */
-		if ((dabd_is_gang_abd && !c_dabd) ||
-		    (sabd_is_gang_abd && !c_sabd))
-			break;
+		IMPLY(abd_is_gang(dabd), c_dabd != NULL);
+		IMPLY(abd_is_gang(sabd), c_sabd != NULL);

 		abd_iter_map(&daiter);
 		abd_iter_map(&saiter);
@ -1032,66 +1024,40 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
 	int i;
 	ssize_t len, dlen;
 	struct abd_iter caiters[3];
-	struct abd_iter daiter = {0};
+	struct abd_iter daiter;
 	void *caddrs[3];
 	unsigned long flags __maybe_unused = 0;
 	abd_t *c_cabds[3];
 	abd_t *c_dabd = NULL;
-	boolean_t cabds_is_gang_abd[3];
-	boolean_t dabd_is_gang_abd = B_FALSE;

 	ASSERT3U(parity, <=, 3);
-
 	for (i = 0; i < parity; i++) {
-		cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
+		abd_verify(cabds[i]);
+		ASSERT3U(csize, <=, cabds[i]->abd_size);
 		c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0);
 	}

-	if (dabd) {
-		dabd_is_gang_abd = abd_is_gang(dabd);
+	ASSERT3S(dsize, >=, 0);
+	if (dsize > 0) {
+		ASSERT(dabd);
+		abd_verify(dabd);
+		ASSERT3U(dsize, <=, dabd->abd_size);
 		c_dabd = abd_init_abd_iter(dabd, &daiter, 0);
 	}

-	ASSERT3S(dsize, >=, 0);
-
 	abd_enter_critical(flags);
 	while (csize > 0) {
-		/* if we are at the end of the gang ABD we are done */
-		if (dabd_is_gang_abd && !c_dabd)
-			break;
-
+		len = csize;
 		for (i = 0; i < parity; i++) {
-			/*
-			 * If we are at the end of the gang ABD we are
-			 * done.
-			 */
-			if (cabds_is_gang_abd[i] && !c_cabds[i])
-				break;
+			IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
 			abd_iter_map(&caiters[i]);
 			caddrs[i] = caiters[i].iter_mapaddr;
+			len = MIN(caiters[i].iter_mapsize, len);
 		}

-		len = csize;
-
-		if (dabd && dsize > 0)
+		if (dsize > 0) {
+			IMPLY(abd_is_gang(dabd), c_dabd != NULL);
 			abd_iter_map(&daiter);
-
-		switch (parity) {
-			case 3:
-				len = MIN(caiters[2].iter_mapsize, len);
-				zfs_fallthrough;
-			case 2:
-				len = MIN(caiters[1].iter_mapsize, len);
-				zfs_fallthrough;
-			case 1:
-				len = MIN(caiters[0].iter_mapsize, len);
-		}
-
-		/* must be progressive */
-		ASSERT3S(len, >, 0);
-
-		if (dabd && dsize > 0) {
-			/* this needs precise iter.length */
 			len = MIN(daiter.iter_mapsize, len);
 			dlen = len;
 		} else
@ -1114,7 +1080,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
 			    &caiters[i], len);
 		}

-		if (dabd && dsize > 0) {
+		if (dsize > 0) {
 			abd_iter_unmap(&daiter);
 			c_dabd =
 			    abd_advance_abd_iter(dabd, c_dabd, &daiter,
@ -1153,16 +1119,16 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
 	struct abd_iter xiters[3];
 	void *caddrs[3], *xaddrs[3];
 	unsigned long flags __maybe_unused = 0;
-	boolean_t cabds_is_gang_abd[3];
-	boolean_t tabds_is_gang_abd[3];
 	abd_t *c_cabds[3];
 	abd_t *c_tabds[3];

 	ASSERT3U(parity, <=, 3);

 	for (i = 0; i < parity; i++) {
-		cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
-		tabds_is_gang_abd[i] = abd_is_gang(tabds[i]);
+		abd_verify(cabds[i]);
+		abd_verify(tabds[i]);
+		ASSERT3U(tsize, <=, cabds[i]->abd_size);
+		ASSERT3U(tsize, <=, tabds[i]->abd_size);
 		c_cabds[i] =
 		    abd_init_abd_iter(cabds[i], &citers[i], 0);
 		c_tabds[i] =
@ -1171,36 +1137,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,

 	abd_enter_critical(flags);
 	while (tsize > 0) {
-
+		len = tsize;
 		for (i = 0; i < parity; i++) {
-			/*
-			 * If we are at the end of the gang ABD we
-			 * are done.
-			 */
-			if (cabds_is_gang_abd[i] && !c_cabds[i])
-				break;
-			if (tabds_is_gang_abd[i] && !c_tabds[i])
-				break;
+			IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
+			IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL);
 			abd_iter_map(&citers[i]);
 			abd_iter_map(&xiters[i]);
 			caddrs[i] = citers[i].iter_mapaddr;
 			xaddrs[i] = xiters[i].iter_mapaddr;
+			len = MIN(citers[i].iter_mapsize, len);
+			len = MIN(xiters[i].iter_mapsize, len);
 		}

-		len = tsize;
-		switch (parity) {
-			case 3:
-				len = MIN(xiters[2].iter_mapsize, len);
-				len = MIN(citers[2].iter_mapsize, len);
-				zfs_fallthrough;
-			case 2:
-				len = MIN(xiters[1].iter_mapsize, len);
-				len = MIN(citers[1].iter_mapsize, len);
-				zfs_fallthrough;
-			case 1:
-				len = MIN(xiters[0].iter_mapsize, len);
-				len = MIN(citers[0].iter_mapsize, len);
-		}
 		/* must be progressive */
 		ASSERT3S(len, >, 0);
 		/*
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@ -8042,9 +8042,8 @@ l2arc_write_size(l2arc_dev_t *dev)
 	 */
 	size = l2arc_write_max;
 	if (size == 0) {
-		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
-		    "be greater than zero, resetting it to the default (%d)",
-		    L2ARC_WRITE_SIZE);
+		cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
+		    "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
 	}

@ -8067,30 +8066,9 @@ l2arc_write_size(l2arc_dev_t *dev)
 	 * device. This is important in l2arc_evict(), otherwise infinite
 	 * iteration can occur.
 	 */
-	if (size > dev->l2ad_end - dev->l2ad_start) {
-		cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
-		    "plus the overhead of log blocks (persistent L2ARC, "
-		    "%llu bytes) exceeds the size of the cache device "
-		    "(guid %llu), resetting them to the default (%d)",
-		    (u_longlong_t)l2arc_log_blk_overhead(size, dev),
-		    (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
+	size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);

-		size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
-
-		if (l2arc_trim_ahead > 1) {
-			cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1");
-			l2arc_trim_ahead = 1;
-		}
-
-		if (arc_warm == B_FALSE)
-			size += l2arc_write_boost;
-
-		size += l2arc_log_blk_overhead(size, dev);
-		if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
-			size += MAX(64 * 1024 * 1024,
-			    (size * l2arc_trim_ahead) / 100);
-		}
-	}
+	size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);

 	return (size);

--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@ -157,10 +157,8 @@
 * (copying the file content to the new dataset and removing the source file).
 * In that case Block Cloning will only be used briefly, because the BRT entries
 * will be removed when the source is removed.
- * Note: currently it is not possible to clone blocks between encrypted
- * datasets, even if those datasets use the same encryption key (this includes
- * snapshots of encrypted datasets). Cloning blocks between datasets that use
- * the same keys should be possible and should be implemented in the future.
+ * Block Cloning across encrypted datasets is supported as long as both
+ * datasets share the same master key (e.g. snapshots and clones)
 *
 * Block Cloning flow through ZFS layers.
 *
@ -344,7 +342,7 @@ brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)

 	ASSERT3U(idx, <, brtvd->bv_size);

-	if (brtvd->bv_need_byteswap) {
+	if (unlikely(brtvd->bv_need_byteswap)) {
 		return (BSWAP_16(brtvd->bv_entcount[idx]));
 	} else {
 		return (brtvd->bv_entcount[idx]);
@ -357,7 +355,7 @@ brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)

 	ASSERT3U(idx, <, brtvd->bv_size);

-	if (brtvd->bv_need_byteswap) {
+	if (unlikely(brtvd->bv_need_byteswap)) {
 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
 	} else {
 		brtvd->bv_entcount[idx] = entcnt;
@ -392,55 +390,39 @@ brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)

 #ifdef ZFS_DEBUG
 static void
-brt_vdev_dump(brt_t *brt)
+brt_vdev_dump(brt_vdev_t *brtvd)
 {
-	brt_vdev_t *brtvd;
-	uint64_t vdevid;
+	uint64_t idx;

-	if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
-		return;
-	}
-
-	if (brt->brt_nvdevs == 0) {
-		zfs_dbgmsg("BRT empty");
-		return;
-	}
-
-	zfs_dbgmsg("BRT vdev dump:");
-	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
-		uint64_t idx;
-
-		brtvd = &brt->brt_vdevs[vdevid];
-		zfs_dbgmsg("  vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
-		    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
-		    (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
-		    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
-		    (u_longlong_t)brtvd->bv_size,
-		    (u_longlong_t)brtvd->bv_totalcount,
-		    (u_longlong_t)brtvd->bv_nblocks,
-		    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
-		if (brtvd->bv_totalcount > 0) {
-			zfs_dbgmsg("    entcounts:");
-			for (idx = 0; idx < brtvd->bv_size; idx++) {
-				if (brt_vdev_entcount_get(brtvd, idx) > 0) {
-					zfs_dbgmsg("      [%04llu] %hu",
-					    (u_longlong_t)idx,
-					    brt_vdev_entcount_get(brtvd, idx));
-				}
+	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
+	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
+	    (u_longlong_t)brtvd->bv_vdevid,
+	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
+	    (u_longlong_t)brtvd->bv_size,
+	    (u_longlong_t)brtvd->bv_totalcount,
+	    (u_longlong_t)brtvd->bv_nblocks,
+	    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
+	if (brtvd->bv_totalcount > 0) {
+		zfs_dbgmsg("    entcounts:");
+		for (idx = 0; idx < brtvd->bv_size; idx++) {
+			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
+			if (entcnt > 0) {
+				zfs_dbgmsg("      [%04llu] %hu",
+				    (u_longlong_t)idx, entcnt);
 			}
 		}
-		if (brtvd->bv_entcount_dirty) {
-			char *bitmap;
+	}
+	if (brtvd->bv_entcount_dirty) {
+		char *bitmap;

-			bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
-			for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
-				bitmap[idx] =
-				    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
-			}
-			bitmap[idx] = '\0';
-			zfs_dbgmsg("    bitmap: %s", bitmap);
-			kmem_free(bitmap, brtvd->bv_nblocks + 1);
+		bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
+		for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
+			bitmap[idx] =
+			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
 		}
+		bitmap[idx] = '\0';
+		zfs_dbgmsg("    dirty: %s", bitmap);
+		kmem_free(bitmap, brtvd->bv_nblocks + 1);
 	}
 }
 #endif
@ -769,7 +751,8 @@ brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
 	BT_SET(brtvd->bv_bitmap, idx);

 #ifdef ZFS_DEBUG
-	brt_vdev_dump(brt);
+	if (zfs_flags & ZFS_DEBUG_BRT)
+		brt_vdev_dump(brtvd);
 #endif
 }

@ -805,7 +788,8 @@ brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
 	BT_SET(brtvd->bv_bitmap, idx);

 #ifdef ZFS_DEBUG
-	brt_vdev_dump(brt);
+	if (zfs_flags & ZFS_DEBUG_BRT)
+		brt_vdev_dump(brtvd);
 #endif
 }

--- a/module/zfs/dataset_kstats.c
+++ b/module/zfs/dataset_kstats.c
@ -198,6 +198,18 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
 	zil_sums_fini(&dk->dk_zil_sums);
 }

+void
+dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
+{
+	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+	char *ds_name;
+
+	ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name);
+	ASSERT3S(ds_name, !=, NULL);
+	(void) strlcpy(ds_name, name,
+	    KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
+}
+
 void
 dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
    int64_t nwritten)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@ -1619,8 +1619,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 */
 	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
 		spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth);
-		zfs_panic_recover("unencrypted block in encrypted "
-		    "object set %llu", dmu_objset_id(db->db_objset));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
@ -1925,7 +1923,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 		zio_free(db->db_objset->os_spa, txg, bp);

 	if (dr->dt.dl.dr_brtwrite) {
-		ASSERT0P(dr->dt.dl.dr_data);
+		ASSERT0(dr->dt.dl.dr_data);
 		dr->dt.dl.dr_data = db->db_buf;
 	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@ -2736,7 +2734,7 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 }

 void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;

@ -2754,8 +2752,14 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 		 * Block cloning: We will be completely overwriting a block
 		 * cloned in this transaction group, so let's undirty the
 		 * pending clone and mark the block as uncached. This will be
-		 * as if the clone was never done.
+		 * as if the clone was never done.  But if the fill can fail
+		 * we should have a way to return back to the cloned data.
 		 */
+		if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+			mutex_exit(&db->db_mtx);
+			dmu_buf_will_dirty(db_fake, tx);
+			return;
+		}
 		VERIFY(!dbuf_undirty(db, tx));
 		db->db_state = DB_UNCACHED;
 	}
@ -2816,32 +2820,41 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
 	dl->dr_overridden_by.blk_birth = dr->dr_txg;
 }

-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
 {
 	(void) tx;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-	dbuf_states_t old_state;
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);

-	old_state = db->db_state;
-	db->db_state = DB_CACHED;
-	if (old_state == DB_FILL) {
+	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			memset(db->db.db_data, 0, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
+			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db,
 			    "fill done handling freed in flight");
+			failed = B_FALSE;
+		} else if (failed) {
+			VERIFY(!dbuf_undirty(db, tx));
+			db->db_buf = NULL;
+			dbuf_clear_data(db);
+			DTRACE_SET_STATE(db, "fill failed");
 		} else {
+			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db, "fill done");
 		}
 		cv_broadcast(&db->db_changed);
+	} else {
+		db->db_state = DB_CACHED;
+		failed = B_FALSE;
 	}
 	mutex_exit(&db->db_mtx);
+	return (failed);
 }

 void
@ -2986,7 +2999,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 	DTRACE_SET_STATE(db, "filling assigned arcbuf");
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
-	dmu_buf_fill_done(&db->db, tx);
+	dmu_buf_fill_done(&db->db, tx, B_FALSE);
 }

 void
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@ -1115,14 +1115,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);

 		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
+			dmu_buf_will_fill(db, tx, B_FALSE);
 		else
 			dmu_buf_will_dirty(db, tx);

 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);

 		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
+			dmu_buf_fill_done(db, tx, B_FALSE);

 		offset += tocpy;
 		size -= tocpy;
@ -1330,27 +1330,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)

 		ASSERT(size > 0);

-		bufoff = zfs_uio_offset(uio) - db->db_offset;
+		offset_t off = zfs_uio_offset(uio);
+		bufoff = off - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);

 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);

 		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
+			dmu_buf_will_fill(db, tx, B_TRUE);
 		else
 			dmu_buf_will_dirty(db, tx);

-		/*
-		 * XXX zfs_uiomove could block forever (eg.nfs-backed
-		 * pages).  There needs to be a uiolockdown() function
-		 * to lock the pages in memory, so that zfs_uiomove won't
-		 * block.
-		 */
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff,
 		    tocpy, UIO_WRITE, uio);

-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
+		if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
+			/* The fill was reverted.  Undo any uio progress. */
+			zfs_uio_advance(uio, off - zfs_uio_offset(uio));
+		}

 		if (err)
 			break;
@ -1482,9 +1479,9 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, FTAG);
+	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
-	rw_exit(&dn->dn_struct_rwlock);

 	/*
 	 * We can only assign if the offset is aligned and the arc buf is the
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@ -2532,7 +2532,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
 	 * size of the provided arc_buf_t.
 	 */
 	if (db_spill->db_size != drrs->drr_length) {
-		dmu_buf_will_fill(db_spill, tx);
+		dmu_buf_will_fill(db_spill, tx, B_FALSE);
 		VERIFY0(dbuf_spill_set_blksz(db_spill,
 		    drrs->drr_length, tx));
 	}
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@ -1124,8 +1124,6 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	if (sta->os->os_encrypted &&
 	    !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
 		spa_log_error(spa, zb, &bp->blk_birth);
-		zfs_panic_recover("unencrypted block in encrypted "
-		    "object set %llu", dmu_objset_id(sta->os));
 		return (SET_ERROR(EIO));
 	}

--- a/module/zfs/dsl_crypt.c
+++ b/module/zfs/dsl_crypt.c
@ -266,6 +266,40 @@ spa_crypto_key_compare(const void *a, const void *b)
 	return (0);
 }

+/*
+ * this compares a crypto key based on zk_guid. See comment on
+ * spa_crypto_key_compare for more information.
+ */
+boolean_t
+dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb)
+{
+	dsl_crypto_key_t *dcka = NULL;
+	dsl_crypto_key_t *dckb = NULL;
+	uint64_t obja, objb;
+	boolean_t equal;
+	spa_t *spa;
+
+	spa = dmu_objset_spa(osa);
+	if (spa != dmu_objset_spa(osb))
+		return (B_FALSE);
+	obja = dmu_objset_ds(osa)->ds_object;
+	objb = dmu_objset_ds(osb)->ds_object;
+
+	if (spa_keystore_lookup_key(spa, obja, FTAG, &dcka) != 0)
+		return (B_FALSE);
+	if (spa_keystore_lookup_key(spa, objb, FTAG, &dckb) != 0) {
+		spa_keystore_dsl_key_rele(spa, dcka, FTAG);
+		return (B_FALSE);
+	}
+
+	equal = (dcka->dck_key.zk_guid == dckb->dck_key.zk_guid);
+
+	spa_keystore_dsl_key_rele(spa, dcka, FTAG);
+	spa_keystore_dsl_key_rele(spa, dckb, FTAG);
+
+	return (equal);
+}
+
 static int
 spa_key_mapping_compare(const void *a, const void *b)
 {
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@ -1000,8 +1000,6 @@ livelist_compare(const void *larg, const void *rarg)
 	/* if vdevs are equal, sort by offsets. */
 	uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
 	uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
-	if (l_dva0_offset == r_dva0_offset)
-		ASSERT3U(l->blk_birth, ==, r->blk_birth);
 	return (TREE_CMP(l_dva0_offset, r_dva0_offset));
 }

@ -1016,9 +1014,9 @@ struct livelist_iter_arg {
 * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
 * corresponding FREE are stored in the supplied bplist.
 *
- * Note that multiple FREE and ALLOC entries for the same blkptr may
- * be encountered when dedup is involved. For this reason we keep a
- * refcount for all the FREE entries of each blkptr and ensure that
+ * Note that multiple FREE and ALLOC entries for the same blkptr may be
+ * encountered when dedup or block cloning is involved.  For this reason we
+ * keep a refcount for all the FREE entries of each blkptr and ensure that
 * each of those FREE entries has a corresponding ALLOC preceding it.
 */
 static int
@ -1037,6 +1035,13 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 	livelist_entry_t node;
 	node.le_bp = *bp;
 	livelist_entry_t *found = avl_find(avl, &node, NULL);
+	if (found) {
+		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
+		ASSERT3U(BP_GET_CHECKSUM(bp), ==,
+		    BP_GET_CHECKSUM(&found->le_bp));
+		ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==,
+		    BP_PHYSICAL_BIRTH(&found->le_bp));
+	}
 	if (bp_freed) {
 		if (found == NULL) {
 			/* first free entry for this blkptr */
@ -1046,10 +1051,10 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 			e->le_refcnt = 1;
 			avl_add(avl, e);
 		} else {
-			/* dedup block free */
-			ASSERT(BP_GET_DEDUP(bp));
-			ASSERT3U(BP_GET_CHECKSUM(bp), ==,
-			    BP_GET_CHECKSUM(&found->le_bp));
+			/*
+			 * Deduped or cloned block free.  We could assert D bit
+			 * for dedup, but there is no such one for cloning.
+			 */
 			ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
 			found->le_refcnt++;
 		}
@ -1065,14 +1070,6 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 				/* all tracked free pairs have been matched */
 				avl_remove(avl, found);
 				kmem_free(found, sizeof (livelist_entry_t));
-			} else {
-				/*
-				 * This is definitely a deduped blkptr so
-				 * let's validate it.
-				 */
-				ASSERT(BP_GET_DEDUP(bp));
-				ASSERT3U(BP_GET_CHECKSUM(bp), ==,
-				    BP_GET_CHECKSUM(&found->le_bp));
 			}
 		}
 	}
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@ -151,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
 * need to be handled with minimum delay.
 */
-static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */
@ -1164,6 +1164,275 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 	tqs->stqs_taskq = NULL;
 }

+#ifdef _KERNEL
+/*
+ * The READ and WRITE rows of zio_taskqs are configurable at module load time
+ * by setting zio_taskq_read or zio_taskq_write.
+ *
+ * Example (the defaults for READ and WRITE)
+ *   zio_taskq_read='fixed,1,8 null scale null'
+ *   zio_taskq_write='batch fixed,1,5 scale fixed,1,5'
+ *
+ * Each sets the entire row at a time.
+ *
+ * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
+ * of threads per taskq.
+ *
+ * 'null' can only be set on the high-priority queues (queue selection for
+ * high-priority queues will fall back to the regular queue if the high-pri
+ * is NULL.
+ */
+static const char *const modes[ZTI_NMODES] = {
+	"fixed", "batch", "scale", "null"
+};
+
+/* Parse the incoming config string. Modifies cfg */
+static int
+spa_taskq_param_set(zio_type_t t, char *cfg)
+{
+	int err = 0;
+
+	zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
+
+	char *next = cfg, *tok, *c;
+
+	/*
+	 * Parse out each element from the string and fill `row`. The entire
+	 * row has to be set at once, so any errors are flagged by just
+	 * breaking out of this loop early.
+	 */
+	uint_t q;
+	for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
+		/* `next` is the start of the config */
+		if (next == NULL)
+			break;
+
+		/* Eat up leading space */
+		while (isspace(*next))
+			next++;
+		if (*next == '\0')
+			break;
+
+		/* Mode ends at space or end of string */
+		tok = next;
+		next = strchr(tok, ' ');
+		if (next != NULL) *next++ = '\0';
+
+		/* Parameters start after a comma */
+		c = strchr(tok, ',');
+		if (c != NULL) *c++ = '\0';
+
+		/* Match mode string */
+		uint_t mode;
+		for (mode = 0; mode < ZTI_NMODES; mode++)
+			if (strcmp(tok, modes[mode]) == 0)
+				break;
+		if (mode == ZTI_NMODES)
+			break;
+
+		/* Invalid canary */
+		row[q].zti_mode = ZTI_NMODES;
+
+		/* Per-mode setup */
+		switch (mode) {
+
+		/*
+		 * FIXED is parameterised: number of queues, and number of
+		 * threads per queue.
+		 */
+		case ZTI_MODE_FIXED: {
+			/* No parameters? */
+			if (c == NULL || *c == '\0')
+				break;
+
+			/* Find next parameter */
+			tok = c;
+			c = strchr(tok, ',');
+			if (c == NULL)
+				break;
+
+			/* Take digits and convert */
+			unsigned long long nq;
+			if (!(isdigit(*tok)))
+				break;
+			err = ddi_strtoull(tok, &tok, 10, &nq);
+			/* Must succeed and also end at the next param sep */
+			if (err != 0 || tok != c)
+				break;
+
+			/* Move past the comma */
+			tok++;
+			/* Need another number */
+			if (!(isdigit(*tok)))
+				break;
+			/* Remember start to make sure we moved */
+			c = tok;
+
+			/* Take digits */
+			unsigned long long ntpq;
+			err = ddi_strtoull(tok, &tok, 10, &ntpq);
+			/* Must succeed, and moved forward */
+			if (err != 0 || tok == c || *tok != '\0')
+				break;
+
+			/*
+			 * sanity; zero queues/threads make no sense, and
+			 * 16K is almost certainly more than anyone will ever
+			 * need and avoids silly numbers like UINT32_MAX
+			 */
+			if (nq == 0 || nq >= 16384 ||
+			    ntpq == 0 || ntpq >= 16384)
+				break;
+
+			const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
+			row[q] = zti;
+			break;
+		}
+
+		case ZTI_MODE_BATCH: {
+			const zio_taskq_info_t zti = ZTI_BATCH;
+			row[q] = zti;
+			break;
+		}
+
+		case ZTI_MODE_SCALE: {
+			const zio_taskq_info_t zti = ZTI_SCALE;
+			row[q] = zti;
+			break;
+		}
+
+		case ZTI_MODE_NULL: {
+			/*
+			 * Can only null the high-priority queues; the general-
+			 * purpose ones have to exist.
+			 */
+			if (q != ZIO_TASKQ_ISSUE_HIGH &&
+			    q != ZIO_TASKQ_INTERRUPT_HIGH)
+				break;
+
+			const zio_taskq_info_t zti = ZTI_NULL;
+			row[q] = zti;
+			break;
+		}
+
+		default:
+			break;
+		}
+
+		/* Ensure we set a mode */
+		if (row[q].zti_mode == ZTI_NMODES)
+			break;
+	}
+
+	/* Didn't get a full row, fail */
+	if (q < ZIO_TASKQ_TYPES)
+		return (SET_ERROR(EINVAL));
+
+	/* Eat trailing space */
+	if (next != NULL)
+		while (isspace(*next))
+			next++;
+
+	/* If there's anything left over then fail */
+	if (next != NULL && *next != '\0')
+		return (SET_ERROR(EINVAL));
+
+	/* Success! Copy it into the real config */
+	for (q = 0; q < ZIO_TASKQ_TYPES; q++)
+		zio_taskqs[t][q] = row[q];
+
+	return (0);
+}
+
+static int
+spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
+{
+	int pos = 0;
+
+	/* Build paramater string from live config */
+	const char *sep = "";
+	for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
+		const zio_taskq_info_t *zti = &zio_taskqs[t][q];
+		if (zti->zti_mode == ZTI_MODE_FIXED)
+			pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
+			    modes[zti->zti_mode], zti->zti_count,
+			    zti->zti_value);
+		else
+			pos += sprintf(&buf[pos], "%s%s", sep,
+			    modes[zti->zti_mode]);
+		sep = " ";
+	}
+
+	if (add_newline)
+		buf[pos++] = '\n';
+	buf[pos] = '\0';
+
+	return (pos);
+}
+
+#ifdef __linux__
+static int
+spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+	char *cfg = kmem_strdup(val);
+	int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
+	kmem_free(cfg, strlen(val)+1);
+	return (-err);
+}
+static int
+spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+	return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
+}
+
+static int
+spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+	char *cfg = kmem_strdup(val);
+	int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
+	kmem_free(cfg, strlen(val)+1);
+	return (-err);
+}
+static int
+spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+	return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
+}
+#else
+/*
+ * On FreeBSD load-time parameters can be set up before malloc() is available,
+ * so we have to do all the parsing work on the stack.
+ */
+#define	SPA_TASKQ_PARAM_MAX	(128)
+
+static int
+spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
+{
+	char buf[SPA_TASKQ_PARAM_MAX];
+	int err;
+
+	(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
+	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+	if (err || req->newptr == NULL)
+		return (err);
+	return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
+}
+
+static int
+spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
+{
+	char buf[SPA_TASKQ_PARAM_MAX];
+	int err;
+
+	(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
+	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+	if (err || req->newptr == NULL)
+		return (err);
+	return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
+}
+#endif
+#endif /* _KERNEL */
+
 /*
 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
 * Note that a type may have multiple discrete taskqs to avoid lock contention
@ -10210,4 +10479,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
 	ZMOD_RW,
 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
 	"was being condensed");
+
+#ifdef _KERNEL
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
+	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
+	"Configure IO queues for read IO");
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
+	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
+	"Configure IO queues for write IO");
+#endif
 /* END CSTYLED */
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@ -309,6 +309,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 	uint64_t dnodesize;
 	int error;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl));
+
 	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lracl, sizeof (*lracl));
@ -470,6 +472,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 	uint64_t dnodesize;
 	int error;

+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
@ -613,6 +617,8 @@ zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	int vflg = 0;

+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

@ -648,6 +654,8 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	int vflg = 0;

+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

@ -715,12 +723,14 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_rename_t *lr = arg2;
-	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
-	char *tname = sname + strlen(sname) + 1;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));

 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
 	return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL));
 }

@ -730,12 +740,14 @@ zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap)
 #ifdef __linux__
 	zfsvfs_t *zfsvfs = arg1;
 	lr_rename_t *lr = arg2;
-	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
-	char *tname = sname + strlen(sname) + 1;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));

 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
 	return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE,
 	    NULL));
 #else
@ -750,14 +762,13 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
 	zfsvfs_t *zfsvfs = arg1;
 	lr_rename_whiteout_t *lr = arg2;
 	int error;
-	/* sname and tname follow lr_rename_whiteout_t */
-	char *sname = (char *)(lr + 1);
-	char *tname = sname + strlen(sname) + 1;
 	/* For the whiteout file. */
 	xvattr_t xva;
 	uint64_t objid;
 	uint64_t dnodesize;

+	ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

@ -783,6 +794,9 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
 	if (error)
 		return (error);

+	/* sname and tname follow lr_rename_whiteout_t */
+	char *sname = (char *)(lr + 1);
+	char *tname = sname + strlen(sname) + 1;
 	return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname,
 	    RENAME_WHITEOUT, &xva.xva_vattr));
 #else
@ -800,6 +814,8 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	uint64_t eod, offset, length;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

@ -863,6 +879,8 @@ zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	uint64_t end;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

@ -910,6 +928,8 @@ zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 	flock64_t fl = {0};
 	int error;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

@ -940,6 +960,8 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
 	int error;
 	void *start;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	xva_init(&xva);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
@ -1002,6 +1024,9 @@ zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap)
 	size_t size;
 	int error = 0;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size);
+
 	ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa,
 	    SPA_FEATURE_ZILSAXATTR));
 	if (byteswap)
@ -1079,6 +1104,10 @@ zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
 	znode_t *zp;
 	int error;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) +
+	    sizeof (ace_t) * lr->lr_aclcnt);
+
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
@ -1124,6 +1153,9 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
 	znode_t *zp;
 	int error;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes);
+
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
@ -1171,6 +1203,10 @@ zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
 	znode_t *zp;
 	int error;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@ -47,6 +47,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
+#include <sys/dsl_crypt.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
@ -1103,6 +1104,16 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		return (SET_ERROR(EXDEV));
 	}

+	/*
+	 * Cloning across encrypted datasets is possible only if they
+	 * share the same master key.
+	 */
+	if (inos != outos && inos->os_encrypted &&
+	    !dmu_objset_crypto_key_equal(inos, outos)) {
+		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
 	error = zfs_verify_zp(inzp);
 	if (error == 0)
 		error = zfs_verify_zp(outzp);
@ -1181,11 +1192,18 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	inblksz = inzp->z_blksz;

 	/*
-	 * We cannot clone into files with different block size if we can't
-	 * grow it (block size is already bigger or more than one block).
+	 * We cannot clone into a file with different block size if we can't
+	 * grow it (block size is already bigger, has more than one block, or
+	 * not locked for growth).  There are other possible reasons for the
+	 * grow to fail, but we cover what we can before opening transaction
+	 * and the rest detect after we try to do it.
 	 */
+	if (inblksz < outzp->z_blksz) {
+		error = SET_ERROR(EINVAL);
+		goto unlock;
+	}
 	if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz ||
-	    outzp->z_size > inblksz)) {
+	    outlr->lr_length != UINT64_MAX)) {
 		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
@ -1286,20 +1304,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 			 */
 			break;
 		}
-		/*
-		 * Encrypted data is fine as long as it comes from the same
-		 * dataset.
-		 * TODO: We want to extend it in the future to allow cloning to
-		 * datasets with the same keys, like clones or to be able to
-		 * clone a file from a snapshot of an encrypted dataset into the
-		 * dataset itself.
-		 */
-		if (BP_IS_PROTECTED(&bps[0])) {
-			if (inzfsvfs != outzfsvfs) {
-				error = SET_ERROR(EXDEV);
-				break;
-			}
-		}

 		/*
 		 * Start a transaction.
@ -1318,12 +1322,24 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}

 		/*
-		 * Copy source znode's block size. This only happens on the
-		 * first iteration since zfs_rangelock_reduce() will shrink down
-		 * lr_len to the appropriate size.
+		 * Copy source znode's block size. This is done only if the
+		 * whole znode is locked (see zfs_rangelock_cb()) and only
+		 * on the first iteration since zfs_rangelock_reduce() will
+		 * shrink down lr_length to the appropriate size.
 		 */
 		if (outlr->lr_length == UINT64_MAX) {
 			zfs_grow_blocksize(outzp, inblksz, tx);
+
+			/*
+			 * Block growth may fail for many reasons we can not
+			 * predict here.  If it happen the cloning is doomed.
+			 */
+			if (inblksz != outzp->z_blksz) {
+				error = SET_ERROR(EINVAL);
+				dmu_tx_abort(tx);
+				break;
+			}
+
 			/*
 			 * Round range lock up to the block boundary, so we
 			 * prevent appends until we are done.
@ -1339,6 +1355,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 			break;
 		}

+		if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) {
+			update_pages(outzp, outoff, size, outos);
+		}
+
 		zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
 		    &clear_setid_bits_txg, tx);

--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@ -91,15 +91,7 @@
 * committed to stable storage. Please refer to the zil_commit_waiter()
 * function (and the comments within it) for more details.
 */
-static uint_t zfs_commit_timeout_pct = 5;
-
-/*
- * Minimal time we care to delay commit waiting for more ZIL records.
- * At least FreeBSD kernel can't sleep for less than 2us at its best.
- * So requests to sleep for less then 5us is a waste of CPU time with
- * a risk of significant log latency increase due to oversleep.
- */
-static uint64_t zil_min_commit_timeout = 5000;
+static uint_t zfs_commit_timeout_pct = 10;

 /*
 * See zil.h for more information about these fields.
@ -152,6 +144,7 @@ static kmem_cache_t *zil_zcw_cache;

 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 static itx_t *zil_itx_clone(itx_t *oitx);
+static uint64_t zil_max_waste_space(zilog_t *zilog);

 static int
 zil_bp_compare(const void *x1, const void *x2)
@ -522,6 +515,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
+			ASSERT3U(reclen, <=, end - lrp);
 			if (lr->lrc_seq > claim_lr_seq) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
@ -604,7 +598,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;

-	ASSERT(lrc->lrc_txtype == TX_WRITE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));

 	/*
 	 * If the block is not readable, don't claim it.  This can happen
@ -632,7 +626,9 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
 	spa_t *spa = zilog->zl_spa;
 	uint_t ii;

-	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));

 	if (tx == NULL) {
 		return (0);
@ -646,9 +642,9 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
 		bp = &lr->lr_bps[ii];

 		/*
-		 * When data are embedded into BP there is no need to create
-		 * BRT entry as there is no data block. Just copy the BP as
-		 * it contains the data.
+		 * When data is embedded into the BP there is no need to create
+		 * BRT entry as there is no data block.  Just copy the BP as it
+		 * contains the data.
 		 */
 		if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 			continue;
@ -709,7 +705,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;

-	ASSERT(lrc->lrc_txtype == TX_WRITE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));

 	/*
 	 * If we previously claimed it, we need to free it.
@ -730,7 +726,9 @@ zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 	spa_t *spa;
 	uint_t ii;

-	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));

 	if (tx == NULL) {
 		return (0);
@ -1625,7 +1623,7 @@ zil_lwb_write_done(zio_t *zio)

 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL && !vd->vdev_nowritecache) {
+		if (vd != NULL) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
@ -1713,24 +1711,6 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 	mutex_exit(&zilog->zl_lock);
 }

-/*
- * Define a limited set of intent log block sizes.
- *
- * These must be a multiple of 4KB. Note only the amount used (again
- * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
- */
-static const struct {
-	uint64_t	limit;
-	uint64_t	blksz;
-} zil_block_buckets[] = {
-	{ 4096,		4096 },			/* non TX_WRITE */
-	{ 8192 + 4096,	8192 + 4096 },		/* database */
-	{ 32768 + 4096,	32768 + 4096 },		/* NFS writes */
-	{ 65536 + 4096,	65536 + 4096 },		/* 64KB writes */
-	{ UINT64_MAX,	SPA_OLD_MAXBLOCKSIZE},	/* > 128KB writes */
-};
-
 /*
 * Maximum block size used by the ZIL.  This is picked up when the ZIL is
 * initialized.  Otherwise this should not be used directly; see
@ -1738,6 +1718,91 @@ static const struct {
 */
 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;

+/*
+ * Plan splitting of the provided burst size between several blocks.
+ */
+static uint_t
+zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
+{
+	uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
+
+	if (size <= md) {
+		/*
+		 * Small bursts are written as-is in one block.
+		 */
+		*minsize = size;
+		return (size);
+	} else if (size > 8 * md) {
+		/*
+		 * Big bursts use maximum blocks.  The first block size
+		 * is hard to predict, but it does not really matter.
+		 */
+		*minsize = 0;
+		return (md);
+	}
+
+	/*
+	 * Medium bursts try to divide evenly to better utilize several SLOG
+	 * VDEVs.  The first block size we predict assuming the worst case of
+	 * maxing out others.  Fall back to using maximum blocks if due to
+	 * large records or wasted space we can not predict anything better.
+	 */
+	uint_t s = size;
+	uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
+	uint_t chunk = DIV_ROUND_UP(s, n);
+	uint_t waste = zil_max_waste_space(zilog);
+	waste = MAX(waste, zilog->zl_cur_max);
+	if (chunk <= md - waste) {
+		*minsize = MAX(s - (md - waste) * (n - 1), waste);
+		return (chunk);
+	} else {
+		*minsize = 0;
+		return (md);
+	}
+}
+
+/*
+ * Try to predict next block size based on previous history.  Make prediction
+ * sufficient for 7 of 8 previous bursts.  Don't try to save if the saving is
+ * less then 50%, extra writes may cost more, but we don't want single spike
+ * to badly affect our predictions.
+ */
+static uint_t
+zil_lwb_predict(zilog_t *zilog)
+{
+	uint_t m, o;
+
+	/* If we are in the middle of a burst, take it into account also. */
+	if (zilog->zl_cur_size > 0) {
+		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
+	} else {
+		o = UINT_MAX;
+		m = 0;
+	}
+
+	/* Find minimum optimal size.  We don't need to go below that. */
+	for (int i = 0; i < ZIL_BURSTS; i++)
+		o = MIN(o, zilog->zl_prev_opt[i]);
+
+	/* Find two biggest minimal first block sizes above the optimal. */
+	uint_t m1 = MAX(m, o), m2 = o;
+	for (int i = 0; i < ZIL_BURSTS; i++) {
+		m = zilog->zl_prev_min[i];
+		if (m >= m1) {
+			m2 = m1;
+			m1 = m;
+		} else if (m > m2) {
+			m2 = m;
+		}
+	}
+
+	/*
+	 * If second minimum size gives 50% saving -- use it.  It may cost us
+	 * one additional write later, but the space saving is just too big.
+	 */
+	return ((m1 < m2 * 2) ? m1 : m2);
+}
+
 /*
 * Close the log block for being issued and allocate the next one.
 * Has to be called under zl_issuer_lock to chain more lwbs.
@ -1745,7 +1810,7 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 static lwb_t *
 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
-	int i;
+	uint64_t blksz, plan, plan2;

 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
@ -1760,34 +1825,40 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 		return (NULL);

 	/*
-	 * Log blocks are pre-allocated. Here we select the size of the next
-	 * block, based on size used in the last block.
-	 * - first find the smallest bucket that will fit the block from a
-	 *   limited set of block sizes. This is because it's faster to write
-	 *   blocks allocated from the same metaslab as they are adjacent or
-	 *   close.
-	 * - next find the maximum from the new suggested size and an array of
-	 *   previous sizes. This lessens a picket fence effect of wrongly
-	 *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
-	 *   requests.
-	 *
-	 * Note we only write what is used, but we can't just allocate
-	 * the maximum block size because we can exhaust the available
-	 * pool log space.
+	 * Log blocks are pre-allocated.  Here we select the size of the next
+	 * block, based on what's left of this burst and the previous history.
+	 * While we try to only write used part of the block, we can't just
+	 * always allocate the maximum block size because we can exhaust all
+	 * available pool log space, so we try to be reasonable.
 	 */
-	uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
-	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
-		continue;
-	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
-	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
-	for (i = 0; i < ZIL_PREV_BLKS; i++)
-		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
-	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
-	    uint64_t, zil_blksz,
-	    uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
-	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
+	if (zilog->zl_cur_left > 0) {
+		/*
+		 * We are in the middle of a burst and know how much is left.
+		 * But if workload is multi-threaded there may be more soon.
+		 * Try to predict what can it be and plan for the worst case.
+		 */
+		uint_t m;
+		plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+		if (zilog->zl_parallel) {
+			plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
+			    zil_lwb_predict(zilog), &m);
+			if (plan < plan2)
+				plan = plan2;
+		}
+	} else {
+		/*
+		 * The previous burst is done and we can only predict what
+		 * will come next.
+		 */
+		plan = zil_lwb_predict(zilog);
+	}
+	blksz = plan + sizeof (zil_chain_t);
+	blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
+	blksz = MIN(blksz, zilog->zl_max_block_size);
+	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
+	    uint64_t, plan);

-	return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
+	return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
 }

 /*
@ -1810,6 +1881,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 	    itx = list_next(&lwb->lwb_itxs, itx))
 		zil_lwb_commit(zilog, lwb, itx);
 	lwb->lwb_nused = lwb->lwb_nfilled;
+	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);

 	lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
 	    ZIO_FLAG_CANFAIL);
@ -1837,7 +1909,7 @@ next_lwb:
 	int wsz = lwb->lwb_sz;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
-		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+		if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
@ -1998,6 +2070,42 @@ zil_max_copied_data(zilog_t *zilog)
 	return (MIN(max_data, zil_maxcopied));
 }

+static uint64_t
+zil_itx_record_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+
+	if (lr->lrc_txtype == TX_COMMIT)
+		return (0);
+	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+	return (lr->lrc_reclen);
+}
+
+static uint64_t
+zil_itx_data_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+	lr_write_t *lrw = (lr_write_t *)lr;
+
+	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+		ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t));
+		return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t),
+		    uint64_t));
+	}
+	return (0);
+}
+
+static uint64_t
+zil_itx_full_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+
+	if (lr->lrc_txtype == TX_COMMIT)
+		return (0);
+	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+	return (lr->lrc_reclen + zil_itx_data_size(itx));
+}
+
 /*
 * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
 * split the itx as needed, but don't touch the actual transaction data.
@ -2039,14 +2147,10 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 		return (lwb);
 	}

-	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
-		dlen = P2ROUNDUP_TYPED(
-		    lrw->lr_length, sizeof (uint64_t), uint64_t);
-	} else {
-		dlen = 0;
-	}
 	reclen = lr->lrc_reclen;
-	zilog->zl_cur_used += (reclen + dlen);
+	ASSERT3U(reclen, >=, sizeof (lr_t));
+	ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
+	dlen = zil_itx_data_size(itx);

 cont:
 	/*
@ -2064,19 +2168,19 @@ cont:
 		if (lwb == NULL)
 			return (NULL);
 		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
-
-		/*
-		 * There must be enough space in the new, empty log block to
-		 * hold reclen.  For WR_COPIED, we need to fit the whole
-		 * record in one block, and reclen is the header size + the
-		 * data size. For WR_NEED_COPY, we can create multiple
-		 * records, splitting the data into multiple blocks, so we
-		 * only need to fit one word of data per block; in this case
-		 * reclen is just the header size (no data).
-		 */
-		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 	}

+	/*
+	 * There must be enough space in the log block to hold reclen.
+	 * For WR_COPIED, we need to fit the whole record in one block,
+	 * and reclen is the write record header size + the data size.
+	 * For WR_NEED_COPY, we can create multiple records, splitting
+	 * the data into multiple blocks, so we only need to fit one
+	 * word of data per block; in this case reclen is just the header
+	 * size (no data).
+	 */
+	ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+
 	dnow = MIN(dlen, lwb_sp - reclen);
 	if (dlen > dnow) {
 		ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
@ -2087,6 +2191,7 @@ cont:
 		clrw->lr_length = dnow;
 		lrw->lr_offset += dnow;
 		lrw->lr_length -= dnow;
+		zilog->zl_cur_left -= dnow;
 	} else {
 		citx = itx;
 		clr = lr;
@ -2108,10 +2213,8 @@ cont:
 	list_insert_tail(&lwb->lwb_itxs, citx);

 	dlen -= dnow;
-	if (dlen > 0) {
-		zilog->zl_cur_used += reclen;
+	if (dlen > 0)
 		goto cont;
-	}

 	if (lr->lrc_txtype == TX_WRITE &&
 	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
@ -2138,13 +2241,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 	if (lr->lrc_txtype == TX_COMMIT)
 		return;

-	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
-		dlen = P2ROUNDUP_TYPED(
-		    lrw->lr_length, sizeof (uint64_t), uint64_t);
-	} else {
-		dlen = 0;
-	}
 	reclen = lr->lrc_reclen;
+	dlen = zil_itx_data_size(itx);
 	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);

 	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
@ -2252,7 +2350,9 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
 	size_t itxsize, lrsize;
 	itx_t *itx;

+	ASSERT3U(olrsize, >=, sizeof (lr_t));
 	lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
+	ASSERT3U(lrsize, >=, olrsize);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;

 	itx = zio_data_buf_alloc(itxsize);
@ -2271,6 +2371,10 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
 static itx_t *
 zil_itx_clone(itx_t *oitx)
 {
+	ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
+	ASSERT3U(oitx->itx_size, ==,
+	    offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
+
 	itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
 	memcpy(itx, oitx, oitx->itx_size);
 	itx->itx_callback = NULL;
@ -2281,6 +2385,9 @@ zil_itx_clone(itx_t *oitx)
 void
 zil_itx_destroy(itx_t *itx)
 {
+	ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
+	ASSERT3U(itx->itx_lr.lrc_reclen, ==,
+	    itx->itx_size - offsetof(itx_t, itx_lr));
 	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
 	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);

@ -2364,7 +2471,7 @@ void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
-	itx_async_node_t *ian;
+	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
@ -2391,7 +2498,8 @@ zil_remove_async(zilog_t *zilog, uint64_t oid)
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
-		ian = avl_find(t, &oid, &where);
+		ian_search.ia_foid = oid;
+		ian = avl_find(t, &ian_search, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
@ -2565,6 +2673,7 @@ zil_get_commit_list(zilog_t *zilog)
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
+		itx_t *itx = NULL;
 		if (unlikely(zilog->zl_suspend > 0)) {
 			/*
 			 * ZIL was just suspended, but we lost the race.
@ -2574,10 +2683,20 @@ zil_get_commit_list(zilog_t *zilog)
 			if (!list_is_empty(sync_list))
 				wtxg = MAX(wtxg, txg);
 		} else {
+			itx = list_head(sync_list);
 			list_move_tail(commit_list, sync_list);
 		}

 		mutex_exit(&itxg->itxg_lock);
+
+		while (itx != NULL) {
+			uint64_t s = zil_itx_full_size(itx);
+			zilog->zl_cur_size += s;
+			zilog->zl_cur_left += s;
+			s = zil_itx_record_size(itx);
+			zilog->zl_cur_max = MAX(zilog->zl_cur_max, s);
+			itx = list_next(commit_list, itx);
+		}
 	}
 	return (wtxg);
 }
@ -2589,7 +2708,7 @@ void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
-	itx_async_node_t *ian;
+	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;

@ -2619,7 +2738,8 @@ zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
-			ian = avl_find(t, &foid, &where);
+			ian_search.ia_foid = foid;
+			ian = avl_find(t, &ian_search, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
@ -2712,6 +2832,26 @@ zil_commit_writer_stall(zilog_t *zilog)
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }

+static void
+zil_burst_done(zilog_t *zilog)
+{
+	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
+	    zilog->zl_cur_size == 0)
+		return;
+
+	if (zilog->zl_parallel)
+		zilog->zl_parallel--;
+
+	uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1);
+	zilog->zl_prev_rotor = r;
+	zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size,
+	    &zilog->zl_prev_min[r]);
+
+	zilog->zl_cur_size = 0;
+	zilog->zl_cur_max = 0;
+	zilog->zl_cur_left = 0;
+}
+
 /*
 * This function will traverse the commit list, creating new lwbs as
 * needed, and committing the itxs from the commit list to these newly
@ -2726,7 +2866,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 	list_t nolwb_waiters;
 	lwb_t *lwb, *plwb;
 	itx_t *itx;
-	boolean_t first = B_TRUE;

 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));

@ -2752,9 +2891,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		zil_commit_activate_saxattr_feature(zilog);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
-		first = (lwb->lwb_state == LWB_STATE_NEW) &&
-		    ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
-		    plwb->lwb_state == LWB_STATE_FLUSH_DONE);
+
+		/*
+		 * If the lwb is still opened, it means the workload is really
+		 * multi-threaded and we won the chance of write aggregation.
+		 * If it is not opened yet, but previous lwb is still not
+		 * flushed, it still means the workload is multi-threaded, but
+		 * there was too much time between the commits to aggregate, so
+		 * we try aggregation next times, but without too much hopes.
+		 */
+		if (lwb->lwb_state == LWB_STATE_OPENED) {
+			zilog->zl_parallel = ZIL_BURSTS;
+		} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
+		    != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
+			zilog->zl_parallel = MAX(zilog->zl_parallel,
+			    ZIL_BURSTS / 2);
+		}
 	}

 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
@ -2829,7 +2981,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 					 * Our lwb is done, leave the rest of
 					 * itx list to somebody else who care.
 					 */
-					first = B_FALSE;
+					zilog->zl_parallel = ZIL_BURSTS;
+					zilog->zl_cur_left -=
+					    zil_itx_full_size(itx);
 					break;
 				}
 			} else {
@ -2839,8 +2993,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 				}
 				list_insert_tail(&nolwb_itxs, itx);
 			}
+			zilog->zl_cur_left -= zil_itx_full_size(itx);
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
+			zilog->zl_cur_left -= zil_itx_full_size(itx);
 			zil_itx_destroy(itx);
 		}
 	}
@ -2921,28 +3077,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
-		 *
-		 * If we had no already running or open LWBs, it can be
-		 * the workload is single-threaded.  And if the ZIL write
-		 * latency is very small or if the LWB is almost full, it
-		 * may be cheaper to bypass the delay.
 		 */
-		if (lwb->lwb_state == LWB_STATE_OPENED && first) {
-			hrtime_t sleep = zilog->zl_last_lwb_latency *
-			    zfs_commit_timeout_pct / 100;
-			if (sleep < zil_min_commit_timeout ||
-			    lwb->lwb_nmax - lwb->lwb_nused <
-			    lwb->lwb_nmax / 8) {
-				list_insert_tail(ilwbs, lwb);
-				lwb = zil_lwb_write_close(zilog, lwb,
-				    LWB_STATE_NEW);
-				zilog->zl_cur_used = 0;
-				if (lwb == NULL) {
-					while ((lwb = list_remove_head(ilwbs))
-					    != NULL)
-						zil_lwb_write_issue(zilog, lwb);
-					zil_commit_writer_stall(zilog);
-				}
+		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+			zil_burst_done(zilog);
+			list_insert_tail(ilwbs, lwb);
+			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+			if (lwb == NULL) {
+				while ((lwb = list_remove_head(ilwbs)) != NULL)
+					zil_lwb_write_issue(zilog, lwb);
+				zil_commit_writer_stall(zilog);
 			}
 		}
 	}
@ -3096,24 +3239,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
+	zil_burst_done(zilog);
 	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);

 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);

-	/*
-	 * Since the lwb's zio hadn't been issued by the time this thread
-	 * reached its timeout, we reset the zilog's "zl_cur_used" field
-	 * to influence the zil block size selection algorithm.
-	 *
-	 * By having to issue the lwb's zio here, it means the size of the
-	 * lwb was too large, given the incoming throughput of itxs.  By
-	 * setting "zl_cur_used" to zero, we communicate this fact to the
-	 * block size selection algorithm, so it can take this information
-	 * into account, and potentially select a smaller size for the
-	 * next lwb block that is allocated.
-	 */
-	zilog->zl_cur_used = 0;
-
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_close() returns NULL, this
@ -3708,7 +3838,9 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
-	zilog->zl_max_block_size = zil_maxblocksize;
+	zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize,
+	    ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ),
+	    spa_maxblocksize(dmu_objset_spa(os)));

 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
@ -3728,6 +3860,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);

+	for (int i = 0; i < ZIL_BURSTS; i++) {
+		zilog->zl_prev_opt[i] = zilog->zl_max_block_size -
+		    sizeof (zil_chain_t);
+	}
+
 	return (zilog);
 }

@ -4230,9 +4367,6 @@ EXPORT_SYMBOL(zil_kstat_values_update);
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");

-ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
-	"Minimum delay we care for ZIL block commit");
-
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 	"Disable intent logging replay");

--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -306,6 +306,53 @@ zio_fini(void)
 * ==========================================================================
 */

+#ifdef ZFS_DEBUG
+static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
+#endif
+
+/*
+ * Use empty space after the buffer to detect overflows.
+ *
+ * Since zio_init() creates kmem caches only for certain set of buffer sizes,
+ * allocations of different sizes may have some unused space after the data.
+ * Filling part of that space with a known pattern on allocation and checking
+ * it on free should allow us to detect some buffer overflows.
+ */
+static void
+zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
+{
+#ifdef ZFS_DEBUG
+	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
+	ulong_t *canary = p + off / sizeof (ulong_t);
+	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
+	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
+	    cache[c] == cache[c + 1])
+		asize = (c + 2) << SPA_MINBLOCKSHIFT;
+	for (; off < asize; canary++, off += sizeof (ulong_t))
+		*canary = zio_buf_canary;
+#endif
+}
+
+static void
+zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
+{
+#ifdef ZFS_DEBUG
+	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
+	ulong_t *canary = p + off / sizeof (ulong_t);
+	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
+	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
+	    cache[c] == cache[c + 1])
+		asize = (c + 2) << SPA_MINBLOCKSHIFT;
+	for (; off < asize; canary++, off += sizeof (ulong_t)) {
+		if (unlikely(*canary != zio_buf_canary)) {
+			PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx",
+			    p, size, (canary - p) * sizeof (ulong_t),
+			    *canary, zio_buf_canary);
+		}
+	}
+#endif
+}
+
 /*
 * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
@ -322,7 +369,9 @@ zio_buf_alloc(size_t size)
 	atomic_add_64(&zio_buf_cache_allocs[c], 1);
 #endif

-	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+	void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
+	zio_buf_put_canary(p, size, zio_buf_cache, c);
+	return (p);
 }

 /*
@ -338,7 +387,9 @@ zio_data_buf_alloc(size_t size)

 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);

-	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+	void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
+	zio_buf_put_canary(p, size, zio_data_buf_cache, c);
+	return (p);
 }

 void
@ -351,6 +402,7 @@ zio_buf_free(void *buf, size_t size)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif

+	zio_buf_check_canary(buf, size, zio_buf_cache, c);
 	kmem_cache_free(zio_buf_cache[c], buf);
 }

@ -361,6 +413,7 @@ zio_data_buf_free(void *buf, size_t size)

 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);

+	zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }

@ -1382,23 +1435,10 @@ zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
    zio_done_func_t *done, void *private, zio_flag_t flags)
 {
-	zio_t *zio;
-	int c;
-
-	if (vd->vdev_children == 0) {
-		zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
-		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
-		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-
-		zio->io_cmd = cmd;
-	} else {
-		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
-
-		for (c = 0; c < vd->vdev_children; c++)
-			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-			    done, private, flags));
-	}
-
+	zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+	    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+	    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+	zio->io_cmd = cmd;
 	return (zio);
 }

@ -1569,11 +1609,18 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
 }

 void
-zio_flush(zio_t *zio, vdev_t *vd)
+zio_flush(zio_t *pio, vdev_t *vd)
 {
-	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
-	    NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+	if (vd->vdev_nowritecache)
+		return;
+	if (vd->vdev_children == 0) {
+		zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd,
+		    DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL |
+		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+	} else {
+		for (uint64_t c = 0; c < vd->vdev_children; c++)
+			zio_flush(pio, vd->vdev_child[c]);
+	}
 }

 void
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@ -363,11 +363,14 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 			zil_chain_t zilc;
 			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));

-			size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ,
-			    uint64_t);
+			uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused,
+			    ZIL_MIN_BLKSZ, uint64_t);
+			ASSERT3U(size, >=, nused);
+			size = nused;
 			eck = zilc.zc_eck;
 			eck_offset = offsetof(zil_chain_t, zc_eck);
 		} else {
+			ASSERT3U(size, >=, sizeof (zio_eck_t));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
@ -448,12 +451,13 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
 				return (SET_ERROR(ECKSUM));
 			}

-			if (nused > size) {
+			nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+			if (size < nused)
 				return (SET_ERROR(ECKSUM));
-			}
-
-			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+			size = nused;
 		} else {
+			if (size < sizeof (zio_eck_t))
+				return (SET_ERROR(ECKSUM));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@ -451,6 +451,8 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 	lr_truncate_t *lr = arg2;
 	uint64_t offset, length;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

@ -487,6 +489,8 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 	dmu_tx_t *tx;
 	int error;

+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));

@ -516,60 +520,6 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 	return (error);
 }

-/*
- * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
- * after a system failure.
- *
- * TODO: For now we drop block cloning transations for ZVOLs as they are
- *       unsupported, but we still need to inform BRT about that as we
- *       claimed them during pool import.
- *       This situation can occur when we try to import a pool from a ZFS
- *       version supporting block cloning for ZVOLs into a system that
- *       has this ZFS version, that doesn't support block cloning for ZVOLs.
- */
-static int
-zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
-{
-	char name[ZFS_MAX_DATASET_NAME_LEN];
-	zvol_state_t *zv = arg1;
-	objset_t *os = zv->zv_objset;
-	lr_clone_range_t *lr = arg2;
-	blkptr_t *bp;
-	dmu_tx_t *tx;
-	spa_t *spa;
-	uint_t ii;
-	int error;
-
-	dmu_objset_name(os, name);
-	cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.",
-	    name);
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	tx = dmu_tx_create(os);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		return (error);
-	}
-
-	spa = os->os_spa;
-
-	for (ii = 0; ii < lr->lr_nbps; ii++) {
-		bp = &lr->lr_bps[ii];
-
-		if (!BP_IS_HOLE(bp)) {
-			zio_free(spa, dmu_tx_get_txg(tx), bp);
-		}
-	}
-
-	(void) zil_replaying(zv->zv_zilog, tx);
-	dmu_tx_commit(tx);
-
-	return (0);
-}
-
 static int
 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 {
@ -604,7 +554,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* TX_SETSAXATTR */
 	zvol_replay_err,	/* TX_RENAME_EXCHANGE */
 	zvol_replay_err,	/* TX_RENAME_WHITEOUT */
-	zvol_replay_clone_range	/* TX_CLONE_RANGE */
+	zvol_replay_err,	/* TX_CLONE_RANGE */
 };

 /*
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@ -53,6 +53,12 @@ tags = ['functional', 'arc']
 tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on']
 tags = ['functional', 'atime']

+[tests/functional/block_cloning]
+tests = ['block_cloning_clone_mmap_cached',
+    'block_cloning_copyfilerange',
+    'block_cloning_copyfilerange_partial']
+tags = ['functional', 'block_cloning']
+
 [tests/functional/bootfs]
 tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos',
    'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos',
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@ -42,6 +42,7 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial',
    'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone',
    'block_cloning_disabled_ficlonerange',
    'block_cloning_copyfilerange_cross_dataset',
+    'block_cloning_cross_enc_dataset',
    'block_cloning_copyfilerange_fallback_same_txg']
 tags = ['functional', 'block_cloning']

--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@ -270,6 +270,7 @@ if sys.platform.startswith('freebsd'):
    })
 elif sys.platform.startswith('linux'):
    maybe.update({
+        'block_cloning/block_cloning_clone_mmap_cached': ['SKIP', cfr_reason],
        'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
        'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
        'fault/auto_online_002_pos': ['FAIL', 11889],
@ -305,6 +306,8 @@ elif sys.platform.startswith('linux'):
            ['SKIP', cfr_cross_reason],
        'block_cloning/block_cloning_copyfilerange_fallback_same_txg':
            ['SKIP', cfr_cross_reason],
+        'block_cloning/block_cloning_cross_enc_dataset':
+            ['SKIP', cfr_cross_reason],
    })


--- a/tests/zfs-tests/cmd/.gitignore
+++ b/tests/zfs-tests/cmd/.gitignore
@ -2,6 +2,7 @@
 /btree_test
 /chg_usr_exec
 /clonefile
+/clone_mmap_cached
 /devname2devid
 /dir_rd_update
 /draid
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@ -2,6 +2,7 @@ scripts_zfs_tests_bindir = $(datadir)/$(PACKAGE)/zfs-tests/bin


 scripts_zfs_tests_bin_PROGRAMS  = %D%/chg_usr_exec
+scripts_zfs_tests_bin_PROGRAMS += %D%/clone_mmap_cached
 scripts_zfs_tests_bin_PROGRAMS += %D%/cp_files
 scripts_zfs_tests_bin_PROGRAMS += %D%/ctime
 scripts_zfs_tests_bin_PROGRAMS += %D%/dir_rd_update
--- a/tests/zfs-tests/cmd/clone_mmap_cached.c
+++ b/tests/zfs-tests/cmd/clone_mmap_cached.c
@ -0,0 +1,146 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2024 by Pawel Jakub Dawidek
+ */
+
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef __FreeBSD__
+#define	loff_t	off_t
+#endif
+
+ssize_t
+copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int)
+    __attribute__((weak));
+
+static void *
+mmap_file(int fd, size_t size)
+{
+	void *p;
+
+	p = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
+	if (p == MAP_FAILED) {
+		(void) fprintf(stderr, "mmap failed: %s\n", strerror(errno));
+		exit(2);
+	}
+
+	return (p);
+}
+
+static void
+usage(const char *progname)
+{
+
+	/*
+	 * -i cache input before copy_file_range(2).
+	 * -o cache input before copy_file_range(2).
+	 */
+	(void) fprintf(stderr, "usage: %s [-io] <input> <output>\n", progname);
+	exit(3);
+}
+
+int
+main(int argc, char *argv[])
+{
+	int dfd, sfd;
+	size_t dsize, ssize;
+	void *dmem, *smem, *ptr;
+	off_t doff, soff;
+	struct stat sb;
+	bool cache_input, cache_output;
+	const char *progname;
+	int c;
+
+	progname = argv[0];
+	cache_input = cache_output = false;
+
+	while ((c = getopt(argc, argv, "io")) != -1) {
+		switch (c) {
+		case 'i':
+			cache_input = true;
+			break;
+		case 'o':
+			cache_output = true;
+			break;
+		default:
+			usage(progname);
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (argc != 2) {
+		usage(progname);
+	}
+
+	sfd = open(argv[0], O_RDONLY);
+	if (fstat(sfd, &sb) == -1) {
+		(void) fprintf(stderr, "fstat failed: %s\n", strerror(errno));
+		exit(2);
+	}
+	ssize = sb.st_size;
+	smem = mmap_file(sfd, ssize);
+
+	dfd = open(argv[1], O_RDWR);
+	if (fstat(dfd, &sb) == -1) {
+		(void) fprintf(stderr, "fstat failed: %s\n", strerror(errno));
+		exit(2);
+	}
+	dsize = sb.st_size;
+	dmem = mmap_file(dfd, dsize);
+
+	/*
+	 * Hopefully it won't be compiled out.
+	 */
+	if (cache_input) {
+		ptr = malloc(ssize);
+		assert(ptr != NULL);
+		memcpy(ptr, smem, ssize);
+		free(ptr);
+	}
+	if (cache_output) {
+		ptr = malloc(ssize);
+		assert(ptr != NULL);
+		memcpy(ptr, dmem, dsize);
+		free(ptr);
+	}
+
+	soff = doff = 0;
+	if (copy_file_range(sfd, &soff, dfd, &doff, ssize, 0) < 0) {
+		(void) fprintf(stderr, "copy_file_range failed: %s\n",
+		    strerror(errno));
+		exit(2);
+	}
+
+	exit(memcmp(smem, dmem, ssize) == 0 ? 0 : 1);
+}
--- a/tests/zfs-tests/cmd/ctime.c
+++ b/tests/zfs-tests/cmd/ctime.c
@ -362,12 +362,20 @@ main(void)
 			return (1);
 		}

-		if (t1 == t2) {
-			(void) fprintf(stderr, "%s: t1(%ld) == t2(%ld)\n",
+
+		/*
+		 * Ideally, time change would be exactly two seconds, but allow
+		 * a little slack in case of scheduling delays or similar.
+		 */
+		long delta = (long)t2 - (long)t1;
+		if (delta < 2 || delta > 4) {
+			(void) fprintf(stderr,
+			    "%s: BAD time change: t1(%ld), t2(%ld)\n",
 			    timetest_table[i].name, (long)t1, (long)t2);
 			return (1);
 		} else {
-			(void) fprintf(stderr, "%s: t1(%ld) != t2(%ld)\n",
+			(void) fprintf(stderr,
+			    "%s: good time change: t1(%ld), t2(%ld)\n",
 			    timetest_table[i].name, (long)t1, (long)t2);
 		}
 	}
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@ -184,6 +184,7 @@ export ZFSTEST_FILES='badsend
    btree_test
    chg_usr_exec
    clonefile
+    clone_mmap_cached
    devname2devid
    dir_rd_update
    draid
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@ -440,6 +440,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/atime/setup.ksh \
 	functional/block_cloning/cleanup.ksh \
 	functional/block_cloning/setup.ksh \
+	functional/block_cloning/block_cloning_clone_mmap_cached.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
@ -451,6 +452,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/block_cloning/block_cloning_ficlone.ksh \
 	functional/block_cloning/block_cloning_ficlonerange.ksh \
 	functional/block_cloning/block_cloning_ficlonerange_partial.ksh \
+	functional/block_cloning/block_cloning_cross_enc_dataset.ksh \
 	functional/bootfs/bootfs_001_pos.ksh \
 	functional/bootfs/bootfs_002_neg.ksh \
 	functional/bootfs/bootfs_003_pos.ksh \
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib
@ -28,8 +28,8 @@

 function have_same_content
 {
-	typeset hash1=$(cat $1 | md5sum)
-	typeset hash2=$(cat $2 | md5sum)
+	typeset hash1=$(md5digest $1)
+	typeset hash2=$(md5digest $2)

 	log_must [ "$hash1" = "$hash2" ]
 }
@ -44,10 +44,14 @@ function have_same_content
 #
 function get_same_blocks
 {
+    KEY=$5
+    if [ ${#KEY} -gt 0 ]; then
+        KEY="--key=$KEY"
+    fi
 	typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$
-	zdb -vvvvv $1 -O $2 | \
+	zdb $KEY -vvvvv $1 -O $2 | \
 	    awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a
-	zdb -vvvvv $3 -O $4 | \
+	zdb $KEY -vvvvv $3 -O $4 | \
 	    awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b
 	echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ')
 }
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh
@ -0,0 +1,86 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+#
+# DESCRIPTION:
+#	When the destination file is mmaped and is already cached we need to
+#	update mmaped pages after successful clone.
+#
+# STRATEGY:
+#	1. Create a pool.
+#	2. Create a two test files with random content.
+#	3. mmap the files, read them and clone from one to the other using
+#	   clone_mmap_cached.
+#	4. clone_mmap_cached also verifies if the content of the destination
+#	   file was updated while reading it from mmaped memory.
+#
+
+verify_runnable "global"
+
+if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
+  log_unsupported "copy_file_range not available before Linux 4.5"
+fi
+
+VDIR=$TEST_BASE_DIR/disk-bclone
+VDEV="$VDIR/a"
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+	rm -rf $VDIR
+}
+
+log_onexit cleanup
+
+log_assert "Test for clone into mmaped and cached file"
+
+log_must rm -rf $VDIR
+log_must mkdir -p $VDIR
+log_must truncate -s 1G $VDEV
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV
+log_must zfs create $TESTPOOL/$TESTFS
+
+for opts in "--" "-i" "-o" "-io"
+do
+	log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/src bs=1M count=1
+	log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/dst bs=1M count=1
+
+	# Clear cache.
+	log_must zpool export $TESTPOOL
+	log_must zpool import -d $VDIR $TESTPOOL
+
+	log_must clone_mmap_cached $opts /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst
+
+	sync_pool $TESTPOOL
+	log_must sync
+
+	log_must have_same_content /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst
+	blocks=$(get_same_blocks $TESTPOOL/$TESTFS src $TESTPOOL/$TESTFS dst)
+	# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1).
+	log_must [ "$blocks" = "$(seq -s " " 0 7 | sed 's/ $//')" ]
+done
+
+log_pass "Clone properly updates mmapped and cached pages"
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh
@ -0,0 +1,170 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Kay Pedersen <mail@mkwg.de>
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
+  log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
+fi
+
+claim="Block cloning across encrypted datasets."
+
+log_assert $claim
+
+DS1="$TESTPOOL/encrypted1"
+DS2="$TESTPOOL/encrypted2"
+DS1_NC="$TESTPOOL/notcrypted1"
+PASSPHRASE="top_secret"
+
+function prepare_enc
+{
+    log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+    log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
+	    "-o keyformat=passphrase -o keylocation=prompt $DS1"
+    log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
+	    "-o keyformat=passphrase -o keylocation=prompt $DS2"
+    log_must zfs create $DS1/child1
+    log_must zfs create $DS1/child2
+    log_must zfs create $DS1_NC
+
+    log_note "Create test file"
+    # we must wait until the src file txg is written to the disk otherwise we
+    # will fallback to normal copy. See "dmu_read_l0_bps" in
+    # "zfs/module/zfs/dmu.c" and "zfs_clone_range" in
+    # "zfs/module/zfs/zfs_vnops.c"
+    log_must dd if=/dev/urandom of=/$DS1/file bs=128K count=4
+    log_must dd if=/dev/urandom of=/$DS1/child1/file bs=128K count=4
+    log_must dd if=/dev/urandom of=/$DS1_NC/file bs=128K count=4
+    log_must sync_pool $TESTPOOL
+}
+
+function cleanup_enc
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+function clone_and_check
+{
+    I_FILE="$1"
+    O_FILE=$2
+    I_DS=$3
+    O_DS=$4
+    SAME_BLOCKS=$5
+    # the CLONE option provides a choice between copy_file_range
+    # which should clone and a dd which is a copy no matter what
+    CLONE=$6
+    SNAPSHOT=$7
+    if [ ${#SNAPSHOT} -gt 0 ]; then
+        I_FILE=".zfs/snapshot/$SNAPSHOT/$1"
+    fi
+    if [ $CLONE ]; then
+        log_must clonefile -f "/$I_DS/$I_FILE" "/$O_DS/$O_FILE" 0 0 524288
+    else
+        log_must dd if="/$I_DS/$I_FILE" of="/$O_DS/$O_FILE" bs=128K
+    fi
+    log_must sync_pool $TESTPOOL
+
+    log_must have_same_content "/$I_DS/$I_FILE" "/$O_DS/$O_FILE"
+
+    if [ ${#SNAPSHOT} -gt 0 ]; then
+        I_DS="$I_DS@$SNAPSHOT"
+        I_FILE="$1"
+    fi
+    typeset blocks=$(get_same_blocks \
+      $I_DS $I_FILE $O_DS $O_FILE $PASSPHRASE)
+    log_must [ "$blocks" = "$SAME_BLOCKS" ]
+}
+
+log_onexit cleanup_enc
+
+prepare_enc
+
+log_note "Cloning entire file with copy_file_range across different enc" \
+    "roots, should fallback"
+# we are expecting no same block map.
+clone_and_check "file" "clone" $DS1 $DS2 "" true
+log_note "check if the file is still readable and the same after" \
+    "unmount and key unload, shouldn't fail"
+typeset hash1=$(md5digest "/$DS1/file")
+log_must zfs umount $DS1 && zfs unload-key $DS1
+typeset hash2=$(md5digest "/$DS2/clone")
+log_must [ "$hash1" = "$hash2" ]
+
+cleanup_enc
+prepare_enc
+
+log_note "Cloning entire file with copy_file_range across different child datasets"
+# clone shouldn't work because of deriving a new master key for the child
+# we are expecting no same block map.
+clone_and_check "file" "clone" $DS1 "$DS1/child1" "" true
+clone_and_check "file" "clone" "$DS1/child1" "$DS1/child2" "" true
+
+cleanup_enc
+prepare_enc
+
+log_note "Copying entire file with copy_file_range across same snapshot"
+log_must zfs snapshot -r $DS1@s1
+log_must sync_pool $TESTPOOL
+log_must rm -f "/$DS1/file"
+log_must sync_pool $TESTPOOL
+clone_and_check "file" "clone" "$DS1" "$DS1" "0 1 2 3" true "s1"
+
+cleanup_enc
+prepare_enc
+
+log_note "Copying entire file with copy_file_range across different snapshot"
+clone_and_check "file" "file" $DS1 $DS2 "" true
+log_must zfs snapshot -r $DS2@s1
+log_must sync_pool $TESTPOOL
+log_must rm -f "/$DS1/file" "/$DS2/file"
+log_must sync_pool $TESTPOOL
+clone_and_check "file" "clone" "$DS2" "$DS1" "" true "s1"
+typeset hash1=$(md5digest "/$DS1/.zfs/snapshot/s1/file")
+log_note "destroy the snapshot and check if the file is still readable and" \
+    "has the same content"
+log_must zfs destroy -r $DS2@s1
+log_must sync_pool $TESTPOOL
+typeset hash2=$(md5digest "/$DS1/file")
+log_must [ "$hash1" = "$hash2" ]
+
+cleanup_enc
+prepare_enc
+
+log_note "Copying with copy_file_range from non encrypted to encrypted"
+clone_and_check "file" "copy" $DS1_NC $DS1 "" true
+
+cleanup_enc
+prepare_enc
+
+log_note "Copying with copy_file_range from encrypted to non encrypted"
+clone_and_check "file" "copy" $DS1 $DS1_NC "" true
+
+log_must sync_pool $TESTPOOL
+
+log_pass $claim
--- a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh
@ -30,6 +30,9 @@
 if ! command -v clonefile > /dev/null ; then
  log_unsupported "clonefile program required to test block cloning"
 fi
+if ! command -v clone_mmap_cached > /dev/null ; then
+  log_unsupported "clone_mmap_cached program required to test block cloning"
+fi

 verify_runnable "global"

--- a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh
@ -31,15 +31,13 @@
 #	2. Set l2arc_write_max to a value larger than the cache device.
 #	3. Create a file larger than the cache device and random read
 #		for 10 sec.
-#	4. Verify that l2arc_write_max is set back to the default.
-#	5. Set l2arc_write_max to a value less than the cache device size but
+#	4. Set l2arc_write_max to a value less than the cache device size but
 #		larger than the default (256MB).
-#	6. Record the l2_size.
-#	7. Random read for 1 sec.
-#	8. Record the l2_size again.
-#	9. If (6) <= (8) then we have not looped around yet.
-#	10. If (6) > (8) then we looped around. Break out of the loop and test.
-#	11. Destroy pool.
+#	5. Record the l2_size.
+#	6. Random read for 1 sec.
+#	7. Record the l2_size again.
+#	8. If (5) <= (7) then we have not looped around yet.
+#	9. Destroy pool.
 #

 verify_runnable "global"
@ -93,10 +91,6 @@ log_must zfs set relatime=off $TESTPOOL
 log_must fio $FIO_SCRIPTS/mkfiles.fio
 log_must fio $FIO_SCRIPTS/random_reads.fio

-typeset write_max2=$(get_tunable L2ARC_WRITE_MAX)
-
-log_must test $write_max2 -eq $write_max
-
 log_must set_tunable32 L2ARC_WRITE_MAX $(( 256 * 1024 * 1024 ))
 export RUNTIME=1

@ -108,8 +102,6 @@ while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do
 	do_once=false
 done

-log_must test $l2_size1 -gt $l2_size2
-
 log_must zpool destroy $TESTPOOL

 log_pass "Looping around a cache device succeeds."
--- a/tests/zfs-tests/tests/functional/io/io_uring.ksh
+++ b/tests/zfs-tests/tests/functional/io/io_uring.ksh
@ -44,6 +44,13 @@ if ! $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then
 	log_unsupported "Requires io_uring support"
 fi

+if [ -e /etc/os-release ] ; then
+	source /etc/os-release
+	if [ -n "$REDHAT_SUPPORT_PRODUCT_VERSION" ] && ((floor($REDHAT_SUPPORT_PRODUCT_VERSION) == 9)) ; then
+		log_unsupported "Disabled on CentOS 9, fails with 'Operation not permitted'"
+	fi
+fi
+
 fio --ioengine=io_uring --parse-only || log_unsupported "fio io_uring support required"

 function cleanup