Merge pull request #158 from truenas/zfs-2.2-release-cobia-rc1

Sync with upstream zfs-2.2-release branch
2023-08-25 16:27:57 -04:00 · 2023-08-25 16:27:57 -04:00 · f8c61e8326
parent 6464c1be21 4b09b19309
commit f8c61e8326
36 changed files with 893 additions and 566 deletions
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@ -79,6 +79,7 @@
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
+#include <sys/brt.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>

@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = {
 #define	ZB_TOTAL	DN_MAX_LEVELS
 #define	SPA_MAX_FOR_16M	(SPA_MAXBLOCKSHIFT+1)

+typedef struct zdb_brt_entry {
+	dva_t		zbre_dva;
+	uint64_t	zbre_refcount;
+	avl_node_t	zbre_node;
+} zdb_brt_entry_t;
+
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_removing_size;
 	uint64_t	zcb_checkpoint_size;
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
+	uint64_t	zcb_clone_asize;
+	uint64_t	zcb_clone_blocks;
 	uint64_t	zcb_psize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_count[SPA_MAX_FOR_16M];
@ -5368,6 +5377,8 @@ typedef struct zdb_cb {
 	int		zcb_haderrors;
 	spa_t		*zcb_spa;
 	uint32_t	**zcb_vd_obsolete_counts;
+	avl_tree_t	zcb_brt;
+	boolean_t	zcb_brt_is_active;
 } zdb_cb_t;

 /* test if two DVA offsets from same vdev are within the same metaslab */
@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);

+	if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
+		/*
+		 * Cloned blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them them once.
+		 *
+		 * To do this, we keep our own in-memory BRT. For each block
+		 * we haven't seen before, we look it up in the real BRT and
+		 * if its there, we note it and its refcount then proceed as
+		 * normal. If we see the block again, we count it as a clone
+		 * and then give it no further consideration.
+		 */
+		zdb_brt_entry_t zbre_search, *zbre;
+		avl_index_t where;
+
+		zbre_search.zbre_dva = bp->blk_dva[0];
+		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
+		if (zbre != NULL) {
+			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_clone_blocks++;
+
+			zbre->zbre_refcount--;
+			if (zbre->zbre_refcount == 0) {
+				avl_remove(&zcb->zcb_brt, zbre);
+				umem_free(zbre, sizeof (zdb_brt_entry_t));
+			}
+			return;
+		}
+
+		uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
+		if (crefcnt > 0) {
+			zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
+			    UMEM_NOFAIL);
+			zbre->zbre_dva = bp->blk_dva[0];
+			zbre->zbre_refcount = crefcnt;
+			avl_insert(&zcb->zcb_brt, zbre, where);
+		}
+	}
+
 	if (dump_opt['L'])
 		return;

@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa)
 	iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
 }

+static int
+zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
+{
+	const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
+	const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
+	int cmp;
+
+	cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+	if (cmp == 0)
+		cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
+
+	return (cmp);
+}
+
 static int
 dump_block_stats(spa_t *spa)
 {
@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa)

 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);

+	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
+		avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
+		    sizeof (zdb_brt_entry_t),
+		    offsetof(zdb_brt_entry_t, zbre_node));
+		zcb->zcb_brt_is_active = B_TRUE;
+	}
+
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa)
 	    metaslab_class_get_alloc(spa_special_class(spa)) +
 	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
 	    get_unflushed_alloc_space(spa);
-	total_found = tzb->zb_asize - zcb->zcb_dedup_asize +
+	total_found =
+	    tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
 	    zcb->zcb_removing_size + zcb->zcb_checkpoint_size;

 	if (total_found == total_alloc && !dump_opt['L']) {
@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa)
 	    "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
 	    (u_longlong_t)zcb->zcb_dedup_blocks,
 	    (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
+	(void) printf("\t%-16s %14llu    count: %6llu\n",
+	    "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
+	    (u_longlong_t)zcb->zcb_clone_blocks);
 	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);

--- a/cmd/zed/zed.d/statechange-slot_off.sh
+++ b/cmd/zed/zed.d/statechange-slot_off.sh
@ -0,0 +1,61 @@
+#!/bin/sh
+#
+# Turn off disk's enclosure slot if it becomes FAULTED.
+#
+# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
+# as they flip between FAULTED and ONLINE.  If
+# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
+# FAULTED, then power down the slot via sysfs:
+#
+# /sys/class/enclosure/<enclosure>/<slot>/power_status
+#
+# We assume the user will be responsible for turning the slot back on again.
+#
+# Note that this script requires that your enclosure be supported by the
+# Linux SCSI Enclosure services (SES) driver.  The script will do nothing
+# if you have no enclosure, or if your enclosure isn't supported.
+#
+# Exit codes:
+#   0: slot successfully powered off
+#   1: enclosure not available
+#   2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled
+#   3: vdev was not FAULTED
+#   4: The enclosure sysfs path passed from ZFS does not exist
+#   5: Enclosure slot didn't actually turn off after we told it to
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+if [ ! -d /sys/class/enclosure ] ; then
+	# No JBOD enclosure or NVMe slots
+	exit 1
+fi
+
+if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then
+	exit 2
+fi
+
+if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then
+	exit 3
+fi
+
+if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then
+	exit 4
+fi
+
+echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status"
+
+# Wait for sysfs for report that the slot is off.  It can take ~400ms on some
+# enclosures.
+for i in $(seq 1 20) ; do
+	if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then
+		break
+	fi
+	sleep 0.1
+done
+
+if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then
+	exit 5
+fi
+
+zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH"
--- a/cmd/zed/zed.d/zed.rc
+++ b/cmd/zed/zed.d/zed.rc
@ -142,3 +142,8 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
 # Disabled by default, 1 to enable and 0 to disable.
 #ZED_SYSLOG_DISPLAY_GUIDS=1

+##
+# Power off the drive's slot in the enclosure if it becomes FAULTED.  This can
+# help silence misbehaving drives.  This assumes your drive enclosure fully
+# supports slot power control via sysfs.
+#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 	int error;

 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);

 	ztest_object_lock(zd, object, RL_READER);
@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 		    DMU_READ_NO_PREFETCH);
 		ASSERT0(error);
 	} else {
+		ASSERT3P(zio, !=, NULL);
 		size = doi.doi_data_block_size;
 		if (ISP2(size)) {
 			offset = P2ALIGN(offset, size);
--- a/contrib/dracut/90zfs/zfs-env-bootfs.service.in
+++ b/contrib/dracut/90zfs/zfs-env-bootfs.service.in
@ -12,11 +12,12 @@ ExecStart=/bin/sh -c '
    decode_root_args || exit 0;                                                                \
    [ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \
    rootflags="$(getarg rootflags=)";                                                          \
-    case ",$rootflags," in                                                                     \
-        *,zfsutil,*) ;;                                                                        \
-        ,,) rootflags=zfsutil ;;                                                               \
-        *)  rootflags="zfsutil,$rootflags" ;;                                                  \
-    esac;                                                                                      \
+    [ "$(@sbindir@/zfs get -H -o value mountpoint "$root")" = legacy ] ||                      \
+        case ",$rootflags," in                                                                 \
+            *,zfsutil,*) ;;                                                                    \
+            ,,) rootflags=zfsutil ;;                                                           \
+            *)  rootflags="zfsutil,$rootflags" ;;                                              \
+        esac;                                                                                  \
    exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"'

 [Install]
--- a/include/os/linux/spl/sys/kmem_cache.h
+++ b/include/os/linux/spl/sys/kmem_cache.h
@ -198,6 +198,14 @@ extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache);
    spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
 #define	kmem_cache_set_move(skc, move)	spl_kmem_cache_set_move(skc, move)
 #define	kmem_cache_destroy(skc)		spl_kmem_cache_destroy(skc)
+/*
+ * This is necessary to be compatible with other kernel modules
+ * or in-tree filesystem that may define kmem_cache_alloc,
+ * like bcachefs does it now.
+ */
+#ifdef kmem_cache_alloc
+#undef kmem_cache_alloc
+#endif
 #define	kmem_cache_alloc(skc, flags)	spl_kmem_cache_alloc(skc, flags)
 #define	kmem_cache_free(skc, obj)	spl_kmem_cache_free(skc, obj)
 #define	kmem_cache_reap_now(skc)	spl_kmem_cache_reap_now(skc)
--- a/include/sys/brt.h
+++ b/include/sys/brt.h
@ -36,6 +36,7 @@ extern "C" {
 #endif

 extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp);
+extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp);

 extern uint64_t brt_get_dspace(spa_t *spa);
 extern uint64_t brt_get_used(spa_t *spa);
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@ -572,11 +572,15 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
 int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
    uint64_t length, int read, const void *tag, int *numbufsp,
    dmu_buf_t ***dbpp);
+int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+    const void *tag, dmu_buf_t **dbp);
 int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
    const void *tag, dmu_buf_t **dbp, int flags);
 int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
    uint64_t length, boolean_t read, const void *tag, int *numbufsp,
    dmu_buf_t ***dbpp, uint32_t flags);
+int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
+    dmu_buf_t **dbp);
 /*
 * Add a reference to a dmu buffer that has already been held via
 * dmu_buf_hold() in the current context.
--- a/include/sys/dmu_impl.h
+++ b/include/sys/dmu_impl.h
@ -247,8 +247,6 @@ typedef struct dmu_sendstatus {

 void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
 void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
-int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
-    const void *, dmu_buf_t **);

 #ifdef	__cplusplus
 }
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@ -80,7 +80,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
 #define	METASLAB_ASYNC_ALLOC		0x8
 #define	METASLAB_DONT_THROTTLE		0x10
 #define	METASLAB_MUST_RESERVE		0x20
-#define	METASLAB_FASTWRITE		0x40
 #define	METASLAB_ZIL			0x80

 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
@ -96,8 +95,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
 int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
 int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
 void metaslab_check_free(spa_t *, const blkptr_t *);
-void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
-void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);

 void metaslab_stat_init(void);
 void metaslab_stat_fini(void);
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@ -313,7 +313,7 @@ struct metaslab_group {
 * Each metaslab maintains a set of in-core trees to track metaslab
 * operations.  The in-core free tree (ms_allocatable) contains the list of
 * free segments which are eligible for allocation.  As blocks are
- * allocated, the allocated segment are removed from the ms_allocatable and
+ * allocated, the allocated segments are removed from the ms_allocatable and
 * added to a per txg allocation tree (ms_allocating).  As blocks are
 * freed, they are added to the free tree (ms_freeing).  These trees
 * allow us to process all allocations and frees in syncing context
@ -366,9 +366,9 @@ struct metaslab_group {
 struct metaslab {
 	/*
 	 * This is the main lock of the metaslab and its purpose is to
-	 * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+	 * coordinate our allocations and frees [e.g., metaslab_block_alloc(),
 	 * metaslab_free_concrete(), ..etc] with our various syncing
-	 * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+	 * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc].
 	 *
 	 * The lock is also used during some miscellaneous operations like
 	 * using the metaslab's histogram for the metaslab group's histogram
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@ -266,7 +266,6 @@ struct vdev {
 	metaslab_group_t *vdev_mg;	/* metaslab group		*/
 	metaslab_group_t *vdev_log_mg;	/* embedded slog metaslab group	*/
 	metaslab_t	**vdev_ms;	/* metaslab array		*/
-	uint64_t	vdev_pending_fastwrite; /* allocated fastwrites */
 	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@ -38,14 +38,22 @@ extern "C" {
 /*
 * Possible states for a given lwb structure.
 *
- * An lwb will start out in the "closed" state, and then transition to
- * the "opened" state via a call to zil_lwb_write_open(). When
- * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock"
- * must be held.
+ * An lwb will start out in the "new" state, and transition to the "opened"
+ * state via a call to zil_lwb_write_open() on first itx assignment.  When
+ * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be
+ * held.
 *
- * After the lwb is "opened", it can transition into the "issued" state
- * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must
- * be held when making this transition.
+ * After the lwb is "opened", it can be assigned number of itxs and transition
+ * into the "closed" state via zil_lwb_write_close() when full or on timeout.
+ * When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock"
+ * must be held.  New lwb allocation also takes "zl_lock" to protect the list.
+ *
+ * After the lwb is "closed", it can transition into the "ready" state via
+ * zil_lwb_write_issue().  "zl_lock" must be held when making this transition.
+ * Since it is done by the same thread, "zl_issuer_lock" is not needed.
+ *
+ * When lwb in "ready" state receives its block pointer, it can transition to
+ * "issued". "zl_lock" must be held when making this transition.
 *
 * After the lwb's write zio completes, it transitions into the "write
 * done" state via zil_lwb_write_done(); and then into the "flush done"
@ -62,17 +70,20 @@ extern "C" {
 *
 * Additionally, correctness when reading an lwb's state is often
 * achieved by exploiting the fact that these state transitions occur in
- * this specific order; i.e. "closed" to "opened" to "issued" to "done".
+ * this specific order; i.e. "new" to "opened" to "closed" to "ready" to
+ * "issued" to "write_done" and finally "flush_done".
 *
- * Thus, if an lwb is in the "closed" or "opened" state, holding the
+ * Thus, if an lwb is in the "new" or "opened" state, holding the
 * "zl_issuer_lock" will prevent a concurrent thread from transitioning
- * that lwb to the "issued" state. Likewise, if an lwb is already in the
- * "issued" state, holding the "zl_lock" will prevent a concurrent
- * thread from transitioning that lwb to the "write done" state.
+ * that lwb to the "closed" state. Likewise, if an lwb is already in the
+ * "ready" state, holding the "zl_lock" will prevent a concurrent thread
+ * from transitioning that lwb to the "issued" state.
 */
 typedef enum {
-    LWB_STATE_CLOSED,
+    LWB_STATE_NEW,
    LWB_STATE_OPENED,
+    LWB_STATE_CLOSED,
+    LWB_STATE_READY,
    LWB_STATE_ISSUED,
    LWB_STATE_WRITE_DONE,
    LWB_STATE_FLUSH_DONE,
@ -91,18 +102,21 @@ typedef enum {
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
-	boolean_t	lwb_fastwrite;	/* is blk marked for fastwrite? */
+	boolean_t	lwb_slim;	/* log block has slim format */
 	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
-	boolean_t	lwb_indirect;	/* do not postpone zil_lwb_commit() */
+	int		lwb_error;	/* log block allocation error */
+	int		lwb_nmax;	/* max bytes in the buffer */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_nfilled;	/* # filled bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	lwb_state_t	lwb_state;	/* the state of this lwb */
 	char		*lwb_buf;	/* log write buffer */
+	zio_t		*lwb_child_zio;	/* parent zio for children */
 	zio_t		*lwb_write_zio;	/* zio for the lwb buffer */
 	zio_t		*lwb_root_zio;	/* root zio for lwb write and flushes */
 	hrtime_t	lwb_issued_timestamp; /* when was the lwb issued? */
 	uint64_t	lwb_issued_txg;	/* the txg when the write is issued */
+	uint64_t	lwb_alloc_txg;	/* the txg when lwb_blk is allocated */
 	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
 	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
 	list_node_t	lwb_issue_node;	/* linkage of lwbs ready for issue */
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@ -222,7 +222,6 @@ typedef uint64_t zio_flag_t;
 #define	ZIO_FLAG_NOPWRITE	(1ULL << 28)
 #define	ZIO_FLAG_REEXECUTED	(1ULL << 29)
 #define	ZIO_FLAG_DELEGATED	(1ULL << 30)
-#define	ZIO_FLAG_FASTWRITE	(1ULL << 31)

 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
--- a/lib/libzfs/Makefile.am
+++ b/lib/libzfs/Makefile.am
@ -57,7 +57,7 @@ libzfs_la_LIBADD = \
 	libzutil.la \
 	libuutil.la

-libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)
+libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)

 libzfs_la_LDFLAGS = -pthread

--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@ -3928,6 +3928,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)

 	switch (errno) {

+	case EALREADY:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "removal for this vdev is already in progress."));
+		(void) zfs_error(hdl, EZFS_BUSY, errbuf);
+		break;
+
 	case EINVAL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid config; all top-level vdevs must "
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@ -928,6 +928,39 @@ zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written,
 	return (0);
 }

+static volatile boolean_t send_progress_thread_signal_duetotimer;
+static void
+send_progress_thread_act(int sig, siginfo_t *info, void *ucontext)
+{
+	(void) sig, (void) ucontext;
+	send_progress_thread_signal_duetotimer = info->si_code == SI_TIMER;
+}
+
+struct timer_desirability {
+	timer_t timer;
+	boolean_t desired;
+};
+static void
+timer_delete_cleanup(void *timer)
+{
+	struct timer_desirability *td = timer;
+	if (td->desired)
+		timer_delete(td->timer);
+}
+
+#ifdef SIGINFO
+#define	SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO sigaddset(&new, SIGINFO)
+#else
+#define	SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO
+#endif
+#define	SEND_PROGRESS_THREAD_PARENT_BLOCK(old) { \
+	sigset_t new; \
+	sigemptyset(&new); \
+	sigaddset(&new, SIGUSR1); \
+	SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO; \
+	pthread_sigmask(SIG_BLOCK, &new, old); \
+}
+
 static void *
 send_progress_thread(void *arg)
 {
@ -941,6 +974,26 @@ send_progress_thread(void *arg)
 	struct tm tm;
 	int err;

+	const struct sigaction signal_action =
+	    {.sa_sigaction = send_progress_thread_act, .sa_flags = SA_SIGINFO};
+	struct sigevent timer_cfg =
+	    {.sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGUSR1};
+	const struct itimerspec timer_time =
+	    {.it_value = {.tv_sec = 1}, .it_interval = {.tv_sec = 1}};
+	struct timer_desirability timer = {};
+
+	sigaction(SIGUSR1, &signal_action, NULL);
+#ifdef SIGINFO
+	sigaction(SIGINFO, &signal_action, NULL);
+#endif
+
+	if ((timer.desired = pa->pa_progress || pa->pa_astitle)) {
+		if (timer_create(CLOCK_MONOTONIC, &timer_cfg, &timer.timer))
+			return ((void *)(uintptr_t)errno);
+		(void) timer_settime(timer.timer, 0, &timer_time, NULL);
+	}
+	pthread_cleanup_push(timer_delete_cleanup, &timer);
+
 	if (!pa->pa_parsable && pa->pa_progress) {
 		(void) fprintf(stderr,
 		    "TIME       %s   %sSNAPSHOT %s\n",
@ -953,12 +1006,12 @@ send_progress_thread(void *arg)
 	 * Print the progress from ZFS_IOC_SEND_PROGRESS every second.
 	 */
 	for (;;) {
-		(void) sleep(1);
+		pause();
 		if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes,
 		    &blocks)) != 0) {
 			if (err == EINTR || err == ENOENT)
-				return ((void *)0);
-			return ((void *)(uintptr_t)err);
+				err = 0;
+			pthread_exit(((void *)(uintptr_t)err));
 		}

 		(void) time(&t);
@ -991,21 +1044,25 @@ send_progress_thread(void *arg)
 			(void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
 			    tm.tm_hour, tm.tm_min, tm.tm_sec,
 			    (u_longlong_t)bytes, zhp->zfs_name);
-		} else if (pa->pa_progress) {
+		} else if (pa->pa_progress ||
+		    !send_progress_thread_signal_duetotimer) {
 			zfs_nicebytes(bytes, buf, sizeof (buf));
 			(void) fprintf(stderr, "%02d:%02d:%02d   %5s   %s\n",
 			    tm.tm_hour, tm.tm_min, tm.tm_sec,
 			    buf, zhp->zfs_name);
 		}
 	}
+	pthread_cleanup_pop(B_TRUE);
 }

 static boolean_t
-send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid)
+send_progress_thread_exit(
+    libzfs_handle_t *hdl, pthread_t ptid, sigset_t *oldmask)
 {
 	void *status = NULL;
 	(void) pthread_cancel(ptid);
 	(void) pthread_join(ptid, &status);
+	pthread_sigmask(SIG_SETMASK, oldmask, NULL);
 	int error = (int)(uintptr_t)status;
 	if (error != 0 && status != PTHREAD_CANCELED)
 		return (zfs_standard_error(hdl, error,
@ -1199,7 +1256,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
-		if (sdd->progress || sdd->progressastitle) {
+		sigset_t oldmask;
+		{
 			pa.pa_zhp = zhp;
 			pa.pa_fd = sdd->outfd;
 			pa.pa_parsable = sdd->parsable;
@ -1214,13 +1272,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
 				zfs_close(zhp);
 				return (err);
 			}
+			SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
 		}

 		err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
 		    fromorigin, sdd->outfd, flags, sdd->debugnv);

-		if ((sdd->progress || sdd->progressastitle) &&
-		    send_progress_thread_exit(zhp->zfs_hdl, tid))
+		if (send_progress_thread_exit(zhp->zfs_hdl, tid, &oldmask))
 			return (-1);
 	}

@ -1562,8 +1620,9 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
 	progress_arg_t pa = { 0 };
 	int err = 0;
 	pthread_t ptid;
+	sigset_t oldmask;

-	if (flags->progress || flags->progressastitle) {
+	{
 		pa.pa_zhp = zhp;
 		pa.pa_fd = fd;
 		pa.pa_parsable = flags->parsable;
@ -1577,6 +1636,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
 			return (zfs_error(zhp->zfs_hdl,
 			    EZFS_THREADCREATEFAILED, errbuf));
 		}
+		SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
 	}

 	err = lzc_send_space_resume_redacted(zhp->zfs_name, from,
@ -1584,8 +1644,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
 	    redactbook, fd, &size);
 	*sizep = size;

-	if ((flags->progress || flags->progressastitle) &&
-	    send_progress_thread_exit(zhp->zfs_hdl, ptid))
+	if (send_progress_thread_exit(zhp->zfs_hdl, ptid, &oldmask))
 		return (-1);

 	if (!flags->progress && !flags->parsable)
@ -1876,11 +1935,12 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
 	if (!flags->dryrun) {
 		progress_arg_t pa = { 0 };
 		pthread_t tid;
+		sigset_t oldmask;
 		/*
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
-		if (flags->progress || flags->progressastitle) {
+		{
 			pa.pa_zhp = zhp;
 			pa.pa_fd = outfd;
 			pa.pa_parsable = flags->parsable;
@ -1898,6 +1958,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
 				zfs_close(zhp);
 				return (error);
 			}
+			SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
 		}

 		error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd,
@ -1905,8 +1966,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
 		if (redact_book != NULL)
 			free(redact_book);

-		if ((flags->progressastitle || flags->progress) &&
-		    send_progress_thread_exit(hdl, tid)) {
+		if (send_progress_thread_exit(hdl, tid, &oldmask)) {
 			zfs_close(zhp);
 			return (-1);
 		}
@ -2691,7 +2751,8 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
 	 * If progress reporting is requested, spawn a new thread to poll
 	 * ZFS_IOC_SEND_PROGRESS at a regular interval.
 	 */
-	if (flags->progress || flags->progressastitle) {
+	sigset_t oldmask;
+	{
 		pa.pa_zhp = zhp;
 		pa.pa_fd = fd;
 		pa.pa_parsable = flags->parsable;
@ -2708,13 +2769,13 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
 			return (zfs_error(zhp->zfs_hdl,
 			    EZFS_THREADCREATEFAILED, errbuf));
 		}
+		SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
 	}

 	err = lzc_send_redacted(name, from, fd,
 	    lzc_flags_from_sendflags(flags), redactbook);

-	if ((flags->progress || flags->progressastitle) &&
-	    send_progress_thread_exit(hdl, ptid))
+	if (send_progress_thread_exit(hdl, ptid, &oldmask))
 			return (-1);

 	if (err == 0 && (flags->props || flags->holds || flags->backup)) {
--- a/man/man8/zfs-send.8
+++ b/man/man8/zfs-send.8
@ -29,7 +29,7 @@
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd January 12, 2023
+.Dd July 27, 2023
 .Dt ZFS-SEND 8
 .Os
 .
@ -297,6 +297,12 @@ This flag can only be used in conjunction with
 .It Fl v , -verbose
 Print verbose information about the stream package generated.
 This information includes a per-second report of how much data has been sent.
+The same report can be requested by sending
+.Dv SIGINFO
+or
+.Dv SIGUSR1 ,
+regardless of
+.Fl v .
 .Pp
 The format of the stream is committed.
 You will be able to receive your streams on future versions of ZFS.
@ -433,6 +439,12 @@ and the verbose output goes to standard error
 .It Fl v , -verbose
 Print verbose information about the stream package generated.
 This information includes a per-second report of how much data has been sent.
+The same report can be requested by sending
+.Dv SIGINFO
+or
+.Dv SIGUSR1 ,
+regardless of
+.Fl v .
 .El
 .It Xo
 .Nm zfs
@ -669,6 +681,10 @@ ones on the source, and are ready to be used, while the parent snapshot on the
 target contains none of the username and password data present on the source,
 because it was removed by the redacted send operation.
 .
+.Sh SIGNALS
+See
+.Fl v .
+.
 .Sh EXAMPLES
 .\" These are, respectively, examples 12, 13 from zfs.8
 .\" Make sure to update them bidirectionally
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@ -6290,7 +6290,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)

 	error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
 	    ap->a_outoffp, &len, ap->a_outcred);
-	if (error == EXDEV || error == EOPNOTSUPP)
+	if (error == EXDEV || error == EAGAIN || error == EINVAL ||
+	    error == EOPNOTSUPP)
 		goto bad_locked_fallback;
 	*ap->a_lenp = (size_t)len;
 out_locked:
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@ -478,17 +478,19 @@ zfsctl_is_snapdir(struct inode *ip)
 */
 static struct inode *
 zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
-    const struct file_operations *fops, const struct inode_operations *ops)
+    const struct file_operations *fops, const struct inode_operations *ops,
+    uint64_t creation)
 {
-	inode_timespec_t now;
 	struct inode *ip;
 	znode_t *zp;
+	inode_timespec_t now = {.tv_sec = creation};

 	ip = new_inode(zfsvfs->z_sb);
 	if (ip == NULL)
 		return (NULL);

-	now = current_time(ip);
+	if (!creation)
+		now = current_time(ip);
 	zp = ITOZ(ip);
 	ASSERT3P(zp->z_dirlocks, ==, NULL);
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
@ -552,14 +554,28 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
    const struct file_operations *fops, const struct inode_operations *ops)
 {
 	struct inode *ip = NULL;
+	uint64_t creation = 0;
+	dsl_dataset_t *snap_ds;
+	dsl_pool_t *pool;

 	while (ip == NULL) {
 		ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
 		if (ip)
 			break;

+		if (id <= ZFSCTL_INO_SNAPDIRS && !creation) {
+			pool = dmu_objset_pool(zfsvfs->z_os);
+			dsl_pool_config_enter(pool, FTAG);
+			if (!dsl_dataset_hold_obj(pool,
+			    ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) {
+				creation = dsl_get_creation(snap_ds);
+				dsl_dataset_rele(snap_ds, FTAG);
+			}
+			dsl_pool_config_exit(pool, FTAG);
+		}
+
 		/* May fail due to concurrent zfsctl_inode_alloc() */
-		ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
+		ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation);
 	}

 	return (ip);
@ -581,7 +597,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
 	ASSERT(zfsvfs->z_ctldir == NULL);

 	zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
-	    &zpl_fops_root, &zpl_ops_root);
+	    &zpl_fops_root, &zpl_ops_root, 0);
 	if (zfsvfs->z_ctldir == NULL)
 		return (SET_ERROR(ENOENT));

--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@ -103,9 +103,17 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 * Since Linux 5.3 the filesystem driver is responsible for executing
 	 * an appropriate fallback, and a generic fallback function is provided.
 	 */
-	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
+	    ret == -EAGAIN)
 		ret = generic_copy_file_range(src_file, src_off, dst_file,
 		    dst_off, len, flags);
+#else
+	/*
+	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
+	 * to the kernel that it should fallback to a content copy.
+	 */
+	if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
+		ret = -EOPNOTSUPP;
 #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */

 	return (ret);
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@ -174,7 +174,7 @@
 *	                size_t len, unsigned int flags);
 *
 * Even though offsets and length represent bytes, they have to be
- * block-aligned or we will return the EXDEV error so the upper layer can
+ * block-aligned or we will return an error so the upper layer can
 * fallback to the generic mechanism that will just copy the data.
 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
 * This function was implemented based on zfs_write(), but instead of writing
@ -192,9 +192,9 @@
 * Some special cases to consider and how we address them:
 * - The block we want to clone may have been created within the same
 *   transaction group that we are trying to clone. Such block has no BP
- *   allocated yet, so cannot be immediately cloned. We return EXDEV.
+ *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
 * - The block we want to clone may have been modified within the same
- *   transaction group. We return EXDEV.
+ *   transaction group. We return EAGAIN.
 * - A block may be cloned multiple times during one transaction group (that's
 *   why pending list is actually a tree and not an append-only list - this
 *   way we can figure out faster if this block is cloned for the first time
@ -1544,6 +1544,37 @@ out:
 	return (B_FALSE);
 }

+uint64_t
+brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
+{
+	brt_t *brt = spa->spa_brt;
+	brt_vdev_t *brtvd;
+	brt_entry_t bre_search, *bre;
+	uint64_t vdevid, refcnt;
+	int error;
+
+	brt_entry_fill(bp, &bre_search, &vdevid);
+
+	brt_rlock(brt);
+
+	brtvd = brt_vdev(brt, vdevid);
+	ASSERT(brtvd != NULL);
+
+	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+	if (bre == NULL) {
+		error = brt_entry_lookup(brt, brtvd, &bre_search);
+		ASSERT(error == 0 || error == ENOENT);
+		if (error == ENOENT)
+			refcnt = 0;
+		else
+			refcnt = bre_search.bre_refcount;
+	} else
+		refcnt = bre->bre_refcount;
+
+	brt_unlock(brt);
+	return (refcnt);
+}
+
 static void
 brt_prefetch(brt_t *brt, const blkptr_t *bp)
 {
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@ -165,7 +165,7 @@ dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	zfs_acl_byteswap,	"acl"		}
 };

-static int
+int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
    const void *tag, dmu_buf_t **dbp)
 {
@ -185,6 +185,7 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
 	*dbp = &db->db;
 	return (0);
 }
+
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
    const void *tag, dmu_buf_t **dbp)
@ -1653,10 +1654,22 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
+	int error;
+
+	error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
+	    DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+	if (error != 0)
+		return (error);

 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
-	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+	/*
+	 * This transaction does not produce any dirty data or log blocks, so
+	 * it should not be throttled.  All other cases wait for TXG sync, by
+	 * which time the log block we are writing will be obsolete, so we can
+	 * skip waiting and just return error here instead.
+	 */
+	if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@ -1292,7 +1292,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,

 		/*
 		 * If this metaslab group is below its qmax or it's
-		 * the only allocatable metasable group, then attempt
+		 * the only allocatable metaslab group, then attempt
 		 * to allocate from it.
 		 */
 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
@ -5101,7 +5101,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
    zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
-	metaslab_group_t *mg, *fast_mg, *rotor;
+	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
 	boolean_t try_hard = B_FALSE;

@ -5164,15 +5164,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
-	} else if (flags & METASLAB_FASTWRITE) {
-		mg = fast_mg = mca->mca_rotor;
-
-		do {
-			if (fast_mg->mg_vd->vdev_pending_fastwrite <
-			    mg->mg_vd->vdev_pending_fastwrite)
-				mg = fast_mg;
-		} while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
-
 	} else {
 		ASSERT(mca->mca_rotor != NULL);
 		mg = mca->mca_rotor;
@ -5297,7 +5288,7 @@ top:
 				mg->mg_bias = 0;
 			}

-			if ((flags & METASLAB_FASTWRITE) ||
+			if ((flags & METASLAB_ZIL) ||
 			    atomic_add_64_nv(&mca->mca_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mca->mca_rotor = mg->mg_next;
@ -5310,11 +5301,6 @@ top:
 			    ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
 			DVA_SET_ASIZE(&dva[d], asize);

-			if (flags & METASLAB_FASTWRITE) {
-				atomic_add_64(&vd->vdev_pending_fastwrite,
-				    psize);
-			}
-
 			return (0);
 		}
 next:
@ -5950,55 +5936,6 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 	return (error);
 }

-void
-metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-	uint64_t psize = BP_GET_PSIZE(bp);
-	int d;
-	vdev_t *vd;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!BP_IS_EMBEDDED(bp));
-	ASSERT(psize > 0);
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-
-	for (d = 0; d < ndvas; d++) {
-		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
-			continue;
-		atomic_add_64(&vd->vdev_pending_fastwrite, psize);
-	}
-
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-}
-
-void
-metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-	uint64_t psize = BP_GET_PSIZE(bp);
-	int d;
-	vdev_t *vd;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!BP_IS_EMBEDDED(bp));
-	ASSERT(psize > 0);
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-
-	for (d = 0; d < ndvas; d++) {
-		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
-			continue;
-		ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
-		atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
-	}
-
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-}
-
 static void
 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
    uint64_t size, void *arg)
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@ -1192,7 +1192,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)

 	ASSERT(tvd == tvd->vdev_top);

-	tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
@ -1655,7 +1654,6 @@ vdev_metaslab_fini(vdev_t *vd)
 		}
 	}
 	ASSERT0(vd->vdev_ms_count);
-	ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
 }

 typedef struct vdev_probe_stats {
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@ -839,7 +839,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 	uint64_t zp_gen;

 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);

 	/*
@ -889,6 +888,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
+		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and its checksum is being calculated
@ -917,8 +917,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 		}
 #endif
 		if (error == 0)
-			error = dmu_buf_hold(os, object, offset, zgd, &db,
-			    DMU_READ_NO_PREFETCH);
+			error = dmu_buf_hold_noread(os, object, offset, zgd,
+			    &db);

 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
@ -1028,6 +1028,10 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
 *
 * On success, the function return the number of bytes copied in *lenp.
 * Note, it doesn't return how much bytes are left to be copied.
+ * On errors which are caused by any file system limitations or
+ * brt limitations `EINVAL` is returned. In the most cases a user
+ * requested bad parameters, it could be possible to clone the file but
+ * some parameters don't match the requirements.
 */
 int
 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
@ -1171,7 +1175,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	 * We cannot clone into files with different block size.
 	 */
 	if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) {
-		error = SET_ERROR(EXDEV);
+		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}

@ -1179,7 +1183,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	 * Offsets and len must be at block boundries.
 	 */
 	if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
-		error = SET_ERROR(EXDEV);
+		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
 	/*
@ -1187,7 +1191,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 	 */
 	if ((len % inblksz) != 0 &&
 	    (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
-		error = SET_ERROR(EXDEV);
+		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}

@ -1242,13 +1246,11 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		    &nbps);
 		if (error != 0) {
 			/*
-			 * If we are tyring to clone a block that was created
-			 * in the current transaction group. Return an error,
-			 * so the caller can fallback to just copying the data.
+			 * If we are trying to clone a block that was created
+			 * in the current transaction group, error will be
+			 * EAGAIN here, which we can just return to the caller
+			 * so it can fallback if it likes.
 			 */
-			if (error == EAGAIN) {
-				error = SET_ERROR(EXDEV);
-			}
 			break;
 		}
 		/*
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -3024,11 +3024,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

-	/*
-	 * We didn't allocate this bp, so make sure it doesn't get unmarked.
-	 */
-	pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
-
 	zio_nowait(zio);

 	return (pio);
@ -3616,7 +3611,6 @@ zio_dva_allocate(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));

-	flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
 	if (zio->io_flags & ZIO_FLAG_NODATA)
 		flags |= METASLAB_DONT_THROTTLE;
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
@ -3776,7 +3770,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	 * of, so we just hash the objset ID to pick the allocator to get
 	 * some parallelism.
 	 */
-	int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
+	int flags = METASLAB_ZIL;
 	int allocator = (uint_t)cityhash4(0, 0, 0,
 	    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
@ -4472,8 +4466,8 @@ zio_ready(zio_t *zio)
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;

-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
-	    ZIO_WAIT_READY)) {
+	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+	    ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}

@ -4931,12 +4925,6 @@ zio_done(zio_t *zio)
 		zfs_ereport_free_checksum(zcr);
 	}

-	if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
-	    !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
-	    !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
-		metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
-	}
-
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@ -698,7 +698,6 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 	int error;

 	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);

 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
@ -717,6 +716,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 	} else { /* indirect write */
+		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's written out
 		 * and its checksum is being calculated that no one can change
@ -727,8 +727,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 		offset = P2ALIGN_TYPED(offset, size, uint64_t);
 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 		    size, RL_READER);
-		error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
-		    DMU_READ_NO_PREFETCH);
+		error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
+		    &db);
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;

--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@ -36,11 +36,13 @@ tags = ['functional', 'atime']

 [tests/functional/block_cloning:Linux]
 tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial',
+    'block_cloning_copyfilerange_fallback',
    'block_cloning_ficlone', 'block_cloning_ficlonerange',
    'block_cloning_ficlonerange_partial',
    'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone',
    'block_cloning_disabled_ficlonerange',
-    'block_cloning_copyfilerange_cross_dataset']
+    'block_cloning_copyfilerange_cross_dataset',
+    'block_cloning_copyfilerange_fallback_same_txg']
 tags = ['functional', 'block_cloning']

 [tests/functional/chattr:Linux]
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@ -300,8 +300,12 @@ elif sys.platform.startswith('linux'):
            ['SKIP', cfr_reason],
        'block_cloning/block_cloning_copyfilerange_partial':
            ['SKIP', cfr_reason],
+        'block_cloning/block_cloning_copyfilerange_fallback':
+            ['SKIP', cfr_reason],
        'block_cloning/block_cloning_copyfilerange_cross_dataset':
            ['SKIP', cfr_cross_reason],
+        'block_cloning/block_cloning_copyfilerange_fallback_same_txg':
+            ['SKIP', cfr_cross_reason],
    })


--- a/tests/zfs-tests/cmd/clonefile.c
+++ b/tests/zfs-tests/cmd/clonefile.c
@ -212,7 +212,7 @@ main(int argc, char **argv)

 	int dfd = open(argv[optind+1], O_WRONLY|O_CREAT,
 	    S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
-	if (sfd < 0) {
+	if (dfd < 0) {
 		fprintf(stderr, "open: %s: %s\n",
 		    argv[optind+1], strerror(errno));
 		close(sfd);
--- a/tests/zfs-tests/cmd/readmmap.c
+++ b/tests/zfs-tests/cmd/readmmap.c
@ -44,6 +44,7 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <sys/mman.h>
+#include <sys/types.h>
 #include <time.h>

 int
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@ -441,6 +441,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/block_cloning/cleanup.ksh \
 	functional/block_cloning/setup.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
+	functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
+	functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
 	functional/block_cloning/block_cloning_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
 	functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh
@ -0,0 +1,86 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
+  log_unsupported "copy_file_range not available before Linux 4.5"
+fi
+
+claim="copy_file_range will fall back to copy when cloning not possible."
+
+log_assert $claim
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
+log_must sync_pool $TESTPOOL
+
+
+log_note "Copying entire file with copy_file_range"
+
+log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
+
+typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "1 2 3 4" ]
+
+
+log_note "Copying within a block with copy_file_range"
+
+log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 32768 32768 65536
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
+
+typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "2 3 4" ]
+
+
+log_note "Copying across a block with copy_file_range"
+
+log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 327680 327680 131072
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
+
+typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "2" ]
+
+log_pass $claim
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
@ -0,0 +1,66 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+verify_runnable "global"
+
+if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
+  log_unsupported "copy_file_range not available before Linux 4.5"
+fi
+
+claim="copy_file_range will fall back to copy when cloning on same txg"
+
+log_assert $claim
+
+typeset timeout=$(get_tunable TXG_TIMEOUT)
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+	set_tunable64 TXG_TIMEOUT $timeout
+}
+
+log_onexit cleanup
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must set_tunable64 TXG_TIMEOUT 5000
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
+log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
+
+log_must sync_pool $TESTPOOL
+
+log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
+
+typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
+log_must [ "$blocks" = "" ]
+
+log_pass $claim
+