From e360d9bbca9bd205419371005cdd5d17fcb70f4e Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@klarasystems.com>
Date: Wed, 26 Jun 2024 20:07:03 +0000
Subject: [PATCH 1/3] During pool export flush the ARC asynchronously

This also includes removing L2 vdevs asynchronously

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.

Signed-off-by: Don Brady <don.brady@klarasystems.com>
---
 include/sys/arc.h      |   8 ++
 include/sys/arc_impl.h |   8 +-
 module/zfs/arc.c       | 275 +++++++++++++++++++++++++++++++----------
 module/zfs/dsl_pool.c  |  22 ++--
 4 files changed, 236 insertions(+), 77 deletions(-)

diff --git a/include/sys/arc.h b/include/sys/arc.h
index c92b3eee61..45a70ba3a9 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -63,8 +63,15 @@ extern "C" {
 	(hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
 } while (0)
 
+/* The asize in the header is only used by L2 cache */
+#define	HDR_SET_ASIZE(hdr, x) do { \
+	ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
+	(hdr)->b_asize = ((x) >> SPA_MINBLOCKSHIFT); \
+} while (0)
+
 #define	HDR_GET_LSIZE(hdr)	((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
 #define	HDR_GET_PSIZE(hdr)	((hdr)->b_psize << SPA_MINBLOCKSHIFT)
+#define	HDR_GET_ASIZE(hdr)	((hdr)->b_asize << SPA_MINBLOCKSHIFT)
 
 typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
@@ -323,6 +330,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
 int arc_cached(spa_t *spa, const blkptr_t *bp);
 
 void arc_flush(spa_t *spa, boolean_t retry);
+void arc_flush_async(spa_t *spa);
 void arc_tempreserve_clear(uint64_t reserve);
 int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
 
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index 01693d72dd..4ecd7036db 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -378,8 +378,8 @@ typedef struct l2arc_lb_ptr_buf {
  * L2ARC Internals
  */
 typedef struct l2arc_dev {
-	vdev_t			*l2ad_vdev;	/* vdev */
-	spa_t			*l2ad_spa;	/* spa */
+	vdev_t			*l2ad_vdev;	/* can be NULL during remove */
+	spa_t			*l2ad_spa;	/* can be NULL during remove */
 	uint64_t		l2ad_hand;	/* next write location */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
@@ -475,8 +475,8 @@ struct arc_buf_hdr {
 
 	arc_buf_contents_t	b_type;
 	uint8_t			b_complevel;
-	uint8_t			b_reserved1; /* used for 4 byte alignment */
-	uint16_t		b_reserved2; /* used for 4 byte alignment */
+	uint8_t			b_reserved1;	/* used for 4 byte alignment */
+	uint16_t		b_asize;	/* alignment or L2-only asize */
 	arc_buf_hdr_t		*b_hash_next;
 	arc_flags_t		b_flags;
 
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 3c657c979c..cbc3db128b 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -26,7 +26,7 @@
  * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2020, George Amanakis. All rights reserved.
- * Copyright (c) 2019, 2023, Klara Inc.
+ * Copyright (c) 2019, 2024, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  *
@@ -464,6 +464,9 @@ static uint_t zfs_arc_lotsfree_percent = 10;
  */
 static int zfs_arc_prune_task_threads = 1;
 
+/* Used by spa_export/spa_destroy to flush the arc asynchronously */
+static taskq_t *arc_flush_taskq;
+
 /* The 7 states: */
 arc_state_t ARC_anon;
 arc_state_t ARC_mru;
@@ -1718,6 +1721,8 @@ arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
 	arc_buf_hdr_t	*hdr;
 
 	ASSERT(size != 0);
+	ASSERT(dev->l2ad_vdev != NULL);
+
 	hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
 	hdr->b_birth = birth;
 	hdr->b_type = type;
@@ -1725,6 +1730,7 @@ arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
 	HDR_SET_LSIZE(hdr, size);
 	HDR_SET_PSIZE(hdr, psize);
+	HDR_SET_ASIZE(hdr, vdev_psize_to_asize(dev->l2ad_vdev, psize));
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = complevel;
 	if (protected)
@@ -3515,16 +3521,17 @@ static void
 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only)
 {
-	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
-	l2arc_dev_t *dev = l2hdr->b_dev;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
-	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+	uint64_t asize = HDR_GET_ASIZE(hdr);
 	arc_buf_contents_t type = hdr->b_type;
 	int64_t lsize_s;
 	int64_t psize_s;
 	int64_t asize_s;
 
+	/* For L2 we expect the header's b_asize to be valid */
+	ASSERT3U(asize, >=, psize);
+
 	if (incr) {
 		lsize_s = lsize;
 		psize_s = psize;
@@ -3586,8 +3593,6 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
-	uint64_t psize = HDR_GET_PSIZE(hdr);
-	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 
 	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
 	ASSERT(HDR_HAS_L2HDR(hdr));
@@ -3595,7 +3600,10 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	l2arc_hdr_arcstats_decrement(hdr);
-	vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+	if (dev->l2ad_vdev != NULL) {
+		uint64_t asize = HDR_GET_ASIZE(hdr);
+		vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+	}
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
 	    hdr);
@@ -4382,20 +4390,10 @@ arc_evict(void)
 	return (total_evicted);
 }
 
-void
-arc_flush(spa_t *spa, boolean_t retry)
+static void
+arc_flush_impl(uint64_t guid, boolean_t retry)
 {
-	uint64_t guid = 0;
-
-	/*
-	 * If retry is B_TRUE, a spa must not be specified since we have
-	 * no good way to determine if all of a spa's buffers have been
-	 * evicted from an arc state.
-	 */
-	ASSERT(!retry || spa == NULL);
-
-	if (spa != NULL)
-		guid = spa_load_guid(spa);
+	ASSERT(!retry || guid == 0);
 
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
@@ -4413,6 +4411,59 @@ arc_flush(spa_t *spa, boolean_t retry)
 	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry);
 }
 
+void
+arc_flush(spa_t *spa, boolean_t retry)
+{
+	/*
+	 * If retry is B_TRUE, a spa must not be specified since we have
+	 * no good way to determine if all of a spa's buffers have been
+	 * evicted from an arc state.
+	 */
+	ASSERT(!retry || spa == NULL);
+
+	arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry);
+}
+
+static void
+arc_flush_task(void *arg)
+{
+	uint64_t guid = *((uint64_t *)arg);
+	hrtime_t start_time = gethrtime();
+
+	arc_flush_impl(guid, B_FALSE);
+	kmem_free(arg, sizeof (uint64_t *));
+
+	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
+	if (elaspsed > 0) {
+		zfs_dbgmsg("spa %llu arc flushed in %llu ms",
+		    (u_longlong_t)guid, (u_longlong_t)elaspsed);
+	}
+}
+
+/*
+ * ARC buffers use the spa's load guid and can continue to exist after
+ * the spa_t is gone (exported). The blocks are orphaned since each
+ * spa import has a different load guid.
+ *
+ * It's OK if the spa is re-imported while this asynchronous flush is
+ * still in progress. The new spa_load_guid will be different.
+ *
+ * Also, arc_fini will wait for any arc_flush_task to finish.
+ */
+void
+arc_flush_async(spa_t *spa)
+{
+	uint64_t *guidp = kmem_alloc(sizeof (uint64_t *), KM_SLEEP);
+
+	*guidp = spa_load_guid(spa);
+
+	if (taskq_dispatch(arc_flush_taskq, arc_flush_task, guidp,
+	    TQ_SLEEP) == TASKQID_INVALID) {
+		arc_flush_impl(*guidp, B_FALSE);
+		kmem_free(guidp, sizeof (uint64_t *));
+	}
+}
+
 uint64_t
 arc_reduce_target_size(uint64_t to_free)
 {
@@ -7751,6 +7802,9 @@ arc_init(void)
 	arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
 	    defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
+	arc_flush_taskq = taskq_create("arc_flush", 75, defclsyspri,
+	    1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
@@ -7816,6 +7870,10 @@ arc_fini(void)
 	arc_lowmem_fini();
 #endif /* _KERNEL */
 
+	/* Wait for any background flushes */
+	taskq_wait(arc_flush_taskq);
+	taskq_destroy(arc_flush_taskq);
+
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
 
@@ -8198,6 +8256,18 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 	return (next);
 }
 
+static boolean_t
+l2arc_dev_invalid(const l2arc_dev_t *dev)
+{
+	/*
+	 * We want to skip devices that are being rebuilt, trimmed,
+	 * removed, or belong to a spa that is being exported.
+	 */
+	return (dev->l2ad_vdev == NULL || vdev_is_dead(dev->l2ad_vdev) ||
+	    dev->l2ad_rebuild || dev->l2ad_trim_all ||
+	    dev->l2ad_spa == NULL || dev->l2ad_spa->spa_is_exporting);
+}
+
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
@@ -8238,12 +8308,10 @@ l2arc_dev_get_next(void)
 			break;
 
 		ASSERT3P(next, !=, NULL);
-	} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
-	    next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting);
+	} while (l2arc_dev_invalid(next));
 
 	/* if we were unable to find any usable vdevs, return NULL */
-	if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
-	    next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting)
+	if (l2arc_dev_invalid(next))
 		next = NULL;
 
 	l2arc_dev_last = next;
@@ -8373,6 +8441,8 @@ top:
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			l2arc_hdr_arcstats_decrement(hdr);
 
+			ASSERT(dev->l2ad_vdev != NULL);
+
 			bytes_dropped +=
 			    vdev_psize_to_asize(dev->l2ad_vdev, psize);
 			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
@@ -8757,6 +8827,8 @@ l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
 	if (dev->l2ad_log_entries == 0) {
 		return (0);
 	} else {
+		ASSERT(dev->l2ad_vdev != NULL);
+
 		uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
 
 		uint64_t log_blocks = (log_entries +
@@ -8785,6 +8857,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 	vdev_t *vd = dev->l2ad_vdev;
 	boolean_t rerun;
 
+	ASSERT(vd != NULL || all);
+	ASSERT(dev->l2ad_spa != NULL || all);
+
 	buflist = &dev->l2ad_buflist;
 
 top:
@@ -8877,7 +8952,8 @@ retry:
 		if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
 			break;
 		} else {
-			vdev_space_update(vd, -asize, 0, 0);
+			if (vd != NULL)
+				vdev_space_update(vd, -asize, 0, 0);
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
@@ -9293,6 +9369,8 @@ skip:
 			hdr->b_l2hdr.b_hits = 0;
 			hdr->b_l2hdr.b_arcs_state =
 			    hdr->b_l1hdr.b_state->arcs_state;
+			/* l2arc_hdr_arcstats_update() expects a valid asize */
+			HDR_SET_ASIZE(hdr, asize);
 			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
@@ -9458,8 +9536,10 @@ l2arc_feed_thread(void *unused)
 		 * held to prevent device removal.  l2arc_dev_get_next()
 		 * will grab and release l2arc_dev_mtx.
 		 */
-		if ((dev = l2arc_dev_get_next()) == NULL)
+		if ((dev = l2arc_dev_get_next()) == NULL ||
+		    dev->l2ad_spa == NULL) {
 			continue;
+		}
 
 		spa = dev->l2ad_spa;
 		ASSERT3P(spa, !=, NULL);
@@ -9544,6 +9624,12 @@ l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
 	uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	spa_t *spa = dev->l2ad_spa;
 
+	/*
+	 * After a l2arc_remove_vdev(), the spa_t will no longer be valid
+	 */
+	if (spa == NULL)
+		return;
+
 	/*
 	 * The L2ARC has to hold at least the payload of one log block for
 	 * them to be restored (persistent L2ARC). The payload of a log block
@@ -9711,39 +9797,18 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
 	l2arc_rebuild_dev(dev, reopen);
 }
 
-/*
- * Remove a vdev from the L2ARC.
- */
-void
-l2arc_remove_vdev(vdev_t *vd)
+typedef struct {
+	l2arc_dev_t	*rva_l2arc_dev;
+	uint64_t	rva_spa_gid;
+	uint64_t	rva_vdev_gid;
+} remove_vdev_args_t;
+
+static void
+l2arc_device_teardown(void *arg)
 {
-	l2arc_dev_t *remdev = NULL;
-
-	/*
-	 * Find the device by vdev
-	 */
-	remdev = l2arc_vdev_get(vd);
-	ASSERT3P(remdev, !=, NULL);
-
-	/*
-	 * Cancel any ongoing or scheduled rebuild.
-	 */
-	mutex_enter(&l2arc_rebuild_thr_lock);
-	if (remdev->l2ad_rebuild_began == B_TRUE) {
-		remdev->l2ad_rebuild_cancel = B_TRUE;
-		while (remdev->l2ad_rebuild == B_TRUE)
-			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
-	}
-	mutex_exit(&l2arc_rebuild_thr_lock);
-
-	/*
-	 * Remove device from global list
-	 */
-	mutex_enter(&l2arc_dev_mtx);
-	list_remove(l2arc_dev_list, remdev);
-	l2arc_dev_last = NULL;		/* may have been invalidated */
-	atomic_dec_64(&l2arc_ndev);
-	mutex_exit(&l2arc_dev_mtx);
+	remove_vdev_args_t *rva = arg;
+	l2arc_dev_t *remdev = rva->rva_l2arc_dev;
+	hrtime_t start_time = gethrtime();
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
@@ -9758,6 +9823,79 @@ l2arc_remove_vdev(vdev_t *vd)
 	zfs_refcount_destroy(&remdev->l2ad_lb_count);
 	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
 	vmem_free(remdev, sizeof (l2arc_dev_t));
+
+	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
+	if (elaspsed > 0) {
+		zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms",
+		    (u_longlong_t)rva->rva_spa_gid,
+		    (u_longlong_t)rva->rva_vdev_gid,
+		    (u_longlong_t)elaspsed);
+	}
+	kmem_free(rva, sizeof (remove_vdev_args_t));
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	boolean_t asynchronous = spa->spa_state == POOL_STATE_EXPORTED ||
+	    spa->spa_state == POOL_STATE_DESTROYED;
+
+	/*
+	 * Find the device by vdev
+	 */
+	l2arc_dev_t *remdev = l2arc_vdev_get(vd);
+	ASSERT3P(remdev, !=, NULL);
+
+	/*
+	 * Save info for final teardown
+	 */
+	remove_vdev_args_t *rva = kmem_alloc(sizeof (remove_vdev_args_t),
+	    KM_SLEEP);
+	rva->rva_l2arc_dev = remdev;
+	rva->rva_spa_gid = spa_guid(remdev->l2ad_spa);
+	rva->rva_vdev_gid = remdev->l2ad_vdev->vdev_guid;
+
+	/*
+	 * Cancel any ongoing or scheduled rebuild.
+	 */
+	mutex_enter(&l2arc_rebuild_thr_lock);
+	if (remdev->l2ad_rebuild_began == B_TRUE) {
+		remdev->l2ad_rebuild_cancel = B_TRUE;
+		while (remdev->l2ad_rebuild == B_TRUE)
+			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
+	} else if (remdev->l2ad_rebuild == B_TRUE) {
+		/* Rebuild hasn't started yet so skip asynchronous teardown */
+		asynchronous = B_FALSE;
+	}
+	mutex_exit(&l2arc_rebuild_thr_lock);
+
+	/*
+	 * Remove device from global list
+	 */
+	ASSERT(spa_config_held(spa, SCL_L2ARC, RW_WRITER) & SCL_L2ARC);
+	mutex_enter(&l2arc_dev_mtx);
+	list_remove(l2arc_dev_list, remdev);
+	l2arc_dev_last = NULL;		/* may have been invalidated */
+	atomic_dec_64(&l2arc_ndev);
+
+	/* During a pool export spa & vdev will no longer be valid */
+	if (asynchronous) {
+		remdev->l2ad_spa = NULL;
+		remdev->l2ad_vdev = NULL;
+	}
+	mutex_exit(&l2arc_dev_mtx);
+
+	/*
+	 * If possible, the teardown is completed asynchronously
+	 */
+	if (!asynchronous || taskq_dispatch(arc_flush_taskq,
+	    l2arc_device_teardown, rva, TQ_SLEEP) == TASKQID_INVALID) {
+		l2arc_device_teardown(rva);
+	}
 }
 
 void
@@ -10015,6 +10153,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
 			mutex_enter(&l2arc_rebuild_thr_lock);
 			if (dev->l2ad_rebuild_cancel) {
 				dev->l2ad_rebuild = B_FALSE;
+				/* After signaling, the spa & vdev go away */
 				cv_signal(&l2arc_rebuild_thr_cv);
 				mutex_exit(&l2arc_rebuild_thr_lock);
 				err = SET_ERROR(ECANCELED);
@@ -10054,7 +10193,15 @@ out:
 	vmem_free(this_lb, sizeof (*this_lb));
 	vmem_free(next_lb, sizeof (*next_lb));
 
-	if (!l2arc_rebuild_enabled) {
+	if (err == ECANCELED) {
+		/*
+		 * In case the rebuild was canceled do not log to spa history
+		 * log as the pool may be in the process of being removed.
+		 */
+		zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
+		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
+		return (err);
+	} else if (!l2arc_rebuild_enabled) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "disabled");
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
@@ -10072,13 +10219,6 @@ out:
 		    "no valid log blocks");
 		memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize);
 		l2arc_dev_hdr_update(dev);
-	} else if (err == ECANCELED) {
-		/*
-		 * In case the rebuild was canceled do not log to spa history
-		 * log as the pool may be in the process of being removed.
-		 */
-		zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
-		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err != 0) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "aborted, restored %llu blocks",
@@ -10363,6 +10503,7 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
 	    L2BLK_GET_STATE((le)->le_prop));
 	asize = vdev_psize_to_asize(dev->l2ad_vdev,
 	    L2BLK_GET_PSIZE((le)->le_prop));
+	ASSERT3U(asize, ==, HDR_GET_ASIZE(hdr));
 
 	/*
 	 * vdev_space_update() has to be called before arc_hdr_destroy() to
@@ -10392,6 +10533,8 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
 			exists->b_l2hdr.b_daddr = le->le_daddr;
 			exists->b_l2hdr.b_arcs_state =
 			    L2BLK_GET_STATE((le)->le_prop);
+			/* l2arc_hdr_arcstats_update() expects a valid asize */
+			HDR_SET_ASIZE(exists, asize);
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_tail(&dev->l2ad_buflist, exists);
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 342ec5c15c..d5af83659a 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -404,13 +404,21 @@ dsl_pool_close(dsl_pool_t *dp)
 	taskq_destroy(dp->dp_zil_clean_taskq);
 	spa_sync_tq_destroy(dp->dp_spa);
 
-	/*
-	 * We can't set retry to TRUE since we're explicitly specifying
-	 * a spa to flush. This is good enough; any missed buffers for
-	 * this spa won't cause trouble, and they'll eventually fall
-	 * out of the ARC just like any other unused buffer.
-	 */
-	arc_flush(dp->dp_spa, FALSE);
+	if (dp->dp_spa->spa_state == POOL_STATE_EXPORTED ||
+	    dp->dp_spa->spa_state == POOL_STATE_DESTROYED) {
+		/*
+		 * On export/destroy perform the ARC flush asynchronously.
+		 */
+		arc_flush_async(dp->dp_spa);
+	} else {
+		/*
+		 * We can't set retry to TRUE since we're explicitly specifying
+		 * a spa to flush. This is good enough; any missed buffers for
+		 * this spa won't cause trouble, and they'll eventually fall
+		 * out of the ARC just like any other unused buffer.
+		 */
+		arc_flush(dp->dp_spa, FALSE);
+	}
 
 	mmp_fini(dp->dp_spa);
 	txg_fini(dp);

From 52b68cd3421c3aad76232a8cdac69a46195bb9c3 Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@klarasystems.com>
Date: Wed, 26 Jun 2024 22:23:17 +0000
Subject: [PATCH 2/3] Guarantee that spa_load_guid is unique

The zpool reguid feature introduced the spa_load_guid, which is a
transient value used for runtime identification purposes in the ARC.
This value is not the same as the spa's persistent pool guid.

However, the value is seeded from spa_generate_load_guid() which
does not check for uniqueness against the spa_load_guid from other
pools.  Although extremely rare, you can end up with two different
pools sharing the same spa_load_guid value!

This change guarantees that the value is always unique and
additionally not still in use by an async arc flush task.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.

Signed-off-by: Don Brady <don.brady@klarasystems.com>
---
 include/sys/arc.h     |   1 +
 include/sys/spa.h     |   1 +
 module/zfs/arc.c      | 127 +++++++++++++++++++++++++++++++++++++-----
 module/zfs/spa_misc.c |  28 ++++++++++
 module/zfs/vdev.c     |   2 +-
 5 files changed, 143 insertions(+), 16 deletions(-)

diff --git a/include/sys/arc.h b/include/sys/arc.h
index 45a70ba3a9..5839f87087 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -333,6 +333,7 @@ void arc_flush(spa_t *spa, boolean_t retry);
 void arc_flush_async(spa_t *spa);
 void arc_tempreserve_clear(uint64_t reserve);
 int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
+boolean_t arc_async_flush_guid_inuse(uint64_t load_guid);
 
 uint64_t arc_all_memory(void);
 uint64_t arc_default_max(uint64_t min, uint64_t allmem);
diff --git a/include/sys/spa.h b/include/sys/spa.h
index a70912335b..d99c73409e 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1090,6 +1090,7 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_generate_guid(spa_t *spa);
+extern uint64_t spa_generate_load_guid(void);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern int spa_change_guid(spa_t *spa);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index cbc3db128b..5a5e3498bd 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -775,6 +775,21 @@ static buf_hash_table_t buf_hash_table;
 
 uint64_t zfs_crc64_table[256];
 
+/*
+ * Asynchronous ARC flush
+ *
+ * We track these in a list for arc_async_flush_guid_inuse()
+ */
+static list_t arc_async_flush_list;
+static kmutex_t	arc_async_flush_lock;
+
+typedef struct arc_async_flush {
+	uint64_t		af_spa_guid;
+	taskqid_t		af_task_id;
+	list_node_t		af_node;
+} arc_async_flush_t;
+
+
 /*
  * Level 2 ARC
  */
@@ -4424,19 +4439,50 @@ arc_flush(spa_t *spa, boolean_t retry)
 	arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry);
 }
 
+static arc_async_flush_t *
+arc_async_flush_add(uint64_t spa_guid, taskqid_t task_id)
+{
+	arc_async_flush_t *af = kmem_alloc(sizeof (*af), KM_SLEEP);
+	af->af_spa_guid = spa_guid;
+	af->af_task_id = task_id;
+	list_link_init(&af->af_node);
+
+	mutex_enter(&arc_async_flush_lock);
+	list_insert_tail(&arc_async_flush_list, af);
+	mutex_exit(&arc_async_flush_lock);
+
+	return (af);
+}
+
+static void
+arc_async_flush_remove(uint64_t spa_guid, taskqid_t task_id)
+{
+	mutex_enter(&arc_async_flush_lock);
+	for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
+	    af != NULL; af = list_next(&arc_async_flush_list, af)) {
+		if (af->af_spa_guid == spa_guid && af->af_task_id == task_id) {
+			list_remove(&arc_async_flush_list, af);
+			kmem_free(af, sizeof (*af));
+			break;
+		}
+	}
+	mutex_exit(&arc_async_flush_lock);
+}
+
 static void
 arc_flush_task(void *arg)
 {
-	uint64_t guid = *((uint64_t *)arg);
+	arc_async_flush_t *af = arg;
 	hrtime_t start_time = gethrtime();
+	uint64_t spa_guid = af->af_spa_guid;
 
-	arc_flush_impl(guid, B_FALSE);
-	kmem_free(arg, sizeof (uint64_t *));
+	arc_flush_impl(spa_guid, B_FALSE);
+	arc_async_flush_remove(spa_guid, af->af_task_id);
 
 	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
 	if (elaspsed > 0) {
 		zfs_dbgmsg("spa %llu arc flushed in %llu ms",
-		    (u_longlong_t)guid, (u_longlong_t)elaspsed);
+		    (u_longlong_t)spa_guid, (u_longlong_t)elaspsed);
 	}
 }
 
@@ -4453,17 +4499,46 @@ arc_flush_task(void *arg)
 void
 arc_flush_async(spa_t *spa)
 {
-	uint64_t *guidp = kmem_alloc(sizeof (uint64_t *), KM_SLEEP);
+	uint64_t spa_guid = spa_load_guid(spa);
+	arc_async_flush_t *af = arc_async_flush_add(spa_guid, TASKQID_INVALID);
 
-	*guidp = spa_load_guid(spa);
+	/*
+	 * Note that arc_flush_task() needs arc_async_flush_lock to remove af
+	 * list node.  So by holding the lock we avoid a race for af removal
+	 * with our use here.
+	 */
+	mutex_enter(&arc_async_flush_lock);
+	taskqid_t tid = af->af_task_id = taskq_dispatch(arc_flush_taskq,
+	    arc_flush_task, af, TQ_SLEEP);
+	mutex_exit(&arc_async_flush_lock);
 
-	if (taskq_dispatch(arc_flush_taskq, arc_flush_task, guidp,
-	    TQ_SLEEP) == TASKQID_INVALID) {
-		arc_flush_impl(*guidp, B_FALSE);
-		kmem_free(guidp, sizeof (uint64_t *));
+	/*
+	 * unlikely, but if we couldn't dispatch then use an inline flush
+	 */
+	if (tid == TASKQID_INVALID) {
+		arc_async_flush_remove(spa_guid, TASKQID_INVALID);
+		arc_flush_impl(spa_guid, B_FALSE);
 	}
 }
 
+/*
+ * Check if a guid is still in-use as part of an async teardown task
+ */
+boolean_t
+arc_async_flush_guid_inuse(uint64_t spa_guid)
+{
+	mutex_enter(&arc_async_flush_lock);
+	for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
+	    af != NULL; af = list_next(&arc_async_flush_list, af)) {
+		if (af->af_spa_guid == spa_guid) {
+			mutex_exit(&arc_async_flush_lock);
+			return (B_TRUE);
+		}
+	}
+	mutex_exit(&arc_async_flush_lock);
+	return (B_FALSE);
+}
+
 uint64_t
 arc_reduce_target_size(uint64_t to_free)
 {
@@ -7802,6 +7877,9 @@ arc_init(void)
 	arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
 	    defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
+	list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
+	    offsetof(arc_async_flush_t, af_node));
+	mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
 	arc_flush_taskq = taskq_create("arc_flush", 75, defclsyspri,
 	    1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
@@ -7885,6 +7963,9 @@ arc_fini(void)
 	taskq_wait(arc_prune_taskq);
 	taskq_destroy(arc_prune_taskq);
 
+	list_destroy(&arc_async_flush_list);
+	mutex_destroy(&arc_async_flush_lock);
+
 	mutex_enter(&arc_prune_mtx);
 	while ((p = list_remove_head(&arc_prune_list)) != NULL) {
 		(void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
@@ -9801,6 +9882,7 @@ typedef struct {
 	l2arc_dev_t	*rva_l2arc_dev;
 	uint64_t	rva_spa_gid;
 	uint64_t	rva_vdev_gid;
+	taskqid_t	rva_task_id;
 } remove_vdev_args_t;
 
 static void
@@ -9831,6 +9913,10 @@ l2arc_device_teardown(void *arg)
 		    (u_longlong_t)rva->rva_vdev_gid,
 		    (u_longlong_t)elaspsed);
 	}
+
+	if (rva->rva_task_id != TASKQID_INVALID)
+		arc_async_flush_remove(rva->rva_spa_gid, rva->rva_task_id);
+
 	kmem_free(rva, sizeof (remove_vdev_args_t));
 }
 
@@ -9889,11 +9975,22 @@ l2arc_remove_vdev(vdev_t *vd)
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
-	/*
-	 * If possible, the teardown is completed asynchronously
-	 */
-	if (!asynchronous || taskq_dispatch(arc_flush_taskq,
-	    l2arc_device_teardown, rva, TQ_SLEEP) == TASKQID_INVALID) {
+	if (!asynchronous) {
+		l2arc_device_teardown(rva);
+		return;
+	}
+
+	uint64_t spa_guid = spa_load_guid(spa);
+	arc_async_flush_t *af = arc_async_flush_add(spa_guid, TASKQID_INVALID);
+
+	mutex_enter(&arc_async_flush_lock);
+	taskqid_t tid = taskq_dispatch(arc_flush_taskq, l2arc_device_teardown,
+	    rva, TQ_SLEEP);
+	rva->rva_task_id = af->af_task_id = tid;
+	mutex_exit(&arc_async_flush_lock);
+
+	if (tid == TASKQID_INVALID) {
+		arc_async_flush_remove(spa_guid, TASKQID_INVALID);
 		l2arc_device_teardown(rva);
 	}
 }
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 97191e7685..67be9fe1d3 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1588,6 +1588,34 @@ spa_generate_guid(spa_t *spa)
 	return (guid);
 }
 
+static boolean_t
+spa_load_guid_exists(uint64_t guid)
+{
+	avl_tree_t *t = &spa_namespace_avl;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+		if (spa_load_guid(spa) == guid)
+			return (B_TRUE);
+	}
+
+	return (arc_async_flush_guid_inuse(guid));
+}
+
+uint64_t
+spa_generate_load_guid(void)
+{
+	uint64_t guid;
+
+	do {
+		(void) random_get_pseudo_bytes((void *)&guid,
+		    sizeof (guid));
+	} while (guid == 0 || spa_load_guid_exists(guid));
+
+	return (guid);
+}
+
 void
 snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
 {
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 6ae0a14127..ec723febf8 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -631,7 +631,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
-		spa->spa_load_guid = spa_generate_guid(NULL);
+		spa->spa_load_guid = spa_generate_load_guid();
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {

From 3d835dddd777754203f23f805428b9fb1fac87af Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@klarasystems.com>
Date: Thu, 22 Aug 2024 22:27:45 +0000
Subject: [PATCH 3/3] Changed arc evict to prioritize unloaded spas

When there are active async flushes, then the eviction
thread can focus exclusively on buffers belonging to any
spa that is being flushed.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.

Signed-off-by: Don Brady <don.brady@klarasystems.com>
---
 module/zfs/arc.c | 76 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 64 insertions(+), 12 deletions(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 5a5e3498bd..8f93d5ef44 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -789,6 +789,8 @@ typedef struct arc_async_flush {
 	list_node_t		af_node;
 } arc_async_flush_t;
 
+static unsigned int arc_async_flush_init_spa_list(uint64_t spa_list[],
+    unsigned int list_len);
 
 /*
  * Level 2 ARC
@@ -3884,9 +3886,20 @@ arc_set_need_free(void)
 	}
 }
 
+static boolean_t
+arc_spa_is_list_member(uint64_t spa_guid, uint64_t spa_list[],
+    unsigned int spa_cnt)
+{
+	for (int i = 0; i < spa_cnt; i++) {
+		if (spa_list[i] == spa_guid)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
 static uint64_t
 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
-    uint64_t spa, uint64_t bytes)
+    uint64_t bytes, uint64_t spa_list[], unsigned int spa_cnt)
 {
 	multilist_sublist_t *mls;
 	uint64_t bytes_evicted = 0, real_evicted = 0;
@@ -3928,8 +3941,13 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 		if (hdr->b_spa == 0)
 			continue;
 
-		/* we're only interested in evicting buffers of a certain spa */
-		if (spa != 0 && hdr->b_spa != spa) {
+		/*
+		 * Check if we're only interested in evicting buffers from
+		 * a specifc list of spas. This would typically be from
+		 * spas that are being unloaded.
+		 */
+		if (spa_cnt > 0 &&
+		    !arc_spa_is_list_member(hdr->b_spa, spa_list, spa_cnt)) {
 			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
@@ -4065,8 +4083,8 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
  * the given arc state; which is used by arc_flush().
  */
 static uint64_t
-arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
-    uint64_t bytes)
+arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t bytes,
+    uint64_t spa_list[], unsigned int spa_cnt)
 {
 	uint64_t total_evicted = 0;
 	multilist_t *ml = &state->arcs_list[type];
@@ -4121,7 +4139,8 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 				break;
 
 			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
-			    markers[sublist_idx], spa, bytes_remaining);
+			    markers[sublist_idx], bytes_remaining, spa_list,
+			    spa_cnt);
 
 			scan_evicted += bytes_evicted;
 			total_evicted += bytes_evicted;
@@ -4186,9 +4205,11 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
     boolean_t retry)
 {
 	uint64_t evicted = 0;
+	uint64_t spa_list[1] = {spa};
 
 	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
-		evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL);
+		evicted += arc_evict_state(state, type, ARC_EVICT_ALL,
+		    spa_list, spa == 0 ? 0 : 1);
 
 		if (!retry)
 			break;
@@ -4212,7 +4233,15 @@ arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes)
 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
 		    bytes);
-		return (arc_evict_state(state, type, 0, delta));
+		/*
+		 * Create a list of guids from any active ARC async flushes.
+		 * The underlying arc_evict_state() function will target
+		 * only spa guids from this list when it is not empty.
+		 */
+		uint64_t spa_list[16];
+		unsigned int spa_cnt =
+		    arc_async_flush_init_spa_list(spa_list, 16);
+		return (arc_evict_state(state, type, delta, spa_list, spa_cnt));
 	}
 
 	return (0);
@@ -4516,8 +4545,8 @@ arc_flush_async(spa_t *spa)
 	 * unlikely, but if we couldn't dispatch then use an inline flush
 	 */
 	if (tid == TASKQID_INVALID) {
-		arc_async_flush_remove(spa_guid, TASKQID_INVALID);
 		arc_flush_impl(spa_guid, B_FALSE);
+		arc_async_flush_remove(spa_guid, TASKQID_INVALID);
 	}
 }
 
@@ -4539,6 +4568,30 @@ arc_async_flush_guid_inuse(uint64_t spa_guid)
 	return (B_FALSE);
 }
 
+/*
+ * Initialize a list of spa guids that are being flushed.
+ *
+ * Used by arc_evict_state() to target headers belonging to spas on this list.
+ */
+static unsigned int
+arc_async_flush_init_spa_list(uint64_t spa_list[], unsigned int list_len)
+{
+	unsigned int init_cnt = 0;
+
+	/*
+	 * Iterate until the end of the list or array slots are full.
+	 */
+	mutex_enter(&arc_async_flush_lock);
+	for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
+	    init_cnt < list_len && af != NULL;
+	    af = list_next(&arc_async_flush_list, af)) {
+		spa_list[init_cnt++] = af->af_spa_guid;
+	}
+	mutex_exit(&arc_async_flush_lock);
+
+	return (init_cnt);
+}
+
 uint64_t
 arc_reduce_target_size(uint64_t to_free)
 {
@@ -9914,8 +9967,7 @@ l2arc_device_teardown(void *arg)
 		    (u_longlong_t)elaspsed);
 	}
 
-	if (rva->rva_task_id != TASKQID_INVALID)
-		arc_async_flush_remove(rva->rva_spa_gid, rva->rva_task_id);
+	arc_async_flush_remove(rva->rva_spa_gid, rva->rva_task_id);
 
 	kmem_free(rva, sizeof (remove_vdev_args_t));
 }
@@ -9990,8 +10042,8 @@ l2arc_remove_vdev(vdev_t *vd)
 	mutex_exit(&arc_async_flush_lock);
 
 	if (tid == TASKQID_INVALID) {
-		arc_async_flush_remove(spa_guid, TASKQID_INVALID);
 		l2arc_device_teardown(rva);
+		arc_async_flush_remove(spa_guid, TASKQID_INVALID);
 	}
 }