From 5d534f13718ebff1a2241163cf8e6f550f32a457 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 24 May 2022 09:36:07 -0700
Subject: [PATCH] zed: Take no action on scrub/resilver checksum errors

When scrubbing/resilvering a pool it can be counter productive to
cancel the scan and kick of a replace operation to a hot spare
when encountering checksum errors.  In this case, the best course
of action is to allow the scrub/resilver to complete as quickly
as possible and to keep the vdevs fully online if possible.

Realistically, this is less of an issue for a RAIDZ since a
traditional resilver must be used and checksums will be verified.
However, this is not the case for a mirror or dRAID pool which is
sequentially resilvered and checksum verification is deferred
until after the replace operation completes.

Regardless, we apply this policy to all pool types since it's
a good idea for all vdevs.  Degrading additional vdevs has the
potential to make a bad situation worse.  Note the checksum
errors will still be reported as both an event and by
`zpool status`.  This change only prevents the ZED from
proactively taking any action.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #13499
---
 cmd/zed/agents/zfs_diagnosis.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c
index 0b27f6702e..9f646f9e33 100644
--- a/cmd/zed/agents/zfs_diagnosis.c
+++ b/cmd/zed/agents/zfs_diagnosis.c
@@ -35,6 +35,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/fm/protocol.h>
 #include <sys/fm/fs/zfs.h>
+#include <sys/zio.h>
 
 #include "zfs_agents.h"
 #include "fmd_api.h"
@@ -773,6 +774,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
 		char *failmode = NULL;
 		boolean_t checkremove = B_FALSE;
+		uint32_t pri = 0;
+		int32_t flags = 0;
 
 		/*
 		 * If this is a checksum or I/O error, then toss it into the
@@ -795,6 +798,23 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 				checkremove = B_TRUE;
 		} else if (fmd_nvl_class_match(hdl, nvl,
 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
+			/*
+			 * We ignore ereports for checksum errors generated by
+			 * scrub/resilver I/O to avoid potentially further
+			 * degrading the pool while it's being repaired.
+			 */
+			if (((nvlist_lookup_uint32(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
+			    (pri == ZIO_PRIORITY_SCRUB ||
+			    pri == ZIO_PRIORITY_REBUILD)) ||
+			    ((nvlist_lookup_int32(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
+			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
+				fmd_hdl_debug(hdl, "ignoring '%s' for "
+				    "scrub/resilver I/O", class);
+				return;
+			}
+
 			if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
 				zfs_serd_name(zcp->zc_data.zc_serd_checksum,
 				    pool_guid, vdev_guid, "checksum");