From cf70c0f8ae01c24699767b5ecfbe1882c009d53a Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 24 May 2022 09:36:07 -0700 Subject: [PATCH] zed: Take no action on scrub/resilver checksum errors When scrubbing/resilvering a pool it can be counter productive to cancel the scan and kick of a replace operation to a hot spare when encountering checksum errors. In this case, the best course of action is to allow the scrub/resilver to complete as quickly as possible and to keep the vdevs fully online if possible. Realistically, this is less of an issue for a RAIDZ since a traditional resilver must be used and checksums will be verified. However, this is not the case for a mirror or dRAID pool which is sequentially resilvered and checksum verification is deferred until after the replace operation completes. Regardless, we apply this policy to all pool types since it's a good idea for all vdevs. Degrading additional vdevs has the potential to make a bad situation worse. Note the checksum errors will still be reported as both an event and by `zpool status`. This change only prevents the ZED from proactively taking any action. Reviewed-by: Tony Hutter Reviewed-by: Tony Nguyen Signed-off-by: Brian Behlendorf Closes #13499 --- cmd/zed/agents/zfs_diagnosis.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c index 4fe9ee2bc5..813916d2e8 100644 --- a/cmd/zed/agents/zfs_diagnosis.c +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "zfs_agents.h" #include "fmd_api.h" @@ -770,6 +771,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { char *failmode = NULL; boolean_t checkremove = B_FALSE; + uint32_t pri = 0; + int32_t flags = 0; /* * If this is a checksum or I/O error, then toss it into the @@ -792,6 +795,23 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) checkremove = B_TRUE; } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { + /* + * We ignore ereports for checksum errors generated by + * scrub/resilver I/O to avoid potentially further + * degrading the pool while it's being repaired. + */ + if (((nvlist_lookup_uint32(nvl, + FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) && + (pri == ZIO_PRIORITY_SCRUB || + pri == ZIO_PRIORITY_REBUILD)) || + ((nvlist_lookup_int32(nvl, + FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) && + (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) { + fmd_hdl_debug(hdl, "ignoring '%s' for " + "scrub/resilver I/O", class); + return; + } + if (zcp->zc_data.zc_serd_checksum[0] == '\0') { zfs_serd_name(zcp->zc_data.zc_serd_checksum, pool_guid, vdev_guid, "checksum");