zed: Take no action on scrub/resilver checksum errors
When scrubbing/resilvering a pool it can be counter productive to cancel the scan and kick of a replace operation to a hot spare when encountering checksum errors. In this case, the best course of action is to allow the scrub/resilver to complete as quickly as possible and to keep the vdevs fully online if possible. Realistically, this is less of an issue for a RAIDZ since a traditional resilver must be used and checksums will be verified. However, this is not the case for a mirror or dRAID pool which is sequentially resilvered and checksum verification is deferred until after the replace operation completes. Regardless, we apply this policy to all pool types since it's a good idea for all vdevs. Degrading additional vdevs has the potential to make a bad situation worse. Note the checksum errors will still be reported as both an event and by `zpool status`. This change only prevents the ZED from proactively taking any action. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #13499
This commit is contained in:
parent
2cd0f98f4a
commit
cf70c0f8ae
cmd/zed/agents
|
@ -34,6 +34,7 @@
|
|||
#include <sys/fs/zfs.h>
|
||||
#include <sys/fm/protocol.h>
|
||||
#include <sys/fm/fs/zfs.h>
|
||||
#include <sys/zio.h>
|
||||
|
||||
#include "zfs_agents.h"
|
||||
#include "fmd_api.h"
|
||||
|
@ -770,6 +771,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
|||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
|
||||
char *failmode = NULL;
|
||||
boolean_t checkremove = B_FALSE;
|
||||
uint32_t pri = 0;
|
||||
int32_t flags = 0;
|
||||
|
||||
/*
|
||||
* If this is a checksum or I/O error, then toss it into the
|
||||
|
@ -792,6 +795,23 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
|||
checkremove = B_TRUE;
|
||||
} else if (fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
|
||||
/*
|
||||
* We ignore ereports for checksum errors generated by
|
||||
* scrub/resilver I/O to avoid potentially further
|
||||
* degrading the pool while it's being repaired.
|
||||
*/
|
||||
if (((nvlist_lookup_uint32(nvl,
|
||||
FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
|
||||
(pri == ZIO_PRIORITY_SCRUB ||
|
||||
pri == ZIO_PRIORITY_REBUILD)) ||
|
||||
((nvlist_lookup_int32(nvl,
|
||||
FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
|
||||
(flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
|
||||
fmd_hdl_debug(hdl, "ignoring '%s' for "
|
||||
"scrub/resilver I/O", class);
|
||||
return;
|
||||
}
|
||||
|
||||
if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
|
||||
zfs_serd_name(zcp->zc_data.zc_serd_checksum,
|
||||
pool_guid, vdev_guid, "checksum");
|
||||
|
|
Loading…
Reference in New Issue