zed: Take no action on scrub/resilver checksum errors
When scrubbing/resilvering a pool it can be counter productive to cancel the scan and kick of a replace operation to a hot spare when encountering checksum errors. In this case, the best course of action is to allow the scrub/resilver to complete as quickly as possible and to keep the vdevs fully online if possible. Realistically, this is less of an issue for a RAIDZ since a traditional resilver must be used and checksums will be verified. However, this is not the case for a mirror or dRAID pool which is sequentially resilvered and checksum verification is deferred until after the replace operation completes. Regardless, we apply this policy to all pool types since it's a good idea for all vdevs. Degrading additional vdevs has the potential to make a bad situation worse. Note the checksum errors will still be reported as both an event and by `zpool status`. This change only prevents the ZED from proactively taking any action. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #13499
This commit is contained in:
parent
4184b78be1
commit
5d534f1371
|
@ -35,6 +35,7 @@
|
||||||
#include <sys/fs/zfs.h>
|
#include <sys/fs/zfs.h>
|
||||||
#include <sys/fm/protocol.h>
|
#include <sys/fm/protocol.h>
|
||||||
#include <sys/fm/fs/zfs.h>
|
#include <sys/fm/fs/zfs.h>
|
||||||
|
#include <sys/zio.h>
|
||||||
|
|
||||||
#include "zfs_agents.h"
|
#include "zfs_agents.h"
|
||||||
#include "fmd_api.h"
|
#include "fmd_api.h"
|
||||||
|
@ -773,6 +774,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
||||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
|
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
|
||||||
char *failmode = NULL;
|
char *failmode = NULL;
|
||||||
boolean_t checkremove = B_FALSE;
|
boolean_t checkremove = B_FALSE;
|
||||||
|
uint32_t pri = 0;
|
||||||
|
int32_t flags = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this is a checksum or I/O error, then toss it into the
|
* If this is a checksum or I/O error, then toss it into the
|
||||||
|
@ -795,6 +798,23 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
||||||
checkremove = B_TRUE;
|
checkremove = B_TRUE;
|
||||||
} else if (fmd_nvl_class_match(hdl, nvl,
|
} else if (fmd_nvl_class_match(hdl, nvl,
|
||||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
|
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
|
||||||
|
/*
|
||||||
|
* We ignore ereports for checksum errors generated by
|
||||||
|
* scrub/resilver I/O to avoid potentially further
|
||||||
|
* degrading the pool while it's being repaired.
|
||||||
|
*/
|
||||||
|
if (((nvlist_lookup_uint32(nvl,
|
||||||
|
FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
|
||||||
|
(pri == ZIO_PRIORITY_SCRUB ||
|
||||||
|
pri == ZIO_PRIORITY_REBUILD)) ||
|
||||||
|
((nvlist_lookup_int32(nvl,
|
||||||
|
FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
|
||||||
|
(flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
|
||||||
|
fmd_hdl_debug(hdl, "ignoring '%s' for "
|
||||||
|
"scrub/resilver I/O", class);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
|
if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
|
||||||
zfs_serd_name(zcp->zc_data.zc_serd_checksum,
|
zfs_serd_name(zcp->zc_data.zc_serd_checksum,
|
||||||
pool_guid, vdev_guid, "checksum");
|
pool_guid, vdev_guid, "checksum");
|
||||||
|
|
Loading…
Reference in New Issue