Avoid posting duplicate zpool events

Duplicate io and checksum ereport events can misrepresent that
things are worse than they seem. Ideally the zpool events and the
corresponding vdev stat error counts in a zpool status should be
for unique errors -- not the same error being counted over and over.
This can be demonstrated in a simple example. With a single bad
block in a datafile and just 5 reads of the file we end up with a
degraded vdev, even though there is only one unique error in the pool.

The proposed solution to the above issue, is to eliminate duplicates
when posting events and when updating vdev error stats. We now save
recent error events of interest when posting events so that we can
easily check for duplicates when posting an error.

Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@delphix.com>
Closes #10861
This commit is contained in:
Don Brady 2020-09-04 11:34:28 -06:00 committed by Brian Behlendorf
parent bd724261d2
commit 8afac5dc55
22 changed files with 799 additions and 79 deletions

View File

@ -23,6 +23,10 @@
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/*
* Copyright (c) 2020 by Delphix. All rights reserved.
*/
#ifndef _SYS_FM_FS_ZFS_H #ifndef _SYS_FM_FS_ZFS_H
#define _SYS_FM_FS_ZFS_H #define _SYS_FM_FS_ZFS_H
@ -88,6 +92,7 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY "zio_priority"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp"

View File

@ -104,6 +104,9 @@ extern int zfs_zevent_seek(zfs_zevent_t *, uint64_t);
extern void zfs_zevent_init(zfs_zevent_t **); extern void zfs_zevent_init(zfs_zevent_t **);
extern void zfs_zevent_destroy(zfs_zevent_t *); extern void zfs_zevent_destroy(zfs_zevent_t *);
extern void zfs_zevent_track_duplicate(void);
extern void zfs_ereport_init(void);
extern void zfs_ereport_fini(void);
#else #else
static inline void fm_init(void) { } static inline void fm_init(void) { }

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved.
@ -1145,10 +1145,10 @@ extern const char *spa_state_to_name(spa_t *spa);
struct zbookmark_phys; struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
uint64_t length);
extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,
zio_t *zio); zio_t *zio);
extern void zfs_ereport_taskq_fini(void);
extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
const char *name, nvlist_t *aux); const char *name, nvlist_t *aux);
extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_remove(spa_t *spa, vdev_t *vd);

View File

@ -22,7 +22,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright 2016 Toomas Soome <tsoome@me.com> * Copyright 2016 Toomas Soome <tsoome@me.com>
@ -680,7 +680,7 @@ extern hrtime_t zio_handle_io_delay(zio_t *zio);
/* /*
* Checksum ereport functions * Checksum ereport functions
*/ */
extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
uint64_t length, void *arg, struct zio_bad_cksum *info); uint64_t length, void *arg, struct zio_bad_cksum *info);
extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,

View File

@ -1,6 +1,6 @@
'\" te '\" te
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved. .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
.\" Copyright (c) 2019 by Delphix. All rights reserved. .\" Copyright (c) 2019, 2020 by Delphix. All rights reserved.
.\" Copyright (c) 2019 Datto Inc. .\" Copyright (c) 2019 Datto Inc.
.\" The contents of this file are subject to the terms of the Common Development .\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except .\" and Distribution License (the "License"). You may not use this file except
@ -3656,6 +3656,27 @@ Default value: \fB0\fR.
.sp .sp
.ne 2 .ne 2
.na
\fBzfs_zevent_retain_max\fR (int)
.ad
.RS 12n
Maximum recent zevent records to retain for duplicate checking. Setting
this value to zero disables duplicate detection.
.sp
Default value: \fB2000\fR.
.RE
.sp
.ne 2
.na
\fBzfs_zevent_retain_expire_secs\fR (int)
.ad
.RS 12n
Lifespan for a recent ereport that was retained for duplicate checking.
.sp
Default value: \fB900\fR.
.RE
.na .na
\fBzfs_zil_clean_taskq_maxalloc\fR (int) \fBzfs_zil_clean_taskq_maxalloc\fR (int)
.ad .ad

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2018, Joyent, Inc.
* Copyright (c) 2011, 2019, Delphix. All rights reserved. * Copyright (c) 2011, 2020, Delphix. All rights reserved.
* Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
* Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
@ -2188,7 +2188,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
ret = SET_ERROR(EIO); ret = SET_ERROR(EIO);
spa_log_error(spa, zb); spa_log_error(spa, zb);
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, zb, NULL, 0, 0); spa, NULL, zb, NULL, 0);
} }
return (ret); return (ret);
@ -5654,7 +5654,7 @@ arc_read_done(zio_t *zio)
spa_log_error(zio->io_spa, &acb->acb_zb); spa_log_error(zio->io_spa, &acb->acb_zb);
(void) zfs_ereport_post( (void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION, FM_EREPORT_ZFS_AUTHENTICATION,
zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); zio->io_spa, NULL, &acb->acb_zb, zio, 0);
} }
} }
@ -5931,7 +5931,7 @@ top:
spa_log_error(spa, zb); spa_log_error(spa, zb);
(void) zfs_ereport_post( (void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION, FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, zb, NULL, 0, 0); spa, NULL, zb, NULL, 0);
} }
} }
if (rc != 0) { if (rc != 0) {

View File

@ -104,13 +104,15 @@ struct erpt_kstat {
kstat_named_t erpt_set_failed; /* num erpt set failures */ kstat_named_t erpt_set_failed; /* num erpt set failures */
kstat_named_t fmri_set_failed; /* num fmri set failures */ kstat_named_t fmri_set_failed; /* num fmri set failures */
kstat_named_t payload_set_failed; /* num payload set failures */ kstat_named_t payload_set_failed; /* num payload set failures */
kstat_named_t erpt_duplicates; /* num duplicate erpts */
}; };
static struct erpt_kstat erpt_kstat_data = { static struct erpt_kstat erpt_kstat_data = {
{ "erpt-dropped", KSTAT_DATA_UINT64 }, { "erpt-dropped", KSTAT_DATA_UINT64 },
{ "erpt-set-failed", KSTAT_DATA_UINT64 }, { "erpt-set-failed", KSTAT_DATA_UINT64 },
{ "fmri-set-failed", KSTAT_DATA_UINT64 }, { "fmri-set-failed", KSTAT_DATA_UINT64 },
{ "payload-set-failed", KSTAT_DATA_UINT64 } { "payload-set-failed", KSTAT_DATA_UINT64 },
{ "erpt-duplicates", KSTAT_DATA_UINT64 }
}; };
kstat_t *fm_ksp; kstat_t *fm_ksp;
@ -568,6 +570,12 @@ out:
return (error); return (error);
} }
void
zfs_zevent_track_duplicate(void)
{
atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64);
}
static int static int
zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)
{ {
@ -1633,6 +1641,8 @@ fm_init(void)
list_create(&zevent_list, sizeof (zevent_t), list_create(&zevent_list, sizeof (zevent_t),
offsetof(zevent_t, ev_node)); offsetof(zevent_t, ev_node));
cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL);
zfs_ereport_init();
} }
void void
@ -1640,6 +1650,8 @@ fm_fini(void)
{ {
int count; int count;
zfs_ereport_fini();
zfs_zevent_drain_all(&count); zfs_zevent_drain_all(&count);
mutex_enter(&zevent_lock); mutex_enter(&zevent_lock);

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved.
@ -2868,7 +2868,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
} }
if (error != EBADF) { if (error != EBADF) {
(void) zfs_ereport_post(ereport, spa, (void) zfs_ereport_post(ereport, spa,
NULL, NULL, NULL, 0, 0); NULL, NULL, NULL, 0);
} }
} }
spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;

View File

@ -22,7 +22,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc. * Copyright 2017 Joyent, Inc.
*/ */
@ -316,7 +316,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
if (target->spa_ccw_fail_time == 0) { if (target->spa_ccw_fail_time == 0) {
(void) zfs_ereport_post( (void) zfs_ereport_post(
FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
target, NULL, NULL, NULL, 0, 0); target, NULL, NULL, NULL, 0);
} }
target->spa_ccw_fail_time = gethrtime(); target->spa_ccw_fail_time = gethrtime();
spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);

View File

@ -1481,7 +1481,7 @@ vdev_probe_done(zio_t *zio)
ASSERT(zio->io_error != 0); ASSERT(zio->io_error != 0);
vdev_dbgmsg(vd, "failed probe"); vdev_dbgmsg(vd, "failed probe");
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0, 0); spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO); zio->io_error = SET_ERROR(ENXIO);
} }
@ -1862,11 +1862,10 @@ vdev_open(vdev_t *vd)
vd->vdev_ops->vdev_op_leaf) { vd->vdev_ops->vdev_op_leaf) {
(void) zfs_ereport_post( (void) zfs_ereport_post(
FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
spa, vd, NULL, NULL, 0, 0); spa, vd, NULL, NULL, 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL); VDEV_AUX_BAD_LABEL);
return (SET_ERROR(EDOM)); return (SET_ERROR(EDOM));
} }
vd->vdev_max_asize = max_asize; vd->vdev_max_asize = max_asize;
} }
@ -4759,7 +4758,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
} }
(void) zfs_ereport_post(class, spa, vd, NULL, NULL, (void) zfs_ereport_post(class, spa, vd, NULL, NULL,
save_state, 0); save_state);
} }
/* Erase any notion of persistent removed state */ /* Erase any notion of persistent removed state */

View File

@ -16,7 +16,7 @@
/* /*
* Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2014, 2019 by Delphix. All rights reserved. * Copyright (c) 2014, 2020 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -1473,13 +1473,14 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
vdev_t *vd = ic->ic_vdev; vdev_t *vd = ic->ic_vdev;
int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
NULL, zio, is->is_target_offset, is->is_size,
NULL, NULL, NULL);
if (ret != EALREADY) {
mutex_enter(&vd->vdev_stat_lock); mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_checksum_errors++; vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock); mutex_exit(&vd->vdev_stat_lock);
}
(void) zfs_ereport_post_checksum(zio->io_spa, vd,
NULL, zio, is->is_target_offset, is->is_size,
NULL, NULL, NULL);
} }
} }
} }

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2016 Gvozden Nešković. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
*/ */
@ -1790,16 +1790,17 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
zio_bad_cksum_t zbc; zio_bad_cksum_t zbc;
raidz_map_t *rm = zio->io_vsd; raidz_map_t *rm = zio->io_vsd;
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
zbc.zbc_has_cksum = 0; zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected; zbc.zbc_injected = rm->rm_ecksuminjected;
(void) zfs_ereport_post_checksum(zio->io_spa, vd, int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
&zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
rc->rc_abd, bad_data, &zbc); rc->rc_abd, bad_data, &zbc);
if (ret != EALREADY) {
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
}
} }
} }
@ -2279,21 +2280,21 @@ vdev_raidz_io_done(zio_t *zio)
vdev_t *cvd; vdev_t *cvd;
rc = &rm->rm_col[c]; rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx]; cvd = vd->vdev_child[rc->rc_devidx];
if (rc->rc_error == 0) { if (rc->rc_error != 0)
continue;
zio_bad_cksum_t zbc; zio_bad_cksum_t zbc;
zbc.zbc_has_cksum = 0; zbc.zbc_has_cksum = 0;
zbc.zbc_injected = zbc.zbc_injected = rm->rm_ecksuminjected;
rm->rm_ecksuminjected;
int ret = zfs_ereport_start_checksum(
zio->io_spa, cvd, &zio->io_bookmark, zio,
rc->rc_offset, rc->rc_size,
(void *)(uintptr_t)c, &zbc);
if (ret != EALREADY) {
mutex_enter(&cvd->vdev_stat_lock); mutex_enter(&cvd->vdev_stat_lock);
cvd->vdev_stat.vs_checksum_errors++; cvd->vdev_stat.vs_checksum_errors++;
mutex_exit(&cvd->vdev_stat_lock); mutex_exit(&cvd->vdev_stat_lock);
zfs_ereport_start_checksum(
zio->io_spa, cvd,
&zio->io_bookmark, zio,
rc->rc_offset, rc->rc_size,
(void *)(uintptr_t)c, &zbc);
} }
} }
} }

View File

@ -24,7 +24,7 @@
*/ */
/* /*
* Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012,2020 by Delphix. All rights reserved.
*/ */
#include <sys/spa.h> #include <sys/spa.h>
@ -101,7 +101,251 @@
* good and bad versions of the buffer (if available), and we annotate the * good and bad versions of the buffer (if available), and we annotate the
* ereport with information about the differences. * ereport with information about the differences.
*/ */
#ifdef _KERNEL #ifdef _KERNEL
/*
* Duplicate ereport Detection
*
* Some ereports are retained momentarily for detecting duplicates. These
* are kept in a recent_events_node_t in both a time-ordered list and an AVL
* tree of recent unique ereports.
*
* The lifespan of these recent ereports is bounded (15 mins) and a cleaner
* task is used to purge stale entries.
*/
static list_t recent_events_list;
static avl_tree_t recent_events_tree;
static kmutex_t recent_events_lock;
static taskqid_t recent_events_cleaner_tqid;
/*
* Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
*
* This setting can be changed dynamically and setting it to zero
* disables duplicate detection.
*/
unsigned int zfs_zevent_retain_max = 2000;
/*
* The lifespan for a recent ereport entry. The default of 15 minutes is
* intended to outlive the zfs diagnosis engine's threshold of 10 errors
* over a period of 10 minutes.
*/
unsigned int zfs_zevent_retain_expire_secs = 900;
typedef enum zfs_subclass {
ZSC_IO,
ZSC_DATA,
ZSC_CHECKSUM
} zfs_subclass_t;
typedef struct {
/* common criteria */
uint64_t re_pool_guid;
uint64_t re_vdev_guid;
int re_io_error;
uint64_t re_io_size;
uint64_t re_io_offset;
zfs_subclass_t re_subclass;
zio_priority_t re_io_priority;
/* logical zio criteria (optional) */
zbookmark_phys_t re_io_bookmark;
/* internal state */
avl_node_t re_tree_link;
list_node_t re_list_link;
uint64_t re_timestamp;
} recent_events_node_t;
static int
recent_events_compare(const void *a, const void *b)
{
const recent_events_node_t *node1 = a;
const recent_events_node_t *node2 = b;
int cmp;
/*
* The comparison order here is somewhat arbitrary.
* What's important is that if every criteria matches, then it
* is a duplicate (i.e. compare returns 0)
*/
if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
return (cmp);
if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
return (cmp);
if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
return (cmp);
if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
return (cmp);
if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
return (cmp);
if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
return (cmp);
if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
return (cmp);
const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
return (cmp);
if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
return (cmp);
if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
return (cmp);
if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
return (cmp);
return (0);
}
static void zfs_ereport_schedule_cleaner(void);
/*
* background task to clean stale recent event nodes.
*/
/*ARGSUSED*/
static void
zfs_ereport_cleaner(void *arg)
{
recent_events_node_t *entry;
uint64_t now = gethrtime();
/*
* purge expired entries
*/
mutex_enter(&recent_events_lock);
while ((entry = list_tail(&recent_events_list)) != NULL) {
uint64_t age = NSEC2SEC(now - entry->re_timestamp);
if (age <= zfs_zevent_retain_expire_secs)
break;
/* remove expired node */
avl_remove(&recent_events_tree, entry);
list_remove(&recent_events_list, entry);
kmem_free(entry, sizeof (*entry));
}
/* Restart the cleaner if more entries remain */
recent_events_cleaner_tqid = 0;
if (!list_is_empty(&recent_events_list))
zfs_ereport_schedule_cleaner();
mutex_exit(&recent_events_lock);
}
static void
zfs_ereport_schedule_cleaner(void)
{
ASSERT(MUTEX_HELD(&recent_events_lock));
uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
recent_events_cleaner_tqid = taskq_dispatch_delay(
system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
ddi_get_lbolt() + NSEC_TO_TICK(timeout));
}
/*
* Check if an ereport would be a duplicate of one recently posted.
*
* An ereport is considered a duplicate if the set of criteria in
* recent_events_node_t all match.
*
* Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
* are candidates for duplicate checking.
*/
static boolean_t
zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
{
recent_events_node_t search = {0}, *entry;
if (vd == NULL || zio == NULL)
return (B_FALSE);
if (zfs_zevent_retain_max == 0)
return (B_FALSE);
if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
search.re_subclass = ZSC_IO;
else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
search.re_subclass = ZSC_DATA;
else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
search.re_subclass = ZSC_CHECKSUM;
else
return (B_FALSE);
search.re_pool_guid = spa_guid(spa);
search.re_vdev_guid = vd->vdev_guid;
search.re_io_error = zio->io_error;
search.re_io_priority = zio->io_priority;
/* if size is supplied use it over what's in zio */
if (size) {
search.re_io_size = size;
search.re_io_offset = offset;
} else {
search.re_io_size = zio->io_size;
search.re_io_offset = zio->io_offset;
}
/* grab optional logical zio criteria */
if (zb != NULL) {
search.re_io_bookmark.zb_objset = zb->zb_objset;
search.re_io_bookmark.zb_object = zb->zb_object;
search.re_io_bookmark.zb_level = zb->zb_level;
search.re_io_bookmark.zb_blkid = zb->zb_blkid;
}
uint64_t now = gethrtime();
mutex_enter(&recent_events_lock);
/* check if we have seen this one recently */
entry = avl_find(&recent_events_tree, &search, NULL);
if (entry != NULL) {
uint64_t age = NSEC2SEC(now - entry->re_timestamp);
/*
* There is still an active cleaner (since we're here).
* Reset the last seen time for this duplicate entry
* so that its lifespand gets extended.
*/
list_remove(&recent_events_list, entry);
list_insert_head(&recent_events_list, entry);
entry->re_timestamp = now;
zfs_zevent_track_duplicate();
mutex_exit(&recent_events_lock);
return (age <= zfs_zevent_retain_expire_secs);
}
if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
/* recycle oldest node */
entry = list_tail(&recent_events_list);
ASSERT(entry != NULL);
list_remove(&recent_events_list, entry);
avl_remove(&recent_events_tree, entry);
} else {
entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
}
/* record this as a recent ereport */
*entry = search;
avl_add(&recent_events_tree, entry);
list_insert_head(&recent_events_list, entry);
entry->re_timestamp = now;
/* Start a cleaner if not already scheduled */
if (recent_events_cleaner_tqid == 0)
zfs_ereport_schedule_cleaner();
mutex_exit(&recent_events_lock);
return (B_FALSE);
}
void void
zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
{ {
@ -153,9 +397,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
uint64_t ena; uint64_t ena;
char class[64]; char class[64];
if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
return (B_FALSE);
if ((ereport = fm_nvlist_create(NULL)) == NULL) if ((ereport = fm_nvlist_create(NULL)) == NULL)
return (B_FALSE); return (B_FALSE);
@ -336,6 +577,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, zio->io_timestamp, NULL); DATA_TYPE_UINT64, zio->io_timestamp, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
DATA_TYPE_UINT64, zio->io_delta, NULL); DATA_TYPE_UINT64, zio->io_delta, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
DATA_TYPE_UINT32, zio->io_priority, NULL);
/* /*
* If the 'size' parameter is non-zero, it indicates this is a * If the 'size' parameter is non-zero, it indicates this is a
@ -788,24 +1031,34 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
} }
/* /*
* Return 0 if event was posted, EINVAL if there was a problem posting it or * Post an ereport for the given subclass
* EBUSY if the event was rate limited. *
* Returns
* - 0 if an event was posted
* - EINVAL if there was a problem posting event
* - EBUSY if the event was rate limited
* - EALREADY if the event was already posted (duplicate)
*/ */
int int
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
uint64_t size)
{ {
int rc = 0; int rc = 0;
#ifdef _KERNEL #ifdef _KERNEL
nvlist_t *ereport = NULL; nvlist_t *ereport = NULL;
nvlist_t *detector = NULL; nvlist_t *detector = NULL;
if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
return (EINVAL);
if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
return (SET_ERROR(EALREADY));
if (zfs_is_ratelimiting_event(subclass, vd)) if (zfs_is_ratelimiting_event(subclass, vd))
return (SET_ERROR(EBUSY)); return (SET_ERROR(EBUSY));
if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
zb, zio, stateoroffset, size)) zb, zio, state, 0))
return (SET_ERROR(EINVAL)); /* couldn't post event */ return (SET_ERROR(EINVAL)); /* couldn't post event */
if (ereport == NULL) if (ereport == NULL)
@ -817,7 +1070,16 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
return (rc); return (rc);
} }
void /*
* Prepare a checksum ereport
*
* Returns
* - 0 if an event was posted
* - EINVAL if there was a problem posting event
* - EBUSY if the event was rate limited
* - EALREADY if the event was already posted (duplicate)
*/
int
zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length, void *arg, struct zio *zio, uint64_t offset, uint64_t length, void *arg,
zio_bad_cksum_t *info) zio_bad_cksum_t *info)
@ -825,8 +1087,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
zio_cksum_report_t *report; zio_cksum_report_t *report;
#ifdef _KERNEL #ifdef _KERNEL
if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
return (SET_ERROR(EINVAL));
if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
offset, length))
return (SET_ERROR(EALREADY));
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
return; return (SET_ERROR(EBUSY));
#endif #endif
report = kmem_zalloc(sizeof (*report), KM_SLEEP); report = kmem_zalloc(sizeof (*report), KM_SLEEP);
@ -851,7 +1120,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
if (report->zcr_ereport == NULL) { if (report->zcr_ereport == NULL) {
zfs_ereport_free_checksum(report); zfs_ereport_free_checksum(report);
return; return (0);
} }
#endif #endif
@ -859,6 +1128,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
report->zcr_next = zio->io_logical->io_cksum_report; report->zcr_next = zio->io_logical->io_cksum_report;
zio->io_logical->io_cksum_report = report; zio->io_logical->io_cksum_report = report;
mutex_exit(&spa->spa_errlist_lock); mutex_exit(&spa->spa_errlist_lock);
return (0);
} }
void void
@ -901,7 +1171,15 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
kmem_free(rpt, sizeof (*rpt)); kmem_free(rpt, sizeof (*rpt));
} }
/*
* Post a checksum ereport
*
* Returns
* - 0 if an event was posted
* - EINVAL if there was a problem posting event
* - EBUSY if the event was rate limited
* - EALREADY if the event was already posted (duplicate)
*/
int int
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length, struct zio *zio, uint64_t offset, uint64_t length,
@ -913,8 +1191,15 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
nvlist_t *detector = NULL; nvlist_t *detector = NULL;
zfs_ecksum_info_t *info; zfs_ecksum_info_t *info;
if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
return (SET_ERROR(EINVAL));
if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
offset, length))
return (SET_ERROR(EALREADY));
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
return (EBUSY); return (SET_ERROR(EBUSY));
if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
spa, vd, zb, zio, offset, length) || (ereport == NULL)) { spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
@ -1073,11 +1358,57 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
#endif #endif
} }
#if defined(_KERNEL) #ifdef _KERNEL
void
zfs_ereport_init(void)
{
mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&recent_events_list, sizeof (recent_events_node_t),
offsetof(recent_events_node_t, re_list_link));
avl_create(&recent_events_tree, recent_events_compare,
sizeof (recent_events_node_t), offsetof(recent_events_node_t,
re_tree_link));
}
/*
* This 'early' fini needs to run before zfs_fini() which on Linux waits
* for the system_delay_taskq to drain.
*/
void
zfs_ereport_taskq_fini(void)
{
mutex_enter(&recent_events_lock);
if (recent_events_cleaner_tqid != 0) {
taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
recent_events_cleaner_tqid = 0;
}
mutex_exit(&recent_events_lock);
}
void
zfs_ereport_fini(void)
{
recent_events_node_t *entry;
while ((entry = list_head(&recent_events_list)) != NULL) {
avl_remove(&recent_events_tree, entry);
list_remove(&recent_events_list, entry);
kmem_free(entry, sizeof (*entry));
}
avl_destroy(&recent_events_tree);
list_destroy(&recent_events_list);
mutex_destroy(&recent_events_lock);
}
EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_post);
EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_is_valid);
EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_ereport_post_checksum);
EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_remove);
EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_autoreplace);
EXPORT_SYMBOL(zfs_post_state_change); EXPORT_SYMBOL(zfs_post_state_change);
ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
"Maximum recent zevents records to retain for duplicate checking");
ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
"Expiration time for recent zevents records");
#endif /* _KERNEL */ #endif /* _KERNEL */

View File

@ -7615,6 +7615,7 @@ zfs_kmod_fini(void)
kmem_free(zs, sizeof (zfsdev_state_t)); kmem_free(zs, sizeof (zfsdev_state_t));
} }
zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */
zfs_fini(); zfs_fini();
spa_fini(); spa_fini();
zvol_fini(); zvol_fini();

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation. * Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Klara Inc.
@ -547,7 +547,7 @@ error:
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(spa, &zio->io_bookmark); spa_log_error(spa, &zio->io_bookmark);
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, &zio->io_bookmark, zio, 0, 0); spa, NULL, &zio->io_bookmark, zio, 0);
} }
} else { } else {
zio->io_error = ret; zio->io_error = ret;
@ -2004,7 +2004,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth)
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
pio->io_offset, pio->io_size, pio->io_error); pio->io_offset, pio->io_size, pio->io_error);
(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
pio->io_spa, vd, zb, pio, 0, 0); pio->io_spa, vd, zb, pio, 0);
if (failmode == ZIO_FAILURE_MODE_CONTINUE && if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
taskq_empty_ent(&pio->io_tqent)) { taskq_empty_ent(&pio->io_tqent)) {
@ -2331,7 +2331,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and has been suspended.\n", spa_name(spa)); "failure and has been suspended.\n", spa_name(spa));
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0, 0); NULL, NULL, 0);
mutex_enter(&spa->spa_suspend_lock); mutex_enter(&spa->spa_suspend_lock);
@ -4217,13 +4217,15 @@ zio_checksum_verify(zio_t *zio)
zio->io_error = error; zio->io_error = error;
if (error == ECKSUM && if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
int ret = zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, &zio->io_bookmark, zio,
zio->io_offset, zio->io_size, NULL, &info);
if (ret != EALREADY) {
mutex_enter(&zio->io_vd->vdev_stat_lock); mutex_enter(&zio->io_vd->vdev_stat_lock);
zio->io_vd->vdev_stat.vs_checksum_errors++; zio->io_vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&zio->io_vd->vdev_stat_lock); mutex_exit(&zio->io_vd->vdev_stat_lock);
}
zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, &zio->io_bookmark, zio,
zio->io_offset, zio->io_size, NULL, &info);
} }
} }
@ -4543,7 +4545,7 @@ zio_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio->io_spa, zio->io_vd, &zio->io_bookmark,
zio, 0, 0); zio, 0);
} }
} }
} }
@ -4557,16 +4559,16 @@ zio_done(zio_t *zio)
*/ */
if (zio->io_error != ECKSUM && zio->io_vd != NULL && if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
!vdev_is_dead(zio->io_vd)) { !vdev_is_dead(zio->io_vd)) {
int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
if (ret != EALREADY) {
mutex_enter(&zio->io_vd->vdev_stat_lock); mutex_enter(&zio->io_vd->vdev_stat_lock);
if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_type == ZIO_TYPE_READ)
zio->io_vd->vdev_stat.vs_read_errors++; zio->io_vd->vdev_stat.vs_read_errors++;
} else if (zio->io_type == ZIO_TYPE_WRITE) { else if (zio->io_type == ZIO_TYPE_WRITE)
zio->io_vd->vdev_stat.vs_write_errors++; zio->io_vd->vdev_stat.vs_write_errors++;
}
mutex_exit(&zio->io_vd->vdev_stat_lock); mutex_exit(&zio->io_vd->vdev_stat_lock);
}
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
zio->io_vd, &zio->io_bookmark, zio, 0, 0);
} }
if ((zio->io_error == EIO || !(zio->io_flags & if ((zio->io_error == EIO || !(zio->io_flags &
@ -4578,7 +4580,7 @@ zio_done(zio_t *zio)
*/ */
spa_log_error(zio->io_spa, &zio->io_bookmark); spa_log_error(zio->io_spa, &zio->io_bookmark);
(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0); zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
} }
} }

View File

@ -346,7 +346,7 @@ tags = ['functional', 'cli_root', 'zpool_detach']
[tests/functional/cli_root/zpool_events] [tests/functional/cli_root/zpool_events]
tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow',
'zpool_events_poolname', 'zpool_events_errors'] 'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates']
tags = ['functional', 'cli_root', 'zpool_events'] tags = ['functional', 'cli_root', 'zpool_events']
[tests/functional/cli_root/zpool_export] [tests/functional/cli_root/zpool_export]

View File

@ -105,6 +105,7 @@ export SYSTEM_FILES_COMMON='arp
umask umask
umount umount
uname uname
uniq
uuidgen uuidgen
vmstat vmstat
wait wait

View File

@ -82,6 +82,7 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
VOL_MODE vol.mode zvol_volmode VOL_MODE vol.mode zvol_volmode
VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_RECURSIVE vol.recursive UNSUPPORTED
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
ZIO_SLOW_IO_MS zio.slow_io_ms zio_slow_io_ms ZIO_SLOW_IO_MS zio.slow_io_ms zio_slow_io_ms
%%%% %%%%
while read name FreeBSD Linux; do while read name FreeBSD Linux; do

View File

@ -0,0 +1 @@
/ereports

View File

@ -1,4 +1,8 @@
include $(top_srcdir)/config/Rules.am
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_events pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_events
pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_events
dist_pkgdata_SCRIPTS = \ dist_pkgdata_SCRIPTS = \
setup.ksh \ setup.ksh \
cleanup.ksh \ cleanup.ksh \
@ -6,8 +10,16 @@ dist_pkgdata_SCRIPTS = \
zpool_events_cliargs.ksh \ zpool_events_cliargs.ksh \
zpool_events_follow.ksh \ zpool_events_follow.ksh \
zpool_events_poolname.ksh \ zpool_events_poolname.ksh \
zpool_events_errors.ksh zpool_events_errors.ksh \
zpool_events_duplicates.ksh
dist_pkgdata_DATA = \ dist_pkgdata_DATA = \
zpool_events.cfg \ zpool_events.cfg \
zpool_events.kshlib zpool_events.kshlib
ereports_LDADD = \
$(abs_top_builddir)/lib/libnvpair/libnvpair.la \
$(abs_top_builddir)/lib/libzfs/libzfs.la
pkgexec_PROGRAMS = ereports
ereports_SOURCES = ereports.c

View File

@ -0,0 +1,174 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2020 by Delphix. All rights reserved.
*/
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <libzfs.h>
#include <sys/zfs_ioctl.h>
#include <sys/nvpair.h>
#include <sys/fm/protocol.h>
#include <sys/fm/fs/zfs.h>
/*
* Command to output io and checksum ereport values, one per line.
* Used by zpool_events_duplicates.ksh to check for duplicate events.
*
* example output line:
*
* checksum "error_pool" 0x856dd01ce52e336 0x000034 0x000400 0x000a402c00
* 0x000004 0x000000 0x000000 0x000000 0x000001
*/
/*
* Our ereport duplicate criteria
*
* When the class and all of these values match, then an ereport is
* considered to be a duplicate.
*/
static const char *criteria_name[] = {
FM_EREPORT_PAYLOAD_ZFS_POOL,
FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
/* logical zio criteriai (optional) */
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
};
#define CRITERIA_NAMES_COUNT ARRAY_SIZE(criteria_name)
static void
print_ereport_line(nvlist_t *nvl)
{
char *class;
int last = CRITERIA_NAMES_COUNT - 1;
/*
* For the test case context, we only want to see 'io' and
* 'checksum' subclass. We skip 'data' to minimize the output.
*/
if (nvlist_lookup_string(nvl, FM_CLASS, &class) != 0 ||
strstr(class, "ereport.fs.zfs.") == NULL ||
strcmp(class, "ereport.fs.zfs.data") == 0) {
return;
}
(void) printf("%s\t", class + strlen("ereport.fs.zfs."));
for (int i = 0; i < CRITERIA_NAMES_COUNT; i++) {
nvpair_t *nvp;
uint32_t i32 = 0;
uint64_t i64 = 0;
char *str = NULL;
if (nvlist_lookup_nvpair(nvl, criteria_name[i], &nvp) != 0) {
/* print a proxy for optional criteria */
(void) printf("--------");
(void) printf("%c", i == last ? '\n' : '\t');
continue;
}
switch (nvpair_type(nvp)) {
case DATA_TYPE_STRING:
(void) nvpair_value_string(nvp, &str);
(void) printf("\"%s\"", str ? str : "<NULL>");
break;
case DATA_TYPE_INT32:
(void) nvpair_value_int32(nvp, (void *)&i32);
(void) printf("0x%06x", i32);
break;
case DATA_TYPE_UINT32:
(void) nvpair_value_uint32(nvp, &i32);
(void) printf("0x%06x", i32);
break;
case DATA_TYPE_INT64:
(void) nvpair_value_int64(nvp, (void *)&i64);
(void) printf("0x%06llx", (u_longlong_t)i64);
break;
case DATA_TYPE_UINT64:
(void) nvpair_value_uint64(nvp, &i64);
if (strcmp(FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
criteria_name[i]) == 0)
(void) printf("0x%010llx", (u_longlong_t)i64);
else
(void) printf("0x%06llx", (u_longlong_t)i64);
break;
default:
(void) printf("<unknown>");
break;
}
(void) printf("%c", i == last ? '\n' : '\t');
}
}
static void
ereports_dump(libzfs_handle_t *zhdl, int zevent_fd)
{
nvlist_t *nvl;
int ret, dropped;
while (1) {
ret = zpool_events_next(zhdl, &nvl, &dropped, ZEVENT_NONBLOCK,
zevent_fd);
if (ret || nvl == NULL)
break;
if (dropped > 0)
(void) fprintf(stdout, "dropped %d events\n", dropped);
print_ereport_line(nvl);
(void) fflush(stdout);
nvlist_free(nvl);
}
}
/* ARGSUSED */
int
main(int argc, char **argv)
{
libzfs_handle_t *hdl;
int fd;
hdl = libzfs_init();
if (hdl == NULL) {
(void) fprintf(stderr, "libzfs_init: %s\n", strerror(errno));
exit(2);
}
fd = open(ZFS_DEV, O_RDWR);
if (fd < 0) {
(void) fprintf(stderr, "open: %s\n", strerror(errno));
libzfs_fini(hdl);
exit(2);
}
ereports_dump(hdl, fd);
(void) close(fd);
libzfs_fini(hdl);
return (0);
}

View File

@ -0,0 +1,155 @@
#!/bin/ksh -p
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
# Copyright (c) 2020 by Delphix. All rights reserved.
#
# DESCRIPTION:
# Verify that duplicate I/O ereport errors are not posted
#
# STRATEGY:
# 1. Create a mirror pool
# 2. Inject duplicate read/write IO errors and checksum errors
# 3. Verify there are no duplicate events being posted
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "both"
MOUNTDIR=$TEST_BASE_DIR/mount
FILEPATH=$MOUNTDIR/badfile
VDEV1=$TEST_BASE_DIR/vfile1
VDEV2=$TEST_BASE_DIR/vfile2
POOL=error_pool
FILESIZE="10M"
OLD_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX)
RETAIN_MAX=$(get_tunable ZEVENT_RETAIN_MAX)
EREPORTS="$STF_SUITE/tests/functional/cli_root/zpool_events/ereports"
duplicates=false
function cleanup
{
log_must set_tunable64 ZEVENT_LEN_MAX $OLD_LEN_MAX
log_must zinject -c all
if poolexists $POOL ; then
destroy_pool $POOL
fi
log_must rm -f $VDEV1 $VDEV2
}
log_assert "Duplicate I/O ereport errors are not posted"
log_note "zevent retain max setting: $RETAIN_MAX"
log_onexit cleanup
# Set our threshold high to avoid dropping events.
set_tunable64 ZEVENT_LEN_MAX 20000
log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2
log_must mkdir -p $MOUNTDIR
#
# $1: test type - corrupt (checksum error), io
# $2: read, write
function do_dup_test
{
ERR=$1
RW=$2
log_note "Testing $ERR $RW ereports"
log_must zpool create -f -m $MOUNTDIR -o failmode=continue $POOL mirror $VDEV1 $VDEV2
log_must zpool events -c
log_must zfs set compression=off $POOL
if [ "$RW" == "read" ] ; then
log_must mkfile $FILESIZE $FILEPATH
# unmount and mount filesystems to purge file from ARC
# to force reads to go through error inject handler
log_must zfs unmount $POOL
log_must zfs mount $POOL
# all reads from this file get an error
if [ "$ERR" == "corrupt" ] ; then
log_must zinject -a -t data -e checksum -T read $FILEPATH
else
log_must zinject -a -t data -e io -T read $FILEPATH
fi
# Read the file a few times to generate some
# duplicate errors of the same blocks
# shellcheck disable=SC2034
for i in {1..15}; do
dd if=$FILEPATH of=/dev/null bs=128K > /dev/null 2>&1
done
log_must zinject -c all
fi
log_must zinject -d $VDEV1 -e $ERR -T $RW -f 100 $POOL
if [ "$RW" == "write" ] ; then
log_must mkfile $FILESIZE $FILEPATH
log_must zpool sync $POOL
else
# scrub twice to generate some duplicates
log_must zpool scrub $POOL
log_must zpool wait -t scrub $POOL
log_must zpool scrub $POOL
log_must zpool wait -t scrub $POOL
fi
log_must zinject -c all
# Wait for the pool to settle down and finish resilvering (if
# necessary). We want the errors to stop incrementing before we
# check for duplicates.
zpool wait -t resilver $POOL
ereports="$($EREPORTS | sort)"
actual=$(echo "$ereports" | wc -l)
unique=$(echo "$ereports" | uniq | wc -l)
log_note "$actual total $ERR $RW ereports where $unique were unique"
if [ $actual -gt $unique ] ; then
log_note "UNEXPECTED -- $((actual-unique)) duplicate $ERR $RW ereports"
echo "$ereports"
duplicates=true
fi
log_must zpool destroy $POOL
}
do_dup_test "corrupt" "read"
do_dup_test "io" "read"
do_dup_test "io" "write"
if $duplicates; then
log_fail "FAILED -- Duplicate I/O ereport errors encountered"
else
log_pass "Duplicate I/O ereport errors are not posted"
fi