diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 9bfb123c76..6491606d32 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + #ifndef _SYS_FM_FS_ZFS_H #define _SYS_FM_FS_ZFS_H @@ -88,6 +92,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY "zio_priority" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" diff --git a/include/sys/fm/util.h b/include/sys/fm/util.h index ff54b05bb6..ea8c61a8b9 100644 --- a/include/sys/fm/util.h +++ b/include/sys/fm/util.h @@ -104,6 +104,9 @@ extern int zfs_zevent_seek(zfs_zevent_t *, uint64_t); extern void zfs_zevent_init(zfs_zevent_t **); extern void zfs_zevent_destroy(zfs_zevent_t *); +extern void zfs_zevent_track_duplicate(void); +extern void zfs_ereport_init(void); +extern void zfs_ereport_fini(void); #else static inline void fm_init(void) { } diff --git a/include/sys/spa.h b/include/sys/spa.h index e53d0d64c3..ddce8cc914 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -1145,10 +1145,10 @@ extern const char *spa_state_to_name(spa_t *spa); struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, - const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, - uint64_t length); + const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, zio_t *zio); +extern void zfs_ereport_taskq_fini(void); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); diff --git a/include/sys/zio.h b/include/sys/zio.h index f3b5a12079..4959831716 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome @@ -680,7 +680,7 @@ extern hrtime_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions */ -extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, +extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 893bbf6522..32e28a13a4 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1,6 +1,6 @@ '\" te .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. -.\" Copyright (c) 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2019, 2020 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except @@ -3656,6 +3656,27 @@ Default value: \fB0\fR. .sp .ne 2 +.na +\fBzfs_zevent_retain_max\fR (int) +.ad +.RS 12n +Maximum recent zevent records to retain for duplicate checking. Setting +this value to zero disables duplicate detection. +.sp +Default value: \fB2000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_zevent_retain_expire_secs\fR (int) +.ad +.RS 12n +Lifespan for a recent ereport that was retained for duplicate checking. +.sp +Default value: \fB900\fR. +.RE + .na \fBzfs_zil_clean_taskq_maxalloc\fR (int) .ad diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 0c33a4535b..12837104a9 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2019, Delphix. All rights reserved. + * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. @@ -2188,7 +2188,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, ret = SET_ERROR(EIO); spa_log_error(spa, zb); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + spa, NULL, zb, NULL, 0); } return (ret); @@ -5654,7 +5654,7 @@ arc_read_done(zio_t *zio) spa_log_error(zio->io_spa, &acb->acb_zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, - zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); + zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } @@ -5931,7 +5931,7 @@ top: spa_log_error(spa, zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, zb, NULL, 0, 0); + spa, NULL, zb, NULL, 0); } } if (rc != 0) { diff --git a/module/zfs/fm.c b/module/zfs/fm.c index c00e08b8d0..a5003f85d6 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -104,13 +104,15 @@ struct erpt_kstat { kstat_named_t erpt_set_failed; /* num erpt set failures */ kstat_named_t fmri_set_failed; /* num fmri set failures */ kstat_named_t payload_set_failed; /* num payload set failures */ + kstat_named_t erpt_duplicates; /* num duplicate erpts */ }; static struct erpt_kstat erpt_kstat_data = { { "erpt-dropped", KSTAT_DATA_UINT64 }, { "erpt-set-failed", KSTAT_DATA_UINT64 }, { "fmri-set-failed", KSTAT_DATA_UINT64 }, - { "payload-set-failed", KSTAT_DATA_UINT64 } + { "payload-set-failed", KSTAT_DATA_UINT64 }, + { "erpt-duplicates", KSTAT_DATA_UINT64 } }; kstat_t *fm_ksp; @@ -568,6 +570,12 @@ out: return (error); } +void +zfs_zevent_track_duplicate(void) +{ + atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64); +} + static int zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) { @@ -1633,6 +1641,8 @@ fm_init(void) list_create(&zevent_list, sizeof (zevent_t), offsetof(zevent_t, ev_node)); cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); + + zfs_ereport_init(); } void @@ -1640,6 +1650,8 @@ fm_fini(void) { int count; + zfs_ereport_fini(); + zfs_zevent_drain_all(&count); mutex_enter(&zevent_lock); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 31fa52d1d0..015996d152 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -2868,7 +2868,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) } if (error != EBADF) { (void) zfs_ereport_post(ereport, spa, - NULL, NULL, NULL, 0, 0); + NULL, NULL, NULL, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index b98b7badba..81059c69d4 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. */ @@ -316,7 +316,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) if (target->spa_ccw_fail_time == 0) { (void) zfs_ereport_post( FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, - target, NULL, NULL, NULL, 0, 0); + target, NULL, NULL, NULL, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 95a2f5947d..0a3b8bd83b 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1481,7 +1481,7 @@ vdev_probe_done(zio_t *zio) ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - spa, vd, NULL, NULL, 0, 0); + spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } @@ -1862,11 +1862,10 @@ vdev_open(vdev_t *vd) vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, - spa, vd, NULL, NULL, 0, 0); + spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); - } vd->vdev_max_asize = max_asize; } @@ -4759,7 +4758,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, - save_state, 0); + save_state); } /* Erase any notion of persistent removed state */ diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 5301e0665a..12ee393bd5 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -16,7 +16,7 @@ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. - * Copyright (c) 2014, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014, 2020 by Delphix. All rights reserved. */ #include @@ -1473,13 +1473,14 @@ vdev_indirect_all_checksum_errors(zio_t *zio) vdev_t *vd = ic->ic_vdev; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - - (void) zfs_ereport_post_checksum(zio->io_spa, vd, + int ret = zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); + if (ret != EALREADY) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 4320078b6f..47312e02f7 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ @@ -1790,16 +1790,17 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - (void) zfs_ereport_post_checksum(zio->io_spa, vd, + int ret = zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); + if (ret != EALREADY) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } } } @@ -2279,21 +2280,21 @@ vdev_raidz_io_done(zio_t *zio) vdev_t *cvd; rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error == 0) { - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = - rm->rm_ecksuminjected; + if (rc->rc_error != 0) + continue; + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + int ret = zfs_ereport_start_checksum( + zio->io_spa, cvd, &zio->io_bookmark, zio, + rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); + if (ret != EALREADY) { mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); - - zfs_ereport_start_checksum( - zio->io_spa, cvd, - &zio->io_bookmark, zio, - rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); } } } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index ad13ccedfc..a8341f50ba 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012,2020 by Delphix. All rights reserved. */ #include @@ -101,7 +101,251 @@ * good and bad versions of the buffer (if available), and we annotate the * ereport with information about the differences. */ + #ifdef _KERNEL +/* + * Duplicate ereport Detection + * + * Some ereports are retained momentarily for detecting duplicates. These + * are kept in a recent_events_node_t in both a time-ordered list and an AVL + * tree of recent unique ereports. + * + * The lifespan of these recent ereports is bounded (15 mins) and a cleaner + * task is used to purge stale entries. + */ +static list_t recent_events_list; +static avl_tree_t recent_events_tree; +static kmutex_t recent_events_lock; +static taskqid_t recent_events_cleaner_tqid; + +/* + * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. + * + * This setting can be changed dynamically and setting it to zero + * disables duplicate detection. + */ +unsigned int zfs_zevent_retain_max = 2000; + +/* + * The lifespan for a recent ereport entry. The default of 15 minutes is + * intended to outlive the zfs diagnosis engine's threshold of 10 errors + * over a period of 10 minutes. + */ +unsigned int zfs_zevent_retain_expire_secs = 900; + +typedef enum zfs_subclass { + ZSC_IO, + ZSC_DATA, + ZSC_CHECKSUM +} zfs_subclass_t; + +typedef struct { + /* common criteria */ + uint64_t re_pool_guid; + uint64_t re_vdev_guid; + int re_io_error; + uint64_t re_io_size; + uint64_t re_io_offset; + zfs_subclass_t re_subclass; + zio_priority_t re_io_priority; + + /* logical zio criteria (optional) */ + zbookmark_phys_t re_io_bookmark; + + /* internal state */ + avl_node_t re_tree_link; + list_node_t re_list_link; + uint64_t re_timestamp; +} recent_events_node_t; + +static int +recent_events_compare(const void *a, const void *b) +{ + const recent_events_node_t *node1 = a; + const recent_events_node_t *node2 = b; + int cmp; + + /* + * The comparison order here is somewhat arbitrary. + * What's important is that if every criteria matches, then it + * is a duplicate (i.e. compare returns 0) + */ + if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) + return (cmp); + if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) + return (cmp); + + const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; + const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; + + if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) + return (cmp); + if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) + return (cmp); + + return (0); +} + +static void zfs_ereport_schedule_cleaner(void); + +/* + * background task to clean stale recent event nodes. + */ +/*ARGSUSED*/ +static void +zfs_ereport_cleaner(void *arg) +{ + recent_events_node_t *entry; + uint64_t now = gethrtime(); + + /* + * purge expired entries + */ + mutex_enter(&recent_events_lock); + while ((entry = list_tail(&recent_events_list)) != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + if (age <= zfs_zevent_retain_expire_secs) + break; + + /* remove expired node */ + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + + /* Restart the cleaner if more entries remain */ + recent_events_cleaner_tqid = 0; + if (!list_is_empty(&recent_events_list)) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); +} + +static void +zfs_ereport_schedule_cleaner(void) +{ + ASSERT(MUTEX_HELD(&recent_events_lock)); + + uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); + + recent_events_cleaner_tqid = taskq_dispatch_delay( + system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, + ddi_get_lbolt() + NSEC_TO_TICK(timeout)); +} + +/* + * Check if an ereport would be a duplicate of one recently posted. + * + * An ereport is considered a duplicate if the set of criteria in + * recent_events_node_t all match. + * + * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM + * are candidates for duplicate checking. + */ +static boolean_t +zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) +{ + recent_events_node_t search = {0}, *entry; + + if (vd == NULL || zio == NULL) + return (B_FALSE); + + if (zfs_zevent_retain_max == 0) + return (B_FALSE); + + if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) + search.re_subclass = ZSC_IO; + else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) + search.re_subclass = ZSC_DATA; + else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) + search.re_subclass = ZSC_CHECKSUM; + else + return (B_FALSE); + + search.re_pool_guid = spa_guid(spa); + search.re_vdev_guid = vd->vdev_guid; + search.re_io_error = zio->io_error; + search.re_io_priority = zio->io_priority; + /* if size is supplied use it over what's in zio */ + if (size) { + search.re_io_size = size; + search.re_io_offset = offset; + } else { + search.re_io_size = zio->io_size; + search.re_io_offset = zio->io_offset; + } + + /* grab optional logical zio criteria */ + if (zb != NULL) { + search.re_io_bookmark.zb_objset = zb->zb_objset; + search.re_io_bookmark.zb_object = zb->zb_object; + search.re_io_bookmark.zb_level = zb->zb_level; + search.re_io_bookmark.zb_blkid = zb->zb_blkid; + } + + uint64_t now = gethrtime(); + + mutex_enter(&recent_events_lock); + + /* check if we have seen this one recently */ + entry = avl_find(&recent_events_tree, &search, NULL); + if (entry != NULL) { + uint64_t age = NSEC2SEC(now - entry->re_timestamp); + + /* + * There is still an active cleaner (since we're here). + * Reset the last seen time for this duplicate entry + * so that its lifespand gets extended. + */ + list_remove(&recent_events_list, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + zfs_zevent_track_duplicate(); + mutex_exit(&recent_events_lock); + + return (age <= zfs_zevent_retain_expire_secs); + } + + if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { + /* recycle oldest node */ + entry = list_tail(&recent_events_list); + ASSERT(entry != NULL); + list_remove(&recent_events_list, entry); + avl_remove(&recent_events_tree, entry); + } else { + entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); + } + + /* record this as a recent ereport */ + *entry = search; + avl_add(&recent_events_tree, entry); + list_insert_head(&recent_events_list, entry); + entry->re_timestamp = now; + + /* Start a cleaner if not already scheduled */ + if (recent_events_cleaner_tqid == 0) + zfs_ereport_schedule_cleaner(); + + mutex_exit(&recent_events_lock); + return (B_FALSE); +} + void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) { @@ -153,9 +397,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, uint64_t ena; char class[64]; - if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) - return (B_FALSE); - if ((ereport = fm_nvlist_create(NULL)) == NULL) return (B_FALSE); @@ -336,6 +577,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, zio->io_timestamp, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, DATA_TYPE_UINT64, zio->io_delta, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, + DATA_TYPE_UINT32, zio->io_priority, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a @@ -788,24 +1031,34 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) } /* - * Return 0 if event was posted, EINVAL if there was a problem posting it or - * EBUSY if the event was rate limited. + * Post an ereport for the given subclass + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, - const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, - uint64_t size) + const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (EINVAL); + + if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(subclass, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size)) + zb, zio, state, 0)) return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) @@ -817,7 +1070,16 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, return (rc); } -void +/* + * Prepare a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ +int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, zio_bad_cksum_t *info) @@ -825,8 +1087,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_cksum_report_t *report; #ifdef _KERNEL + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return; + return (SET_ERROR(EBUSY)); #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); @@ -851,7 +1120,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, if (report->zcr_ereport == NULL) { zfs_ereport_free_checksum(report); - return; + return (0); } #endif @@ -859,6 +1128,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; mutex_exit(&spa->spa_errlist_lock); + return (0); } void @@ -901,7 +1171,15 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt) kmem_free(rpt, sizeof (*rpt)); } - +/* + * Post a checksum ereport + * + * Returns + * - 0 if an event was posted + * - EINVAL if there was a problem posting event + * - EBUSY if the event was rate limited + * - EALREADY if the event was already posted (duplicate) + */ int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, @@ -913,8 +1191,15 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, nvlist_t *detector = NULL; zfs_ecksum_info_t *info; + if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) + return (SET_ERROR(EINVAL)); + + if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, + offset, length)) + return (SET_ERROR(EALREADY)); + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) - return (EBUSY); + return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length) || (ereport == NULL)) { @@ -1073,11 +1358,57 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) #endif } -#if defined(_KERNEL) +#ifdef _KERNEL +void +zfs_ereport_init(void) +{ + mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&recent_events_list, sizeof (recent_events_node_t), + offsetof(recent_events_node_t, re_list_link)); + avl_create(&recent_events_tree, recent_events_compare, + sizeof (recent_events_node_t), offsetof(recent_events_node_t, + re_tree_link)); +} + +/* + * This 'early' fini needs to run before zfs_fini() which on Linux waits + * for the system_delay_taskq to drain. + */ +void +zfs_ereport_taskq_fini(void) +{ + mutex_enter(&recent_events_lock); + if (recent_events_cleaner_tqid != 0) { + taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); + recent_events_cleaner_tqid = 0; + } + mutex_exit(&recent_events_lock); +} + +void +zfs_ereport_fini(void) +{ + recent_events_node_t *entry; + + while ((entry = list_head(&recent_events_list)) != NULL) { + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + } + avl_destroy(&recent_events_tree); + list_destroy(&recent_events_list); + mutex_destroy(&recent_events_lock); +} + EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_state_change); + +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, + "Maximum recent zevents records to retain for duplicate checking"); +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, + "Expiration time for recent zevents records"); #endif /* _KERNEL */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 495ff4707d..c9322a8265 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7615,6 +7615,7 @@ zfs_kmod_fini(void) kmem_free(zs, sizeof (zfsdev_state_t)); } + zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f956a9ef76..8a8fbccd7d 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. @@ -547,7 +547,7 @@ error: if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, - spa, NULL, &zio->io_bookmark, zio, 0, 0); + spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; @@ -2004,7 +2004,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth) zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, pio->io_offset, pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, - pio->io_spa, vd, zb, pio, 0, 0); + pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { @@ -2331,7 +2331,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and has been suspended.\n", spa_name(spa)); (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, - NULL, NULL, 0, 0); + NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); @@ -4217,13 +4217,15 @@ zio_checksum_verify(zio_t *zio) zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - zio->io_vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&zio->io_vd->vdev_stat_lock); - - zfs_ereport_start_checksum(zio->io_spa, + int ret = zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, NULL, &info); + + if (ret != EALREADY) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + } } } @@ -4543,7 +4545,7 @@ zio_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, - zio, 0, 0); + zio, 0); } } } @@ -4557,16 +4559,16 @@ zio_done(zio_t *zio) */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - if (zio->io_type == ZIO_TYPE_READ) { - zio->io_vd->vdev_stat.vs_read_errors++; - } else if (zio->io_type == ZIO_TYPE_WRITE) { - zio->io_vd->vdev_stat.vs_write_errors++; + int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, + zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); + if (ret != EALREADY) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + if (zio->io_type == ZIO_TYPE_READ) + zio->io_vd->vdev_stat.vs_read_errors++; + else if (zio->io_type == ZIO_TYPE_WRITE) + zio->io_vd->vdev_stat.vs_write_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); } - mutex_exit(&zio->io_vd->vdev_stat_lock); - - (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, 0, 0); } if ((zio->io_error == EIO || !(zio->io_flags & @@ -4578,7 +4580,7 @@ zio_done(zio_t *zio) */ spa_log_error(zio->io_spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, - zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0); + zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index fcd9684603..725afe2f05 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -346,7 +346,7 @@ tags = ['functional', 'cli_root', 'zpool_detach'] [tests/functional/cli_root/zpool_events] tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', - 'zpool_events_poolname', 'zpool_events_errors'] + 'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates'] tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index bf8b67e750..4c11bf1463 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -105,6 +105,7 @@ export SYSTEM_FILES_COMMON='arp umask umount uname + uniq uuidgen vmstat wait diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index ad28113953..da7bc1613d 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -82,6 +82,7 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max +ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max ZIO_SLOW_IO_MS zio.slow_io_ms zio_slow_io_ms %%%% while read name FreeBSD Linux; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/.gitignore b/tests/zfs-tests/tests/functional/cli_root/zpool_events/.gitignore new file mode 100644 index 0000000000..a1f8c14838 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/.gitignore @@ -0,0 +1 @@ +/ereports diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am index 7fb6e4f7a5..99c46f0143 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am @@ -1,4 +1,8 @@ +include $(top_srcdir)/config/Rules.am + pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_events +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_events + dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ @@ -6,8 +10,16 @@ dist_pkgdata_SCRIPTS = \ zpool_events_cliargs.ksh \ zpool_events_follow.ksh \ zpool_events_poolname.ksh \ - zpool_events_errors.ksh + zpool_events_errors.ksh \ + zpool_events_duplicates.ksh dist_pkgdata_DATA = \ zpool_events.cfg \ zpool_events.kshlib + +ereports_LDADD = \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libzfs/libzfs.la + +pkgexec_PROGRAMS = ereports +ereports_SOURCES = ereports.c diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/ereports.c b/tests/zfs-tests/tests/functional/cli_root/zpool_events/ereports.c new file mode 100644 index 0000000000..f825240000 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/ereports.c @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Command to output io and checksum ereport values, one per line. + * Used by zpool_events_duplicates.ksh to check for duplicate events. + * + * example output line: + * + * checksum "error_pool" 0x856dd01ce52e336 0x000034 0x000400 0x000a402c00 + * 0x000004 0x000000 0x000000 0x000000 0x000001 + */ + +/* + * Our ereport duplicate criteria + * + * When the class and all of these values match, then an ereport is + * considered to be a duplicate. + */ +static const char *criteria_name[] = { + FM_EREPORT_PAYLOAD_ZFS_POOL, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, + + /* logical zio criteriai (optional) */ + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, + FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, + FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, +}; + +#define CRITERIA_NAMES_COUNT ARRAY_SIZE(criteria_name) + +static void +print_ereport_line(nvlist_t *nvl) +{ + char *class; + int last = CRITERIA_NAMES_COUNT - 1; + + /* + * For the test case context, we only want to see 'io' and + * 'checksum' subclass. We skip 'data' to minimize the output. + */ + if (nvlist_lookup_string(nvl, FM_CLASS, &class) != 0 || + strstr(class, "ereport.fs.zfs.") == NULL || + strcmp(class, "ereport.fs.zfs.data") == 0) { + return; + } + + (void) printf("%s\t", class + strlen("ereport.fs.zfs.")); + + for (int i = 0; i < CRITERIA_NAMES_COUNT; i++) { + nvpair_t *nvp; + uint32_t i32 = 0; + uint64_t i64 = 0; + char *str = NULL; + + if (nvlist_lookup_nvpair(nvl, criteria_name[i], &nvp) != 0) { + /* print a proxy for optional criteria */ + (void) printf("--------"); + (void) printf("%c", i == last ? '\n' : '\t'); + continue; + } + + switch (nvpair_type(nvp)) { + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + (void) printf("\"%s\"", str ? str : ""); + break; + + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (void *)&i32); + (void) printf("0x%06x", i32); + break; + + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + (void) printf("0x%06x", i32); + break; + + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (void *)&i64); + (void) printf("0x%06llx", (u_longlong_t)i64); + break; + + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + if (strcmp(FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + criteria_name[i]) == 0) + (void) printf("0x%010llx", (u_longlong_t)i64); + else + (void) printf("0x%06llx", (u_longlong_t)i64); + break; + default: + (void) printf(""); + break; + } + (void) printf("%c", i == last ? '\n' : '\t'); + } +} + +static void +ereports_dump(libzfs_handle_t *zhdl, int zevent_fd) +{ + nvlist_t *nvl; + int ret, dropped; + + while (1) { + ret = zpool_events_next(zhdl, &nvl, &dropped, ZEVENT_NONBLOCK, + zevent_fd); + if (ret || nvl == NULL) + break; + if (dropped > 0) + (void) fprintf(stdout, "dropped %d events\n", dropped); + print_ereport_line(nvl); + (void) fflush(stdout); + nvlist_free(nvl); + } +} + +/* ARGSUSED */ +int +main(int argc, char **argv) +{ + libzfs_handle_t *hdl; + int fd; + + hdl = libzfs_init(); + if (hdl == NULL) { + (void) fprintf(stderr, "libzfs_init: %s\n", strerror(errno)); + exit(2); + } + fd = open(ZFS_DEV, O_RDWR); + if (fd < 0) { + (void) fprintf(stderr, "open: %s\n", strerror(errno)); + libzfs_fini(hdl); + exit(2); + } + + ereports_dump(hdl, fd); + + (void) close(fd); + libzfs_fini(hdl); + + return (0); +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh new file mode 100755 index 0000000000..1ba7b1b344 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh @@ -0,0 +1,155 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +# DESCRIPTION: +# Verify that duplicate I/O ereport errors are not posted +# +# STRATEGY: +# 1. Create a mirror pool +# 2. Inject duplicate read/write IO errors and checksum errors +# 3. Verify there are no duplicate events being posted +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +MOUNTDIR=$TEST_BASE_DIR/mount +FILEPATH=$MOUNTDIR/badfile +VDEV1=$TEST_BASE_DIR/vfile1 +VDEV2=$TEST_BASE_DIR/vfile2 +POOL=error_pool +FILESIZE="10M" +OLD_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX) +RETAIN_MAX=$(get_tunable ZEVENT_RETAIN_MAX) + +EREPORTS="$STF_SUITE/tests/functional/cli_root/zpool_events/ereports" + +duplicates=false + +function cleanup +{ + log_must set_tunable64 ZEVENT_LEN_MAX $OLD_LEN_MAX + + log_must zinject -c all + if poolexists $POOL ; then + destroy_pool $POOL + fi + log_must rm -f $VDEV1 $VDEV2 +} + +log_assert "Duplicate I/O ereport errors are not posted" +log_note "zevent retain max setting: $RETAIN_MAX" + +log_onexit cleanup + +# Set our threshold high to avoid dropping events. +set_tunable64 ZEVENT_LEN_MAX 20000 + +log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 +log_must mkdir -p $MOUNTDIR + +# +# $1: test type - corrupt (checksum error), io +# $2: read, write +function do_dup_test +{ + ERR=$1 + RW=$2 + + log_note "Testing $ERR $RW ereports" + log_must zpool create -f -m $MOUNTDIR -o failmode=continue $POOL mirror $VDEV1 $VDEV2 + log_must zpool events -c + log_must zfs set compression=off $POOL + + if [ "$RW" == "read" ] ; then + log_must mkfile $FILESIZE $FILEPATH + + # unmount and mount filesystems to purge file from ARC + # to force reads to go through error inject handler + log_must zfs unmount $POOL + log_must zfs mount $POOL + + # all reads from this file get an error + if [ "$ERR" == "corrupt" ] ; then + log_must zinject -a -t data -e checksum -T read $FILEPATH + else + log_must zinject -a -t data -e io -T read $FILEPATH + fi + + # Read the file a few times to generate some + # duplicate errors of the same blocks + # shellcheck disable=SC2034 + for i in {1..15}; do + dd if=$FILEPATH of=/dev/null bs=128K > /dev/null 2>&1 + done + log_must zinject -c all + fi + + log_must zinject -d $VDEV1 -e $ERR -T $RW -f 100 $POOL + + if [ "$RW" == "write" ] ; then + log_must mkfile $FILESIZE $FILEPATH + log_must zpool sync $POOL + else + # scrub twice to generate some duplicates + log_must zpool scrub $POOL + log_must zpool wait -t scrub $POOL + log_must zpool scrub $POOL + log_must zpool wait -t scrub $POOL + fi + + log_must zinject -c all + + # Wait for the pool to settle down and finish resilvering (if + # necessary). We want the errors to stop incrementing before we + # check for duplicates. + zpool wait -t resilver $POOL + + ereports="$($EREPORTS | sort)" + actual=$(echo "$ereports" | wc -l) + unique=$(echo "$ereports" | uniq | wc -l) + log_note "$actual total $ERR $RW ereports where $unique were unique" + + if [ $actual -gt $unique ] ; then + log_note "UNEXPECTED -- $((actual-unique)) duplicate $ERR $RW ereports" + echo "$ereports" + duplicates=true + fi + + log_must zpool destroy $POOL +} + +do_dup_test "corrupt" "read" +do_dup_test "io" "read" +do_dup_test "io" "write" + +if $duplicates; then + log_fail "FAILED -- Duplicate I/O ereport errors encountered" +else + log_pass "Duplicate I/O ereport errors are not posted" +fi +