Add slow disk diagnosis to ZED

Slow disk response times can be indicative of a failing drive. ZFS
currently tracks slow I/Os (slower than zio_slow_io_ms) and generates
events (ereport.fs.zfs.delay).  However, no action is taken by ZED,
like is done for checksum or I/O errors.  This change adds slow disk
diagnosis to ZED which is opt-in using new VDEV properties:
  VDEV_PROP_SLOW_IO_N
  VDEV_PROP_SLOW_IO_T

If multiple VDEVs in a pool are undergoing slow I/Os, then it skips
the zpool_vdev_degrade().

Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #15469
This commit is contained in:
Don Brady 2024-02-08 10:19:52 -07:00 committed by GitHub
parent 229b9f4ed0
commit cbe882298e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
29 changed files with 655 additions and 71 deletions

View File

@ -22,6 +22,7 @@
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
* *
* Copyright (c) 2016, Intel Corporation. * Copyright (c) 2016, Intel Corporation.
* Copyright (c) 2023, Klara Inc.
*/ */
/* /*
@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
if (strcmp(name, "spare_on_remove") == 0) if (strcmp(name, "spare_on_remove") == 0)
return (1); return (1);
if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
return (10); /* N = 10 events */
return (0);
}
int64_t
fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
{
(void) hdl;
/*
* These can be looked up in mp->modinfo->fmdi_props
* For now we just hard code for phase 2. In the
* future, there can be a ZED based override.
*/
if (strcmp(name, "remove_timeout") == 0)
return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
return (0); return (0);
} }
@ -535,6 +514,19 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
} }
int
fmd_serd_active(fmd_hdl_t *hdl, const char *name)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
return (0);
}
return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
}
void void
fmd_serd_reset(fmd_hdl_t *hdl, const char *name) fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
{ {
@ -543,12 +535,10 @@ fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
return; } else {
fmd_serd_eng_reset(sgp);
fmd_hdl_debug(hdl, "serd_reset %s", name);
} }
fmd_serd_eng_reset(sgp);
fmd_hdl_debug(hdl, "serd_reset %s", name);
} }
int int
@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
{ {
fmd_module_t *mp = (fmd_module_t *)hdl; fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp; fmd_serd_eng_t *sgp;
int err;
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
name); name);
return (0); return (0);
} }
err = fmd_serd_eng_record(sgp, ep->ev_hrt); return (fmd_serd_eng_record(sgp, ep->ev_hrt));
}
return (err); void
fmd_serd_gc(fmd_hdl_t *hdl)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
} }
/* FMD Timers */ /* FMD Timers */
@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
struct itimerspec its; struct itimerspec its;
fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);
/* disarm the timer */ /* disarm the timer */
memset(&its, 0, sizeof (struct itimerspec)); memset(&its, 0, sizeof (struct itimerspec));

View File

@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ #define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ #define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
extern void fmd_serd_destroy(fmd_hdl_t *, const char *); extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
extern int fmd_serd_exists(fmd_hdl_t *, const char *); extern int fmd_serd_exists(fmd_hdl_t *, const char *);
extern int fmd_serd_active(fmd_hdl_t *, const char *);
extern void fmd_serd_reset(fmd_hdl_t *, const char *); extern void fmd_serd_reset(fmd_hdl_t *, const char *);
extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
extern int fmd_serd_fired(fmd_hdl_t *, const char *); extern int fmd_serd_fired(fmd_hdl_t *, const char *);
extern int fmd_serd_empty(fmd_hdl_t *, const char *); extern int fmd_serd_empty(fmd_hdl_t *, const char *);
extern void fmd_serd_gc(fmd_hdl_t *);
extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
extern void fmd_timer_remove(fmd_hdl_t *, id_t); extern void fmd_timer_remove(fmd_hdl_t *, id_t);

View File

@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
} }
void void
fmd_serd_eng_gc(fmd_serd_eng_t *sgp) fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg)
{ {
(void) arg;
fmd_serd_elem_t *sep, *nep; fmd_serd_elem_t *sep, *nep;
hrtime_t hrt; hrtime_t hrt;

View File

@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
extern int fmd_serd_eng_empty(fmd_serd_eng_t *); extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
extern void fmd_serd_eng_reset(fmd_serd_eng_t *); extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
extern void fmd_serd_eng_gc(fmd_serd_eng_t *); extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -23,6 +23,7 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016, Intel Corporation. * Copyright (c) 2016, Intel Corporation.
* Copyright (c) 2023, Klara Inc.
*/ */
#include <stddef.h> #include <stddef.h>
@ -47,11 +48,16 @@
#define DEFAULT_CHECKSUM_T 600 /* seconds */ #define DEFAULT_CHECKSUM_T 600 /* seconds */
#define DEFAULT_IO_N 10 /* events */ #define DEFAULT_IO_N 10 /* events */
#define DEFAULT_IO_T 600 /* seconds */ #define DEFAULT_IO_T 600 /* seconds */
#define DEFAULT_SLOW_IO_N 10 /* events */
#define DEFAULT_SLOW_IO_T 30 /* seconds */
#define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */
/* /*
* Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This * Our serd engines are named in the following format:
* #define reserves enough space for two 64-bit hex values plus the length of * 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}'
* the longest string. * This #define reserves enough space for two 64-bit hex values plus the
* length of the longest string.
*/ */
#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
@ -68,6 +74,7 @@ typedef struct zfs_case_data {
int zc_pool_state; int zc_pool_state;
char zc_serd_checksum[MAX_SERDLEN]; char zc_serd_checksum[MAX_SERDLEN];
char zc_serd_io[MAX_SERDLEN]; char zc_serd_io[MAX_SERDLEN];
char zc_serd_slow_io[MAX_SERDLEN];
int zc_has_remove_timer; int zc_has_remove_timer;
} zfs_case_data_t; } zfs_case_data_t;
@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = {
{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
}; };
static hrtime_t zfs_remove_timeout; /* wait 15 seconds after a removal */
static hrtime_t zfs_remove_timeout = SEC2NSEC(15);
uu_list_pool_t *zfs_case_pool; uu_list_pool_t *zfs_case_pool;
uu_list_t *zfs_cases; uu_list_t *zfs_cases;
@ -124,6 +132,8 @@ uu_list_t *zfs_cases;
#define ZFS_MAKE_EREPORT(type) \ #define ZFS_MAKE_EREPORT(type) \
FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
static void zfs_purge_cases(fmd_hdl_t *hdl);
/* /*
* Write out the persistent representation of an active case. * Write out the persistent representation of an active case.
*/ */
@ -170,6 +180,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
return (zcp); return (zcp);
} }
/*
* count other unique slow-io cases in a pool
*/
static uint_t
zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
{
zfs_case_t *zcp;
uint_t cases = 0;
static hrtime_t next_check = 0;
/*
* Note that plumbing in some external GC would require adding locking,
* since most of this module code is not thread safe and assumes there
* is only one thread running against the module. So we perform GC here
* inline periodically so that future delay induced faults will be
* possible once the issue causing multiple vdev delays is resolved.
*/
if (gethrestime_sec() > next_check) {
/* Periodically purge old SERD entries and stale cases */
fmd_serd_gc(hdl);
zfs_purge_cases(hdl);
next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS;
}
for (zcp = uu_list_first(zfs_cases); zcp != NULL;
zcp = uu_list_next(zfs_cases, zcp)) {
if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
cases++;
}
}
return (cases);
}
/* /*
* Iterate over any active cases. If any cases are associated with a pool or * Iterate over any active cases. If any cases are associated with a pool or
* vdev which is no longer present on the system, close the associated case. * vdev which is no longer present on the system, close the associated case.
@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
(long long unsigned int)vdev_guid, type); (long long unsigned int)vdev_guid, type);
} }
static void
zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp)
{
fmd_hdl_debug(hdl, "retiring case");
fmd_case_close(hdl, zcp->zc_case);
}
/* /*
* Solve a given ZFS case. This first checks to make sure the diagnosis is * Solve a given ZFS case. This first checks to make sure the diagnosis is
* still valid, as well as cleaning up any pending timer associated with the * still valid, as well as cleaning up any pending timer associated with the
@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (strcmp(class, if (strcmp(class,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
strcmp(class, strcmp(class,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) {
strcmp(class,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
zfs_stats.resource_drops.fmds_value.ui64++; zfs_stats.resource_drops.fmds_value.ui64++;
return; return;
} }
@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (zcp->zc_data.zc_serd_checksum[0] != '\0') if (zcp->zc_data.zc_serd_checksum[0] != '\0')
fmd_serd_reset(hdl, fmd_serd_reset(hdl,
zcp->zc_data.zc_serd_checksum); zcp->zc_data.zc_serd_checksum);
if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
fmd_serd_reset(hdl,
zcp->zc_data.zc_serd_slow_io);
} else if (fmd_nvl_class_match(hdl, nvl, } else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
uint64_t state = 0; uint64_t state = 0;
@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (fmd_case_solved(hdl, zcp->zc_case)) if (fmd_case_solved(hdl, zcp->zc_case))
return; return;
fmd_hdl_debug(hdl, "error event '%s'", class); if (vdev_guid)
fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class,
vdev_guid);
else
fmd_hdl_debug(hdl, "error event '%s'", class);
/* /*
* Determine if we should solve the case and generate a fault. We solve * Determine if we should solve the case and generate a fault. We solve
@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
fmd_nvl_class_match(hdl, nvl, fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
fmd_nvl_class_match(hdl, nvl, fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) ||
fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
const char *failmode = NULL; const char *failmode = NULL;
boolean_t checkremove = B_FALSE; boolean_t checkremove = B_FALSE;
@ -814,6 +875,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
} }
if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
checkremove = B_TRUE; checkremove = B_TRUE;
} else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
uint64_t slow_io_n, slow_io_t;
/*
* Create a slow io SERD engine when the VDEV has the
* 'vdev_slow_io_n' and 'vdev_slow_io_n' properties.
*/
if (zcp->zc_data.zc_serd_slow_io[0] == '\0' &&
nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
&slow_io_n) == 0 &&
nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
&slow_io_t) == 0) {
zfs_serd_name(zcp->zc_data.zc_serd_slow_io,
pool_guid, vdev_guid, "slow_io");
fmd_serd_create(hdl,
zcp->zc_data.zc_serd_slow_io,
slow_io_n,
SEC2NSEC(slow_io_t));
zfs_case_serialize(zcp);
}
/* Pass event to SERD engine and see if this triggers */
if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
ep)) {
/*
* Ignore a slow io diagnosis when other
* VDEVs in the pool show signs of being slow.
*/
if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
zfs_case_retire(hdl, zcp);
fmd_hdl_debug(hdl, "pool %llu has "
"multiple slow io cases -- skip "
"degrading vdev %llu",
(u_longlong_t)
zcp->zc_data.zc_pool_guid,
(u_longlong_t)
zcp->zc_data.zc_vdev_guid);
} else {
zfs_case_solve(hdl, zcp,
"fault.fs.zfs.vdev.slow_io");
}
}
} else if (fmd_nvl_class_match(hdl, nvl, } else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
/* /*
@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
if (zcp->zc_data.zc_serd_io[0] != '\0') if (zcp->zc_data.zc_serd_io[0] != '\0')
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io);
if (zcp->zc_data.zc_has_remove_timer) if (zcp->zc_data.zc_has_remove_timer)
fmd_timer_remove(hdl, zcp->zc_remove_timer); fmd_timer_remove(hdl, zcp->zc_remove_timer);
@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
} }
/*
* We use the fmd gc entry point to look for old cases that no longer apply.
* This allows us to keep our set of case data small in a long running system.
*/
static void
zfs_fm_gc(fmd_hdl_t *hdl)
{
zfs_purge_cases(hdl);
}
static const fmd_hdl_ops_t fmd_ops = { static const fmd_hdl_ops_t fmd_ops = {
zfs_fm_recv, /* fmdo_recv */ zfs_fm_recv, /* fmdo_recv */
zfs_fm_timeout, /* fmdo_timeout */ zfs_fm_timeout, /* fmdo_timeout */
zfs_fm_close, /* fmdo_close */ zfs_fm_close, /* fmdo_close */
NULL, /* fmdo_stats */ NULL, /* fmdo_stats */
zfs_fm_gc, /* fmdo_gc */ NULL, /* fmdo_gc */
}; };
static const fmd_prop_t fmd_props[] = { static const fmd_prop_t fmd_props[] = {
{ "checksum_N", FMD_TYPE_UINT32, "10" },
{ "checksum_T", FMD_TYPE_TIME, "10min" },
{ "io_N", FMD_TYPE_UINT32, "10" },
{ "io_T", FMD_TYPE_TIME, "10min" },
{ "remove_timeout", FMD_TYPE_TIME, "15sec" },
{ NULL, 0, NULL } { NULL, 0, NULL }
}; };
@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl)
(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
} }
void void

View File

@ -523,6 +523,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
} else if (fmd_nvl_class_match(hdl, fault, } else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.vdev.checksum")) { "fault.fs.zfs.vdev.checksum")) {
degrade_device = B_TRUE; degrade_device = B_TRUE;
} else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.vdev.slow_io")) {
degrade_device = B_TRUE;
} else if (fmd_nvl_class_match(hdl, fault, } else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.device")) { "fault.fs.zfs.device")) {
fault_device = B_FALSE; fault_device = B_FALSE;

View File

@ -1083,6 +1083,22 @@ main(int argc, char **argv)
libzfs_fini(g_zfs); libzfs_fini(g_zfs);
return (1); return (1);
} }
if (record.zi_nlanes) {
switch (io_type) {
case ZIO_TYPE_READ:
case ZIO_TYPE_WRITE:
case ZIO_TYPES:
break;
default:
(void) fprintf(stderr, "I/O type for a delay "
"must be 'read' or 'write'\n");
usage();
libzfs_fini(g_zfs);
return (1);
}
}
if (!error) if (!error)
error = ENXIO; error = ENXIO;

View File

@ -2569,7 +2569,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
break; break;
case VDEV_AUX_ERR_EXCEEDED: case VDEV_AUX_ERR_EXCEEDED:
(void) printf(gettext("too many errors")); if (vs->vs_read_errors + vs->vs_write_errors +
vs->vs_checksum_errors == 0 && children == 0 &&
vs->vs_slow_ios > 0) {
(void) printf(gettext("too many slow I/Os"));
} else {
(void) printf(gettext("too many errors"));
}
break; break;
case VDEV_AUX_IO_FAILURE: case VDEV_AUX_IO_FAILURE:

View File

@ -82,6 +82,8 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T "vdev_cksum_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T "vdev_cksum_t"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N "vdev_io_n" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N "vdev_io_n"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"

View File

@ -366,6 +366,8 @@ typedef enum {
VDEV_PROP_IO_N, VDEV_PROP_IO_N,
VDEV_PROP_IO_T, VDEV_PROP_IO_T,
VDEV_PROP_RAIDZ_EXPANDING, VDEV_PROP_RAIDZ_EXPANDING,
VDEV_PROP_SLOW_IO_N,
VDEV_PROP_SLOW_IO_T,
VDEV_NUM_PROPS VDEV_NUM_PROPS
} vdev_prop_t; } vdev_prop_t;

View File

@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation. * Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2023, Klara Inc.
*/ */
#ifndef _SYS_VDEV_IMPL_H #ifndef _SYS_VDEV_IMPL_H
@ -454,12 +455,14 @@ struct vdev {
zfs_ratelimit_t vdev_checksum_rl; zfs_ratelimit_t vdev_checksum_rl;
/* /*
* Checksum and IO thresholds for tuning ZED * Vdev properties for tuning ZED
*/ */
uint64_t vdev_checksum_n; uint64_t vdev_checksum_n;
uint64_t vdev_checksum_t; uint64_t vdev_checksum_t;
uint64_t vdev_io_n; uint64_t vdev_io_n;
uint64_t vdev_io_t; uint64_t vdev_io_t;
uint64_t vdev_slow_io_n;
uint64_t vdev_slow_io_t;
}; };
#define VDEV_PAD_SIZE (8 << 10) #define VDEV_PAD_SIZE (8 << 10)

View File

@ -5626,7 +5626,9 @@
<enumerator name='VDEV_PROP_IO_N' value='44'/> <enumerator name='VDEV_PROP_IO_N' value='44'/>
<enumerator name='VDEV_PROP_IO_T' value='45'/> <enumerator name='VDEV_PROP_IO_T' value='45'/>
<enumerator name='VDEV_PROP_RAIDZ_EXPANDING' value='46'/> <enumerator name='VDEV_PROP_RAIDZ_EXPANDING' value='46'/>
<enumerator name='VDEV_NUM_PROPS' value='47'/> <enumerator name='VDEV_PROP_SLOW_IO_N' value='47'/>
<enumerator name='VDEV_PROP_SLOW_IO_T' value='48'/>
<enumerator name='VDEV_NUM_PROPS' value='49'/>
</enum-decl> </enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/> <typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'> <class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>

View File

@ -5264,6 +5264,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N: case VDEV_PROP_IO_N:
case VDEV_PROP_IO_T: case VDEV_PROP_IO_T:
case VDEV_PROP_SLOW_IO_N:
case VDEV_PROP_SLOW_IO_T:
if (intval == UINT64_MAX) { if (intval == UINT64_MAX) {
(void) strlcpy(buf, "-", len); (void) strlcpy(buf, "-", len);
} else { } else {

View File

@ -1704,7 +1704,9 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
(prop == VDEV_PROP_CHECKSUM_N || (prop == VDEV_PROP_CHECKSUM_N ||
prop == VDEV_PROP_CHECKSUM_T || prop == VDEV_PROP_CHECKSUM_T ||
prop == VDEV_PROP_IO_N || prop == VDEV_PROP_IO_N ||
prop == VDEV_PROP_IO_T)) { prop == VDEV_PROP_IO_T ||
prop == VDEV_PROP_SLOW_IO_N ||
prop == VDEV_PROP_SLOW_IO_T)) {
*ivalp = UINT64_MAX; *ivalp = UINT64_MAX;
} }

View File

@ -44,7 +44,7 @@ section, below.
Every vdev has a set of properties that export statistics about the vdev Every vdev has a set of properties that export statistics about the vdev
as well as control various behaviors. as well as control various behaviors.
Properties are not inherited from top-level vdevs, with the exception of Properties are not inherited from top-level vdevs, with the exception of
checksum_n, checksum_t, io_n, and io_t. checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t.
.Pp .Pp
The values of numeric properties can be specified using human-readable suffixes The values of numeric properties can be specified using human-readable suffixes
.Po for example, .Po for example,
@ -117,7 +117,7 @@ If this device is currently being removed from the pool
.Pp .Pp
The following native properties can be used to change the behavior of a vdev. The following native properties can be used to change the behavior of a vdev.
.Bl -tag -width "allocating" .Bl -tag -width "allocating"
.It Sy checksum_n , checksum_t , io_n , io_t .It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t
Tune the fault management daemon by specifying checksum/io thresholds of <N> Tune the fault management daemon by specifying checksum/io thresholds of <N>
errors in <T> seconds, respectively. errors in <T> seconds, respectively.
These properties can be set on leaf and top-level vdevs. These properties can be set on leaf and top-level vdevs.

View File

@ -260,8 +260,8 @@ sufficient replicas exist to continue functioning.
The underlying conditions are as follows: The underlying conditions are as follows:
.Bl -bullet -compact .Bl -bullet -compact
.It .It
The number of checksum errors exceeds acceptable levels and the device is The number of checksum errors or slow I/Os exceeds acceptable levels and the
degraded as an indication that something may be wrong. device is degraded as an indication that something may be wrong.
ZFS continues to use the device as necessary. ZFS continues to use the device as necessary.
.It .It
The number of I/O errors exceeds acceptable levels. The number of I/O errors exceeds acceptable levels.

View File

@ -69,6 +69,7 @@ Force a vdev into the DEGRADED or FAULTED state.
.Nm zinject .Nm zinject
.Fl d Ar vdev .Fl d Ar vdev
.Fl D Ar latency : Ns Ar lanes .Fl D Ar latency : Ns Ar lanes
.Op Fl T Ar read|write
.Ar pool .Ar pool
.Xc .Xc
Add an artificial delay to I/O requests on a particular Add an artificial delay to I/O requests on a particular

View File

@ -431,6 +431,12 @@ vdev_prop_init(void)
zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX, zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX,
PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "IO_T", B_FALSE, PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "IO_T", B_FALSE,
sfeatures); sfeatures);
zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX,
PROP_DEFAULT, ZFS_TYPE_VDEV, "<events>", "SLOW_IO_N", B_FALSE,
sfeatures);
zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX,
PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "SLOW_IO_T", B_FALSE,
sfeatures);
/* default index (boolean) properties */ /* default index (boolean) properties */
zprop_register_index(VDEV_PROP_REMOVING, "removing", 0, zprop_register_index(VDEV_PROP_REMOVING, "removing", 0,

View File

@ -677,6 +677,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_state_dirty_node);
@ -3755,6 +3757,18 @@ vdev_load(vdev_t *vd)
if (error && error != ENOENT) if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error); "failed [error=%d]", (u_longlong_t)zapobj, error);
error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
&vd->vdev_slow_io_n);
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
&vd->vdev_slow_io_t);
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
} }
/* /*
@ -5970,6 +5984,20 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
} }
vd->vdev_io_t = intval; vd->vdev_io_t = intval;
break; break;
case VDEV_PROP_SLOW_IO_N:
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
break;
}
vd->vdev_slow_io_n = intval;
break;
case VDEV_PROP_SLOW_IO_T:
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
break;
}
vd->vdev_slow_io_t = intval;
break;
default: default:
/* Most processing is done in vdev_props_set_sync */ /* Most processing is done in vdev_props_set_sync */
break; break;
@ -6313,6 +6341,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N: case VDEV_PROP_IO_N:
case VDEV_PROP_IO_T: case VDEV_PROP_IO_T:
case VDEV_PROP_SLOW_IO_N:
case VDEV_PROP_SLOW_IO_T:
err = vdev_prop_get_int(vd, prop, &intval); err = vdev_prop_get_int(vd, prop, &intval);
if (err && err != ENOENT) if (err && err != ENOENT)
break; break;

View File

@ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
case VDEV_PROP_IO_T: case VDEV_PROP_IO_T:
propval = vd->vdev_io_t; propval = vd->vdev_io_t;
break; break;
case VDEV_PROP_SLOW_IO_N:
propval = vd->vdev_slow_io_n;
break;
case VDEV_PROP_SLOW_IO_T:
propval = vd->vdev_slow_io_t;
break;
default: default:
propval = propdef; propval = propdef;
break; break;
@ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
NULL); NULL);
} }
if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
uint64_t slow_io_n, slow_io_t;
slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
DATA_TYPE_UINT64,
slow_io_n,
NULL);
slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
DATA_TYPE_UINT64,
slow_io_t,
NULL);
}
mutex_exit(&spa->spa_errlist_lock); mutex_exit(&spa->spa_errlist_lock);
*ereport_out = ereport; *ereport_out = ereport;

View File

@ -605,6 +605,10 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid) if (vd->vdev_guid != handler->zi_record.zi_guid)
continue; continue;
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
handler->zi_record.zi_iotype != zio->io_type)
continue;
/* /*
* Defensive; should never happen as the array allocation * Defensive; should never happen as the array allocation
* occurs prior to inserting this handler on the list. * occurs prior to inserting this handler on the list.

View File

@ -104,7 +104,8 @@ tags = ['functional', 'devices']
[tests/functional/events:Linux] [tests/functional/events:Linux]
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config'] 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
'zed_slow_io', 'zed_slow_io_many_vdevs']
tags = ['functional', 'events'] tags = ['functional', 'events']
[tests/functional/fadvise:Linux] [tests/functional/fadvise:Linux]

View File

@ -1447,6 +1447,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/events/zed_fd_spill.ksh \ functional/events/zed_fd_spill.ksh \
functional/events/zed_io_config.ksh \ functional/events/zed_io_config.ksh \
functional/events/zed_rc_filter.ksh \ functional/events/zed_rc_filter.ksh \
functional/events/zed_slow_io.ksh \
functional/events/zed_slow_io_many_vdevs.ksh \
functional/exec/cleanup.ksh \ functional/exec/cleanup.ksh \
functional/exec/exec_001_pos.ksh \ functional/exec/exec_001_pos.ksh \
functional/exec/exec_002_neg.ksh \ functional/exec/exec_002_neg.ksh \

View File

@ -70,4 +70,6 @@ typeset -a properties=(
checksum_t checksum_t
io_n io_n
io_t io_t
slow_io_n
slow_io_t
) )

View File

@ -26,8 +26,10 @@
. $STF_SUITE/include/libtest.shlib . $STF_SUITE/include/libtest.shlib
zed_cleanup all-debug.sh all-syslog.sh all-dumpfds
zed_stop zed_stop
zed_cleanup all-debug.sh all-syslog.sh all-dumpfds
zed_events_drain
default_cleanup default_cleanup

View File

@ -0,0 +1,205 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
#
# DESCRIPTION:
# Verify that vdev properties, slow_io_n and slow_io_t, work with ZED.
#
# STRATEGY:
# 1. Create a pool with single vdev
# 2. Set slow_io_n/slow_io_t to non-default values
# 3. Inject slow io errors
# 4. Verify that ZED degrades vdev
#
. $STF_SUITE/include/libtest.shlib
TESTDIR="$TEST_BASE_DIR/zed_slow_io"
VDEV="$TEST_BASE_DIR/vdevfile.$$"
TESTPOOL="slow_io_pool"
FILEPATH="$TESTDIR/slow_io.testfile"
OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
verify_runnable "both"
function do_setup
{
log_must truncate -s 1G $VDEV
default_setup_noexit $VDEV
zed_events_drain
log_must zfs set compression=off $TESTPOOL
log_must zfs set primarycache=none $TESTPOOL
log_must zfs set prefetch=none $TESTPOOL
log_must zfs set recordsize=512 $TESTPOOL
for i in {1..10}; do
dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null
done
zpool sync
}
# intermediate cleanup
function do_clean
{
log_must zinject -c all
log_must zpool destroy $TESTPOOL
log_must rm -f $VDEV
}
# final cleanup
function cleanup
{
log_must zinject -c all
# if pool still exists then something failed so log additional info
if poolexists $TESTPOOL ; then
log_note "$(zpool status -s $TESTPOOL)"
echo "=================== zed log search ==================="
grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
destroy_pool $TESTPOOL
fi
log_must zed_stop
log_must rm -f $VDEV
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
}
function start_slow_io
{
zpool sync
log_must set_tunable64 ZIO_SLOW_IO_MS 10
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL
zpool sync
}
function stop_slow_io
{
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
log_must zinject -c all
}
# Test default ZED settings:
# inject 10 events over 2.5 seconds, should not degrade.
function default_degrade
{
do_setup
start_slow_io
for i in {1..10}; do
dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
sleep 0.25
done
stop_slow_io
log_note "$(zpool status -s $TESTPOOL)"
# give slow ZED a chance to process the delay events
sleep 18
log_note "$(zpool status -s $TESTPOOL)"
degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
log_note $degrades vdev degrades in ZED log
[ $degrades -eq "0" ] || \
log_fail "expecting no degrade events, found $degrades"
do_clean
}
# change slow_io_n, slow_io_t to 5 events in 60 seconds
# fire more than 5 events, should degrade
function slow_io_degrade
{
do_setup
zpool set slow_io_n=5 $TESTPOOL $VDEV
zpool set slow_io_t=60 $TESTPOOL $VDEV
start_slow_io
for i in {1..16}; do
dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
sleep 0.5
done
stop_slow_io
zpool sync
#
# wait up to 60 seconds for kernel to produce at least 5 delay events
#
typeset -i i=0
typeset -i events=0
while [[ $i -lt 60 ]]; do
events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
[[ $events -ge "5" ]] && break
i=$((i+1))
sleep 1
done
log_note "$events delay events found"
if [[ $events -ge "5" ]]; then
log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10
fi
do_clean
}
# change slow_io_n, slow_io_t to 10 events in 1 second
# inject events spaced 0.5 seconds apart, should not degrade
function slow_io_no_degrade
{
do_setup
zpool set slow_io_n=10 $TESTPOOL $VDEV
zpool set slow_io_t=1 $TESTPOOL $VDEV
start_slow_io
for i in {1..16}; do
dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
sleep 0.5
done
stop_slow_io
zpool sync
log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45
do_clean
}
log_assert "Test ZED slow io configurability"
log_onexit cleanup
log_must zed_events_drain
log_must zed_start
default_degrade
slow_io_degrade
slow_io_no_degrade
log_pass "Test ZED slow io configurability"

View File

@ -0,0 +1,177 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
#
# DESCRIPTION:
# Verify that delay events from multiple vdevs doesnt degrade
#
# STRATEGY:
# 1. Create a pool with a 3 disk raidz vdev
# 2. Inject slow io errors
# 3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs
#
. $STF_SUITE/include/libtest.shlib
TESTDIR="$TEST_BASE_DIR/zed_slow_io"
VDEV1="$TEST_BASE_DIR/vdevfile1.$$"
VDEV2="$TEST_BASE_DIR/vdevfile2.$$"
VDEV3="$TEST_BASE_DIR/vdevfile3.$$"
VDEV4="$TEST_BASE_DIR/vdevfile4.$$"
VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4"
TESTPOOL="slow_io_pool"
FILEPATH="$TESTDIR/slow_io.testfile"
OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
verify_runnable "both"
function cleanup
{
log_must zinject -c all
# if pool still exists then something failed so log additional info
if poolexists $TESTPOOL ; then
log_note "$(zpool status -s $TESTPOOL)"
echo "=================== zed log search ==================="
grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
destroy_pool $TESTPOOL
fi
log_must zed_stop
log_must rm -f $VDEVS
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
}
function start_slow_io
{
for vdev in $VDEVS
do
log_must zpool set slow_io_n=4 $TESTPOOL $vdev
log_must zpool set slow_io_t=60 $TESTPOOL $vdev
done
zpool sync
log_must set_tunable64 ZIO_SLOW_IO_MS 10
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
for vdev in $VDEVS
do
log_must zinject -d $vdev -D10:1 $TESTPOOL
done
zpool sync
}
function stop_slow_io
{
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
log_must zinject -c all
}
function multiple_slow_vdevs_test
{
log_must truncate -s 1G $VDEVS
default_raidz_setup_noexit $VDEVS
log_must zpool events -c
log_must zfs set compression=off $TESTPOOL
log_must zfs set primarycache=none $TESTPOOL
log_must zfs set recordsize=4K $TESTPOOL
log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20
zpool sync
#
# Read the file with slow io injected on the disks
# This will cause multiple errors on each disk to trip ZED SERD
#
# pool: slow_io_pool
# state: ONLINE
# config:
#
# NAME STATE READ WRITE CKSUM SLOW
# slow_io_pool ONLINE 0 0 0 -
# raidz1-0 ONLINE 0 0 0 -
# /var/tmp/vdevfile1.499278 ONLINE 0 0 0 113
# /var/tmp/vdevfile2.499278 ONLINE 0 0 0 109
# /var/tmp/vdevfile3.499278 ONLINE 0 0 0 96
# /var/tmp/vdevfile4.499278 ONLINE 0 0 0 109
#
start_slow_io
dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null
stop_slow_io
# count events available for processing
typeset -i i=0
typeset -i events=0
while [[ $i -lt 60 ]]; do
events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
[[ $events -ge "50" ]] && break
i=$((i+1))
sleep 1
done
log_note "$events delay events found"
if [[ $events -lt "50" ]]; then
log_note "bailing: not enough events to complete the test"
destroy_pool $TESTPOOL
return
fi
#
# give slow ZED a chance to process the delay events
#
typeset -i i=0
typeset -i skips=0
while [[ $i -lt 75 ]]; do
skips=$(grep "retiring case" \
$ZEDLET_DIR/zed.log | wc -l)
[[ $skips -gt "0" ]] && break
i=$((i+1))
sleep 1
done
log_note $skips degrade skips in ZED log after $i seconds
[ $skips -gt "0" ] || log_fail "expecting to see skips"
degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
log_note $degrades vdev degrades in ZED log
[ $degrades -eq "0" ] || \
log_fail "expecting no degrade events, found $degrades"
destroy_pool $TESTPOOL
}
log_assert "Test ZED slow io across multiple vdevs"
log_onexit cleanup
log_must zed_events_drain
log_must zed_start
multiple_slow_vdevs_test
log_pass "Test ZED slow io across multiple vdevs"

View File

@ -32,5 +32,6 @@ cleanup_devices $DISKS
zed_stop zed_stop
zed_cleanup resilver_finish-start-scrub.sh zed_cleanup resilver_finish-start-scrub.sh
zed_events_drain
log_pass log_pass

View File

@ -28,6 +28,7 @@
verify_runnable "global" verify_runnable "global"
zed_events_drain
zed_setup resilver_finish-start-scrub.sh zed_setup resilver_finish-start-scrub.sh
zed_start zed_start