Add slow disk diagnosis to ZED
Slow disk response times can be indicative of a failing drive. ZFS currently tracks slow I/Os (slower than zio_slow_io_ms) and generates events (ereport.fs.zfs.delay). However, no action is taken by ZED, like is done for checksum or I/O errors. This change adds slow disk diagnosis to ZED which is opt-in using new VDEV properties: VDEV_PROP_SLOW_IO_N VDEV_PROP_SLOW_IO_T If multiple VDEVs in a pool are undergoing slow I/Os, then it skips the zpool_vdev_degrade(). Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Rob Wing <rob.wing@klarasystems.com> Signed-off-by: Don Brady <don.brady@klarasystems.com> Closes #15469
This commit is contained in:
parent
db65272aef
commit
c1c26a77ff
|
@ -22,6 +22,7 @@
|
|||
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2016, Intel Corporation.
|
||||
* Copyright (c) 2023, Klara Inc.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
|
|||
if (strcmp(name, "spare_on_remove") == 0)
|
||||
return (1);
|
||||
|
||||
if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
|
||||
return (10); /* N = 10 events */
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
int64_t
|
||||
fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
|
||||
{
|
||||
(void) hdl;
|
||||
|
||||
/*
|
||||
* These can be looked up in mp->modinfo->fmdi_props
|
||||
* For now we just hard code for phase 2. In the
|
||||
* future, there can be a ZED based override.
|
||||
*/
|
||||
if (strcmp(name, "remove_timeout") == 0)
|
||||
return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
|
||||
|
||||
if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
|
||||
return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
@ -535,6 +514,19 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
|
|||
return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
|
||||
}
|
||||
|
||||
int
|
||||
fmd_serd_active(fmd_hdl_t *hdl, const char *name)
|
||||
{
|
||||
fmd_module_t *mp = (fmd_module_t *)hdl;
|
||||
fmd_serd_eng_t *sgp;
|
||||
|
||||
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
|
||||
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
|
||||
return (0);
|
||||
}
|
||||
return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
|
||||
}
|
||||
|
||||
void
|
||||
fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
|
||||
{
|
||||
|
@ -543,12 +535,10 @@ fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
|
|||
|
||||
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
|
||||
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
|
||||
return;
|
||||
} else {
|
||||
fmd_serd_eng_reset(sgp);
|
||||
fmd_hdl_debug(hdl, "serd_reset %s", name);
|
||||
}
|
||||
|
||||
fmd_serd_eng_reset(sgp);
|
||||
|
||||
fmd_hdl_debug(hdl, "serd_reset %s", name);
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
|
|||
{
|
||||
fmd_module_t *mp = (fmd_module_t *)hdl;
|
||||
fmd_serd_eng_t *sgp;
|
||||
int err;
|
||||
|
||||
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
|
||||
zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
|
||||
name);
|
||||
return (0);
|
||||
}
|
||||
err = fmd_serd_eng_record(sgp, ep->ev_hrt);
|
||||
return (fmd_serd_eng_record(sgp, ep->ev_hrt));
|
||||
}
|
||||
|
||||
return (err);
|
||||
void
|
||||
fmd_serd_gc(fmd_hdl_t *hdl)
|
||||
{
|
||||
fmd_module_t *mp = (fmd_module_t *)hdl;
|
||||
|
||||
fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
|
||||
}
|
||||
|
||||
/* FMD Timers */
|
||||
|
@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
|
|||
const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
|
||||
struct itimerspec its;
|
||||
|
||||
fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
|
||||
fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);
|
||||
|
||||
/* disarm the timer */
|
||||
memset(&its, 0, sizeof (struct itimerspec));
|
||||
|
|
|
@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
|
|||
extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
|
||||
|
||||
extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
|
||||
extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
|
||||
|
||||
#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
|
||||
#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
|
||||
|
@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
|
|||
extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
|
||||
extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
|
||||
extern int fmd_serd_exists(fmd_hdl_t *, const char *);
|
||||
extern int fmd_serd_active(fmd_hdl_t *, const char *);
|
||||
extern void fmd_serd_reset(fmd_hdl_t *, const char *);
|
||||
extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
|
||||
extern int fmd_serd_fired(fmd_hdl_t *, const char *);
|
||||
extern int fmd_serd_empty(fmd_hdl_t *, const char *);
|
||||
extern void fmd_serd_gc(fmd_hdl_t *);
|
||||
|
||||
extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
|
||||
extern void fmd_timer_remove(fmd_hdl_t *, id_t);
|
||||
|
|
|
@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
|
|||
}
|
||||
|
||||
void
|
||||
fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
|
||||
fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg)
|
||||
{
|
||||
(void) arg;
|
||||
fmd_serd_elem_t *sep, *nep;
|
||||
hrtime_t hrt;
|
||||
|
||||
|
|
|
@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
|
|||
extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
|
||||
|
||||
extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
|
||||
extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
|
||||
extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2016, Intel Corporation.
|
||||
* Copyright (c) 2023, Klara Inc.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
@ -47,11 +48,16 @@
|
|||
#define DEFAULT_CHECKSUM_T 600 /* seconds */
|
||||
#define DEFAULT_IO_N 10 /* events */
|
||||
#define DEFAULT_IO_T 600 /* seconds */
|
||||
#define DEFAULT_SLOW_IO_N 10 /* events */
|
||||
#define DEFAULT_SLOW_IO_T 30 /* seconds */
|
||||
|
||||
#define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */
|
||||
|
||||
/*
|
||||
* Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
|
||||
* #define reserves enough space for two 64-bit hex values plus the length of
|
||||
* the longest string.
|
||||
* Our serd engines are named in the following format:
|
||||
* 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}'
|
||||
* This #define reserves enough space for two 64-bit hex values plus the
|
||||
* length of the longest string.
|
||||
*/
|
||||
#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
|
||||
|
||||
|
@ -68,6 +74,7 @@ typedef struct zfs_case_data {
|
|||
int zc_pool_state;
|
||||
char zc_serd_checksum[MAX_SERDLEN];
|
||||
char zc_serd_io[MAX_SERDLEN];
|
||||
char zc_serd_slow_io[MAX_SERDLEN];
|
||||
int zc_has_remove_timer;
|
||||
} zfs_case_data_t;
|
||||
|
||||
|
@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = {
|
|||
{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
|
||||
};
|
||||
|
||||
static hrtime_t zfs_remove_timeout;
|
||||
/* wait 15 seconds after a removal */
|
||||
static hrtime_t zfs_remove_timeout = SEC2NSEC(15);
|
||||
|
||||
uu_list_pool_t *zfs_case_pool;
|
||||
uu_list_t *zfs_cases;
|
||||
|
@ -124,6 +132,8 @@ uu_list_t *zfs_cases;
|
|||
#define ZFS_MAKE_EREPORT(type) \
|
||||
FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
|
||||
|
||||
static void zfs_purge_cases(fmd_hdl_t *hdl);
|
||||
|
||||
/*
|
||||
* Write out the persistent representation of an active case.
|
||||
*/
|
||||
|
@ -170,6 +180,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
|
|||
return (zcp);
|
||||
}
|
||||
|
||||
/*
|
||||
* count other unique slow-io cases in a pool
|
||||
*/
|
||||
static uint_t
|
||||
zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
|
||||
{
|
||||
zfs_case_t *zcp;
|
||||
uint_t cases = 0;
|
||||
static hrtime_t next_check = 0;
|
||||
|
||||
/*
|
||||
* Note that plumbing in some external GC would require adding locking,
|
||||
* since most of this module code is not thread safe and assumes there
|
||||
* is only one thread running against the module. So we perform GC here
|
||||
* inline periodically so that future delay induced faults will be
|
||||
* possible once the issue causing multiple vdev delays is resolved.
|
||||
*/
|
||||
if (gethrestime_sec() > next_check) {
|
||||
/* Periodically purge old SERD entries and stale cases */
|
||||
fmd_serd_gc(hdl);
|
||||
zfs_purge_cases(hdl);
|
||||
next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS;
|
||||
}
|
||||
|
||||
for (zcp = uu_list_first(zfs_cases); zcp != NULL;
|
||||
zcp = uu_list_next(zfs_cases, zcp)) {
|
||||
if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
|
||||
zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
|
||||
zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
|
||||
fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
|
||||
cases++;
|
||||
}
|
||||
}
|
||||
return (cases);
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate over any active cases. If any cases are associated with a pool or
|
||||
* vdev which is no longer present on the system, close the associated case.
|
||||
|
@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
|
|||
(long long unsigned int)vdev_guid, type);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp)
|
||||
{
|
||||
fmd_hdl_debug(hdl, "retiring case");
|
||||
|
||||
fmd_case_close(hdl, zcp->zc_case);
|
||||
}
|
||||
|
||||
/*
|
||||
* Solve a given ZFS case. This first checks to make sure the diagnosis is
|
||||
* still valid, as well as cleaning up any pending timer associated with the
|
||||
|
@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
|||
if (strcmp(class,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
|
||||
strcmp(class,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
|
||||
strcmp(class,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) {
|
||||
zfs_stats.resource_drops.fmds_value.ui64++;
|
||||
return;
|
||||
}
|
||||
|
@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
|||
if (zcp->zc_data.zc_serd_checksum[0] != '\0')
|
||||
fmd_serd_reset(hdl,
|
||||
zcp->zc_data.zc_serd_checksum);
|
||||
if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
|
||||
fmd_serd_reset(hdl,
|
||||
zcp->zc_data.zc_serd_slow_io);
|
||||
} else if (fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
|
||||
uint64_t state = 0;
|
||||
|
@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
|||
if (fmd_case_solved(hdl, zcp->zc_case))
|
||||
return;
|
||||
|
||||
fmd_hdl_debug(hdl, "error event '%s'", class);
|
||||
if (vdev_guid)
|
||||
fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class,
|
||||
vdev_guid);
|
||||
else
|
||||
fmd_hdl_debug(hdl, "error event '%s'", class);
|
||||
|
||||
/*
|
||||
* Determine if we should solve the case and generate a fault. We solve
|
||||
|
@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
|||
fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
|
||||
fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) ||
|
||||
fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
|
||||
const char *failmode = NULL;
|
||||
boolean_t checkremove = B_FALSE;
|
||||
|
@ -814,6 +875,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
|||
}
|
||||
if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
|
||||
checkremove = B_TRUE;
|
||||
} else if (fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
|
||||
uint64_t slow_io_n, slow_io_t;
|
||||
|
||||
/*
|
||||
* Create a slow io SERD engine when the VDEV has the
|
||||
* 'vdev_slow_io_n' and 'vdev_slow_io_n' properties.
|
||||
*/
|
||||
if (zcp->zc_data.zc_serd_slow_io[0] == '\0' &&
|
||||
nvlist_lookup_uint64(nvl,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
|
||||
&slow_io_n) == 0 &&
|
||||
nvlist_lookup_uint64(nvl,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
|
||||
&slow_io_t) == 0) {
|
||||
zfs_serd_name(zcp->zc_data.zc_serd_slow_io,
|
||||
pool_guid, vdev_guid, "slow_io");
|
||||
fmd_serd_create(hdl,
|
||||
zcp->zc_data.zc_serd_slow_io,
|
||||
slow_io_n,
|
||||
SEC2NSEC(slow_io_t));
|
||||
zfs_case_serialize(zcp);
|
||||
}
|
||||
/* Pass event to SERD engine and see if this triggers */
|
||||
if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
|
||||
fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
|
||||
ep)) {
|
||||
/*
|
||||
* Ignore a slow io diagnosis when other
|
||||
* VDEVs in the pool show signs of being slow.
|
||||
*/
|
||||
if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
|
||||
zfs_case_retire(hdl, zcp);
|
||||
fmd_hdl_debug(hdl, "pool %llu has "
|
||||
"multiple slow io cases -- skip "
|
||||
"degrading vdev %llu",
|
||||
(u_longlong_t)
|
||||
zcp->zc_data.zc_pool_guid,
|
||||
(u_longlong_t)
|
||||
zcp->zc_data.zc_vdev_guid);
|
||||
} else {
|
||||
zfs_case_solve(hdl, zcp,
|
||||
"fault.fs.zfs.vdev.slow_io");
|
||||
}
|
||||
}
|
||||
} else if (fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
|
||||
/*
|
||||
|
@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
|
|||
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
|
||||
if (zcp->zc_data.zc_serd_io[0] != '\0')
|
||||
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
|
||||
if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
|
||||
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io);
|
||||
if (zcp->zc_data.zc_has_remove_timer)
|
||||
fmd_timer_remove(hdl, zcp->zc_remove_timer);
|
||||
|
||||
|
@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
|
|||
fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* We use the fmd gc entry point to look for old cases that no longer apply.
|
||||
* This allows us to keep our set of case data small in a long running system.
|
||||
*/
|
||||
static void
|
||||
zfs_fm_gc(fmd_hdl_t *hdl)
|
||||
{
|
||||
zfs_purge_cases(hdl);
|
||||
}
|
||||
|
||||
static const fmd_hdl_ops_t fmd_ops = {
|
||||
zfs_fm_recv, /* fmdo_recv */
|
||||
zfs_fm_timeout, /* fmdo_timeout */
|
||||
zfs_fm_close, /* fmdo_close */
|
||||
NULL, /* fmdo_stats */
|
||||
zfs_fm_gc, /* fmdo_gc */
|
||||
NULL, /* fmdo_gc */
|
||||
};
|
||||
|
||||
static const fmd_prop_t fmd_props[] = {
|
||||
{ "checksum_N", FMD_TYPE_UINT32, "10" },
|
||||
{ "checksum_T", FMD_TYPE_TIME, "10min" },
|
||||
{ "io_N", FMD_TYPE_UINT32, "10" },
|
||||
{ "io_T", FMD_TYPE_TIME, "10min" },
|
||||
{ "remove_timeout", FMD_TYPE_TIME, "15sec" },
|
||||
{ NULL, 0, NULL }
|
||||
};
|
||||
|
||||
|
@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl)
|
|||
|
||||
(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
|
||||
sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
|
||||
|
||||
zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -523,6 +523,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
|
|||
} else if (fmd_nvl_class_match(hdl, fault,
|
||||
"fault.fs.zfs.vdev.checksum")) {
|
||||
degrade_device = B_TRUE;
|
||||
} else if (fmd_nvl_class_match(hdl, fault,
|
||||
"fault.fs.zfs.vdev.slow_io")) {
|
||||
degrade_device = B_TRUE;
|
||||
} else if (fmd_nvl_class_match(hdl, fault,
|
||||
"fault.fs.zfs.device")) {
|
||||
fault_device = B_FALSE;
|
||||
|
|
|
@ -1083,6 +1083,22 @@ main(int argc, char **argv)
|
|||
libzfs_fini(g_zfs);
|
||||
return (1);
|
||||
}
|
||||
|
||||
if (record.zi_nlanes) {
|
||||
switch (io_type) {
|
||||
case ZIO_TYPE_READ:
|
||||
case ZIO_TYPE_WRITE:
|
||||
case ZIO_TYPES:
|
||||
break;
|
||||
default:
|
||||
(void) fprintf(stderr, "I/O type for a delay "
|
||||
"must be 'read' or 'write'\n");
|
||||
usage();
|
||||
libzfs_fini(g_zfs);
|
||||
return (1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!error)
|
||||
error = ENXIO;
|
||||
|
||||
|
|
|
@ -2569,7 +2569,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
break;
|
||||
|
||||
case VDEV_AUX_ERR_EXCEEDED:
|
||||
(void) printf(gettext("too many errors"));
|
||||
if (vs->vs_read_errors + vs->vs_write_errors +
|
||||
vs->vs_checksum_errors == 0 && children == 0 &&
|
||||
vs->vs_slow_ios > 0) {
|
||||
(void) printf(gettext("too many slow I/Os"));
|
||||
} else {
|
||||
(void) printf(gettext("too many errors"));
|
||||
}
|
||||
break;
|
||||
|
||||
case VDEV_AUX_IO_FAILURE:
|
||||
|
|
|
@ -82,6 +82,8 @@ extern "C" {
|
|||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T "vdev_cksum_t"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N "vdev_io_n"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
|
||||
|
|
|
@ -364,6 +364,8 @@ typedef enum {
|
|||
VDEV_PROP_IO_N,
|
||||
VDEV_PROP_IO_T,
|
||||
VDEV_PROP_RAIDZ_EXPANDING,
|
||||
VDEV_PROP_SLOW_IO_N,
|
||||
VDEV_PROP_SLOW_IO_T,
|
||||
VDEV_NUM_PROPS
|
||||
} vdev_prop_t;
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
* Copyright (c) 2023, Klara Inc.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_VDEV_IMPL_H
|
||||
|
@ -453,12 +454,14 @@ struct vdev {
|
|||
zfs_ratelimit_t vdev_checksum_rl;
|
||||
|
||||
/*
|
||||
* Checksum and IO thresholds for tuning ZED
|
||||
* Vdev properties for tuning ZED
|
||||
*/
|
||||
uint64_t vdev_checksum_n;
|
||||
uint64_t vdev_checksum_t;
|
||||
uint64_t vdev_io_n;
|
||||
uint64_t vdev_io_t;
|
||||
uint64_t vdev_slow_io_n;
|
||||
uint64_t vdev_slow_io_t;
|
||||
};
|
||||
|
||||
#define VDEV_PAD_SIZE (8 << 10)
|
||||
|
|
|
@ -5672,7 +5672,9 @@
|
|||
<enumerator name='VDEV_PROP_IO_N' value='44'/>
|
||||
<enumerator name='VDEV_PROP_IO_T' value='45'/>
|
||||
<enumerator name='VDEV_PROP_RAIDZ_EXPANDING' value='46'/>
|
||||
<enumerator name='VDEV_NUM_PROPS' value='47'/>
|
||||
<enumerator name='VDEV_PROP_SLOW_IO_N' value='47'/>
|
||||
<enumerator name='VDEV_PROP_SLOW_IO_T' value='48'/>
|
||||
<enumerator name='VDEV_NUM_PROPS' value='49'/>
|
||||
</enum-decl>
|
||||
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
|
||||
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
|
||||
|
|
|
@ -5224,6 +5224,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
|
|||
case VDEV_PROP_CHECKSUM_T:
|
||||
case VDEV_PROP_IO_N:
|
||||
case VDEV_PROP_IO_T:
|
||||
case VDEV_PROP_SLOW_IO_N:
|
||||
case VDEV_PROP_SLOW_IO_T:
|
||||
if (intval == UINT64_MAX) {
|
||||
(void) strlcpy(buf, "-", len);
|
||||
} else {
|
||||
|
|
|
@ -1699,7 +1699,9 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
|
|||
(prop == VDEV_PROP_CHECKSUM_N ||
|
||||
prop == VDEV_PROP_CHECKSUM_T ||
|
||||
prop == VDEV_PROP_IO_N ||
|
||||
prop == VDEV_PROP_IO_T)) {
|
||||
prop == VDEV_PROP_IO_T ||
|
||||
prop == VDEV_PROP_SLOW_IO_N ||
|
||||
prop == VDEV_PROP_SLOW_IO_T)) {
|
||||
*ivalp = UINT64_MAX;
|
||||
}
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ section, below.
|
|||
Every vdev has a set of properties that export statistics about the vdev
|
||||
as well as control various behaviors.
|
||||
Properties are not inherited from top-level vdevs, with the exception of
|
||||
checksum_n, checksum_t, io_n, and io_t.
|
||||
checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t.
|
||||
.Pp
|
||||
The values of numeric properties can be specified using human-readable suffixes
|
||||
.Po for example,
|
||||
|
@ -117,7 +117,7 @@ If this device is currently being removed from the pool
|
|||
.Pp
|
||||
The following native properties can be used to change the behavior of a vdev.
|
||||
.Bl -tag -width "allocating"
|
||||
.It Sy checksum_n , checksum_t , io_n , io_t
|
||||
.It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t
|
||||
Tune the fault management daemon by specifying checksum/io thresholds of <N>
|
||||
errors in <T> seconds, respectively.
|
||||
These properties can be set on leaf and top-level vdevs.
|
||||
|
|
|
@ -260,8 +260,8 @@ sufficient replicas exist to continue functioning.
|
|||
The underlying conditions are as follows:
|
||||
.Bl -bullet -compact
|
||||
.It
|
||||
The number of checksum errors exceeds acceptable levels and the device is
|
||||
degraded as an indication that something may be wrong.
|
||||
The number of checksum errors or slow I/Os exceeds acceptable levels and the
|
||||
device is degraded as an indication that something may be wrong.
|
||||
ZFS continues to use the device as necessary.
|
||||
.It
|
||||
The number of I/O errors exceeds acceptable levels.
|
||||
|
|
|
@ -69,6 +69,7 @@ Force a vdev into the DEGRADED or FAULTED state.
|
|||
.Nm zinject
|
||||
.Fl d Ar vdev
|
||||
.Fl D Ar latency : Ns Ar lanes
|
||||
.Op Fl T Ar read|write
|
||||
.Ar pool
|
||||
.Xc
|
||||
Add an artificial delay to I/O requests on a particular
|
||||
|
|
|
@ -431,6 +431,12 @@ vdev_prop_init(void)
|
|||
zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX,
|
||||
PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "IO_T", B_FALSE,
|
||||
sfeatures);
|
||||
zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX,
|
||||
PROP_DEFAULT, ZFS_TYPE_VDEV, "<events>", "SLOW_IO_N", B_FALSE,
|
||||
sfeatures);
|
||||
zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX,
|
||||
PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "SLOW_IO_T", B_FALSE,
|
||||
sfeatures);
|
||||
|
||||
/* default index (boolean) properties */
|
||||
zprop_register_index(VDEV_PROP_REMOVING, "removing", 0,
|
||||
|
|
|
@ -676,6 +676,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
|||
vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
|
||||
vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
|
||||
vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
|
||||
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
|
||||
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
|
||||
|
||||
list_link_init(&vd->vdev_config_dirty_node);
|
||||
list_link_init(&vd->vdev_state_dirty_node);
|
||||
|
@ -3730,6 +3732,18 @@ vdev_load(vdev_t *vd)
|
|||
if (error && error != ENOENT)
|
||||
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
|
||||
"failed [error=%d]", (u_longlong_t)zapobj, error);
|
||||
|
||||
error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
|
||||
&vd->vdev_slow_io_n);
|
||||
if (error && error != ENOENT)
|
||||
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
|
||||
"failed [error=%d]", (u_longlong_t)zapobj, error);
|
||||
|
||||
error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
|
||||
&vd->vdev_slow_io_t);
|
||||
if (error && error != ENOENT)
|
||||
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
|
||||
"failed [error=%d]", (u_longlong_t)zapobj, error);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -5934,6 +5948,20 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
|
|||
}
|
||||
vd->vdev_io_t = intval;
|
||||
break;
|
||||
case VDEV_PROP_SLOW_IO_N:
|
||||
if (nvpair_value_uint64(elem, &intval) != 0) {
|
||||
error = EINVAL;
|
||||
break;
|
||||
}
|
||||
vd->vdev_slow_io_n = intval;
|
||||
break;
|
||||
case VDEV_PROP_SLOW_IO_T:
|
||||
if (nvpair_value_uint64(elem, &intval) != 0) {
|
||||
error = EINVAL;
|
||||
break;
|
||||
}
|
||||
vd->vdev_slow_io_t = intval;
|
||||
break;
|
||||
default:
|
||||
/* Most processing is done in vdev_props_set_sync */
|
||||
break;
|
||||
|
@ -6269,6 +6297,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
|
|||
case VDEV_PROP_CHECKSUM_T:
|
||||
case VDEV_PROP_IO_N:
|
||||
case VDEV_PROP_IO_T:
|
||||
case VDEV_PROP_SLOW_IO_N:
|
||||
case VDEV_PROP_SLOW_IO_T:
|
||||
err = vdev_prop_get_int(vd, prop, &intval);
|
||||
if (err && err != ENOENT)
|
||||
break;
|
||||
|
|
|
@ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
|
|||
case VDEV_PROP_IO_T:
|
||||
propval = vd->vdev_io_t;
|
||||
break;
|
||||
case VDEV_PROP_SLOW_IO_N:
|
||||
propval = vd->vdev_slow_io_n;
|
||||
break;
|
||||
case VDEV_PROP_SLOW_IO_T:
|
||||
propval = vd->vdev_slow_io_t;
|
||||
break;
|
||||
default:
|
||||
propval = propdef;
|
||||
break;
|
||||
|
@ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
|||
NULL);
|
||||
}
|
||||
|
||||
if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
|
||||
uint64_t slow_io_n, slow_io_t;
|
||||
|
||||
slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
|
||||
if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
|
||||
fm_payload_set(ereport,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
|
||||
DATA_TYPE_UINT64,
|
||||
slow_io_n,
|
||||
NULL);
|
||||
|
||||
slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
|
||||
if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
|
||||
fm_payload_set(ereport,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
|
||||
DATA_TYPE_UINT64,
|
||||
slow_io_t,
|
||||
NULL);
|
||||
}
|
||||
|
||||
mutex_exit(&spa->spa_errlist_lock);
|
||||
|
||||
*ereport_out = ereport;
|
||||
|
|
|
@ -605,6 +605,10 @@ zio_handle_io_delay(zio_t *zio)
|
|||
if (vd->vdev_guid != handler->zi_record.zi_guid)
|
||||
continue;
|
||||
|
||||
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
|
||||
handler->zi_record.zi_iotype != zio->io_type)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Defensive; should never happen as the array allocation
|
||||
* occurs prior to inserting this handler on the list.
|
||||
|
|
|
@ -104,7 +104,8 @@ tags = ['functional', 'devices']
|
|||
|
||||
[tests/functional/events:Linux]
|
||||
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
|
||||
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config']
|
||||
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
|
||||
'zed_slow_io', 'zed_slow_io_many_vdevs']
|
||||
tags = ['functional', 'events']
|
||||
|
||||
[tests/functional/fadvise:Linux]
|
||||
|
|
|
@ -1447,6 +1447,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||
functional/events/zed_fd_spill.ksh \
|
||||
functional/events/zed_io_config.ksh \
|
||||
functional/events/zed_rc_filter.ksh \
|
||||
functional/events/zed_slow_io.ksh \
|
||||
functional/events/zed_slow_io_many_vdevs.ksh \
|
||||
functional/exec/cleanup.ksh \
|
||||
functional/exec/exec_001_pos.ksh \
|
||||
functional/exec/exec_002_neg.ksh \
|
||||
|
|
|
@ -70,4 +70,6 @@ typeset -a properties=(
|
|||
checksum_t
|
||||
io_n
|
||||
io_t
|
||||
slow_io_n
|
||||
slow_io_t
|
||||
)
|
||||
|
|
|
@ -26,8 +26,10 @@
|
|||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
zed_cleanup all-debug.sh all-syslog.sh all-dumpfds
|
||||
|
||||
zed_stop
|
||||
|
||||
zed_cleanup all-debug.sh all-syslog.sh all-dumpfds
|
||||
|
||||
zed_events_drain
|
||||
|
||||
default_cleanup
|
||||
|
|
|
@ -0,0 +1,205 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2023, Klara Inc.
|
||||
#
|
||||
|
||||
# DESCRIPTION:
|
||||
# Verify that vdev properties, slow_io_n and slow_io_t, work with ZED.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a pool with single vdev
|
||||
# 2. Set slow_io_n/slow_io_t to non-default values
|
||||
# 3. Inject slow io errors
|
||||
# 4. Verify that ZED degrades vdev
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
TESTDIR="$TEST_BASE_DIR/zed_slow_io"
|
||||
VDEV="$TEST_BASE_DIR/vdevfile.$$"
|
||||
TESTPOOL="slow_io_pool"
|
||||
FILEPATH="$TESTDIR/slow_io.testfile"
|
||||
|
||||
OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
|
||||
OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
|
||||
|
||||
verify_runnable "both"
|
||||
|
||||
function do_setup
|
||||
{
|
||||
log_must truncate -s 1G $VDEV
|
||||
default_setup_noexit $VDEV
|
||||
zed_events_drain
|
||||
log_must zfs set compression=off $TESTPOOL
|
||||
log_must zfs set primarycache=none $TESTPOOL
|
||||
log_must zfs set prefetch=none $TESTPOOL
|
||||
log_must zfs set recordsize=512 $TESTPOOL
|
||||
for i in {1..10}; do
|
||||
dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null
|
||||
done
|
||||
zpool sync
|
||||
}
|
||||
|
||||
# intermediate cleanup
|
||||
function do_clean
|
||||
{
|
||||
log_must zinject -c all
|
||||
log_must zpool destroy $TESTPOOL
|
||||
log_must rm -f $VDEV
|
||||
}
|
||||
|
||||
# final cleanup
|
||||
function cleanup
|
||||
{
|
||||
log_must zinject -c all
|
||||
|
||||
# if pool still exists then something failed so log additional info
|
||||
if poolexists $TESTPOOL ; then
|
||||
log_note "$(zpool status -s $TESTPOOL)"
|
||||
echo "=================== zed log search ==================="
|
||||
grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
|
||||
destroy_pool $TESTPOOL
|
||||
fi
|
||||
log_must zed_stop
|
||||
|
||||
log_must rm -f $VDEV
|
||||
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
|
||||
}
|
||||
|
||||
function start_slow_io
|
||||
{
|
||||
zpool sync
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS 10
|
||||
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
|
||||
|
||||
log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL
|
||||
zpool sync
|
||||
}
|
||||
|
||||
function stop_slow_io
|
||||
{
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
|
||||
|
||||
log_must zinject -c all
|
||||
}
|
||||
|
||||
# Test default ZED settings:
|
||||
# inject 10 events over 2.5 seconds, should not degrade.
|
||||
function default_degrade
|
||||
{
|
||||
do_setup
|
||||
|
||||
start_slow_io
|
||||
for i in {1..10}; do
|
||||
dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
|
||||
sleep 0.25
|
||||
done
|
||||
stop_slow_io
|
||||
log_note "$(zpool status -s $TESTPOOL)"
|
||||
|
||||
# give slow ZED a chance to process the delay events
|
||||
sleep 18
|
||||
log_note "$(zpool status -s $TESTPOOL)"
|
||||
|
||||
degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
|
||||
log_note $degrades vdev degrades in ZED log
|
||||
[ $degrades -eq "0" ] || \
|
||||
log_fail "expecting no degrade events, found $degrades"
|
||||
|
||||
do_clean
|
||||
}
|
||||
|
||||
# change slow_io_n, slow_io_t to 5 events in 60 seconds
|
||||
# fire more than 5 events, should degrade
|
||||
function slow_io_degrade
|
||||
{
|
||||
do_setup
|
||||
|
||||
zpool set slow_io_n=5 $TESTPOOL $VDEV
|
||||
zpool set slow_io_t=60 $TESTPOOL $VDEV
|
||||
|
||||
start_slow_io
|
||||
for i in {1..16}; do
|
||||
dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
|
||||
sleep 0.5
|
||||
done
|
||||
stop_slow_io
|
||||
zpool sync
|
||||
|
||||
#
|
||||
# wait up to 60 seconds for kernel to produce at least 5 delay events
|
||||
#
|
||||
typeset -i i=0
|
||||
typeset -i events=0
|
||||
while [[ $i -lt 60 ]]; do
|
||||
events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
|
||||
[[ $events -ge "5" ]] && break
|
||||
i=$((i+1))
|
||||
sleep 1
|
||||
done
|
||||
log_note "$events delay events found"
|
||||
|
||||
if [[ $events -ge "5" ]]; then
|
||||
log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10
|
||||
fi
|
||||
|
||||
do_clean
|
||||
}
|
||||
|
||||
# change slow_io_n, slow_io_t to 10 events in 1 second
|
||||
# inject events spaced 0.5 seconds apart, should not degrade
|
||||
function slow_io_no_degrade
|
||||
{
|
||||
do_setup
|
||||
|
||||
zpool set slow_io_n=10 $TESTPOOL $VDEV
|
||||
zpool set slow_io_t=1 $TESTPOOL $VDEV
|
||||
|
||||
start_slow_io
|
||||
for i in {1..16}; do
|
||||
dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
|
||||
sleep 0.5
|
||||
done
|
||||
stop_slow_io
|
||||
zpool sync
|
||||
|
||||
log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45
|
||||
|
||||
do_clean
|
||||
}
|
||||
|
||||
log_assert "Test ZED slow io configurability"
|
||||
log_onexit cleanup
|
||||
|
||||
log_must zed_events_drain
|
||||
log_must zed_start
|
||||
|
||||
default_degrade
|
||||
slow_io_degrade
|
||||
slow_io_no_degrade
|
||||
|
||||
log_pass "Test ZED slow io configurability"
|
|
@ -0,0 +1,177 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2023, Klara Inc.
|
||||
#
|
||||
|
||||
# DESCRIPTION:
|
||||
# Verify that delay events from multiple vdevs doesnt degrade
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a pool with a 3 disk raidz vdev
|
||||
# 2. Inject slow io errors
|
||||
# 3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
TESTDIR="$TEST_BASE_DIR/zed_slow_io"
|
||||
VDEV1="$TEST_BASE_DIR/vdevfile1.$$"
|
||||
VDEV2="$TEST_BASE_DIR/vdevfile2.$$"
|
||||
VDEV3="$TEST_BASE_DIR/vdevfile3.$$"
|
||||
VDEV4="$TEST_BASE_DIR/vdevfile4.$$"
|
||||
VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4"
|
||||
TESTPOOL="slow_io_pool"
|
||||
FILEPATH="$TESTDIR/slow_io.testfile"
|
||||
|
||||
OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
|
||||
OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
|
||||
|
||||
verify_runnable "both"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must zinject -c all
|
||||
|
||||
# if pool still exists then something failed so log additional info
|
||||
if poolexists $TESTPOOL ; then
|
||||
log_note "$(zpool status -s $TESTPOOL)"
|
||||
echo "=================== zed log search ==================="
|
||||
grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
|
||||
destroy_pool $TESTPOOL
|
||||
fi
|
||||
log_must zed_stop
|
||||
|
||||
log_must rm -f $VDEVS
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
|
||||
}
|
||||
|
||||
function start_slow_io
|
||||
{
|
||||
for vdev in $VDEVS
|
||||
do
|
||||
log_must zpool set slow_io_n=4 $TESTPOOL $vdev
|
||||
log_must zpool set slow_io_t=60 $TESTPOOL $vdev
|
||||
done
|
||||
zpool sync
|
||||
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS 10
|
||||
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
|
||||
|
||||
for vdev in $VDEVS
|
||||
do
|
||||
log_must zinject -d $vdev -D10:1 $TESTPOOL
|
||||
done
|
||||
zpool sync
|
||||
}
|
||||
|
||||
function stop_slow_io
|
||||
{
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
|
||||
|
||||
log_must zinject -c all
|
||||
}
|
||||
|
||||
function multiple_slow_vdevs_test
|
||||
{
|
||||
log_must truncate -s 1G $VDEVS
|
||||
default_raidz_setup_noexit $VDEVS
|
||||
|
||||
log_must zpool events -c
|
||||
log_must zfs set compression=off $TESTPOOL
|
||||
log_must zfs set primarycache=none $TESTPOOL
|
||||
log_must zfs set recordsize=4K $TESTPOOL
|
||||
|
||||
log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20
|
||||
zpool sync
|
||||
|
||||
#
|
||||
# Read the file with slow io injected on the disks
|
||||
# This will cause multiple errors on each disk to trip ZED SERD
|
||||
#
|
||||
# pool: slow_io_pool
|
||||
# state: ONLINE
|
||||
# config:
|
||||
#
|
||||
# NAME STATE READ WRITE CKSUM SLOW
|
||||
# slow_io_pool ONLINE 0 0 0 -
|
||||
# raidz1-0 ONLINE 0 0 0 -
|
||||
# /var/tmp/vdevfile1.499278 ONLINE 0 0 0 113
|
||||
# /var/tmp/vdevfile2.499278 ONLINE 0 0 0 109
|
||||
# /var/tmp/vdevfile3.499278 ONLINE 0 0 0 96
|
||||
# /var/tmp/vdevfile4.499278 ONLINE 0 0 0 109
|
||||
#
|
||||
start_slow_io
|
||||
dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null
|
||||
stop_slow_io
|
||||
|
||||
# count events available for processing
|
||||
typeset -i i=0
|
||||
typeset -i events=0
|
||||
while [[ $i -lt 60 ]]; do
|
||||
events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
|
||||
[[ $events -ge "50" ]] && break
|
||||
i=$((i+1))
|
||||
sleep 1
|
||||
done
|
||||
log_note "$events delay events found"
|
||||
if [[ $events -lt "50" ]]; then
|
||||
log_note "bailing: not enough events to complete the test"
|
||||
destroy_pool $TESTPOOL
|
||||
return
|
||||
fi
|
||||
|
||||
#
|
||||
# give slow ZED a chance to process the delay events
|
||||
#
|
||||
typeset -i i=0
|
||||
typeset -i skips=0
|
||||
while [[ $i -lt 75 ]]; do
|
||||
skips=$(grep "retiring case" \
|
||||
$ZEDLET_DIR/zed.log | wc -l)
|
||||
[[ $skips -gt "0" ]] && break
|
||||
i=$((i+1))
|
||||
sleep 1
|
||||
done
|
||||
|
||||
log_note $skips degrade skips in ZED log after $i seconds
|
||||
[ $skips -gt "0" ] || log_fail "expecting to see skips"
|
||||
|
||||
degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
|
||||
log_note $degrades vdev degrades in ZED log
|
||||
[ $degrades -eq "0" ] || \
|
||||
log_fail "expecting no degrade events, found $degrades"
|
||||
|
||||
destroy_pool $TESTPOOL
|
||||
}
|
||||
|
||||
log_assert "Test ZED slow io across multiple vdevs"
|
||||
log_onexit cleanup
|
||||
|
||||
log_must zed_events_drain
|
||||
log_must zed_start
|
||||
multiple_slow_vdevs_test
|
||||
|
||||
log_pass "Test ZED slow io across multiple vdevs"
|
|
@ -32,5 +32,6 @@ cleanup_devices $DISKS
|
|||
|
||||
zed_stop
|
||||
zed_cleanup resilver_finish-start-scrub.sh
|
||||
zed_events_drain
|
||||
|
||||
log_pass
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
|
||||
verify_runnable "global"
|
||||
|
||||
zed_events_drain
|
||||
zed_setup resilver_finish-start-scrub.sh
|
||||
zed_start
|
||||
|
||||
|
|
Loading…
Reference in New Issue