Ratelimit deadman zevents as with delay zevents

Just as delay zevents can flood the zevent pipe when a vdev becomes
unresponsive, so do the deadman zevents.

Ratelimit deadman zevents according to the same tunable as for delay
zevents.

Enable deadman tests on FreeBSD and add a test for deadman event
ratelimiting. 

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Don Brady <don.brady@delphix.com>
Signed-off-by: Ryan Moeller <ryan@iXsystems.com>
Closes #11786
This commit is contained in:
Ryan Moeller 2021-04-07 19:23:57 -04:00 committed by Tony Hutter
parent 932d471bec
commit 214196e9f5
9 changed files with 102 additions and 12 deletions

View File

@ -441,10 +441,11 @@ struct vdev {
kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
/* /*
* We rate limit ZIO delay and ZIO checksum events, since they * We rate limit ZIO delay, deadman, and checksum events, since they
* can flood ZED with tons of events when a drive is acting up. * can flood ZED with tons of events when a drive is acting up.
*/ */
zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_delay_rl;
zfs_ratelimit_t vdev_deadman_rl;
zfs_ratelimit_t vdev_checksum_rl; zfs_ratelimit_t vdev_checksum_rl;
}; };

View File

@ -1620,7 +1620,8 @@ Default value: \fB64\fR.
\fBzfs_slow_io_events_per_second\fR (int) \fBzfs_slow_io_events_per_second\fR (int)
.ad .ad
.RS 12n .RS 12n
Rate limit delay zevents (which report slow I/Os) to this many per second. Rate limit delay and deadman zevents (which report slow I/Os) to this many per
second.
.sp .sp
Default value: 20 Default value: 20
.RE .RE

View File

@ -543,6 +543,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
*/ */
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
1); 1);
zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second,
1);
zfs_ratelimit_init(&vd->vdev_checksum_rl, zfs_ratelimit_init(&vd->vdev_checksum_rl,
&zfs_checksum_events_per_second, 1); &zfs_checksum_events_per_second, 1);
@ -1033,6 +1035,7 @@ vdev_free(vdev_t *vd)
cv_destroy(&vd->vdev_rebuild_io_cv); cv_destroy(&vd->vdev_rebuild_io_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_deadman_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl);
if (vd == spa->spa_root_vdev) if (vd == spa->spa_root_vdev)

View File

@ -357,8 +357,8 @@ zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
} }
/* /*
* We want to rate limit ZIO delay and checksum events so as to not * We want to rate limit ZIO delay, deadman, and checksum events so as to not
* flood ZED when a disk is acting up. * flood zevent consumers when a disk is acting up.
* *
* Returns 1 if we're ratelimiting, 0 if not. * Returns 1 if we're ratelimiting, 0 if not.
*/ */
@ -367,11 +367,13 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
{ {
int rc = 0; int rc = 0;
/* /*
* __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
* are. Invert it to get our return value. * are. Invert it to get our return value.
*/ */
if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
rc = !zfs_ratelimit(&vd->vdev_delay_rl); rc = !zfs_ratelimit(&vd->vdev_delay_rl);
} else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) {
rc = !zfs_ratelimit(&vd->vdev_deadman_rl);
} else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
rc = !zfs_ratelimit(&vd->vdev_checksum_rl); rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
} }

View File

@ -571,6 +571,12 @@ tags = ['functional', 'cp_files']
tests = ['ctime_001_pos' ] tests = ['ctime_001_pos' ]
tags = ['functional', 'ctime'] tags = ['functional', 'ctime']
[tests/functional/deadman]
tests = ['deadman_ratelimit', 'deadman_sync', 'deadman_zio']
pre =
post =
tags = ['functional', 'deadman']
[tests/functional/delegate] [tests/functional/delegate]
tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos',
'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',

View File

@ -85,12 +85,6 @@ tags = ['functional', 'cli_root', 'zpool_split']
tests = ['compress_004_pos'] tests = ['compress_004_pos']
tags = ['functional', 'compression'] tags = ['functional', 'compression']
[tests/functional/deadman:Linux]
tests = ['deadman_sync', 'deadman_zio']
pre =
post =
tags = ['functional', 'deadman']
[tests/functional/devices:Linux] [tests/functional/devices:Linux]
tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos']
tags = ['functional', 'devices'] tags = ['functional', 'devices']

View File

@ -1,5 +1,6 @@
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/deadman pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/deadman
dist_pkgdata_SCRIPTS = \ dist_pkgdata_SCRIPTS = \
deadman_ratelimit.ksh \
deadman_sync.ksh \ deadman_sync.ksh \
deadman_zio.ksh deadman_zio.ksh

View File

@ -0,0 +1,78 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Portions Copyright 2021 iXsystems, Inc.
#
# DESCRIPTION:
# Verify spa deadman events are rate limited
#
# STRATEGY:
# 1. Reduce the zfs_slow_io_events_per_second to 1.
# 2. Reduce the zfs_deadman_ziotime_ms to 1ms.
# 3. Write data to a pool and read it back.
# 4. Verify deadman events have been produced at a reasonable rate.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/deadman/deadman.cfg
verify_runnable "both"
function cleanup
{
zinject -c all
default_cleanup_noexit
set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
set_tunable64 DEADMAN_ZIOTIME_MS $ZIOTIME_DEFAULT
}
log_assert "Verify spa deadman events are rate limited"
log_onexit cleanup
OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1
log_must set_tunable64 DEADMAN_ZIOTIME_MS 1
# Create a new pool in order to use the updated deadman settings.
default_setup_noexit $DISK1
log_must zpool events -c
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
log_must zinject -d $DISK1 -D 5:1 $TESTPOOL
log_must dd if=$mntpnt/file of=$TEST_BASE_DIR/devnull oflag=sync
events=$(zpool events $TESTPOOL | grep -c ereport.fs.zfs.deadman)
log_note "events=$events"
if [ "$events" -lt 1 ]; then
log_fail "Expect >= 1 deadman events, $events found"
fi
if [ "$events" -gt 10 ]; then
log_fail "Expect <= 10 deadman events, $events found"
fi
log_pass "Verify spa deadman events are rate limited"

View File

@ -73,7 +73,11 @@ log_must zinject -c all
log_must zpool sync log_must zpool sync
# Log txg sync times for reference and the zpool event summary. # Log txg sync times for reference and the zpool event summary.
log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs if is_freebsd; then
log_must sysctl -n kstat.zfs.$TESTPOOL.txgs
else
log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs
fi
log_must zpool events log_must zpool events
# Verify at least 5 deadman events were logged. The first after 5 seconds, # Verify at least 5 deadman events were logged. The first after 5 seconds,