zed: Add deadman-slot_off.sh zedlet
Optionally turn off disk's enclosure slot if an I/O is hung triggering the deadman. It's possible for outstanding I/O to a misbehaving SCSI disk to neither promptly complete or return an error. This can occur due to retry and recovery actions taken by the SCSI layer, driver, or disk. When it occurs the pool will be unresponsive even though there may be sufficient redundancy configured to proceeded without this single disk. When a hung I/O is detected by the kmods it will be posted as a deadman event. By default an I/O is considered to be hung after 5 minutes. This value can be changed with the zfs_deadman_ziotime_ms module parameter. If ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is set the disk's enclosure slot will be powered off causing the outstanding I/O to fail. The ZED will then handle this like a normal disk failure. By default ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is not set. As part of this change `zfs_deadman_events_per_second` is added to control the ratelimitting of deadman events independantly of delay events. In practice, a single deadman event is sufficient and more aren't particularly useful. Alphabetize the zfs_deadman_* entries in zfs.4. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #16226
This commit is contained in:
parent
800d59d577
commit
6b95031f56
|
@ -9,6 +9,7 @@ dist_zedexec_SCRIPTS = \
|
||||||
%D%/all-debug.sh \
|
%D%/all-debug.sh \
|
||||||
%D%/all-syslog.sh \
|
%D%/all-syslog.sh \
|
||||||
%D%/data-notify.sh \
|
%D%/data-notify.sh \
|
||||||
|
%D%/deadman-slot_off.sh \
|
||||||
%D%/generic-notify.sh \
|
%D%/generic-notify.sh \
|
||||||
%D%/pool_import-led.sh \
|
%D%/pool_import-led.sh \
|
||||||
%D%/resilver_finish-notify.sh \
|
%D%/resilver_finish-notify.sh \
|
||||||
|
@ -29,6 +30,7 @@ SUBSTFILES += $(nodist_zedexec_SCRIPTS)
|
||||||
zedconfdefaults = \
|
zedconfdefaults = \
|
||||||
all-syslog.sh \
|
all-syslog.sh \
|
||||||
data-notify.sh \
|
data-notify.sh \
|
||||||
|
deadman-slot_off.sh \
|
||||||
history_event-zfs-list-cacher.sh \
|
history_event-zfs-list-cacher.sh \
|
||||||
pool_import-led.sh \
|
pool_import-led.sh \
|
||||||
resilver_finish-notify.sh \
|
resilver_finish-notify.sh \
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
#!/bin/sh
|
||||||
|
# shellcheck disable=SC3014,SC2154,SC2086,SC2034
|
||||||
|
#
|
||||||
|
# Turn off disk's enclosure slot if an I/O is hung triggering the deadman.
|
||||||
|
#
|
||||||
|
# It's possible for outstanding I/O to a misbehaving SCSI disk to neither
|
||||||
|
# promptly complete or return an error. This can occur due to retry and
|
||||||
|
# recovery actions taken by the SCSI layer, driver, or disk. When it occurs
|
||||||
|
# the pool will be unresponsive even though there may be sufficient redundancy
|
||||||
|
# configured to proceeded without this single disk.
|
||||||
|
#
|
||||||
|
# When a hung I/O is detected by the kmods it will be posted as a deadman
|
||||||
|
# event. By default an I/O is considered to be hung after 5 minutes. This
|
||||||
|
# value can be changed with the zfs_deadman_ziotime_ms module parameter.
|
||||||
|
# If ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is set the disk's enclosure
|
||||||
|
# slot will be powered off causing the outstanding I/O to fail. The ZED
|
||||||
|
# will then handle this like a normal disk failure and FAULT the vdev.
|
||||||
|
#
|
||||||
|
# We assume the user will be responsible for turning the slot back on
|
||||||
|
# after replacing the disk.
|
||||||
|
#
|
||||||
|
# Note that this script requires that your enclosure be supported by the
|
||||||
|
# Linux SCSI Enclosure services (SES) driver. The script will do nothing
|
||||||
|
# if you have no enclosure, or if your enclosure isn't supported.
|
||||||
|
#
|
||||||
|
# Exit codes:
|
||||||
|
# 0: slot successfully powered off
|
||||||
|
# 1: enclosure not available
|
||||||
|
# 2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN disabled
|
||||||
|
# 3: System not configured to wait on deadman
|
||||||
|
# 4: The enclosure sysfs path passed from ZFS does not exist
|
||||||
|
# 5: Enclosure slot didn't actually turn off after we told it to
|
||||||
|
|
||||||
|
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
|
||||||
|
. "${ZED_ZEDLET_DIR}/zed-functions.sh"
|
||||||
|
|
||||||
|
if [ ! -d /sys/class/enclosure ] ; then
|
||||||
|
# No JBOD enclosure or NVMe slots
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN}" != "1" ] ; then
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$ZEVENT_POOL_FAILMODE" != "wait" ] ; then
|
||||||
|
exit 3
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then
|
||||||
|
exit 4
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Turn off the slot and wait for sysfs to report that the slot is off.
|
||||||
|
# It can take ~400ms on some enclosures and multiple retries may be needed.
|
||||||
|
for i in $(seq 1 20) ; do
|
||||||
|
echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status"
|
||||||
|
|
||||||
|
for j in $(seq 1 5) ; do
|
||||||
|
if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then
|
||||||
|
break 2
|
||||||
|
fi
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
|
|
||||||
|
zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH"
|
|
@ -148,6 +148,13 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
|
||||||
# supports slot power control via sysfs.
|
# supports slot power control via sysfs.
|
||||||
#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1
|
#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1
|
||||||
|
|
||||||
|
##
|
||||||
|
# Power off the drive's slot in the enclosure if there is a hung I/O which
|
||||||
|
# exceeds the deadman timeout. This can help prevent a single misbehaving
|
||||||
|
# drive from rendering a redundant pool unavailable. This assumes your drive
|
||||||
|
# enclosure fully supports slot power control via sysfs.
|
||||||
|
#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN=1
|
||||||
|
|
||||||
##
|
##
|
||||||
# Ntfy topic
|
# Ntfy topic
|
||||||
# This defines which topic will receive the ntfy notification.
|
# This defines which topic will receive the ntfy notification.
|
||||||
|
|
|
@ -906,6 +906,13 @@ Historically used for controlling what reporting was available under
|
||||||
.Pa /proc/spl/kstat/zfs .
|
.Pa /proc/spl/kstat/zfs .
|
||||||
No effect.
|
No effect.
|
||||||
.
|
.
|
||||||
|
.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64
|
||||||
|
Check time in milliseconds.
|
||||||
|
This defines the frequency at which we check for hung I/O requests
|
||||||
|
and potentially invoke the
|
||||||
|
.Sy zfs_deadman_failmode
|
||||||
|
behavior.
|
||||||
|
.
|
||||||
.It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
|
.It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
|
||||||
When a pool sync operation takes longer than
|
When a pool sync operation takes longer than
|
||||||
.Sy zfs_deadman_synctime_ms ,
|
.Sy zfs_deadman_synctime_ms ,
|
||||||
|
@ -921,6 +928,10 @@ By default, the deadman is enabled and set to
|
||||||
which results in "hung" I/O operations only being logged.
|
which results in "hung" I/O operations only being logged.
|
||||||
The deadman is automatically disabled when a pool gets suspended.
|
The deadman is automatically disabled when a pool gets suspended.
|
||||||
.
|
.
|
||||||
|
.It Sy zfs_deadman_events_per_second Ns = Ns Sy 1 Ns /s Pq int
|
||||||
|
Rate limit deadman zevents (which report hung I/O operations) to this many per
|
||||||
|
second.
|
||||||
|
.
|
||||||
.It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp
|
.It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp
|
||||||
Controls the failure behavior when the deadman detects a "hung" I/O operation.
|
Controls the failure behavior when the deadman detects a "hung" I/O operation.
|
||||||
Valid values are:
|
Valid values are:
|
||||||
|
@ -938,13 +949,6 @@ This can be used to facilitate automatic fail-over
|
||||||
to a properly configured fail-over partner.
|
to a properly configured fail-over partner.
|
||||||
.El
|
.El
|
||||||
.
|
.
|
||||||
.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64
|
|
||||||
Check time in milliseconds.
|
|
||||||
This defines the frequency at which we check for hung I/O requests
|
|
||||||
and potentially invoke the
|
|
||||||
.Sy zfs_deadman_failmode
|
|
||||||
behavior.
|
|
||||||
.
|
|
||||||
.It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64
|
.It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64
|
||||||
Interval in milliseconds after which the deadman is triggered and also
|
Interval in milliseconds after which the deadman is triggered and also
|
||||||
the interval after which a pool sync operation is considered to be "hung".
|
the interval after which a pool sync operation is considered to be "hung".
|
||||||
|
@ -1002,8 +1006,7 @@ will result in objects waiting when there is not actually contention on the
|
||||||
same object.
|
same object.
|
||||||
.
|
.
|
||||||
.It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int
|
.It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int
|
||||||
Rate limit delay and deadman zevents (which report slow I/O operations) to this
|
Rate limit delay zevents (which report slow I/O operations) to this many per
|
||||||
many per
|
|
||||||
second.
|
second.
|
||||||
.
|
.
|
||||||
.It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64
|
.It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64
|
||||||
|
|
|
@ -112,6 +112,11 @@ int zfs_vdev_dtl_sm_blksz = (1 << 12);
|
||||||
*/
|
*/
|
||||||
static unsigned int zfs_slow_io_events_per_second = 20;
|
static unsigned int zfs_slow_io_events_per_second = 20;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Rate limit deadman "hung IO" events to this many per second.
|
||||||
|
*/
|
||||||
|
static unsigned int zfs_deadman_events_per_second = 1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Rate limit checksum events after this many checksum errors per second.
|
* Rate limit checksum events after this many checksum errors per second.
|
||||||
*/
|
*/
|
||||||
|
@ -666,7 +671,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||||
*/
|
*/
|
||||||
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
|
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
|
||||||
1);
|
1);
|
||||||
zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second,
|
zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
|
||||||
1);
|
1);
|
||||||
zfs_ratelimit_init(&vd->vdev_checksum_rl,
|
zfs_ratelimit_init(&vd->vdev_checksum_rl,
|
||||||
&zfs_checksum_events_per_second, 1);
|
&zfs_checksum_events_per_second, 1);
|
||||||
|
@ -6476,6 +6481,9 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
|
||||||
ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
|
||||||
"Rate limit slow IO (delay) events to this many per second");
|
"Rate limit slow IO (delay) events to this many per second");
|
||||||
|
|
||||||
|
ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
|
||||||
|
"Rate limit hung IO (deadman) events to this many per second");
|
||||||
|
|
||||||
/* BEGIN CSTYLED */
|
/* BEGIN CSTYLED */
|
||||||
ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
|
||||||
"Rate limit checksum events to this many checksum errors per second "
|
"Rate limit checksum events to this many checksum errors per second "
|
||||||
|
|
|
@ -29,6 +29,7 @@ CONDENSE_INDIRECT_OBSOLETE_PCT condense.indirect_obsolete_pct zfs_condense_indir
|
||||||
CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes
|
CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes
|
||||||
DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift
|
DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift
|
||||||
DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms
|
DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms
|
||||||
|
DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second
|
||||||
DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode
|
DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode
|
||||||
DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms
|
DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms
|
||||||
DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms
|
DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms
|
||||||
|
|
|
@ -28,7 +28,7 @@
|
||||||
# Verify spa deadman events are rate limited
|
# Verify spa deadman events are rate limited
|
||||||
#
|
#
|
||||||
# STRATEGY:
|
# STRATEGY:
|
||||||
# 1. Reduce the zfs_slow_io_events_per_second to 1.
|
# 1. Reduce the zfs_deadman_events_per_second to 1.
|
||||||
# 2. Reduce the zfs_deadman_ziotime_ms to 1ms.
|
# 2. Reduce the zfs_deadman_ziotime_ms to 1ms.
|
||||||
# 3. Write data to a pool and read it back.
|
# 3. Write data to a pool and read it back.
|
||||||
# 4. Verify deadman events have been produced at a reasonable rate.
|
# 4. Verify deadman events have been produced at a reasonable rate.
|
||||||
|
@ -44,15 +44,15 @@ function cleanup
|
||||||
zinject -c all
|
zinject -c all
|
||||||
default_cleanup_noexit
|
default_cleanup_noexit
|
||||||
|
|
||||||
set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
|
set_tunable64 DEADMAN_EVENTS_PER_SECOND $OLD_DEADMAN_EVENTS
|
||||||
set_tunable64 DEADMAN_ZIOTIME_MS $ZIOTIME_DEFAULT
|
set_tunable64 DEADMAN_ZIOTIME_MS $ZIOTIME_DEFAULT
|
||||||
}
|
}
|
||||||
|
|
||||||
log_assert "Verify spa deadman events are rate limited"
|
log_assert "Verify spa deadman events are rate limited"
|
||||||
log_onexit cleanup
|
log_onexit cleanup
|
||||||
|
|
||||||
OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
|
OLD_DEADMAN_EVENTS=$(get_tunable DEADMAN_EVENTS_PER_SECOND)
|
||||||
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1
|
log_must set_tunable64 DEADMAN_EVENTS_PER_SECOND 1
|
||||||
log_must set_tunable64 DEADMAN_ZIOTIME_MS 1
|
log_must set_tunable64 DEADMAN_ZIOTIME_MS 1
|
||||||
|
|
||||||
# Create a new pool in order to use the updated deadman settings.
|
# Create a new pool in order to use the updated deadman settings.
|
||||||
|
|
Loading…
Reference in New Issue