zfs/cmd/zed/zed.d/io-spare.sh

126 lines
3.8 KiB
Bash
Executable File

#!/bin/sh
#
# Replace a device with a hot spare in response to IO or checksum errors.
# The following actions will be performed automatically when the number
# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or
# ZED_SPARE_ON_CHECKSUM_ERRORS.
#
# 1) FAULT the device on IO errors, no futher IO will be attempted.
# DEGRADE the device on checksum errors, the device is still
# functional and can be used to service IO requests.
# 2) Set the SES fault beacon for the device.
# 3) Replace the device with a hot spare if any are available.
#
# Once the hot sparing operation is complete either the failed device or
# the hot spare must be manually retired using the 'zpool detach' command.
# The 'autoreplace' functionality which would normally take care of this
# under Illumos has not yet been implemented.
#
# Full support for autoreplace is planned, but it requires that the full
# ZFS Diagnosis Engine be ported. In the meanwhile this script provides
# the majority of the expected hot spare functionality.
#
# Exit codes:
# 0: replaced by hot spare
# 1: no hot spare device available
# 2: hot sparing disabled
# 3: already faulted or degraded
# 4: unsupported event class
# 5: internal error
#
test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc"
test -n "${ZEVENT_POOL}" || exit 5
test -n "${ZEVENT_SUBCLASS}" || exit 5
test -n "${ZEVENT_VDEV_PATH}" || exit 5
test -n "${ZEVENT_VDEV_GUID}" || exit 5
# Defaults to disabled, enable in the zed.rc file.
ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0}
ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0}
if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \
${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then
exit 2
fi
# A lock file is used to serialize execution.
ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock}
LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock"
exec 8> "${LOCKFILE}"
flock -x 8
# Given a <pool> and <device> return the status, (ONLINE, FAULTED, etc...).
vdev_status() {
local POOL=$1
local VDEV=`basename $2`
${ZPOOL} status ${POOL} | \
awk -v pat="${VDEV}|${VDEV/-part?}" '$0 ~ pat { print $1" "$2 }'
return 0
}
# Fault devices after N I/O errors.
if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then
ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}`
if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \
${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then
ACTION="fault"
fi
# Degrade devices after N checksum errors.
elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then
ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS}
if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \
${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then
ACTION="degrade"
fi
else
ACTION=
fi
if [ -n "${ACTION}" ]; then
# Device is already FAULTED or DEGRADED
set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}`
ZEVENT_VDEV_PATH_FOUND=$1
STATUS=$2
if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then
exit 3
fi
# Step 1) FAULT or DEGRADE the device
#
${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL}
# Step 2) Set the SES fault beacon.
#
# XXX: Set the 'fault' or 'ident' beacon for the device. This can
# be done through the sg_ses utility, the only hard part is to map
# the sd device to its corresponding enclosure and slot. We may
# be able to leverage the existing vdev_id scripts for this.
#
# $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3
# $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3
# Step 3) Replace the device with a hot spare.
#
# Round robin through the spares selecting those which are available.
#
for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do
set -- `vdev_status ${ZEVENT_POOL} ${SPARE}`
SPARE_VDEV_FOUND=$1
STATUS=$2
if [ "${STATUS}" = "AVAIL" ]; then
${ZPOOL} replace ${ZEVENT_POOL} \
${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0
fi
done
exit 1
fi
exit 4