diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index b907f6af9b..8bdc097c77 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -4,6 +4,8 @@ DEFAULT_INCLUDES += \ -I$(top_srcdir)/include \ -I$(top_srcdir)/lib/libspl/include +EXTRA_DIST = $(top_srcdir)/cmd/zed/zed.d/README + sbin_PROGRAMS = zed zed_SOURCES = \ @@ -33,6 +35,7 @@ zed_LDADD = \ zedconfdir = $(sysconfdir)/zfs/zed.d dist_zedconf_DATA = \ + $(top_srcdir)/cmd/zed/zed.d/zed-functions.sh \ $(top_srcdir)/cmd/zed/zed.d/zed.rc zedexecdir = $(libexecdir)/zfs/zed.d diff --git a/cmd/zed/zed.d/README b/cmd/zed/zed.d/README new file mode 100644 index 0000000000..b4cb115143 --- /dev/null +++ b/cmd/zed/zed.d/README @@ -0,0 +1,30 @@ +Shell scripts are the recommended choice for ZEDLETs that mostly call +other utilities and do relatively little data manipulation. + +Shell scripts MUST work on both bash and dash. + +Shell scripts MUST run cleanly through ShellCheck: + http://www.shellcheck.net/ + +General functions reside in "zed-functions.sh". Use them where applicable. + +Additional references that may be of use: + + Google Shell Style Guide + https://google-styleguide.googlecode.com/svn/trunk/shell.xml + + Dash as /bin/sh + https://wiki.ubuntu.com/DashAsBinSh + + Common shell script mistakes + http://www.pixelbeat.org/programming/shell_script_mistakes.html + + Filenames and Pathnames in Shell: How to do it Correctly + http://www.dwheeler.com/essays/filenames-in-shell.html + + Autoconf: Portable Shell Programming + https://www.gnu.org/software/autoconf/manual/autoconf.html#Portable-Shell + +Please BE CONSISTENT with the existing style, check for errors, +minimize dependencies where possible, try to be portable, +and comment anything non-obvious. Festina lente. diff --git a/cmd/zed/zed.d/all-debug.sh b/cmd/zed/zed.d/all-debug.sh index aa20ef2686..057e39b504 100755 --- a/cmd/zed/zed.d/all-debug.sh +++ b/cmd/zed/zed.d/all-debug.sh @@ -2,16 +2,23 @@ # # Log all environment variables to ZED_DEBUG_LOG. # -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# This can be a useful aid when developing/debugging ZEDLETs since it shows the +# environment variables defined for each zevent. + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +: "${ZED_DEBUG_LOG:="${TMPDIR:="/tmp"}/zed.debug.log"}" + +lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock" -# Override the default umask to restrict access to a newly-created logfile. umask 077 - -# Append stdout to the logfile after obtaining an advisory lock. -exec >> "${ZED_DEBUG_LOG:=/tmp/zed.debug.log}" -flock -x 1 +zed_lock "${lockfile}" +exec >> "${ZED_DEBUG_LOG}" printenv | sort echo +exec >&- +zed_unlock "${lockfile}" exit 0 diff --git a/cmd/zed/zed.d/all-syslog.sh b/cmd/zed/zed.d/all-syslog.sh index acf9e83bde..b34d17cef1 100755 --- a/cmd/zed/zed.d/all-syslog.sh +++ b/cmd/zed/zed.d/all-syslog.sh @@ -1,11 +1,10 @@ #!/bin/sh # # Log the zevent via syslog. -# -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" -logger -t "${ZED_SYSLOG_TAG:=zed}" -p "${ZED_SYSLOG_PRIORITY:=daemon.notice}" \ - eid="${ZEVENT_EID}" class="${ZEVENT_SUBCLASS}" \ - "${ZEVENT_POOL:+pool=$ZEVENT_POOL}" +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" +zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \ + "${ZEVENT_POOL:+"pool=${ZEVENT_POOL}"}" exit 0 diff --git a/cmd/zed/zed.d/data-email.sh b/cmd/zed/zed.d/data-email.sh index 543b8fe55c..2dae8ff6b4 100755 --- a/cmd/zed/zed.d/data-email.sh +++ b/cmd/zed/zed.d/data-email.sh @@ -1,81 +1,53 @@ #!/bin/sh # -# Send email to ZED_EMAIL in response to a DATA zevent. -# Only one message per ZED_EMAIL_INTERVAL_SECS will be sent for a given -# class/pool combination. This protects against spamming the recipient -# should multiple events occur together in time for the same pool. +# Send email to ZED_EMAIL in response to a DATA error. +# +# Only one email per ZED_EMAIL_INTERVAL_SECS will be sent for a given +# class/pool combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool. +# # Exit codes: # 0: email sent # 1: email failed -# 2: email suppressed -# 3: missing executable -# 4: unsupported event class -# 5: internal error -# State File Format: -# POOL;TIME_OF_LAST_EMAIL -# -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# 2: email not configured +# 3: email suppressed +# 9: internal error -test -n "${ZEVENT_POOL}" || exit 5 -test -n "${ZEVENT_SUBCLASS}" || exit 5 +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" -if test "${ZEVENT_SUBCLASS}" != "data"; then \ - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" - exit 4 +[ -n "${ZED_EMAIL}" ] || exit 2 + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 + +if [ "${ZEVENT_SUBCLASS}" != "data" ]; then \ + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 fi -# Only send email if ZED_EMAIL has been configured. -test -n "${ZED_EMAIL}" || exit 2 +zed_check_cmd "mail" || exit 9 -# Ensure requisite executables are installed. -if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" not installed - exit 3 -fi +zed_rate_limit "${ZEVENT_POOL};${ZEVENT_SUBCLASS};email" || exit 3 -NAME="zed.${ZEVENT_SUBCLASS}.email" -LOCKFILE="${ZED_LOCKDIR:=/var/lock}/${NAME}.lock" -STATEFILE="${ZED_RUNDIR:=/var/run}/${NAME}.state" - -# Obtain lock to ensure mutual exclusion for accessing state. -exec 8> "${LOCKFILE}" -flock -x 8 - -# Query state for last time email was sent for this pool. -TIME_NOW=`date +%s` -TIME_LAST=`egrep "^${ZEVENT_POOL};" "${STATEFILE}" 2>/dev/null | cut -d ";" -f2` -if test -n "${TIME_LAST}"; then - TIME_DELTA=`expr "${TIME_NOW}" - "${TIME_LAST}"` - if test "${TIME_DELTA}" -lt "${ZED_EMAIL_INTERVAL_SECS:=3600}"; then - exit 2 - fi -fi - -"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on `hostname`" \ - "${ZED_EMAIL}" < "${email_pathname}" </dev/null > "${STATEFILE}.$$" -echo "${ZEVENT_POOL};${TIME_NOW}" >> "${STATEFILE}.$$" -mv -f "${STATEFILE}.$$" "${STATEFILE}" +mail -s "${email_subject}" "${ZED_EMAIL}" < "${email_pathname}" +mail_status=$? -if test "${MAIL_STATUS}" -ne 0; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" - exit 1 +if [ "${mail_status}" -ne 0 ]; then + zed_log_msg "mail exit=${mail_status}" + exit 1 fi - +rm -f "${email_pathname}" exit 0 diff --git a/cmd/zed/zed.d/generic-email.sh b/cmd/zed/zed.d/generic-email.sh index 357aedee5f..ad022e0343 100755 --- a/cmd/zed/zed.d/generic-email.sh +++ b/cmd/zed/zed.d/generic-email.sh @@ -1,59 +1,59 @@ #!/bin/sh # # Send email to ZED_EMAIL in response to a given zevent. -# This is a generic script than can be symlinked to a file in the zed -# enabled-scripts directory in order to have email sent when a particular -# class of zevents occurs. The symlink filename must begin with the zevent -# (sub)class string (eg, "probe_failure-email.sh" for the "probe_failure" -# subclass). Refer to the zed(8) manpage for details. +# +# This is a generic script than can be symlinked to a file in the +# enabled-zedlets directory to have an email sent when a particular class of +# zevents occurs. The symlink filename must begin with the zevent (sub)class +# string (e.g., "probe_failure-email.sh" for the "probe_failure" subclass). +# Refer to the zed(8) manpage for details. +# # Exit codes: # 0: email sent # 1: email failed -# 2: email suppressed -# 3: missing executable +# 2: email not configured +# 3: email suppressed + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZED_EMAIL}" ] || exit 2 + +# Rate-limit the message based in part on the filename. # -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +rate_limit_tag="${ZEVENT_POOL};${ZEVENT_SUBCLASS};$(basename -- "$0")" +rate_limit_interval="${ZED_EMAIL_INTERVAL_SECS}" +zed_rate_limit "${rate_limit_tag}" "${rate_limit_interval}" || exit 3 -# Only send email if ZED_EMAIL has been configured. -test -n "${ZED_EMAIL}" || exit 2 - -# Ensure requisite executables are installed. -if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" not installed - exit 3 -fi - -# Override the default umask to restrict access to the msgbody tmpfile. umask 077 - -SUBJECT="ZFS ${ZEVENT_SUBCLASS} event" -test -n "${ZEVENT_POOL}" && SUBJECT="${SUBJECT} for ${ZEVENT_POOL}" -SUBJECT="${SUBJECT} on `hostname`" - -MSGBODY="${TMPDIR:=/tmp}/`basename \"$0\"`.$$" +pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}" +host_str=" on $(hostname)" +email_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}" +email_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" { - echo "A ZFS ${ZEVENT_SUBCLASS} event has been posted:" - echo - echo " eid: ${ZEVENT_EID}" - echo " host: `hostname`" - echo " time: ${ZEVENT_TIME_STRING}" - test -n "${ZEVENT_VDEV_TYPE}" -a -n "${ZEVENT_VDEV_PATH}" && \ - echo " vdev: ${ZEVENT_VDEV_TYPE}:${ZEVENT_VDEV_PATH}" - test -n "${ZEVENT_POOL}" -a -x "${ZPOOL}" && \ - "${ZPOOL}" status "${ZEVENT_POOL}" -} > "${MSGBODY}" + echo "ZFS has posted the following event:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" -test -f "${MSGBODY}" && "${MAIL}" -s "${SUBJECT}" "${ZED_EMAIL}" < "${MSGBODY}" -MAIL_STATUS=$? -rm -f "${MSGBODY}" + if [ -n "${ZEVENT_VDEV_PATH}" ]; then + echo " vpath: ${ZEVENT_VDEV_PATH}" + [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" + fi -if test "${MAIL_STATUS}" -ne 0; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" - exit 1 + [ -n "${ZEVENT_POOL}" ] && [ -x "${ZPOOL}" ] \ + && "${ZPOOL}" status "${ZEVENT_POOL}" + +} > "${email_pathname}" + +mail -s "${email_subject}" "${ZED_EMAIL}" < "${email_pathname}" +mail_status=$? + +if [ "${mail_status}" -ne 0 ]; then + zed_log_msg "mail exit=${mail_status}" + exit 1 fi - +rm -f "${email_pathname}" exit 0 diff --git a/cmd/zed/zed.d/io-email.sh b/cmd/zed/zed.d/io-email.sh index 9edbe6670d..1854b15933 100755 --- a/cmd/zed/zed.d/io-email.sh +++ b/cmd/zed/zed.d/io-email.sh @@ -1,86 +1,57 @@ #!/bin/sh # -# Send email to ZED_EMAIL in response to a CHECKSUM or IO zevent. -# Only one message per ZED_EMAIL_INTERVAL_SECS will be sent for a given -# class/pool/vdev combination. This protects against spamming the recipient -# should multiple events occur together in time for the same pool/device. +# Send email to ZED_EMAIL in response to a CHECKSUM or IO error. +# +# Only one email per ZED_EMAIL_INTERVAL_SECS will be sent for a given +# class/pool/vdev combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool/device. +# # Exit codes: # 0: email sent # 1: email failed -# 2: email suppressed -# 3: missing executable -# 4: unsupported event class -# 5: internal error -# State File Format: -# POOL;VDEV_PATH;TIME_OF_LAST_EMAIL -# -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# 2: email not configured +# 3: email suppressed +# 9: internal error -test -n "${ZEVENT_POOL}" || exit 5 -test -n "${ZEVENT_SUBCLASS}" || exit 5 -test -n "${ZEVENT_VDEV_PATH}" || exit 5 +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" -if test "${ZEVENT_SUBCLASS}" != "checksum" \ - -a "${ZEVENT_SUBCLASS}" != "io"; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" - exit 4 +[ -n "${ZED_EMAIL}" ] || exit 2 + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 +[ -n "${ZEVENT_VDEV_PATH}" ] || exit 9 + +if [ "${ZEVENT_SUBCLASS}" != "checksum" ] \ + && [ "${ZEVENT_SUBCLASS}" != "io" ]; then + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 fi -# Only send email if ZED_EMAIL has been configured. -test -n "${ZED_EMAIL}" || exit 2 +zed_check_cmd "mail" || exit 9 -# Ensure requisite executables are installed. -if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" not installed - exit 3 -fi +zed_rate_limit "${ZEVENT_POOL};${ZEVENT_VDEV_PATH};${ZEVENT_SUBCLASS};email" \ + || exit 3 -NAME="zed.${ZEVENT_SUBCLASS}.email" -LOCKFILE="${ZED_LOCKDIR:=/var/lock}/${NAME}.lock" -STATEFILE="${ZED_RUNDIR:=/var/run}/${NAME}.state" - -# Obtain lock to ensure mutual exclusion for accessing state. -exec 8> "${LOCKFILE}" -flock -x 8 - -# Query state for last time email was sent for this pool/vdev. -TIME_NOW=`date +%s` -TIME_LAST=`egrep "^${ZEVENT_POOL};${ZEVENT_VDEV_PATH};" "${STATEFILE}" \ - 2>/dev/null | cut -d ";" -f3` -if test -n "${TIME_LAST}"; then - TIME_DELTA=`expr "${TIME_NOW}" - "${TIME_LAST}"` - if test "${TIME_DELTA}" -lt "${ZED_EMAIL_INTERVAL_SECS:=3600}"; then - exit 2 - fi -fi - -"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on `hostname`" \ - "${ZED_EMAIL}" < "${email_pathname}" </dev/null > "${STATEFILE}.$$" -echo "${ZEVENT_POOL};${ZEVENT_VDEV_PATH};${TIME_NOW}" >> "${STATEFILE}.$$" -mv -f "${STATEFILE}.$$" "${STATEFILE}" +mail -s "${email_subject}" "${ZED_EMAIL}" < "${email_pathname}" +mail_status=$? -if test "${MAIL_STATUS}" -ne 0; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" - exit 1 +if [ "${mail_status}" -ne 0 ]; then + zed_log_msg "mail exit=${mail_status}" + exit 1 fi - +rm -f "${email_pathname}" exit 0 diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh index b64b2a9f11..9667dedcb7 100755 --- a/cmd/zed/zed.d/io-spare.sh +++ b/cmd/zed/zed.d/io-spare.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# Replace a device with a hot spare in response to IO or checksum errors. +# Replace a device with a hot spare in response to IO or CHECKSUM errors. # The following actions will be performed automatically when the number # of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or # ZED_SPARE_ON_CHECKSUM_ERRORS. @@ -21,106 +21,171 @@ # the majority of the expected hot spare functionality. # # Exit codes: -# 0: replaced by hot spare -# 1: no hot spare device available -# 2: hot sparing disabled -# 3: already faulted or degraded -# 4: unsupported event class -# 5: internal error +# 0: hot spare replacement successful +# 1: hot spare device not available +# 2: hot sparing disabled or threshold not reached +# 3: device already faulted or degraded +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +# Disabled by default. Enable in the zed.rc file. +: "${ZED_SPARE_ON_CHECKSUM_ERRORS:=0}" +: "${ZED_SPARE_ON_IO_ERRORS:=0}" + + +# query_vdev_status (pool, vdev) # -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# Given a [pool] and [vdev], return the matching vdev path & status on stdout. +# +# Warning: This function does not handle the case of [pool] or [vdev] +# containing whitespace. Beware of ShellCheck SC2046. Caveat emptor. +# +# Arguments +# pool: pool name +# vdev: virtual device name +# +# StdOut +# arg1: vdev pathname +# arg2: vdev status +# +query_vdev_status() +{ + local pool="$1" + local vdev="$2" + local t -test -n "${ZEVENT_POOL}" || exit 5 -test -n "${ZEVENT_SUBCLASS}" || exit 5 -test -n "${ZEVENT_VDEV_PATH}" || exit 5 -test -n "${ZEVENT_VDEV_GUID}" || exit 5 + vdev="$(basename -- "${vdev}")" + ([ -n "${pool}" ] && [ -n "${vdev}" ]) || return + t="$(printf '\t')" -# Defaults to disabled, enable in the zed.rc file. -ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0} -ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0} - -if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \ - ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then - exit 2 -fi - -# A lock file is used to serialize execution. -ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock} -LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock" - -exec 8> "${LOCKFILE}" -flock -x 8 - -# Given a and return the status, (ONLINE, FAULTED, etc...). -vdev_status() { - local POOL=$1 - local VDEV=`basename $2` - local T=' ' # tab character since '\t' isn't portable - - ${ZPOOL} status ${POOL} | sed -n -e \ - "s,^[ $T]*\(.*$VDEV\(-part[0-9]\+\)\?\)[ $T]*\([A-Z]\+\).*,\1 \3,p" - return 0 + "${ZPOOL}" status "${pool}" 2>/dev/null | sed -n -e \ + "s,^[ $t]*\(.*${vdev}\(-part[0-9]\+\)\?\)[ $t]*\([A-Z]\+\).*,\1 \3,p" \ + | tail -1 } -# Fault devices after N I/O errors. -if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then - ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}` - if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \ - ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then - ACTION="fault" - fi -# Degrade devices after N checksum errors. -elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then - ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS} +# main +# +# Arguments +# none +# +# Return +# see above +# +main() +{ + local num_errors + local action + local lockfile + local vdev_path + local vdev_status + local spare + local zpool_err + local zpool_rv + local rv - if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \ - ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then - ACTION="degrade" - fi -else - ACTION= -fi + # Avoid hot-sparing a hot-spare. + # + # Note: ZEVENT_VDEV_PATH is not defined for ZEVENT_VDEV_TYPE=spare. + # + [ "${ZEVENT_VDEV_TYPE}" = "spare" ] && exit 2 -if [ -n "${ACTION}" ]; then + [ -n "${ZEVENT_POOL}" ] || exit 9 + [ -n "${ZEVENT_VDEV_GUID}" ] || exit 9 + [ -n "${ZEVENT_VDEV_PATH}" ] || exit 9 - # Device is already FAULTED or DEGRADED - set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}` - ZEVENT_VDEV_PATH_FOUND=$1 - STATUS=$2 - if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then - exit 3 - fi + zed_check_cmd "${ZPOOL}" "${ZINJECT}" || exit 9 - # Step 1) FAULT or DEGRADE the device - # - ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL} + # Fault the device after a given number of I/O errors. + # + if [ "${ZEVENT_SUBCLASS}" = "io" ]; then + if [ "${ZED_SPARE_ON_IO_ERRORS}" -gt 0 ]; then + num_errors=$((ZEVENT_VDEV_READ_ERRORS + ZEVENT_VDEV_WRITE_ERRORS)) + [ "${num_errors}" -ge "${ZED_SPARE_ON_IO_ERRORS}" ] \ + && action="fault" + fi 2>/dev/null - # Step 2) Set the SES fault beacon. - # - # XXX: Set the 'fault' or 'ident' beacon for the device. This can - # be done through the sg_ses utility, the only hard part is to map - # the sd device to its corresponding enclosure and slot. We may - # be able to leverage the existing vdev_id scripts for this. - # - # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 - # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 + # Degrade the device after a given number of checksum errors. + # + elif [ "${ZEVENT_SUBCLASS}" = "checksum" ]; then + if [ "${ZED_SPARE_ON_CHECKSUM_ERRORS}" -gt 0 ]; then + num_errors="${ZEVENT_VDEV_CKSUM_ERRORS}" + [ "${num_errors}" -ge "${ZED_SPARE_ON_CHECKSUM_ERRORS}" ] \ + && action="degrade" + fi 2>/dev/null - # Step 3) Replace the device with a hot spare. - # - # Round robin through the spares selecting those which are available. - # - for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do - set -- `vdev_status ${ZEVENT_POOL} ${SPARE}` - SPARE_VDEV_FOUND=$1 - STATUS=$2 - if [ "${STATUS}" = "AVAIL" ]; then - ${ZPOOL} replace ${ZEVENT_POOL} \ - ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0 - fi - done + else + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 + fi - exit 1 -fi + # Error threshold not reached. + # + if [ -z "${action}" ]; then + exit 2 + fi -exit 4 + lockfile="zed.spare.lock" + zed_lock "${lockfile}" + + # shellcheck disable=SC2046 + set -- $(query_vdev_status "${ZEVENT_POOL}" "${ZEVENT_VDEV_PATH}") + vdev_path="$1" + vdev_status="$2" + + # Device is already FAULTED or DEGRADED. + # + if [ "${vdev_status}" = "FAULTED" ] \ + || [ "${vdev_status}" = "DEGRADED" ]; then + rv=3 + + else + rv=1 + + # 1) FAULT or DEGRADE the device. + # + "${ZINJECT}" -d "${ZEVENT_VDEV_GUID}" -A "${action}" "${ZEVENT_POOL}" + + # 2) Set the SES fault beacon. + # + # TODO: Set the 'fault' or 'ident' beacon for the device. This can + # be done through the sg_ses utility. The only hard part is to map + # the sd device to its corresponding enclosure and slot. We may + # be able to leverage the existing vdev_id scripts for this. + # + # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 + # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 + + # 3) Replace the device with a hot spare. + # + # Round-robin through the spares trying those that are available. + # + for spare in ${ZEVENT_VDEV_SPARE_PATHS}; do + + # shellcheck disable=SC2046 + set -- $(query_vdev_status "${ZEVENT_POOL}" "${spare}") + vdev_path="$1" + vdev_status="$2" + + [ "${vdev_status}" = "AVAIL" ] || continue + + zpool_err="$("${ZPOOL}" replace "${ZEVENT_POOL}" \ + "${ZEVENT_VDEV_GUID}" "${vdev_path}" 2>&1)"; zpool_rv=$? + + if [ "${zpool_rv}" -ne 0 ]; then + [ -n "${zpool_err}" ] && zed_log_err "zpool ${zpool_err}" + else + rv=0 + break + fi + done + fi + + zed_unlock "${lockfile}" + exit "${rv}" +} + + +main "$@" diff --git a/cmd/zed/zed.d/scrub.finish-email.sh b/cmd/zed/zed.d/scrub.finish-email.sh index d92ccfea12..4a8155caf0 100755 --- a/cmd/zed/zed.d/scrub.finish-email.sh +++ b/cmd/zed/zed.d/scrub.finish-email.sh @@ -1,73 +1,63 @@ #!/bin/sh # # Send email to ZED_EMAIL in response to a RESILVER.FINISH or SCRUB.FINISH. -# By default, "zpool status" output will only be included in the email for -# a scrub.finish zevent if the pool is not healthy; to always include its -# output, set ZED_EMAIL_VERBOSE=1. +# +# By default, "zpool status" output will only be included for a scrub.finish +# zevent if the pool is not healthy; to always include its output, set +# ZED_EMAIL_VERBOSE=1. +# # Exit codes: # 0: email sent # 1: email failed -# 2: email suppressed -# 3: missing executable -# 4: unsupported event class -# 5: internal error -# -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# 2: email not configured +# 3: email suppressed +# 9: internal error -test -n "${ZEVENT_POOL}" || exit 5 -test -n "${ZEVENT_SUBCLASS}" || exit 5 +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" -if test "${ZEVENT_SUBCLASS}" = "resilver.finish"; then - ACTION="resilvering" -elif test "${ZEVENT_SUBCLASS}" = "scrub.finish"; then - ACTION="scrubbing" +[ -n "${ZED_EMAIL}" ] || exit 2 + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 + +if [ "${ZEVENT_SUBCLASS}" = "resilver.finish" ]; then + action="resilver" +elif [ "${ZEVENT_SUBCLASS}" = "scrub.finish" ]; then + action="scrub" else - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" - exit 4 + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 fi -# Only send email if ZED_EMAIL has been configured. -test -n "${ZED_EMAIL}" || exit 2 - -# Ensure requisite executables are installed. -if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" not installed - exit 3 -fi -if ! test -x "${ZPOOL}"; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${ZPOOL}" not installed - exit 3 -fi +zed_check_cmd "mail" "${ZPOOL}" || exit 9 # For scrub, suppress email if pool is healthy and verbosity is not enabled. -if test "${ZEVENT_SUBCLASS}" = "scrub.finish"; then - HEALTHY=`"${ZPOOL}" status -x "${ZEVENT_POOL}" | \ - grep "'${ZEVENT_POOL}' is healthy"` - test -n "${HEALTHY}" -a "${ZED_EMAIL_VERBOSE:=0}" = 0 && exit 2 +# +if [ "${ZEVENT_SUBCLASS}" = "scrub.finish" ]; then + healthy="$("${ZPOOL}" status -x "${ZEVENT_POOL}" \ + | grep "'${ZEVENT_POOL}' is healthy")" + [ -n "${healthy}" ] && [ "${ZED_EMAIL_VERBOSE}" -eq 0 ] && exit 3 fi -"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on `hostname`" \ - "${ZED_EMAIL}" < "${email_pathname}" </dev/null 2>&1; then + zed_log_err "\"${cmd}\" not installed" + rv=$((rv + 1)) + fi + done + return "${rv}" +} + + +# zed_log_msg (msg, ...) +# +# Write all argument strings to the system log. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# +# Return +# nothing +# +zed_log_msg() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "$@" +} + + +# zed_log_err (msg, ...) +# +# Write an error message to the system log. This message will contain the +# script name, EID, and all argument strings. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# ZEVENT_EID +# +# Return +# nothing +# +zed_log_err() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "error:" \ + "$(basename -- "$0"):" "${ZEVENT_EID:+"eid=${ZEVENT_EID}:"}" "$@" +} + + +# zed_lock (lockfile, [fd]) +# +# Obtain an exclusive (write) lock on [lockfile]. If the lock cannot be +# immediately acquired, wait until it becomes available. +# +# Every zed_lock() must be paired with a corresponding zed_unlock(). +# +# By default, flock-style locks associate the lockfile with file descriptor 8. +# The bash manpage warns that file descriptors >9 should be used with care as +# they may conflict with file descriptors used internally by the shell. File +# descriptor 9 is reserved for zed_rate_limit(). If concurrent locks are held +# within the same process, they must use different file descriptors (preferably +# decrementing from 8); otherwise, obtaining a new lock with a given file +# descriptor will release the previous lock associated with that descriptor. +# +# Arguments +# lockfile: pathname of the lock file; the lock will be stored in +# ZED_LOCKDIR unless the pathname contains a "/". +# fd: integer for the file descriptor used by flock (OPTIONAL unless holding +# concurrent locks) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_lock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local umask_bak + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + umask_bak="$(umask)" + umask 077 + + # Obtain a lock on the file bound to the given file descriptor. + # + eval "exec ${fd}> '${lockfile}'" + err="$(flock --exclusive "${fd}" 2>&1)" + if [ $? -ne 0 ]; then + zed_log_err "failed to lock \"${lockfile}\": ${err}" + fi + + umask "${umask_bak}" +} + + +# zed_unlock (lockfile, [fd]) +# +# Release the lock on [lockfile]. +# +# Arguments +# lockfile: pathname of the lock file +# fd: integer for the file descriptor used by flock (must match the file +# descriptor passed to the zed_lock function call) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_unlock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + # Release the lock and close the file descriptor. + # + err="$(flock --unlock "${fd}" 2>&1)" + if [ $? -ne 0 ]; then + zed_log_err "failed to unlock \"${lockfile}\": ${err}" + fi + eval "exec ${fd}>&-" +} + + +# zed_rate_limit (tag, [interval]) +# +# Check whether an event of a given type [tag] has already occurred within the +# last [interval] seconds. +# +# This function obtains a lock on the statefile using file descriptor 9. +# +# Arguments +# tag: arbitrary string for grouping related events to rate-limit +# interval: time interval in seconds (OPTIONAL) +# +# Globals +# ZED_EMAIL_INTERVAL_SECS +# ZED_RUNDIR +# +# Return +# 0 if the event should be processed +# 1 if the event should be dropped +# +# State File Format +# time;tag +# +zed_rate_limit() +{ + local tag="$1" + local interval="${2:-${ZED_EMAIL_INTERVAL_SECS}}" + local lockfile="zed.zedlet.state.lock" + local lockfile_fd=9 + local statefile="${ZED_RUNDIR}/zed.zedlet.state" + local time_now + local time_prev + local umask_bak + local rv=0 + + [ -n "${tag}" ] || return 0 + + zed_lock "${lockfile}" "${lockfile_fd}" + time_now="$(date +%s)" + time_prev="$(egrep "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + | tail -1 | cut -d\; -f1)" + + if [ -n "${time_prev}" ] \ + && [ "$((time_now - time_prev))" -lt "${interval}" ]; then + rv=1 + else + umask_bak="$(umask)" + umask 077 + egrep -v "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + > "${statefile}.$$" + echo "${time_now};${tag}" >> "${statefile}.$$" + mv -f "${statefile}.$$" "${statefile}" + umask "${umask_bak}" + fi + + zed_unlock "${lockfile}" "${lockfile_fd}" + return "${rv}" +} diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index 69989f9531..4c53207d74 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -1,34 +1,60 @@ +## # zed.rc +## +## # Absolute path to the debug output file. +# #ZED_DEBUG_LOG="/tmp/zed.debug.log" +## # Email address of the zpool administrator. # Email will only be sent if ZED_EMAIL is defined. +# Disabled by default; uncomment to enable. +# #ZED_EMAIL="root" +## +# Minimum number of seconds between emails for a similar event. +# +#ZED_EMAIL_INTERVAL_SECS=3600 + +## # Email verbosity. # If set to 0, suppress email if the pool is healthy. # If set to 1, send email regardless of pool health. +# #ZED_EMAIL_VERBOSE=0 -# Minimum number of seconds between emails sent for a similar event. -#ZED_EMAIL_INTERVAL_SECS="3600" - +## # Default directory for zed lock files. +# #ZED_LOCKDIR="/var/lock" +## # Default directory for zed state files. +# #ZED_RUNDIR="/var/run" -# The syslog priority (eg, specified as a "facility.level" pair). -#ZED_SYSLOG_PRIORITY="daemon.notice" - -# The syslog tag for marking zed events. -#ZED_SYSLOG_TAG="zed" +## +# Replace a device with a hot spare after N checksum errors are detected. +# Disabled by default; uncomment to enable. +# +#ZED_SPARE_ON_CHECKSUM_ERRORS=10 +## # Replace a device with a hot spare after N I/O errors are detected. +# Disabled by default; uncomment to enable. +# #ZED_SPARE_ON_IO_ERRORS=1 -# Replace a device with a hot spare after N checksum errors are detected. -#ZED_SPARE_ON_CHECKSUM_ERRORS=10 +## +# The syslog priority (e.g., specified as a "facility.level" pair). +# +#ZED_SYSLOG_PRIORITY="daemon.notice" + +## +# The syslog tag for marking zed events. +# +#ZED_SYSLOG_TAG="zed" +