ZED: Do not offline a missing device if no spare is available

Due to commit d48091d a removed device is now explicitly offlined by
the ZED if no spare is available, rather than the letting ZFS detect
it as UNAVAIL. This broke auto-replacing of whole-disk devices, as
described in issue #10577.  In short, when a new device is reinserted
in the same slot, the ZED will try to ONLINE it without letting ZFS
recreate the necessary partition table.

This change simply avoids setting the device OFFLINE when removed if
no spare is available (or if spare_on_remove is false).  This change
has been left minimal to allow it to be backported to 0.8.x release.
The auto_offline_001_pos ZTS test has been updated accordingly.

Some follow up work is planned to update the ZED so it transitions
the vdev to a REMOVED state.  This is a state which has always
existed but there is no current interface the ZED can use to
accomplish this.  Therefore it's being left to a follow up PR.

Reviewed-by: Gionatan Danti <g.danti@assyoma.it>
Co-authored-by: Gionatan Danti <g.danti@assyoma.it>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10577
Closes #10730
This commit is contained in:
Brian Behlendorf 2020-08-18 22:13:17 -07:00 committed by GitHub
parent cfd59f904b
commit 5266a0728a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 39 additions and 24 deletions

View File

@ -351,9 +351,8 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
zpool_vdev_offline(zhp, devname, B_TRUE);
} else if (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
replace_with_spare(hdl, zhp, vdev) == B_FALSE) {
/* Could not handle with spare: offline the device */
fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname);
zpool_vdev_offline(zhp, devname, B_TRUE);
/* Could not handle with spare */
fmd_hdl_debug(hdl, "no spare for '%s'", devname);
}
free(devname);

View File

@ -25,23 +25,29 @@
#
# DESCRIPTION:
# Testing Fault Management Agent ZED Logic - Physically removed device is
# offlined and onlined when reattached
# made unavail and onlined when reattached
#
# STRATEGY:
# 1. Create a pool
# 2. Simulate physical removal of one device
# 3. Verify the device is offlined
# 3. Verify the device is unvailable
# 4. Reattach the device
# 5. Verify the device is onlined
# 6. Repeat the same tests with a spare device: zed will use the spare to handle
# the removed data device
# 7. Repeat the same tests again with a faulted spare device: zed should offline
# the removed data device if no spare is available
# 6. Repeat the same tests with a spare device:
# zed will use the spare to handle the removed data device
# 7. Repeat the same tests again with a faulted spare device:
# the removed data device should be unavailable
#
# NOTE: the use of 'block_device_wait' throughout the test helps avoid race
# conditions caused by mixing creation/removal events from partitioning the
# disk (zpool create) and events from physically removing it (remove_disk).
#
# NOTE: the test relies on 'zpool sync' to prompt the kmods to transition a
# vdev to the unavailable state. The ZED does receive a removal notification
# but only relies on it to activate a hot spare. Additional work is planned
# to extend an existing ioctl interface to allow the ZED to transition the
# vdev in to a removed state.
#
verify_runnable "both"
if is_linux; then
@ -76,7 +82,6 @@ removedev=$(get_debug_device)
typeset poolconfs=(
"mirror $filedev1 $removedev"
"raidz3 $filedev1 $filedev2 $filedev3 $removedev"
"$filedev1 cache $removedev"
"mirror $filedev1 $filedev2 special mirror $filedev3 $removedev"
)
@ -91,11 +96,16 @@ do
log_must zpool create -f $TESTPOOL $conf
block_device_wait ${DEV_DSKDIR}/${removedev}
mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
log_fail "get_prop mountpoint /$TESTPOOL"
# 2. Simulate physical removal of one device
remove_disk $removedev
log_must mkfile 1m $mntpnt/file
log_must zpool sync $TESTPOOL
# 3. Verify the device is offlined
log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE"
# 3. Verify the device is unvailable.
log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
# 4. Reattach the device
insert_disk $removedev
@ -118,21 +128,22 @@ do
block_device_wait ${DEV_DSKDIR}/${removedev}
log_must zpool add $TESTPOOL spare $sparedev
# 3. Simulate physical removal of one device
mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
log_fail "get_prop mountpoint /$TESTPOOL"
# 2. Simulate physical removal of one device
remove_disk $removedev
log_must mkfile 1m $mntpnt/file
log_must zpool sync $TESTPOOL
# 4. Verify the device is handled by the spare unless is a l2arc disk
# which can only be offlined
if [[ $(echo "$conf" | grep -c 'cache') -eq 0 ]]; then
# 3. Verify the device is handled by the spare.
log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE"
else
log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE"
fi
log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
# 5. Reattach the device
# 4. Reattach the device
insert_disk $removedev
# 6. Verify the device is onlined
# 5. Verify the device is onlined
log_must wait_vdev_state $TESTPOOL $removedev "ONLINE"
# cleanup
@ -150,15 +161,20 @@ do
block_device_wait ${DEV_DSKDIR}/${removedev}
log_must zpool add $TESTPOOL spare $sparedev
mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
log_fail "get_prop mountpoint /$TESTPOOL"
# 2. Fault the spare device making it unavailable
log_must zpool offline -f $TESTPOOL $sparedev
log_must wait_hotspare_state $TESTPOOL $sparedev "FAULTED"
# 3. Simulate physical removal of one device
remove_disk $removedev
log_must mkfile 1m $mntpnt/file
log_must zpool sync $TESTPOOL
# 4. Verify the device is offlined
log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE"
# 4. Verify the device is unavailable
log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
# 5. Reattach the device
insert_disk $removedev