Fix ZED auto-replace for VDEVs using by-id paths

The change is simple -- restore the original code so that the VDEV 
path is updated when using by-id paths.  The more challenging part 
was to devise a second ZTS test, that would test auto-replace for 
'by-id' and help prevent a future regression.

With that new test, we can now do an A|B test with , and without, 
the fix to confirm that auto-replace for by-id paths works. The 
existing auto-replace test, functional/fault/auto_replace_001_pos, 
will confirm that we didn't break auto-replace for 'by-vdev' paths.

In the original functional/fault/auto_replace_001_pos test, the disk 
wipe (using dd) was not effective in removing the partitioning since 
the kernel was never informed of the wipe.

Added a call to wipefs(8) so that the kernel is informed and ZED will 
re-partition the device.
    
Added a validation step that the re-partitioning occurred by
confirming  that the GPT partition UUID changes.

Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes 
This commit is contained in:
Don Brady 2023-10-20 10:29:02 -06:00 committed by Tony Hutter
parent 78fd79eacd
commit 0bcd1151f0
9 changed files with 279 additions and 35 deletions
cmd/zed/agents
include
lib/libzutil/os/linux
tests

View File

@ -24,6 +24,7 @@
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016, 2017, Intel Corporation.
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
* Copyright (c) 2023, Klara Inc.
*/
/*
@ -204,7 +205,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
uint64_t is_spare = 0;
const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
char rawpath[PATH_MAX], fullpath[PATH_MAX];
char devpath[PATH_MAX];
char pathbuf[PATH_MAX];
int ret;
int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
boolean_t is_sd = B_FALSE;
@ -214,6 +215,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
char **lines = NULL;
int lines_cnt = 0;
/*
* Get the persistent path, typically under the '/dev/disk/by-id' or
* '/dev/disk/by-vdev' directories. Note that this path can change
* when a vdev is replaced with a new disk.
*/
if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
return;
@ -370,15 +376,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
(void) snprintf(rawpath, sizeof (rawpath), "%s%s",
is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) {
zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
rawpath, strerror(errno));
(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
&newstate);
int err = zpool_vdev_online(zhp, fullpath,
ZFS_ONLINE_FORCEFAULT, &newstate);
zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)",
fullpath, libzfs_error_description(g_zfshdl));
zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s) "
"err %d, new state %d",
fullpath, libzfs_error_description(g_zfshdl), err,
err ? (int)newstate : 0);
return;
}
@ -428,7 +436,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
* to trigger a ZFS fault for the device (and any hot spare
* replacement).
*/
leafname = strrchr(devpath, '/') + 1;
leafname = strrchr(pathbuf, '/') + 1;
/*
* If this is a request to label a whole disk, then attempt to
@ -436,7 +444,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
*/
if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname,
vdev, "autoreplace", &lines, &lines_cnt) != 0) {
zed_log_msg(LOG_INFO,
zed_log_msg(LOG_WARNING,
" zpool_prepare_and_label_disk: could not "
"label '%s' (%s)", leafname,
libzfs_error_description(g_zfshdl));
@ -468,7 +476,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
sizeof (device->pd_physpath));
list_insert_tail(&g_device_list, device);
zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)",
zed_log_msg(LOG_NOTICE, " zpool_label_disk: async '%s' (%llu)",
leafname, (u_longlong_t)guid);
return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
@ -491,8 +499,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
}
if (!found) {
/* unexpected partition slice encountered */
zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
fullpath);
zed_log_msg(LOG_WARNING, "labeled disk %s was "
"unexpected here", fullpath);
(void) zpool_vdev_online(zhp, fullpath,
ZFS_ONLINE_FORCEFAULT, &newstate);
return;
@ -501,8 +509,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)",
physpath, (u_longlong_t)guid);
(void) snprintf(devpath, sizeof (devpath), "%s%s",
DEV_BYID_PATH, new_devid);
/*
* Paths that begin with '/dev/disk/by-id/' will change and so
* they must be updated before calling zpool_vdev_attach().
*/
if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) {
(void) snprintf(pathbuf, sizeof (pathbuf), "%s%s",
DEV_BYID_PATH, new_devid);
zed_log_msg(LOG_INFO, " zpool_label_disk: path '%s' "
"replaced by '%s'", path, pathbuf);
path = pathbuf;
}
}
libzfs_free_str_array(lines, lines_cnt);
@ -545,9 +562,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
* Wait for udev to verify the links exist, then auto-replace
* the leaf disk at same physical location.
*/
if (zpool_label_disk_wait(path, 3000) != 0) {
zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
"disk %s is missing", path);
if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) {
zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling "
"replacement disk, the expected disk partition link '%s' "
"is missing after waiting %u ms",
zpool_get_name(zhp), path, DISK_LABEL_WAIT);
nvlist_free(nvroot);
return;
}
@ -562,7 +581,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
B_TRUE, B_FALSE);
}
zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
zed_log_msg(LOG_WARNING, " zpool_vdev_replace: %s with %s (%s)",
fullpath, path, (ret == 0) ? "no errors" :
libzfs_error_description(g_zfshdl));
@ -660,7 +679,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
dp->dd_prop, path);
dp->dd_found = B_TRUE;
/* pass the new devid for use by replacing code */
/* pass the new devid for use by auto-replacing code */
if (dp->dd_new_devid != NULL) {
(void) nvlist_add_string(nvl, "new_devid",
dp->dd_new_devid);

View File

@ -34,7 +34,7 @@ extern "C" {
#endif
/*
* Default wait time for a device name to be created.
* Default wait time in milliseconds for a device name to be created.
*/
#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */

View File

@ -582,9 +582,8 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
* Wait up to timeout_ms for udev to set up the device node. The device is
* considered ready when libudev determines it has been initialized, all of
* the device links have been verified to exist, and it has been allowed to
* settle. At this point the device the device can be accessed reliably.
* Depending on the complexity of the udev rules this process could take
* several seconds.
* settle. At this point the device can be accessed reliably. Depending on
* the complexity of the udev rules this process could take several seconds.
*/
int
zpool_label_disk_wait(const char *path, int timeout_ms)

View File

@ -122,10 +122,10 @@ tags = ['functional', 'fallocate']
[tests/functional/fault:Linux]
tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos',
'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared',
'decrypt_fault', 'decompress_fault', 'scrub_after_resilver',
'zpool_status_-s']
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
'scrub_after_resilver', 'zpool_status_-s']
tags = ['functional', 'fault']
[tests/functional/features/large_dnode:Linux]

View File

@ -328,6 +328,7 @@ if os.environ.get('CI') == 'true':
'fault/auto_online_001_pos': ['SKIP', ci_reason],
'fault/auto_online_002_pos': ['SKIP', ci_reason],
'fault/auto_replace_001_pos': ['SKIP', ci_reason],
'fault/auto_replace_002_pos': ['SKIP', ci_reason],
'fault/auto_spare_ashift': ['SKIP', ci_reason],
'fault/auto_spare_shared': ['SKIP', ci_reason],
'procfs/pool_state': ['SKIP', ci_reason],

View File

@ -130,12 +130,14 @@ export SYSTEM_FILES_LINUX='attr
chattr
exportfs
fallocate
flock
free
getfattr
groupadd
groupdel
groupmod
hostid
logger
losetup
lsattr
lsblk
@ -145,21 +147,20 @@ export SYSTEM_FILES_LINUX='attr
md5sum
mkswap
modprobe
mountpoint
mpstat
nsenter
parted
perf
setfattr
setpriv
sha256sum
udevadm
unshare
useradd
userdel
usermod
setpriv
mountpoint
flock
logger'
wipefs'
export ZFS_FILES='zdb
zfs

View File

@ -1431,6 +1431,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/fault/auto_online_001_pos.ksh \
functional/fault/auto_online_002_pos.ksh \
functional/fault/auto_replace_001_pos.ksh \
functional/fault/auto_replace_002_pos.ksh \
functional/fault/auto_spare_001_pos.ksh \
functional/fault/auto_spare_002_pos.ksh \
functional/fault/auto_spare_ashift.ksh \

View File

@ -34,13 +34,14 @@
# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path.
# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug".
# 2. Create a pool and set autoreplace=on (auto-replace is opt-in)
# 3. Export a pool
# 3. Export the pool
# 4. Wipe and offline the scsi_debug disk
# 5. Import pool with missing disk
# 5. Import the pool with missing disk
# 6. Re-online the wiped scsi_debug disk
# 7. Verify the ZED detects the new unused disk and adds it back to the pool
# 7. Verify ZED detects the new blank disk and replaces the missing vdev
# 8. Verify that the scsi_debug disk was re-partitioned
#
# Creates a raidz1 zpool using persistent disk path names
# Creates a raidz1 zpool using persistent /dev/disk/by-vdev path names
# (ie not /dev/sdc)
#
# Auto-replace is opt in, and matches by phys_path.
@ -83,11 +84,27 @@ log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE $DISK1 $DISK2 $DISK3
log_must zpool set autoreplace=on $TESTPOOL
# Add some data to the pool
log_must mkfile $FSIZE /$TESTPOOL/data
log_must zfs create $TESTPOOL/fs
log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z
log_must zpool export $TESTPOOL
# Record the partition UUID for later comparison
part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid}
#
# Wipe and offline the disk
#
# Note that it is not enough to zero the disk to expunge the partitions.
# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe').
#
# Using partprobe is overkill and hdparm is not as common as wipefs. So
# we use wipefs which lets the kernel know the partition was removed
# from the device (i.e., calls BLKRRPART ioctl).
#
log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE
log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID
remove_disk $SD
block_device_wait
@ -106,4 +123,18 @@ log_must wait_replacing $TESTPOOL 60
# Validate auto-replace was successful
log_must check_state $TESTPOOL "" "ONLINE"
#
# Confirm the partition UUID changed so we know the new disk was relabeled
#
# Note: some older versions of udevadm don't support "--property" option so
# we'll # skip this test when it is not supported
#
if [ ! -z "$part_uuid" ]; then
new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
log_note new disk GPT uuid ${new_uuid}
[[ "$part_uuid" = "$new_uuid" ]] && \
log_fail "The new disk was not relabeled as expected"
fi
log_pass "Auto-replace test successful"

View File

@ -0,0 +1,192 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2017 by Intel Corporation. All rights reserved.
# Copyright (c) 2023 by Klara, Inc. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/fault/fault.cfg
#
# DESCRIPTION:
# Testing Fault Management Agent ZED Logic - Automated Auto-Replace Test.
# Verifys that auto-replace works with by-id paths.
#
# STRATEGY:
# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path.
# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug".
# 2. Create a pool and set autoreplace=on (auto-replace is opt-in)
# 3. Export the pool
# 4. Wipe and offline the scsi_debug disk
# 5. Import the pool with missing disk
# 6. Re-online the wiped scsi_debug disk with a new serial number
# 7. Verify ZED detects the new blank disk and replaces the missing vdev
# 8. Verify that the scsi_debug disk was re-partitioned
#
# Creates a raidz1 zpool using persistent /dev/disk/by-id path names
#
# Auto-replace is opt in, and matches by phys_path.
#
verify_runnable "both"
if ! is_physical_device $DISKS; then
log_unsupported "Unsupported disks for this test."
fi
function cleanup
{
zpool status $TESTPOOL
destroy_pool $TESTPOOL
sed -i '/alias scsidebug/d' $VDEVID_CONF
unload_scsi_debug
}
#
# Wait until a vdev transitions to its replacement vdev
#
# Return 0 when vdev reaches expected state, 1 on timeout.
#
# Note: index +2 is to skip over root and raidz-0 vdevs
#
function wait_vdev_online # pool index oldguid timeout
{
typeset pool=$1
typeset -i index=$2+2
typeset guid=$3
typeset timeout=${4:-60}
typeset -i i=0
while [[ $i -lt $timeout ]]; do
vdev_guids=( $(zpool get -H -o value guid $pool all-vdevs) )
if [ "${vdev_guids[$index]}" != "${guid}" ]; then
log_note "new vdev[$((index-2))]: ${vdev_guids[$index]}, replacing ${guid}"
return 0
fi
i=$((i+1))
sleep 1
done
return 1
}
log_assert "automated auto-replace with by-id paths"
log_onexit cleanup
load_scsi_debug $SDSIZE $SDHOSTS $SDTGTS $SDLUNS '512b'
SD=$(get_debug_device)
SD_DEVICE_ID=$(get_persistent_disk_name $SD)
SD_HOST=$(get_scsi_host $SD)
# Register vdev_id alias for scsi_debug device to create a persistent path
echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF
block_device_wait
SD_DEVICE=$(udevadm info -q all -n $DEV_DSKDIR/$SD | \
awk -F'=' '/ID_VDEV=/ {print $2; exit}')
[ -z $SD_DEVICE ] && log_fail "vdev rule was not registered properly"
log_must zpool events -c
log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE_ID $DISK1 $DISK2 $DISK3
vdev_guid=$(zpool get guid -H -o value $TESTPOOL $SD_DEVICE_ID)
log_note original vdev guid ${vdev_guid}
# Auto-replace is opt-in so need to set property
log_must zpool set autoreplace=on $TESTPOOL
# Add some data to the pool
log_must zfs create $TESTPOOL/fs
log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z
log_must zpool export $TESTPOOL
# Record the partition UUID for later comparison
part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid}
#
# Wipe and offline the disk
#
# Note that it is not enough to zero the disk to expunge the partitions.
# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe').
#
# Using partprobe is overkill and hdparm is not as common as wipefs. So
# we use wipefs which lets the kernel know the partition was removed
# from the device (i.e., calls BLKRRPART ioctl).
#
log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE
log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID
remove_disk $SD
block_device_wait
# Re-import pool with drive missing
log_must zpool import $TESTPOOL
log_must check_state $TESTPOOL "" "DEGRADED"
block_device_wait
#
# Online an empty disk in the same physical location, with a different by-id
# symlink. We use vpd_use_hostno to make sure the underlying serial number
# changes for the new disk which in turn gives us a different by-id path.
#
# The original names were something like:
# /dev/disk/by-id/scsi-SLinux_scsi_debug_16000-part1
# /dev/disk/by-id/wwn-0x33333330000007d0-part1
#
# This new inserted disk, will have different links like:
# /dev/disk/by-id/scsi-SLinux_scsi_debug_2000-part1
# /dev/disk/by-id/wwn-0x0x3333333000003e80 -part1
#
echo '0' > /sys/bus/pseudo/drivers/scsi_debug/vpd_use_hostno
insert_disk $SD $SD_HOST
# make sure the physical path points to the same scsi-debug device
SD_DEVICE_ID=$(get_persistent_disk_name $SD)
echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF
block_device_wait
# Wait for the new disk to be online and replaced
log_must wait_vdev_online $TESTPOOL 0 $vdev_guid 45
log_must wait_replacing $TESTPOOL 45
# Validate auto-replace was successful
log_must check_state $TESTPOOL "" "ONLINE"
#
# Confirm the partition UUID changed so we know the new disk was relabeled
#
# Note: some older versions of udevadm don't support "--property" option so
# we'll # skip this test when it is not supported
#
if [ ! -z "$part_uuid" ]; then
new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
log_note new disk GPT uuid ${new_uuid}
[[ "$part_uuid" = "$new_uuid" ]] && \
log_fail "The new disk was not relabeled as expected"
fi
log_pass "automated auto-replace with by-id paths"