Probe vdevs before marking removed
Before allowing the ZED to mark a vdev as REMOVED due to a hotplug event confirm that it is non-responsive with probe. Any device which can be successfully probed should be left ONLINE to prevent a healthy pool from being incorrectly SUSPENDED. This may occur for at least the following two scenarios. 1) Drive expansion (zpool online -e) in VMware environments. If, during the partition resize operation, a partition is removed and re-created then udev will send a removed event. 2) Re-scanning the namespaces of an NVMe device (nvme ns-rescan) may result in a udev remove and add event being delivered. Finally, update the ZED to only kick in a spare when the removal was successful. Reviewed-by: Ameer Hamza <ahamza@ixsystems.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #14859 Closes #14861
This commit is contained in:
parent
c2f0aaeb3c
commit
e2176f12a9
|
@ -444,14 +444,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* Remove the vdev since device is unplugged */
|
/* Remove the vdev since device is unplugged */
|
||||||
|
int remove_status = 0;
|
||||||
if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) {
|
if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) {
|
||||||
int status = zpool_vdev_remove_wanted(zhp, devname);
|
remove_status = zpool_vdev_remove_wanted(zhp, devname);
|
||||||
fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'"
|
fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'"
|
||||||
", ret:%d", devname, status);
|
", err:%d", devname, libzfs_errno(zhdl));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Replace the vdev with a spare if its not a l2arc */
|
/* Replace the vdev with a spare if its not a l2arc */
|
||||||
if (!l2arc && (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
|
if (!l2arc && !remove_status &&
|
||||||
|
(!fmd_prop_get_int32(hdl, "spare_on_remove") ||
|
||||||
replace_with_spare(hdl, zhp, vdev) == B_FALSE)) {
|
replace_with_spare(hdl, zhp, vdev) == B_FALSE)) {
|
||||||
/* Could not handle with spare */
|
/* Could not handle with spare */
|
||||||
fmd_hdl_debug(hdl, "no spare for '%s'", devname);
|
fmd_hdl_debug(hdl, "no spare for '%s'", devname);
|
||||||
|
|
|
@ -3994,11 +3994,18 @@ vdev_remove_wanted(spa_t *spa, uint64_t guid)
|
||||||
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
|
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the vdev is already removed, then don't do anything.
|
* If the vdev is already removed, or expanding which can trigger
|
||||||
|
* repartition add/remove events, then don't do anything.
|
||||||
*/
|
*/
|
||||||
if (vd->vdev_removed)
|
if (vd->vdev_removed || vd->vdev_expanding)
|
||||||
return (spa_vdev_state_exit(spa, NULL, 0));
|
return (spa_vdev_state_exit(spa, NULL, 0));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Confirm the vdev has been removed, otherwise don't do anything.
|
||||||
|
*/
|
||||||
|
if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
|
||||||
|
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
|
||||||
|
|
||||||
vd->vdev_remove_wanted = B_TRUE;
|
vd->vdev_remove_wanted = B_TRUE;
|
||||||
spa_async_request(spa, SPA_ASYNC_REMOVE);
|
spa_async_request(spa, SPA_ASYNC_REMOVE);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue