From fc1c0053f9c6fd8e894c3378b489fa817f5d0330 Mon Sep 17 00:00:00 2001 From: samwyc <115969550+samwyc@users.noreply.github.com> Date: Thu, 20 Oct 2022 04:18:13 +0530 Subject: [PATCH] Fix sequential resilver drive failure race condition This patch handles the race condition on simultaneous failure of 2 drives, which misses the vdev_rebuild_reset_wanted signal in vdev_rebuild_thread. We retry to catch this inside the vdev_rebuild_complete_sync function. Reviewed-by: Brian Behlendorf Reviewed-by: Richard Yao Reviewed-by: Dipak Ghosh Reviewed-by: Akash B Signed-off-by: Samuel Wycliffe J Closes #14041 Closes #14050 --- module/zfs/vdev_rebuild.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 4d7de0c6c4..9dfbe0cf6f 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -22,6 +22,7 @@ * * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ #include @@ -134,6 +135,7 @@ int zfs_rebuild_scrub_enabled = 1; * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). */ static void vdev_rebuild_thread(void *arg); +static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx); /* * Clear the per-vdev rebuild bytes value for a vdev tree. @@ -307,6 +309,17 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); + + /* + * Handle a second device failure if it occurs after all rebuild I/O + * has completed but before this sync task has been executed. + */ + if (vd->vdev_rebuild_reset_wanted) { + mutex_exit(&vd->vdev_rebuild_lock); + vdev_rebuild_reset_sync(arg, tx); + return; + } + vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; vrp->vrp_end_time = gethrestime_sec(); @@ -760,7 +773,6 @@ vdev_rebuild_thread(void *arg) ASSERT(vd->vdev_rebuilding); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); - ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;