Fix dRAID sequential resilver silent damage handling

This change addresses two distinct scenarios which are possible
when performing a sequential resilver to a dRAID pool with vdevs
that contain silent unknown damage. Which in this circumstance
took the form of the devices being intentionally overwritten with
zeros. However, it could also result from a device returning incorrect
data while a sequential resilver was in progress.

Scenario 1) A sequential resilver is performed while all of the
dRAID vdevs are ONLINE and there is silent damage present on the
vdev being resilvered. In this case, nothing will be repaired
by vdev_raidz_io_done_reconstruct_known_missing() because
rc->rc_error isn't set on any of the raid columns. To address
this vdev_draid_io_start_read() has been updated to always mark
the resilvering column as ESTALE for sequential resilver IO.

Scenario 2) Multiple columns contain silent damage for the same
block and a sequential resilver is performed. In this case it's
impossible to generate the correct data from parity unless all of
the damaged columns are being sequentially resilvered (and thus
only good data is used to generate parity). This is as expected
and there's nothing which can be done about it. However, we need
to be careful not to make to situation worse. Since we can't
verify the data is actually good without a checksum, we must
only repair the devices which are being sequentially resilvered.
Otherwise, an incorrect repair to a device which previously
contained good data could effectively lock in the damage and
make reconstruction impossible. A check for this was added to
vdev_raidz_io_done_verified() along with a new test case.

Lastly, this change updates the redundancy_draid_spare1 and
redundancy_draid_spare3 test cases to be more representative
of normal dRAID replacement operation.  Specifically, what we
care about is that the scrub run after a sequential resilver
does not find additional blocks which need repair.  This would
indicate the sequential resilver failed to rebuild a section of
one of the devices. Note also the tests were switched to using
the verify_pool() function which still checks for checksum errors.

Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #12061
This commit is contained in:
Brian Behlendorf 2021-05-20 15:05:26 -07:00 committed by GitHub
parent 6ac2d7f76f
commit 8fb577ae6d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 236 additions and 56 deletions

View File

@ -113,7 +113,8 @@ typedef struct raidz_col {
uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */
uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ uint8_t rc_need_orig_restore; /* need to restore from orig_data? */
uint8_t rc_repair; /* Write good data to this column */ uint8_t rc_force_repair; /* Write good data to this column */
uint8_t rc_allow_repair; /* Allow repair I/O to this column */
} raidz_col_t; } raidz_col_t;
typedef struct raidz_row { typedef struct raidz_row {

View File

@ -1009,7 +1009,8 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
rc->rc_error = 0; rc->rc_error = 0;
rc->rc_tried = 0; rc->rc_tried = 0;
rc->rc_skipped = 0; rc->rc_skipped = 0;
rc->rc_repair = 0; rc->rc_force_repair = 0;
rc->rc_allow_repair = 1;
rc->rc_need_orig_restore = B_FALSE; rc->rc_need_orig_restore = B_FALSE;
if (q == 0 && i >= bc) if (q == 0 && i >= bc)
@ -1890,6 +1891,36 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
if (zio->io_flags & ZIO_FLAG_RESILVER) { if (zio->io_flags & ZIO_FLAG_RESILVER) {
vdev_t *svd; vdev_t *svd;
/*
* Sequential rebuilds need to always consider the data
* on the child being rebuilt to be stale. This is
* important when all columns are available to aid
* known reconstruction in identifing which columns
* contain incorrect data.
*
* Furthermore, all repairs need to be constrained to
* the devices being rebuilt because without a checksum
* we cannot verify the data is actually correct and
* performing an incorrect repair could result in
* locking in damage and making the data unrecoverable.
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
if (vdev_draid_rebuilding(cvd)) {
if (c >= rr->rr_firstdatacol)
rr->rr_missingdata++;
else
rr->rr_missingparity++;
rc->rc_error = SET_ERROR(ESTALE);
rc->rc_skipped = 1;
rc->rc_allow_repair = 1;
continue;
} else {
rc->rc_allow_repair = 0;
}
} else {
rc->rc_allow_repair = 1;
}
/* /*
* If this child is a distributed spare then the * If this child is a distributed spare then the
* offset might reside on the vdev being replaced. * offset might reside on the vdev being replaced.
@ -1903,7 +1934,10 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_offset); rc->rc_offset);
if (svd && (svd->vdev_ops == &vdev_spare_ops || if (svd && (svd->vdev_ops == &vdev_spare_ops ||
svd->vdev_ops == &vdev_replacing_ops)) { svd->vdev_ops == &vdev_replacing_ops)) {
rc->rc_repair = 1; rc->rc_force_repair = 1;
if (vdev_draid_rebuilding(svd))
rc->rc_allow_repair = 1;
} }
} }
@ -1914,7 +1948,8 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
if ((cvd->vdev_ops == &vdev_spare_ops || if ((cvd->vdev_ops == &vdev_spare_ops ||
cvd->vdev_ops == &vdev_replacing_ops) && cvd->vdev_ops == &vdev_replacing_ops) &&
vdev_draid_rebuilding(cvd)) { vdev_draid_rebuilding(cvd)) {
rc->rc_repair = 1; rc->rc_force_repair = 1;
rc->rc_allow_repair = 1;
} }
} }
} }

View File

@ -269,7 +269,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
rc->rc_error = 0; rc->rc_error = 0;
rc->rc_tried = 0; rc->rc_tried = 0;
rc->rc_skipped = 0; rc->rc_skipped = 0;
rc->rc_repair = 0; rc->rc_force_repair = 0;
rc->rc_allow_repair = 1;
rc->rc_need_orig_restore = B_FALSE; rc->rc_need_orig_restore = B_FALSE;
if (c >= acols) if (c >= acols)
@ -1811,8 +1812,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
vdev_t *vd = zio->io_vd; vdev_t *vd = zio->io_vd;
vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
if ((rc->rc_error == 0 || rc->rc_size == 0) && if (!rc->rc_allow_repair) {
(rc->rc_repair == 0)) { continue;
} else if (!rc->rc_force_repair &&
(rc->rc_error == 0 || rc->rc_size == 0)) {
continue; continue;
} }

View File

@ -743,10 +743,10 @@ tags = ['functional', 'raidz']
[tests/functional/redundancy] [tests/functional/redundancy]
tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
'redundancy_draid3', 'redundancy_draid_spare1', 'redundancy_draid_spare2', 'redundancy_draid3', 'redundancy_draid_damaged', 'redundancy_draid_spare1',
'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
'redundancy_stripe'] 'redundancy_raidz3', 'redundancy_stripe']
tags = ['functional', 'redundancy'] tags = ['functional', 'redundancy']
timeout = 1200 timeout = 1200

View File

@ -6,6 +6,7 @@ dist_pkgdata_SCRIPTS = \
redundancy_draid1.ksh \ redundancy_draid1.ksh \
redundancy_draid2.ksh \ redundancy_draid2.ksh \
redundancy_draid3.ksh \ redundancy_draid3.ksh \
redundancy_draid_damaged.ksh \
redundancy_draid_spare1.ksh \ redundancy_draid_spare1.ksh \
redundancy_draid_spare2.ksh \ redundancy_draid_spare2.ksh \
redundancy_draid_spare3.ksh \ redundancy_draid_spare3.ksh \

View File

@ -0,0 +1,153 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
#
# DESCRIPTION:
# When sequentially resilvering a dRAID pool with multiple vdevs
# that contain silent damage a sequential resilver should never
# introduce additional unrecoverable damage.
#
# STRATEGY:
# 1. Create block device files for the test draid pool
# 2. For each parity value [1..3]
# - create draid pool
# - fill it with some directories/files
# - overwrite the maximum number of repairable devices
# - sequentially resilver each overwritten device one at a time;
# the device will not be correctly repaired because the silent
# damage on the other vdevs will cause the parity calculations
# to generate incorrect data for the resilvering vdev.
# - verify that only the resilvering devices had invalid data
# written and that a scrub is still able to repair the pool
# - destroy the draid pool
#
typeset -r devs=7
typeset -r dev_size_mb=512
typeset -a disks
prefetch_disable=$(get_tunable PREFETCH_DISABLE)
rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED)
function cleanup
{
poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
for i in {0..$devs}; do
rm -f "$TEST_BASE_DIR/dev-$i"
done
set_tunable32 PREFETCH_DISABLE $prefetch_disable
set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled
}
function test_sequential_resilver # <pool> <parity> <dir>
{
typeset pool=$1
typeset nparity=$2
typeset dir=$3
log_must zpool export $pool
for (( i=0; i<$nparity; i=i+1 )); do
log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \
bs=1M seek=4 count=$(($dev_size_mb-4))
done
log_must zpool import -o cachefile=none -d $dir $pool
for (( i=0; i<$nparity; i=i+1 )); do
spare=draid${nparity}-0-$i
log_must zpool replace -fsw $pool $dir/dev-$i $spare
done
log_must zpool scrub -w $pool
# When only a single child was overwritten the sequential resilver
# can fully repair the damange from parity and the scrub will have
# nothing to repair. When multiple children are silently damaged
# the sequential resilver will calculate the wrong data since only
# the parity information is used and it cannot be verified with
# the checksum. However, since only the resilvering devices are
# written to with the bad data a subsequent scrub will be able to
# fully repair the pool.
#
if [[ $nparity == 1 ]]; then
log_must check_pool_status $pool "scan" "repaired 0B"
else
log_mustnot check_pool_status $pool "scan" "repaired 0B"
fi
log_must check_pool_status $pool "errors" "No known data errors"
log_must check_pool_status $pool "scan" "with 0 errors"
}
log_onexit cleanup
log_must set_tunable32 PREFETCH_DISABLE 1
log_must set_tunable32 REBUILD_SCRUB_ENABLED 0
# Disk files which will be used by pool
for i in {0..$(($devs - 1))}; do
device=$TEST_BASE_DIR/dev-$i
log_must truncate -s ${dev_size_mb}M $device
disks[${#disks[*]}+1]=$device
done
# Disk file which will be attached
log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs
for nparity in 1 2 3; do
raid=draid${nparity}:${nparity}s
dir=$TEST_BASE_DIR
log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]}
log_must zfs set primarycache=metadata $TESTPOOL
log_must zfs create $TESTPOOL/fs
log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R
log_must zfs create -o compress=on $TESTPOOL/fs2
log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R
log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R
log_must zpool export $TESTPOOL
log_must zpool import -o cachefile=none -d $dir $TESTPOOL
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
test_sequential_resilver $TESTPOOL $nparity $dir
log_must zpool destroy "$TESTPOOL"
done
log_pass "draid damaged device(s) test succeeded."

View File

@ -40,7 +40,15 @@
log_assert "Verify resilver to dRAID distributed spares" log_assert "Verify resilver to dRAID distributed spares"
log_onexit cleanup function cleanup_tunable
{
log_must set_tunable32 REBUILD_SCRUB_ENABLED 1
cleanup
}
log_onexit cleanup_tunable
log_must set_tunable32 REBUILD_SCRUB_ENABLED 0
for replace_mode in "healing" "sequential"; do for replace_mode in "healing" "sequential"; do
@ -74,32 +82,15 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $spare_vdev "ONLINE" log_must check_vdev_state $spare_vdev "ONLINE"
log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE"
log_must zpool detach $TESTPOOL $fault_vdev log_must zpool detach $TESTPOOL $fault_vdev
log_must verify_pool $TESTPOOL
resilver_cksum=$(cksum_pool $TESTPOOL)
if [[ $resilver_cksum != 0 ]]; then
log_must zpool status -v $TESTPOOL
log_fail "$replace_mode resilver "
"cksum errors: $resilver_cksum"
fi
if [[ "$replace_mode" = "healing" ]]; then
log_must zpool scrub $TESTPOOL
fi
log_must wait_scrubbed $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B" log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors" log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
scrub_cksum=$(cksum_pool $TESTPOOL)
if [[ $scrub_cksum != 0 ]]; then
log_must zpool status -v $TESTPOOL
log_fail "scrub cksum errors: $scrub_cksum"
fi
(( i += 1 )) (( i += 1 ))
done done
log_must is_data_valid $TESTPOOL log_must is_data_valid $TESTPOOL
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
cleanup cleanup
done done

View File

@ -114,6 +114,9 @@ for replace_mode in "healing" "sequential"; do
log_must zpool detach $TESTPOOL $BASEDIR/vdev7 log_must zpool detach $TESTPOOL $BASEDIR/vdev7
log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE" log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE"
log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE" log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
# Distributed spare in mirror with original device faulted # Distributed spare in mirror with original device faulted
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8
@ -122,6 +125,9 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED" log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED"
log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE" log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE"
log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE" log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
# Distributed spare in mirror with original device still online # Distributed spare in mirror with original device still online
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE" log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE"
@ -129,6 +135,9 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL spare-9 "ONLINE" log_must check_vdev_state $TESTPOOL spare-9 "ONLINE"
log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE" log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE"
log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE" log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
# Normal faulted device replacement # Normal faulted device replacement
new_vdev0="$BASEDIR/new_vdev0" new_vdev0="$BASEDIR/new_vdev0"
@ -137,6 +146,9 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED" log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED"
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0 log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0
log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE" log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
# Distributed spare faulted device replacement # Distributed spare faulted device replacement
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2 log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2
@ -145,6 +157,9 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED" log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED"
log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE" log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE"
log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE" log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
# Normal online device replacement # Normal online device replacement
new_vdev1="$BASEDIR/new_vdev1" new_vdev1="$BASEDIR/new_vdev1"
@ -152,6 +167,9 @@ for replace_mode in "healing" "sequential"; do
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE" log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE"
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1 log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1
log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE" log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE"
log_must verify_pool $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
# Distributed spare online device replacement (then fault) # Distributed spare online device replacement (then fault)
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4 log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4
@ -161,35 +179,13 @@ for replace_mode in "healing" "sequential"; do
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3 log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED" log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED"
log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED" log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED"
log_must verify_pool $TESTPOOL
resilver_cksum=$(cksum_pool $TESTPOOL)
if [[ $resilver_cksum != 0 ]]; then
log_must zpool status -v $TESTPOOL
log_fail "$replace_mode resilver cksum errors: $resilver_cksum"
fi
if [[ "$replace_mode" = "healing" ]]; then
log_must zpool scrub -w $TESTPOOL
else
if [[ $(get_tunable REBUILD_SCRUB_ENABLED) -eq 0 ]]; then
log_must zpool scrub -w $TESTPOOL
else
log_must wait_scrubbed $TESTPOOL
fi
fi
log_must is_pool_scrubbed $TESTPOOL
scrub_cksum=$(cksum_pool $TESTPOOL)
if [[ $scrub_cksum != 0 ]]; then
log_must zpool status -v $TESTPOOL
log_fail "scrub cksum errors: $scrub_cksum"
fi
log_must check_pool_status $TESTPOOL "scan" "repaired 0B" log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors" log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
# Verify the original data is valid
log_must is_data_valid $TESTPOOL log_must is_data_valid $TESTPOOL
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
cleanup cleanup
done done