diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index b94d59eb77..908723da0c 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -113,7 +113,8 @@ typedef struct raidz_col { uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ - uint8_t rc_repair; /* Write good data to this column */ + uint8_t rc_force_repair; /* Write good data to this column */ + uint8_t rc_allow_repair; /* Allow repair I/O to this column */ } raidz_col_t; typedef struct raidz_row { diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index c65ce1cd60..20b1457f0c 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1009,7 +1009,8 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, rc->rc_error = 0; rc->rc_tried = 0; rc->rc_skipped = 0; - rc->rc_repair = 0; + rc->rc_force_repair = 0; + rc->rc_allow_repair = 1; rc->rc_need_orig_restore = B_FALSE; if (q == 0 && i >= bc) @@ -1890,6 +1891,36 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) if (zio->io_flags & ZIO_FLAG_RESILVER) { vdev_t *svd; + /* + * Sequential rebuilds need to always consider the data + * on the child being rebuilt to be stale. This is + * important when all columns are available to aid + * known reconstruction in identifing which columns + * contain incorrect data. + * + * Furthermore, all repairs need to be constrained to + * the devices being rebuilt because without a checksum + * we cannot verify the data is actually correct and + * performing an incorrect repair could result in + * locking in damage and making the data unrecoverable. + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD) { + if (vdev_draid_rebuilding(cvd)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + rc->rc_allow_repair = 1; + continue; + } else { + rc->rc_allow_repair = 0; + } + } else { + rc->rc_allow_repair = 1; + } + /* * If this child is a distributed spare then the * offset might reside on the vdev being replaced. @@ -1903,7 +1934,10 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_offset); if (svd && (svd->vdev_ops == &vdev_spare_ops || svd->vdev_ops == &vdev_replacing_ops)) { - rc->rc_repair = 1; + rc->rc_force_repair = 1; + + if (vdev_draid_rebuilding(svd)) + rc->rc_allow_repair = 1; } } @@ -1914,7 +1948,8 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) if ((cvd->vdev_ops == &vdev_spare_ops || cvd->vdev_ops == &vdev_replacing_ops) && vdev_draid_rebuilding(cvd)) { - rc->rc_repair = 1; + rc->rc_force_repair = 1; + rc->rc_allow_repair = 1; } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 020b3bc956..1feebf7089 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -269,7 +269,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rc->rc_error = 0; rc->rc_tried = 0; rc->rc_skipped = 0; - rc->rc_repair = 0; + rc->rc_force_repair = 0; + rc->rc_allow_repair = 1; rc->rc_need_orig_restore = B_FALSE; if (c >= acols) @@ -1811,8 +1812,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) vdev_t *vd = zio->io_vd; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - if ((rc->rc_error == 0 || rc->rc_size == 0) && - (rc->rc_repair == 0)) { + if (!rc->rc_allow_repair) { + continue; + } else if (!rc->rc_force_repair && + (rc->rc_error == 0 || rc->rc_size == 0)) { continue; } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f0432205db..4ed5d16af0 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -743,10 +743,10 @@ tags = ['functional', 'raidz'] [tests/functional/redundancy] tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', - 'redundancy_draid3', 'redundancy_draid_spare1', 'redundancy_draid_spare2', - 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', - 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', - 'redundancy_stripe'] + 'redundancy_draid3', 'redundancy_draid_damaged', 'redundancy_draid_spare1', + 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', + 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', + 'redundancy_raidz3', 'redundancy_stripe'] tags = ['functional', 'redundancy'] timeout = 1200 diff --git a/tests/zfs-tests/tests/functional/redundancy/Makefile.am b/tests/zfs-tests/tests/functional/redundancy/Makefile.am index ac323c893d..42c11c4aa9 100644 --- a/tests/zfs-tests/tests/functional/redundancy/Makefile.am +++ b/tests/zfs-tests/tests/functional/redundancy/Makefile.am @@ -6,6 +6,7 @@ dist_pkgdata_SCRIPTS = \ redundancy_draid1.ksh \ redundancy_draid2.ksh \ redundancy_draid3.ksh \ + redundancy_draid_damaged.ksh \ redundancy_draid_spare1.ksh \ redundancy_draid_spare2.ksh \ redundancy_draid_spare3.ksh \ diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh new file mode 100755 index 0000000000..6796cc78a1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh @@ -0,0 +1,153 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# When sequentially resilvering a dRAID pool with multiple vdevs +# that contain silent damage a sequential resilver should never +# introduce additional unrecoverable damage. +# +# STRATEGY: +# 1. Create block device files for the test draid pool +# 2. For each parity value [1..3] +# - create draid pool +# - fill it with some directories/files +# - overwrite the maximum number of repairable devices +# - sequentially resilver each overwritten device one at a time; +# the device will not be correctly repaired because the silent +# damage on the other vdevs will cause the parity calculations +# to generate incorrect data for the resilvering vdev. +# - verify that only the resilvering devices had invalid data +# written and that a scrub is still able to repair the pool +# - destroy the draid pool +# + +typeset -r devs=7 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) +rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED) + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + rm -f "$TEST_BASE_DIR/dev-$i" + done + + set_tunable32 PREFETCH_DISABLE $prefetch_disable + set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled +} + +function test_sequential_resilver # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=0; i<$nparity; i=i+1 )); do + spare=draid${nparity}-0-$i + log_must zpool replace -fsw $pool $dir/dev-$i $spare + done + + log_must zpool scrub -w $pool + + # When only a single child was overwritten the sequential resilver + # can fully repair the damange from parity and the scrub will have + # nothing to repair. When multiple children are silently damaged + # the sequential resilver will calculate the wrong data since only + # the parity information is used and it cannot be verified with + # the checksum. However, since only the resilvering devices are + # written to with the bad data a subsequent scrub will be able to + # fully repair the pool. + # + if [[ $nparity == 1 ]]; then + log_must check_pool_status $pool "scan" "repaired 0B" + else + log_mustnot check_pool_status $pool "scan" "repaired 0B" + fi + + log_must check_pool_status $pool "errors" "No known data errors" + log_must check_pool_status $pool "scan" "with 0 errors" +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +for nparity in 1 2 3; do + raid=draid${nparity}:${nparity}s + dir=$TEST_BASE_DIR + + log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} + log_must zfs set primarycache=metadata $TESTPOOL + + log_must zfs create $TESTPOOL/fs + log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $TESTPOOL/fs2 + log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 + log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + + log_must zpool export $TESTPOOL + log_must zpool import -o cachefile=none -d $dir $TESTPOOL + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + test_sequential_resilver $TESTPOOL $nparity $dir + + log_must zpool destroy "$TESTPOOL" +done + +log_pass "draid damaged device(s) test succeeded." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh index 3b7951596d..8acee15679 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh @@ -40,7 +40,15 @@ log_assert "Verify resilver to dRAID distributed spares" -log_onexit cleanup +function cleanup_tunable +{ + log_must set_tunable32 REBUILD_SCRUB_ENABLED 1 + cleanup +} + +log_onexit cleanup_tunable + +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 for replace_mode in "healing" "sequential"; do @@ -74,32 +82,15 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $spare_vdev "ONLINE" log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" log_must zpool detach $TESTPOOL $fault_vdev - - resilver_cksum=$(cksum_pool $TESTPOOL) - if [[ $resilver_cksum != 0 ]]; then - log_must zpool status -v $TESTPOOL - log_fail "$replace_mode resilver " - "cksum errors: $resilver_cksum" - fi - - if [[ "$replace_mode" = "healing" ]]; then - log_must zpool scrub $TESTPOOL - fi - - log_must wait_scrubbed $TESTPOOL + log_must verify_pool $TESTPOOL log_must check_pool_status $TESTPOOL "scan" "repaired 0B" log_must check_pool_status $TESTPOOL "scan" "with 0 errors" - scrub_cksum=$(cksum_pool $TESTPOOL) - if [[ $scrub_cksum != 0 ]]; then - log_must zpool status -v $TESTPOOL - log_fail "scrub cksum errors: $scrub_cksum" - fi - (( i += 1 )) done log_must is_data_valid $TESTPOOL + log_must check_pool_status $TESTPOOL "errors" "No known data errors" cleanup done diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh index 587a1be0a6..28e8e3c6d7 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh @@ -114,6 +114,9 @@ for replace_mode in "healing" "sequential"; do log_must zpool detach $TESTPOOL $BASEDIR/vdev7 log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" # Distributed spare in mirror with original device faulted log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 @@ -122,6 +125,9 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED" log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" # Distributed spare in mirror with original device still online log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE" @@ -129,6 +135,9 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL spare-9 "ONLINE" log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" # Normal faulted device replacement new_vdev0="$BASEDIR/new_vdev0" @@ -137,6 +146,9 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED" log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0 log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" # Distributed spare faulted device replacement log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2 @@ -145,6 +157,9 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED" log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE" log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" # Normal online device replacement new_vdev1="$BASEDIR/new_vdev1" @@ -152,6 +167,9 @@ for replace_mode in "healing" "sequential"; do log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE" log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1 log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE" + log_must verify_pool $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" # Distributed spare online device replacement (then fault) log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4 @@ -161,35 +179,13 @@ for replace_mode in "healing" "sequential"; do log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3 log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED" log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED" - - resilver_cksum=$(cksum_pool $TESTPOOL) - if [[ $resilver_cksum != 0 ]]; then - log_must zpool status -v $TESTPOOL - log_fail "$replace_mode resilver cksum errors: $resilver_cksum" - fi - - if [[ "$replace_mode" = "healing" ]]; then - log_must zpool scrub -w $TESTPOOL - else - if [[ $(get_tunable REBUILD_SCRUB_ENABLED) -eq 0 ]]; then - log_must zpool scrub -w $TESTPOOL - else - log_must wait_scrubbed $TESTPOOL - fi - fi - - log_must is_pool_scrubbed $TESTPOOL - - scrub_cksum=$(cksum_pool $TESTPOOL) - if [[ $scrub_cksum != 0 ]]; then - log_must zpool status -v $TESTPOOL - log_fail "scrub cksum errors: $scrub_cksum" - fi - + log_must verify_pool $TESTPOOL log_must check_pool_status $TESTPOOL "scan" "repaired 0B" log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + # Verify the original data is valid log_must is_data_valid $TESTPOOL + log_must check_pool_status $TESTPOOL "errors" "No known data errors" cleanup done