Fix dRAID self-healing short columns

When dRAID performs a normal read operation only the data columns
in the raid map are read from disk.  This is enough information to
calculate the checksum, verify it, and return the needed data to the
application.  It's only in the event of a checksum failure that the
additional parity and any empty columns must be read since they are
required for parity reconstruction.

Reading these additional columns is handled by vdev_raidz_read_all()
which calls vdev_draid_map_alloc_empty() to expand the raid_map_t
and submit IOs for the missing columns.  This all works correctly,
but it fails to account for any "short" columns.  These are data
columns which are padded with a empty skip sector at the end.
Since that empty sector is not needed for a normal read it's not
read when columns is first read from disk.  However, like the parity
and empty columns the skip sector is needed to perform reconstruction.

The fix is to mark any "short" columns as never being read by clearing
the rc_tried flag when expanding the raid_map_t.  This will cause
the entire column to re-read from disk in the event of a checksum
failure allowing the self-healing functionality to repair the block.

Note that this only effects the self-healing feature because when
scrubbing a pool the parity, data, and empty columns are all read
initially to verify their contents.  Furthermore, only blocks which
contain "short" columns would be effected, and only when the memory
backing the skip sector wasn't already zeroed out.

This change extends the existing redundancy_raidz.ksh test case to
verify self-healing (as well as resilver and scrub).  Then applies
the same test case to dRAID with a slightly modified version of
the test script called redundancy_draid.ksh.  The unused variable
combrec was also removed from both test cases.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #12010
This commit is contained in:
Brian Behlendorf 2021-05-08 08:57:25 -07:00
parent b1dd6351bb
commit 2085a5f992
5 changed files with 309 additions and 4 deletions

View File

@ -812,7 +812,12 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
/* this is a "big column", nothing to add */ /* this is a "big column", nothing to add */
ASSERT3P(rc->rc_abd, !=, NULL); ASSERT3P(rc->rc_abd, !=, NULL);
} else { } else {
/* short data column, add a skip sector */ /*
* short data column, add a skip sector and clear
* rc_tried to force the entire column to be re-read
* thereby including the missing skip sector data
* which is needed for reconstruction.
*/
ASSERT3U(rc->rc_size + skip_size, ==, parity_size); ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
ASSERT3U(rr->rr_nempty, !=, 0); ASSERT3U(rr->rr_nempty, !=, 0);
ASSERT3P(rc->rc_abd, !=, NULL); ASSERT3P(rc->rc_abd, !=, NULL);
@ -823,6 +828,7 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
abd_gang_add(rc->rc_abd, abd_get_offset_size( abd_gang_add(rc->rc_abd, abd_get_offset_size(
rr->rr_abd_empty, skip_off, skip_size), B_TRUE); rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
skip_off += skip_size; skip_off += skip_size;
rc->rc_tried = 0;
} }
/* /*

View File

@ -741,8 +741,8 @@ tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos']
tags = ['functional', 'raidz'] tags = ['functional', 'raidz']
[tests/functional/redundancy] [tests/functional/redundancy]
tests = ['redundancy_draid1', 'redundancy_draid2', 'redundancy_draid3', tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
'redundancy_draid_spare1', 'redundancy_draid_spare2', 'redundancy_draid3', 'redundancy_draid_spare1', 'redundancy_draid_spare2',
'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz',
'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3',
'redundancy_stripe'] 'redundancy_stripe']

View File

@ -2,6 +2,7 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/redundancy
dist_pkgdata_SCRIPTS = \ dist_pkgdata_SCRIPTS = \
setup.ksh \ setup.ksh \
cleanup.ksh \ cleanup.ksh \
redundancy_draid.ksh \
redundancy_draid1.ksh \ redundancy_draid1.ksh \
redundancy_draid2.ksh \ redundancy_draid2.ksh \
redundancy_draid3.ksh \ redundancy_draid3.ksh \

View File

@ -0,0 +1,248 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2020 by vStack. All rights reserved.
# Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
#
# DESCRIPTION:
# dRAID should provide redundancy
#
# STRATEGY:
# 1. Create block device files for the test draid pool
# 2. For each parity value [1..3]
# - create draid pool
# - fill it with some directories/files
# - verify self-healing by overwriting devices
# - verify resilver by replacing devices
# - verify scrub by zeroing devices
# - destroy the draid pool
typeset -r devs=6
typeset -r dev_size_mb=512
typeset -a disks
prefetch_disable=$(get_tunable PREFETCH_DISABLE)
function cleanup
{
poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
for i in {0..$devs}; do
rm -f "$TEST_BASE_DIR/dev-$i"
done
set_tunable32 PREFETCH_DISABLE $prefetch_disable
}
function test_selfheal # <pool> <parity> <dir>
{
typeset pool=$1
typeset nparity=$2
typeset dir=$3
log_must zpool export $pool
for (( i=0; i<$nparity; i=i+1 )); do
log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \
bs=1M seek=4 count=$(($dev_size_mb-4))
done
log_must zpool import -o cachefile=none -d $dir $pool
typeset mntpnt=$(get_prop mountpoint $pool/fs)
log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1
log_must check_pool_status $pool "errors" "No known data errors"
#
# Scrub the pool because the find command will only self-heal blocks
# from the files which were read. Before overwriting additional
# devices we need to repair all of the blocks in the pool.
#
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must zpool clear $pool
log_must zpool export $pool
for (( i=$nparity; i<$nparity*2; i=i+1 )); do
log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \
bs=1M seek=4 count=$(($dev_size_mb-4))
done
log_must zpool import -o cachefile=none -d $dir $pool
typeset mntpnt=$(get_prop mountpoint $pool/fs)
log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1
log_must check_pool_status $pool "errors" "No known data errors"
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must zpool clear $pool
}
function test_resilver # <pool> <parity> <dir>
{
typeset pool=$1
typeset nparity=$2
typeset dir=$3
for (( i=0; i<$nparity; i=i+1 )); do
log_must zpool offline $pool $dir/dev-$i
done
log_must zpool export $pool
for (( i=0; i<$nparity; i=i+1 )); do
log_must zpool labelclear -f $dir/dev-$i
done
log_must zpool import -o cachefile=none -d $dir $pool
for (( i=0; i<$nparity; i=i+1 )); do
log_must zpool replace -fw $pool $dir/dev-$i
done
log_must check_pool_status $pool "errors" "No known data errors"
resilver_cksum=$(cksum_pool $pool)
if [[ $resilver_cksum != 0 ]]; then
log_must zpool status -v $pool
log_fail "resilver cksum errors: $resilver_cksum"
fi
log_must zpool clear $pool
for (( i=$nparity; i<$nparity*2; i=i+1 )); do
log_must zpool offline $pool $dir/dev-$i
done
log_must zpool export $pool
for (( i=$nparity; i<$nparity*2; i=i+1 )); do
log_must zpool labelclear -f $dir/dev-$i
done
log_must zpool import -o cachefile=none -d $dir $pool
for (( i=$nparity; i<$nparity*2; i=i+1 )); do
log_must zpool replace -fw $pool $dir/dev-$i
done
log_must check_pool_status $pool "errors" "No known data errors"
resilver_cksum=$(cksum_pool $pool)
if [[ $resilver_cksum != 0 ]]; then
log_must zpool status -v $pool
log_fail "resilver cksum errors: $resilver_cksum"
fi
log_must zpool clear $pool
}
function test_scrub # <pool> <parity> <dir>
{
typeset pool=$1
typeset nparity=$2
typeset dir=$3
log_must zpool export $pool
for (( i=0; i<$nparity; i=i+1 )); do
dd conv=notrunc if=/dev/zero of=$dir/dev-$i \
bs=1M seek=4 count=$(($dev_size_mb-4))
done
log_must zpool import -o cachefile=none -d $dir $pool
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must zpool clear $pool
log_must zpool export $pool
for (( i=$nparity; i<$nparity*2; i=i+1 )); do
dd conv=notrunc if=/dev/zero of=$dir/dev-$i \
bs=1M seek=4 count=$(($dev_size_mb-4))
done
log_must zpool import -o cachefile=none -d $dir $pool
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must zpool clear $pool
}
log_onexit cleanup
log_must set_tunable32 PREFETCH_DISABLE 1
# Disk files which will be used by pool
for i in {0..$(($devs - 1))}; do
device=$TEST_BASE_DIR/dev-$i
log_must truncate -s ${dev_size_mb}M $device
disks[${#disks[*]}+1]=$device
done
# Disk file which will be attached
log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs
for nparity in 1 2 3; do
raid=draid$nparity
dir=$TEST_BASE_DIR
log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]}
log_must zfs set primarycache=metadata $TESTPOOL
log_must zfs create $TESTPOOL/fs
log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R
log_must zfs create -o compress=on $TESTPOOL/fs2
log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R
log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R
typeset pool_size=$(get_pool_prop size $TESTPOOL)
log_must zpool export $TESTPOOL
log_must zpool import -o cachefile=none -d $dir $TESTPOOL
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
test_selfheal $TESTPOOL $nparity $dir
test_resilver $TESTPOOL $nparity $dir
test_scrub $TESTPOOL $nparity $dir
log_must zpool destroy "$TESTPOOL"
done
log_pass "draid redundancy test succeeded."

View File

@ -23,6 +23,7 @@
# #
# Copyright (c) 2020 by vStack. All rights reserved. # Copyright (c) 2020 by vStack. All rights reserved.
# Copyright (c) 2021 by Delphix. All rights reserved. # Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
# #
. $STF_SUITE/include/libtest.shlib . $STF_SUITE/include/libtest.shlib
@ -37,6 +38,7 @@
# 2. For each parity value [1..3] # 2. For each parity value [1..3]
# - create raidz pool # - create raidz pool
# - fill it with some directories/files # - fill it with some directories/files
# - verify self-healing by overwriting devices
# - verify resilver by replacing devices # - verify resilver by replacing devices
# - verify scrub by zeroing devices # - verify scrub by zeroing devices
# - destroy the raidz pool # - destroy the raidz pool
@ -59,6 +61,54 @@ function cleanup
set_tunable32 PREFETCH_DISABLE $prefetch_disable set_tunable32 PREFETCH_DISABLE $prefetch_disable
} }
function test_selfheal # <pool> <parity> <dir>
{
typeset pool=$1
typeset nparity=$2
typeset dir=$3
log_must zpool export $pool
for (( i=0; i<$nparity; i=i+1 )); do
log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \
bs=1M seek=4 count=$(($dev_size_mb-4))
done
log_must zpool import -o cachefile=none -d $dir $pool
typeset mntpnt=$(get_prop mountpoint $pool/fs)
log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1
log_must check_pool_status $pool "errors" "No known data errors"
#
# Scrub the pool because the find command will only self-heal blocks
# from the files which were read. Before overwriting additional
# devices we need to repair all of the blocks in the pool.
#
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must zpool clear $pool
log_must zpool export $pool
for (( i=$nparity; i<$nparity*2; i=i+1 )); do
log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \
bs=1M seek=4 count=$(($dev_size_mb-4))
done
log_must zpool import -o cachefile=none -d $dir $pool
typeset mntpnt=$(get_prop mountpoint $pool/fs)
log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1
log_must check_pool_status $pool "errors" "No known data errors"
log_must zpool scrub -w $pool
log_must check_pool_status $pool "errors" "No known data errors"
log_must zpool clear $pool
}
function test_resilver # <pool> <parity> <dir> function test_resilver # <pool> <parity> <dir>
{ {
typeset pool=$1 typeset pool=$1
@ -121,7 +171,6 @@ function test_scrub # <pool> <parity> <dir>
typeset pool=$1 typeset pool=$1
typeset nparity=$2 typeset nparity=$2
typeset dir=$3 typeset dir=$3
typeset combrec=$4
log_must zpool export $pool log_must zpool export $pool
@ -189,6 +238,7 @@ for nparity in 1 2 3; do
log_must check_pool_status $TESTPOOL "errors" "No known data errors" log_must check_pool_status $TESTPOOL "errors" "No known data errors"
test_selfheal $TESTPOOL $nparity $dir
test_resilver $TESTPOOL $nparity $dir test_resilver $TESTPOOL $nparity $dir
test_scrub $TESTPOOL $nparity $dir test_scrub $TESTPOOL $nparity $dir