From 93c8e91fe7f5c55612131db64297106d7e5d1cdd Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sat, 8 May 2021 08:57:25 -0700 Subject: [PATCH] Fix dRAID self-healing short columns When dRAID performs a normal read operation only the data columns in the raid map are read from disk. This is enough information to calculate the checksum, verify it, and return the needed data to the application. It's only in the event of a checksum failure that the additional parity and any empty columns must be read since they are required for parity reconstruction. Reading these additional columns is handled by vdev_raidz_read_all() which calls vdev_draid_map_alloc_empty() to expand the raid_map_t and submit IOs for the missing columns. This all works correctly, but it fails to account for any "short" columns. These are data columns which are padded with a empty skip sector at the end. Since that empty sector is not needed for a normal read it's not read when columns is first read from disk. However, like the parity and empty columns the skip sector is needed to perform reconstruction. The fix is to mark any "short" columns as never being read by clearing the rc_tried flag when expanding the raid_map_t. This will cause the entire column to re-read from disk in the event of a checksum failure allowing the self-healing functionality to repair the block. Note that this only effects the self-healing feature because when scrubbing a pool the parity, data, and empty columns are all read initially to verify their contents. Furthermore, only blocks which contain "short" columns would be effected, and only when the memory backing the skip sector wasn't already zeroed out. This change extends the existing redundancy_raidz.ksh test case to verify self-healing (as well as resilver and scrub). Then applies the same test case to dRAID with a slightly modified version of the test script called redundancy_draid.ksh. The unused variable combrec was also removed from both test cases. Reviewed-by: Matthew Ahrens Reviewed-by: Mark Maybee Signed-off-by: Brian Behlendorf Closes #12010 --- module/zfs/vdev_draid.c | 8 +- tests/runfiles/common.run | 4 +- .../tests/functional/redundancy/Makefile.am | 1 + .../redundancy/redundancy_draid.ksh | 248 ++++++++++++++++++ .../redundancy/redundancy_raidz.ksh | 52 +++- 5 files changed, 309 insertions(+), 4 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index fb2143e946..c65ce1cd60 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -812,7 +812,12 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) /* this is a "big column", nothing to add */ ASSERT3P(rc->rc_abd, !=, NULL); } else { - /* short data column, add a skip sector */ + /* + * short data column, add a skip sector and clear + * rc_tried to force the entire column to be re-read + * thereby including the missing skip sector data + * which is needed for reconstruction. + */ ASSERT3U(rc->rc_size + skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); ASSERT3P(rc->rc_abd, !=, NULL); @@ -823,6 +828,7 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) abd_gang_add(rc->rc_abd, abd_get_offset_size( rr->rr_abd_empty, skip_off, skip_size), B_TRUE); skip_off += skip_size; + rc->rc_tried = 0; } /* diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 07c816f52f..f1aa649cb0 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -741,8 +741,8 @@ tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos'] tags = ['functional', 'raidz'] [tests/functional/redundancy] -tests = ['redundancy_draid1', 'redundancy_draid2', 'redundancy_draid3', - 'redundancy_draid_spare1', 'redundancy_draid_spare2', +tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', + 'redundancy_draid3', 'redundancy_draid_spare1', 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe'] diff --git a/tests/zfs-tests/tests/functional/redundancy/Makefile.am b/tests/zfs-tests/tests/functional/redundancy/Makefile.am index 7b85d6a1bf..ac323c893d 100644 --- a/tests/zfs-tests/tests/functional/redundancy/Makefile.am +++ b/tests/zfs-tests/tests/functional/redundancy/Makefile.am @@ -2,6 +2,7 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/redundancy dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ + redundancy_draid.ksh \ redundancy_draid1.ksh \ redundancy_draid2.ksh \ redundancy_draid3.ksh \ diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh new file mode 100755 index 0000000000..8015e682c8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh @@ -0,0 +1,248 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# dRAID should provide redundancy +# +# STRATEGY: +# 1. Create block device files for the test draid pool +# 2. For each parity value [1..3] +# - create draid pool +# - fill it with some directories/files +# - verify self-healing by overwriting devices +# - verify resilver by replacing devices +# - verify scrub by zeroing devices +# - destroy the draid pool + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + rm -f "$TEST_BASE_DIR/dev-$i" + done + + set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +function test_selfheal # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + typeset mntpnt=$(get_prop mountpoint $pool/fs) + log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1 + log_must check_pool_status $pool "errors" "No known data errors" + + # + # Scrub the pool because the find command will only self-heal blocks + # from the files which were read. Before overwriting additional + # devices we need to repair all of the blocks in the pool. + # + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + typeset mntpnt=$(get_prop mountpoint $pool/fs) + log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1 + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + +function test_resilver # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool replace -fw $pool $dir/dev-$i + done + + log_must check_pool_status $pool "errors" "No known data errors" + resilver_cksum=$(cksum_pool $pool) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $pool + log_fail "resilver cksum errors: $resilver_cksum" + fi + + log_must zpool clear $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool replace -fw $pool $dir/dev-$i + done + + log_must check_pool_status $pool "errors" "No known data errors" + resilver_cksum=$(cksum_pool $pool) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $pool + log_fail "resilver cksum errors: $resilver_cksum" + fi + + log_must zpool clear $pool +} + +function test_scrub # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +for nparity in 1 2 3; do + raid=draid$nparity + dir=$TEST_BASE_DIR + + log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} + log_must zfs set primarycache=metadata $TESTPOOL + + log_must zfs create $TESTPOOL/fs + log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $TESTPOOL/fs2 + log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 + log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + + typeset pool_size=$(get_pool_prop size $TESTPOOL) + + log_must zpool export $TESTPOOL + log_must zpool import -o cachefile=none -d $dir $TESTPOOL + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + test_selfheal $TESTPOOL $nparity $dir + test_resilver $TESTPOOL $nparity $dir + test_scrub $TESTPOOL $nparity $dir + + log_must zpool destroy "$TESTPOOL" +done + +log_pass "draid redundancy test succeeded." diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh index 8d32e0603a..d736883916 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh @@ -23,6 +23,7 @@ # # Copyright (c) 2020 by vStack. All rights reserved. # Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib @@ -37,6 +38,7 @@ # 2. For each parity value [1..3] # - create raidz pool # - fill it with some directories/files +# - verify self-healing by overwriting devices # - verify resilver by replacing devices # - verify scrub by zeroing devices # - destroy the raidz pool @@ -59,6 +61,54 @@ function cleanup set_tunable32 PREFETCH_DISABLE $prefetch_disable } +function test_selfheal # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + typeset mntpnt=$(get_prop mountpoint $pool/fs) + log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1 + log_must check_pool_status $pool "errors" "No known data errors" + + # + # Scrub the pool because the find command will only self-heal blocks + # from the files which were read. Before overwriting additional + # devices we need to repair all of the blocks in the pool. + # + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + typeset mntpnt=$(get_prop mountpoint $pool/fs) + log_must find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1 + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool scrub -w $pool + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + function test_resilver # { typeset pool=$1 @@ -121,7 +171,6 @@ function test_scrub # typeset pool=$1 typeset nparity=$2 typeset dir=$3 - typeset combrec=$4 log_must zpool export $pool @@ -189,6 +238,7 @@ for nparity in 1 2 3; do log_must check_pool_status $TESTPOOL "errors" "No known data errors" + test_selfheal $TESTPOOL $nparity $dir test_resilver $TESTPOOL $nparity $dir test_scrub $TESTPOOL $nparity $dir