OpenZFS 9318 - vol_volsize_to_reservation does not account for raidz skip blocks

When a volume is created in a pool with raidz vdevs and volblocksize != 128k, the volume can reference more space than is reserved with the automatically calculated refreservation. There are two deficiencies in vol_volsize_to_reservation that contribute to this: 1) Skip blocks may be added to keep each allocation a multiple of parity + 1. This is the dominating factor when volblocksize is close to 2^ashift. 2) raidz deflation for 128 KB blocks is different for most other block sizes. See "The theory of raidz space accounting" comment in libzfs_dataset.c for a full explanation. Authored by: Mike Gerdts <mike.gerdts@joyent.com> Reviewed by: Richard Elling <Richard.Elling@RichardElling.com> Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Matt Ahrens <matt@delphix.com> Reviewed by: Kody Kantor <kody.kantor@joyent.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Dan McDonald <danmcd@joyent.com> Ported-by: Mike Gerdts <mike.gerdts@joyent.com> Porting Notes: * ZTS: wait for zvols to exist before writing * ZTS: use log_must_busy with {zpool|zfs} destroy OpenZFS-issue: https://www.illumos.org/issues/9318 OpenZFS-commit: https://github.com/illumos/illumos-gate/commit/b73ccab0 Closes #8973
2019-06-30 23:38:07 +00:00 · 2019-06-30 23:38:07 +00:00 · 341166c843
parent 6dbca94f0c
commit 341166c843
7 changed files with 520 additions and 14 deletions
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@ -29,6 +29,7 @@
 * Copyright 2016 Nexenta Systems, Inc.
 * Copyright (c) 2019 Datto Inc.
 * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
 * Copyright 2019 Joyent, Inc.
 */
 #include <assert.h>
@ -998,10 +999,11 @@ zfs_do_create(int argc, char **argv)
 			zpool_close(zpool_handle);
 			goto error;
 		}
 		zpool_close(zpool_handle);
-		volsize = zvol_volsize_to_reservation(volsize, real_props);
+		volsize = zvol_volsize_to_reservation(zpool_handle, volsize,
 		    real_props);
 		nvlist_free(real_props);
 		zpool_close(zpool_handle);
 		if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
 		    &strval) != 0) {
--- a/include/libzfs.h
+++ b/include/libzfs.h
@ -22,7 +22,7 @@
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright Joyent, Inc.
 * Copyright (c) 2013 Steven Hartland. All rights reserved.
 * Copyright (c) 2016, Intel Corporation.
 * Copyright 2016 Nexenta Systems, Inc.
@ -688,7 +688,8 @@ extern int zfs_hold(zfs_handle_t *, const char *, const char *,
 extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
 extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
-extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
+extern uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t,
    nvlist_t *);
 typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
    uid_t rid, uint64_t space);
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@ -21,7 +21,7 @@
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2012 DEY Storage Systems, Inc.  All rights reserved.
 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
@ -1617,6 +1617,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
 	uint64_t new_reservation;
 	zfs_prop_t resv_prop;
 	nvlist_t *props;
 	zpool_handle_t *zph = zpool_handle(zhp);
 	/*
 	 * If this is an existing volume, and someone is setting the volsize,
@ -1631,7 +1632,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
 	fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 	    zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
-	if ((zvol_volsize_to_reservation(old_volsize, props) !=
+	if ((zvol_volsize_to_reservation(zph, old_volsize, props) !=
 	    old_reservation) || nvlist_exists(nvl,
 	    zfs_prop_to_name(resv_prop))) {
 		fnvlist_free(props);
@ -1642,7 +1643,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
 		fnvlist_free(props);
 		return (-1);
 	}
-	new_reservation = zvol_volsize_to_reservation(new_volsize, props);
+	new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props);
 	fnvlist_free(props);
 	if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
@ -1697,7 +1698,8 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl)
 		volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 	}
-	resvsize = zvol_volsize_to_reservation(volsize, props);
+	resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize,
 	    props);
 	fnvlist_free(props);
 	(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop));
@ -5376,12 +5378,176 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
 }
 /*
- * Convert the zvol's volume size to an appropriate reservation.
+ * The theory of raidz space accounting
 *
 * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block
 * will "reference" 128KB, even though it allocates more than that, to store the
 * parity information (and perhaps skip sectors). This concept of the
 * "referenced" (and other DMU space accounting) being lower than the allocated
 * space by a constant factor is called "raidz deflation."
 *
 * As mentioned above, the constant factor for raidz deflation assumes a 128KB
 * block size. However, zvols typically have a much smaller block size (default
 * 8KB). These smaller blocks may require proportionally much more parity
 * information (and perhaps skip sectors). In this case, the change to the
 * "referenced" property may be much more than the logical block size.
 *
 * Suppose a raidz vdev has 5 disks with ashift=12.  A 128k block may be written
 * as follows.
 *
 * +-------+-------+-------+-------+-------+
 * | disk1 | disk2 | disk3 | disk4 | disk5 |
 * +-------+-------+-------+-------+-------+
 * |  P0   |  D0   |  D8   |  D16  |  D24  |
 * |  P1   |  D1   |  D9   |  D17  |  D25  |
 * |  P2   |  D2   |  D10  |  D18  |  D26  |
 * |  P3   |  D3   |  D11  |  D19  |  D27  |
 * |  P4   |  D4   |  D12  |  D20  |  D28  |
 * |  P5   |  D5   |  D13  |  D21  |  D29  |
 * |  P6   |  D6   |  D14  |  D22  |  D30  |
 * |  P7   |  D7   |  D15  |  D23  |  D31  |
 * +-------+-------+-------+-------+-------+
 *
 * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data
 * sectors.  The dataset's referenced will increase by 128k and the pool's
 * allocated and free properties will be adjusted by 160k.
 *
 * A 4k block written to the same raidz vdev will require two 4k sectors.  The
 * blank cells represent unallocated space.
 *
 * +-------+-------+-------+-------+-------+
 * | disk1 | disk2 | disk3 | disk4 | disk5 |
 * +-------+-------+-------+-------+-------+
 * |  P0   |  D0   |       |       |       |
 * +-------+-------+-------+-------+-------+
 *
 * Above, notice that the 4k block required one sector for parity and another
 * for data.  vdev_raidz_asize() will return 8k and as such the pool's allocated
 * and free properties will be adjusted by 8k.  The dataset will not be charged
 * 8k.  Rather, it will be charged a value that is scaled according to the
 * overhead of the 128k block on the same vdev.  This 8k allocation will be
 * charged 8k * 128k / 160k.  128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
 * calculated in the 128k block example above.
 *
 * Every raidz allocation is sized to be a multiple of nparity+1 sectors.  That
 * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
 * allocations are a multiple of 3 sectors, and raidz3 allocations are a
 * multiple of of 4 sectors.  When a block does not fill the required number of
 * sectors, skip blocks (sectors) are used.
 *
 * An 8k block being written to a raidz vdev may be written as follows:
 *
 * +-------+-------+-------+-------+-------+
 * | disk1 | disk2 | disk3 | disk4 | disk5 |
 * +-------+-------+-------+-------+-------+
 * |  P0   |  D0   |  D1   |  S0   |       |
 * +-------+-------+-------+-------+-------+
 *
 * In order to maintain the nparity+1 allocation size, a skip block (S0) was
 * added.  For this 8k block, the pool's allocated and free properties are
 * adjusted by 16k and the dataset's referenced is increased by 16k * 128k /
 * 160k.  Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in
 * the 128k block example above.
 *
 * Compression may lead to a variety of block sizes being written for the same
 * volume or file.  There is no clear way to reserve just the amount of space
 * that will be required, so the worst case (no compression) is assumed.
 * Note that metadata blocks will typically be compressed, so the reservation
 * size returned by zvol_volsize_to_reservation() will generally be slightly
 * larger than the maximum that the volume can reference.
 */
 /*
 * Derived from function of same name in module/zfs/vdev_raidz.c.  Returns the
 * amount of space (in bytes) that will be allocated for the specified block
 * size. Note that the "referenced" space accounted will be less than this, but
 * not necessarily equal to "blksize", due to RAIDZ deflation.
 */
 static uint64_t
 vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
    uint64_t blksize)
 {
 	uint64_t asize, ndata;
 	ASSERT3U(ndisks, >, nparity);
 	ndata = ndisks - nparity;
 	asize = ((blksize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + ndata - 1) / ndata);
 	asize = roundup(asize, nparity + 1) << ashift;
 	return (asize);
 }
 /*
 * Determine how much space will be allocated if it lands on the most space-
 * inefficient top-level vdev.  Returns the size in bytes required to store one
 * copy of the volume data.  See theory comment above.
 */
 static uint64_t
 volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
 {
 	nvlist_t *config, *tree, **vdevs;
 	uint_t nvdevs, v;
 	uint64_t ret = 0;
 	config = zpool_get_config(zhp, NULL);
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
 	    nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
 	    &vdevs, &nvdevs) != 0) {
 		return (nblocks * blksize);
 	}
 	for (v = 0; v < nvdevs; v++) {
 		char *type;
 		uint64_t nparity, ashift, asize, tsize;
 		nvlist_t **disks;
 		uint_t ndisks;
 		uint64_t volsize;
 		if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE,
 		    &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 ||
 		    nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY,
 		    &nparity) != 0 ||
 		    nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT,
 		    &ashift) != 0 ||
 		    nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN,
 		    &disks, &ndisks) != 0) {
 			continue;
 		}
 		/* allocation size for the "typical" 128k block */
 		tsize = vdev_raidz_asize(ndisks, nparity, ashift,
 		    SPA_OLD_MAXBLOCKSIZE);
 		/* allocation size for the blksize block */
 		asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize);
 		/*
 		 * Scale this size down as a ratio of 128k / tsize.  See theory
 		 * statement above.
 		 */
 		volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize;
 		if (volsize > ret) {
 			ret = volsize;
 		}
 	}
 	if (ret == 0) {
 		ret = nblocks * blksize;
 	}
 	return (ret);
 }
 /*
 * Convert the zvol's volume size to an appropriate reservation.  See theory
 * comment above.
 *
 * Note: If this routine is updated, it is necessary to update the ZFS test
- * suite's shell version in reservation.kshlib.
+ * suite's shell version in reservation.shlib.
 */
 uint64_t
-zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
+zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize,
    nvlist_t *props)
 {
 	uint64_t numdb;
 	uint64_t nblocks, volblocksize;
@ -5397,7 +5563,14 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 	    &volblocksize) != 0)
 		volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
-	nblocks = volsize/volblocksize;
+
 	nblocks = volsize / volblocksize;
 	/*
 	 * Metadata defaults to using 128k blocks, not volblocksize blocks.  For
 	 * this reason, only the data blocks are scaled based on vdev config.
 	 */
 	volsize = volsize_from_vdevs(zph, nblocks, volblocksize);
 	/* start with metadnode L0-L6 */
 	numdb = 7;
 	/* calculate number of indirects */
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@ -756,7 +756,8 @@ tags = ['functional', 'refquota']
 [tests/functional/refreserv]
 tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
-    'refreserv_004_pos', 'refreserv_005_pos']
+    'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz',
    'refreserv_raidz']
 tags = ['functional', 'refreserv']
 [tests/functional/removal]
--- a/tests/zfs-tests/tests/functional/refreserv/Makefile.am
+++ b/tests/zfs-tests/tests/functional/refreserv/Makefile.am
@ -6,7 +6,9 @@ dist_pkgdata_SCRIPTS = \
 	refreserv_002_pos.ksh \
 	refreserv_003_pos.ksh \
 	refreserv_004_pos.ksh \
-	refreserv_005_pos.ksh
+	refreserv_005_pos.ksh \
 	refreserv_multi_raidz.ksh \
 	refreserv_raidz.ksh
 dist_pkgdata_DATA = \
 	refreserv.cfg
--- a/tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh
+++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh
@ -0,0 +1,197 @@
 #!/bin/ksh -p
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 #
 # Copyright 2019 Joyent, Inc.
 #
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/refreserv/refreserv.cfg
 #
 # DESCRIPTION:
 #	raidz refreservation=auto picks worst raidz vdev
 #
 # STRATEGY:
 #	1. Create a pool with a single raidz vdev
 #	2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k]
 #	    - create a volume
 #	    - remember its refreservation
 #	    - destroy the volume
 #	3. Destroy the pool
 #	4. Recreate the pool with one more disk in the vdev, then repeat steps
 #	   2 and 3.
 #
 # NOTES:
 #	1. This test will use up to 14 disks but can cover the key concepts with
 #	   5 disks.
 #	2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely.
 #
 verify_runnable "global"
 typeset -a alldisks=($DISKS)
 # The larger the volsize, the better zvol_volsize_to_reservation() is at
 # guessing the right number - though it is horrible with tiny blocks.  At 10M on
 # ashift=12, the estimate may be over 26% too high.
 volsize=100
 function cleanup
 {
 	default_cleanup_noexit
 	default_setup_noexit "${alldisks[0]}"
 }
 log_assert "raidz refreservation=auto picks worst raidz vdev"
 log_onexit cleanup
 poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL"
 # Testing tiny block sizes on ashift=12 pools causes so much size inflation
 # that small test disks may fill before creating small volumes.  However,
 # testing 512b and 1K blocks on ashift=9 pools is an ok approximation for
 # testing the problems that arise from 4K and 8K blocks on ashift=12 pools.
 bps=$(lsblk -nrdo min-io /dev/${alldisks[0]})
 case "$bps" in
 512)
 	allshifts=(9 10 17)
 	;;
 4096)
 	allshifts=(12 13 17)
 	;;
 *)
 	log_fail "bytes/sector: $bps != (512|4096)"
 	;;
 esac
 log_note "Testing in ashift=${allshifts[0]} mode"
 typeset -A sizes=
 #
 # Determine the refreservation for a $volsize MiB volume on each raidz type at
 # various block sizes.
 #
 for parity in 1 2 3; do
 	raid=raidz$parity
 	typeset -A sizes["$raid"]
 	# Ensure we hit scenarios with and without skip blocks
 	for ndisks in $((parity * 2)) $((parity * 2 + 1)); do
 		typeset -a disks=(${alldisks[0..$((ndisks - 1))]})
 		if (( ${#disks[@]} < ndisks )); then
 			log_note "Too few disks to test $raid-$ndisks"
 			continue
 		fi
 		typeset -A sizes["$raid"]["$ndisks"]
 		log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}"
 		for bits in "${allshifts[@]}"; do
 			vbs=$((1 << bits))
 			log_note "Gathering refreservation for $raid-$ndisks" \
 			    "volblocksize=$vbs"
 			vol=$TESTPOOL/$TESTVOL
 			log_must zfs create -V ${volsize}m \
 			    -o volblocksize=$vbs "$vol"
 			refres=$(zfs get -Hpo value refreservation "$vol")
 			log_must test -n "$refres"
 			sizes["$raid"]["$ndisks"]["$vbs"]=$refres
 			log_must_busy zfs destroy "$vol"
 		done
 		log_must_busy zpool destroy "$TESTPOOL"
 	done
 done
 # A little extra info is always helpful when diagnosing problems.  To
 # pretty-print what you find in the log, do this in ksh:
 #   typeset -A sizes=(...)
 #   print -v sizes
 log_note "sizes=$(print -C sizes)"
 #
 # Helper furnction for checking that refreservation is calculated properly in
 # multi-vdev pools.  "Properly" is defined as assuming that all vdevs are as
 # space inefficient as the worst one.
 #
 function check_vdevs {
 	typeset raid=$1
 	typeset nd1=$2
 	typeset nd2=$3
 	typeset -a disks1 disks2
 	typeset vbs vol refres refres1 refres2 expect
 	disks1=(${alldisks[0..$((nd1 - 1))]})
 	disks2=(${alldisks[$nd1..$((nd1 + nd2 - 1))]})
 	if (( ${#disks2[@]} < nd2 )); then
 		log_note "Too few disks to test $raid-$nd1 + $raid=$nd2"
 		return
 	fi
 	log_must zpool create -f "$TESTPOOL" \
 	    "$raid" "${disks1[@]}" "$raid" "${disks2[@]}"
 	for bits in "${allshifts[@]}"; do
 		vbs=$((1 << bits))
 		log_note "Verifying $raid-$nd1 $raid-$nd2 volblocksize=$vbs"
 		vol=$TESTPOOL/$TESTVOL
 		log_must zfs create -V ${volsize}m -o volblocksize=$vbs "$vol"
 		refres=$(zfs get -Hpo value refreservation "$vol")
 		log_must test -n "$refres"
 		refres1=${sizes["$raid"]["$nd1"]["$vbs"]}
 		refres2=${sizes["$raid"]["$nd2"]["$vbs"]}
 		if (( refres1 > refres2 )); then
 			log_note "Expecting refres ($refres) to match refres" \
 			   "from $raid-$nd1 ($refres1)"
 			log_must test "$refres" -eq "$refres1"
 		else
 			log_note "Expecting refres ($refres) to match refres" \
 			   "from $raid-$nd1 ($refres2)"
 			log_must test "$refres" -eq "$refres2"
 		fi
 		log_must zfs destroy "$vol"
 	done
 	log_must zpool destroy "$TESTPOOL"
 }
 #
 # Verify that multi-vdev pools use the last optimistic size for all the
 # permutations within a particular raidz variant.
 #
 for raid in "${!sizes[@]}"; do
 	# ksh likes to create a [0] item for us.  Thanks, ksh!
 	[[ $raid == "0" ]] && continue
 	for nd1 in "${!sizes["$raid"][@]}"; do
 		# And with an empty array we get one key, ''.  Thanks, ksh!
 		[[ $nd1 == "0" || -z "$nd1" ]] && continue
 		for nd2 in "${!sizes["$raid"][@]}"; do
 			[[ $nd2 == "0" || -z "$nd2" ]] && continue
 			check_vdevs "$raid" "$nd1" "$nd2"
 		done
 	done
 done
 log_pass "raidz refreservation=auto picks worst raidz vdev"
--- a/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh
+++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh
@ -0,0 +1,130 @@
 #!/bin/ksh -p
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 #
 # Copyright 2019 Joyent, Inc.
 #
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/refreserv/refreserv.cfg
 #
 # DESCRIPTION:
 #	raidz refreservation=auto accounts for extra parity and skip blocks
 #
 # STRATEGY:
 #	1. Create a pool with a single raidz vdev
 #	2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k]
 #	    - create a volume
 #	    - fully overwrite it
 #	    - verify that referenced is less than or equal to reservation
 #	    - destroy the volume
 #	3. Destroy the pool
 #	4. Recreate the pool with one more disk in the vdev, then repeat steps
 #	   2 and 3.
 #	5. Repeat all steps above for raidz2 and raidz3.
 #
 # NOTES:
 #	1. This test will use up to 14 disks but can cover the key concepts with
 #	   5 disks.
 #	2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely.
 #
 verify_runnable "global"
 typeset -a alldisks=($DISKS)
 # The larger the volsize, the better zvol_volsize_to_reservation() is at
 # guessing the right number.  At 10M on ashift=12, the estimate may be over 26%
 # too high.
 volsize=100
 function cleanup
 {
 	default_cleanup_noexit
 	default_setup_noexit "${alldisks[0]}"
 }
 log_assert "raidz refreservation=auto accounts for extra parity and skip blocks"
 log_onexit cleanup
 poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL"
 # Testing tiny block sizes on ashift=12 pools causes so much size inflation
 # that small test disks may fill before creating small volumes.  However,
 # testing 512b and 1K blocks on ashift=9 pools is an ok approximation for
 # testing the problems that arise from 4K and 8K blocks on ashift=12 pools.
 bps=$(lsblk -nrdo min-io /dev/${alldisks[0]})
 log_must test "$bps" -eq 512 -o "$bps" -eq 4096
 case "$bps" in
 512)
 	allshifts=(9 10 17)
 	maxpct=151
 	;;
 4096)
 	allshifts=(12 13 17)
 	maxpct=110
 	;;
 *)
 	log_fail "bytes/sector: $bps != (512|4096)"
 	;;
 esac
 log_note "Testing in ashift=${allshifts[0]} mode"
 # This loop handles all iterations of steps 1 through 4 described in strategy
 # comment above,
 for parity in 1 2 3; do
 	raid=raidz$parity
 	# Ensure we hit scenarios with and without skip blocks
 	for ndisks in $((parity * 2)) $((parity * 2 + 1)); do
 		typeset -a disks=(${alldisks[0..$((ndisks - 1))]})
 		if (( ${#disks[@]} < ndisks )); then
 			log_note "Too few disks to test $raid-$ndisks"
 			continue
 		fi
 		log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}"
 		for bits in "${allshifts[@]}"; do
 			vbs=$((1 << bits))
 			log_note "Testing $raid-$ndisks volblocksize=$vbs"
 			vol=$TESTPOOL/$TESTVOL
 			log_must zfs create -V ${volsize}m \
 			    -o volblocksize=$vbs "$vol"
 			block_device_wait "/dev/zvol/$vol"
 			log_must dd if=/dev/zero of=/dev/zvol/$vol \
 			    bs=1024k count=$volsize
 			sync
 			ref=$(zfs get -Hpo value referenced "$vol")
 			refres=$(zfs get -Hpo value refreservation "$vol")
 			log_must test -n "$ref"
 			log_must test -n "$refres"
 			typeset -F2 deltapct=$((refres * 100.0 / ref))
 			log_note "$raid-$ndisks refreservation $refres" \
 			    "is $deltapct% of reservation $res"
 			log_must test "$ref" -le "$refres"
 			log_must test "$deltapct" -le $maxpct
 			log_must_busy zfs destroy "$vol"
 		done
 		log_must_busy zpool destroy "$TESTPOOL"
 	done
 done
 log_pass "raidz refreservation=auto accounts for extra parity and skip blocks"