OpenZFS 9318 - vol_volsize_to_reservation does not account for raidz skip blocks
When a volume is created in a pool with raidz vdevs and volblocksize != 128k, the volume can reference more space than is reserved with the automatically calculated refreservation. There are two deficiencies in vol_volsize_to_reservation that contribute to this: 1) Skip blocks may be added to keep each allocation a multiple of parity + 1. This is the dominating factor when volblocksize is close to 2^ashift. 2) raidz deflation for 128 KB blocks is different for most other block sizes. See "The theory of raidz space accounting" comment in libzfs_dataset.c for a full explanation. Authored by: Mike Gerdts <mike.gerdts@joyent.com> Reviewed by: Richard Elling <Richard.Elling@RichardElling.com> Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Matt Ahrens <matt@delphix.com> Reviewed by: Kody Kantor <kody.kantor@joyent.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Dan McDonald <danmcd@joyent.com> Ported-by: Mike Gerdts <mike.gerdts@joyent.com> Porting Notes: * ZTS: wait for zvols to exist before writing * ZTS: use log_must_busy with {zpool|zfs} destroy OpenZFS-issue: https://www.illumos.org/issues/9318 OpenZFS-commit: https://github.com/illumos/illumos-gate/commit/b73ccab0 Closes #8973
This commit is contained in:
parent
350646563f
commit
f3f46b0e45
|
@ -29,6 +29,7 @@
|
||||||
* Copyright 2016 Nexenta Systems, Inc.
|
* Copyright 2016 Nexenta Systems, Inc.
|
||||||
* Copyright (c) 2019 Datto Inc.
|
* Copyright (c) 2019 Datto Inc.
|
||||||
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
|
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
|
||||||
|
* Copyright 2019 Joyent, Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
@ -992,10 +993,11 @@ zfs_do_create(int argc, char **argv)
|
||||||
zpool_close(zpool_handle);
|
zpool_close(zpool_handle);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
zpool_close(zpool_handle);
|
|
||||||
|
|
||||||
volsize = zvol_volsize_to_reservation(volsize, real_props);
|
volsize = zvol_volsize_to_reservation(zpool_handle, volsize,
|
||||||
|
real_props);
|
||||||
nvlist_free(real_props);
|
nvlist_free(real_props);
|
||||||
|
zpool_close(zpool_handle);
|
||||||
|
|
||||||
if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
|
if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
|
||||||
&strval) != 0) {
|
&strval) != 0) {
|
||||||
|
|
|
@ -22,7 +22,7 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
* Copyright Joyent, Inc.
|
||||||
* Copyright (c) 2013 Steven Hartland. All rights reserved.
|
* Copyright (c) 2013 Steven Hartland. All rights reserved.
|
||||||
* Copyright (c) 2016, Intel Corporation.
|
* Copyright (c) 2016, Intel Corporation.
|
||||||
* Copyright 2016 Nexenta Systems, Inc.
|
* Copyright 2016 Nexenta Systems, Inc.
|
||||||
|
@ -687,7 +687,8 @@ extern int zfs_hold(zfs_handle_t *, const char *, const char *,
|
||||||
extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
|
extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
|
||||||
extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
|
extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
|
||||||
extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
|
extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
|
||||||
extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
|
extern uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t,
|
||||||
|
nvlist_t *);
|
||||||
|
|
||||||
typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
|
typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
|
||||||
uid_t rid, uint64_t space);
|
uid_t rid, uint64_t space);
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2018, Joyent, Inc. All rights reserved.
|
* Copyright 2019 Joyent, Inc.
|
||||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved.
|
* Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
|
* Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
|
||||||
|
@ -1618,6 +1618,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
|
||||||
uint64_t new_reservation;
|
uint64_t new_reservation;
|
||||||
zfs_prop_t resv_prop;
|
zfs_prop_t resv_prop;
|
||||||
nvlist_t *props;
|
nvlist_t *props;
|
||||||
|
zpool_handle_t *zph = zpool_handle(zhp);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this is an existing volume, and someone is setting the volsize,
|
* If this is an existing volume, and someone is setting the volsize,
|
||||||
|
@ -1632,7 +1633,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
|
||||||
fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
|
fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
|
||||||
zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
|
zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
|
||||||
|
|
||||||
if ((zvol_volsize_to_reservation(old_volsize, props) !=
|
if ((zvol_volsize_to_reservation(zph, old_volsize, props) !=
|
||||||
old_reservation) || nvlist_exists(nvl,
|
old_reservation) || nvlist_exists(nvl,
|
||||||
zfs_prop_to_name(resv_prop))) {
|
zfs_prop_to_name(resv_prop))) {
|
||||||
fnvlist_free(props);
|
fnvlist_free(props);
|
||||||
|
@ -1643,7 +1644,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
|
||||||
fnvlist_free(props);
|
fnvlist_free(props);
|
||||||
return (-1);
|
return (-1);
|
||||||
}
|
}
|
||||||
new_reservation = zvol_volsize_to_reservation(new_volsize, props);
|
new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props);
|
||||||
fnvlist_free(props);
|
fnvlist_free(props);
|
||||||
|
|
||||||
if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
|
if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
|
||||||
|
@ -1698,7 +1699,8 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl)
|
||||||
volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
|
volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
resvsize = zvol_volsize_to_reservation(volsize, props);
|
resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize,
|
||||||
|
props);
|
||||||
fnvlist_free(props);
|
fnvlist_free(props);
|
||||||
|
|
||||||
(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop));
|
(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop));
|
||||||
|
@ -5363,12 +5365,176 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Convert the zvol's volume size to an appropriate reservation.
|
* The theory of raidz space accounting
|
||||||
|
*
|
||||||
|
* The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block
|
||||||
|
* will "reference" 128KB, even though it allocates more than that, to store the
|
||||||
|
* parity information (and perhaps skip sectors). This concept of the
|
||||||
|
* "referenced" (and other DMU space accounting) being lower than the allocated
|
||||||
|
* space by a constant factor is called "raidz deflation."
|
||||||
|
*
|
||||||
|
* As mentioned above, the constant factor for raidz deflation assumes a 128KB
|
||||||
|
* block size. However, zvols typically have a much smaller block size (default
|
||||||
|
* 8KB). These smaller blocks may require proportionally much more parity
|
||||||
|
* information (and perhaps skip sectors). In this case, the change to the
|
||||||
|
* "referenced" property may be much more than the logical block size.
|
||||||
|
*
|
||||||
|
* Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written
|
||||||
|
* as follows.
|
||||||
|
*
|
||||||
|
* +-------+-------+-------+-------+-------+
|
||||||
|
* | disk1 | disk2 | disk3 | disk4 | disk5 |
|
||||||
|
* +-------+-------+-------+-------+-------+
|
||||||
|
* | P0 | D0 | D8 | D16 | D24 |
|
||||||
|
* | P1 | D1 | D9 | D17 | D25 |
|
||||||
|
* | P2 | D2 | D10 | D18 | D26 |
|
||||||
|
* | P3 | D3 | D11 | D19 | D27 |
|
||||||
|
* | P4 | D4 | D12 | D20 | D28 |
|
||||||
|
* | P5 | D5 | D13 | D21 | D29 |
|
||||||
|
* | P6 | D6 | D14 | D22 | D30 |
|
||||||
|
* | P7 | D7 | D15 | D23 | D31 |
|
||||||
|
* +-------+-------+-------+-------+-------+
|
||||||
|
*
|
||||||
|
* Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data
|
||||||
|
* sectors. The dataset's referenced will increase by 128k and the pool's
|
||||||
|
* allocated and free properties will be adjusted by 160k.
|
||||||
|
*
|
||||||
|
* A 4k block written to the same raidz vdev will require two 4k sectors. The
|
||||||
|
* blank cells represent unallocated space.
|
||||||
|
*
|
||||||
|
* +-------+-------+-------+-------+-------+
|
||||||
|
* | disk1 | disk2 | disk3 | disk4 | disk5 |
|
||||||
|
* +-------+-------+-------+-------+-------+
|
||||||
|
* | P0 | D0 | | | |
|
||||||
|
* +-------+-------+-------+-------+-------+
|
||||||
|
*
|
||||||
|
* Above, notice that the 4k block required one sector for parity and another
|
||||||
|
* for data. vdev_raidz_asize() will return 8k and as such the pool's allocated
|
||||||
|
* and free properties will be adjusted by 8k. The dataset will not be charged
|
||||||
|
* 8k. Rather, it will be charged a value that is scaled according to the
|
||||||
|
* overhead of the 128k block on the same vdev. This 8k allocation will be
|
||||||
|
* charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
|
||||||
|
* calculated in the 128k block example above.
|
||||||
|
*
|
||||||
|
* Every raidz allocation is sized to be a multiple of nparity+1 sectors. That
|
||||||
|
* is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
|
||||||
|
* allocations are a multiple of 3 sectors, and raidz3 allocations are a
|
||||||
|
* multiple of of 4 sectors. When a block does not fill the required number of
|
||||||
|
* sectors, skip blocks (sectors) are used.
|
||||||
|
*
|
||||||
|
* An 8k block being written to a raidz vdev may be written as follows:
|
||||||
|
*
|
||||||
|
* +-------+-------+-------+-------+-------+
|
||||||
|
* | disk1 | disk2 | disk3 | disk4 | disk5 |
|
||||||
|
* +-------+-------+-------+-------+-------+
|
||||||
|
* | P0 | D0 | D1 | S0 | |
|
||||||
|
* +-------+-------+-------+-------+-------+
|
||||||
|
*
|
||||||
|
* In order to maintain the nparity+1 allocation size, a skip block (S0) was
|
||||||
|
* added. For this 8k block, the pool's allocated and free properties are
|
||||||
|
* adjusted by 16k and the dataset's referenced is increased by 16k * 128k /
|
||||||
|
* 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in
|
||||||
|
* the 128k block example above.
|
||||||
|
*
|
||||||
|
* Compression may lead to a variety of block sizes being written for the same
|
||||||
|
* volume or file. There is no clear way to reserve just the amount of space
|
||||||
|
* that will be required, so the worst case (no compression) is assumed.
|
||||||
|
* Note that metadata blocks will typically be compressed, so the reservation
|
||||||
|
* size returned by zvol_volsize_to_reservation() will generally be slightly
|
||||||
|
* larger than the maximum that the volume can reference.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Derived from function of same name in module/zfs/vdev_raidz.c. Returns the
|
||||||
|
* amount of space (in bytes) that will be allocated for the specified block
|
||||||
|
* size. Note that the "referenced" space accounted will be less than this, but
|
||||||
|
* not necessarily equal to "blksize", due to RAIDZ deflation.
|
||||||
|
*/
|
||||||
|
static uint64_t
|
||||||
|
vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
||||||
|
uint64_t blksize)
|
||||||
|
{
|
||||||
|
uint64_t asize, ndata;
|
||||||
|
|
||||||
|
ASSERT3U(ndisks, >, nparity);
|
||||||
|
ndata = ndisks - nparity;
|
||||||
|
asize = ((blksize - 1) >> ashift) + 1;
|
||||||
|
asize += nparity * ((asize + ndata - 1) / ndata);
|
||||||
|
asize = roundup(asize, nparity + 1) << ashift;
|
||||||
|
|
||||||
|
return (asize);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Determine how much space will be allocated if it lands on the most space-
|
||||||
|
* inefficient top-level vdev. Returns the size in bytes required to store one
|
||||||
|
* copy of the volume data. See theory comment above.
|
||||||
|
*/
|
||||||
|
static uint64_t
|
||||||
|
volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
|
||||||
|
{
|
||||||
|
nvlist_t *config, *tree, **vdevs;
|
||||||
|
uint_t nvdevs, v;
|
||||||
|
uint64_t ret = 0;
|
||||||
|
|
||||||
|
config = zpool_get_config(zhp, NULL);
|
||||||
|
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
|
||||||
|
nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
|
||||||
|
&vdevs, &nvdevs) != 0) {
|
||||||
|
return (nblocks * blksize);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (v = 0; v < nvdevs; v++) {
|
||||||
|
char *type;
|
||||||
|
uint64_t nparity, ashift, asize, tsize;
|
||||||
|
nvlist_t **disks;
|
||||||
|
uint_t ndisks;
|
||||||
|
uint64_t volsize;
|
||||||
|
|
||||||
|
if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE,
|
||||||
|
&type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 ||
|
||||||
|
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY,
|
||||||
|
&nparity) != 0 ||
|
||||||
|
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT,
|
||||||
|
&ashift) != 0 ||
|
||||||
|
nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN,
|
||||||
|
&disks, &ndisks) != 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* allocation size for the "typical" 128k block */
|
||||||
|
tsize = vdev_raidz_asize(ndisks, nparity, ashift,
|
||||||
|
SPA_OLD_MAXBLOCKSIZE);
|
||||||
|
/* allocation size for the blksize block */
|
||||||
|
asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Scale this size down as a ratio of 128k / tsize. See theory
|
||||||
|
* statement above.
|
||||||
|
*/
|
||||||
|
volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize;
|
||||||
|
if (volsize > ret) {
|
||||||
|
ret = volsize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret == 0) {
|
||||||
|
ret = nblocks * blksize;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Convert the zvol's volume size to an appropriate reservation. See theory
|
||||||
|
* comment above.
|
||||||
|
*
|
||||||
* Note: If this routine is updated, it is necessary to update the ZFS test
|
* Note: If this routine is updated, it is necessary to update the ZFS test
|
||||||
* suite's shell version in reservation.kshlib.
|
* suite's shell version in reservation.shlib.
|
||||||
*/
|
*/
|
||||||
uint64_t
|
uint64_t
|
||||||
zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
|
zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize,
|
||||||
|
nvlist_t *props)
|
||||||
{
|
{
|
||||||
uint64_t numdb;
|
uint64_t numdb;
|
||||||
uint64_t nblocks, volblocksize;
|
uint64_t nblocks, volblocksize;
|
||||||
|
@ -5384,7 +5550,14 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
|
||||||
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
|
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
|
||||||
&volblocksize) != 0)
|
&volblocksize) != 0)
|
||||||
volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
|
volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
|
||||||
nblocks = volsize/volblocksize;
|
|
||||||
|
nblocks = volsize / volblocksize;
|
||||||
|
/*
|
||||||
|
* Metadata defaults to using 128k blocks, not volblocksize blocks. For
|
||||||
|
* this reason, only the data blocks are scaled based on vdev config.
|
||||||
|
*/
|
||||||
|
volsize = volsize_from_vdevs(zph, nblocks, volblocksize);
|
||||||
|
|
||||||
/* start with metadnode L0-L6 */
|
/* start with metadnode L0-L6 */
|
||||||
numdb = 7;
|
numdb = 7;
|
||||||
/* calculate number of indirects */
|
/* calculate number of indirects */
|
||||||
|
|
|
@ -755,7 +755,8 @@ tags = ['functional', 'refquota']
|
||||||
|
|
||||||
[tests/functional/refreserv]
|
[tests/functional/refreserv]
|
||||||
tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
|
tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
|
||||||
'refreserv_004_pos', 'refreserv_005_pos']
|
'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz',
|
||||||
|
'refreserv_raidz']
|
||||||
tags = ['functional', 'refreserv']
|
tags = ['functional', 'refreserv']
|
||||||
|
|
||||||
[tests/functional/removal]
|
[tests/functional/removal]
|
||||||
|
|
|
@ -6,7 +6,9 @@ dist_pkgdata_SCRIPTS = \
|
||||||
refreserv_002_pos.ksh \
|
refreserv_002_pos.ksh \
|
||||||
refreserv_003_pos.ksh \
|
refreserv_003_pos.ksh \
|
||||||
refreserv_004_pos.ksh \
|
refreserv_004_pos.ksh \
|
||||||
refreserv_005_pos.ksh
|
refreserv_005_pos.ksh \
|
||||||
|
refreserv_multi_raidz.ksh \
|
||||||
|
refreserv_raidz.ksh
|
||||||
|
|
||||||
dist_pkgdata_DATA = \
|
dist_pkgdata_DATA = \
|
||||||
refreserv.cfg
|
refreserv.cfg
|
||||||
|
|
|
@ -0,0 +1,197 @@
|
||||||
|
#!/bin/ksh -p
|
||||||
|
#
|
||||||
|
# This file and its contents are supplied under the terms of the
|
||||||
|
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
# You may only use this file in accordance with the terms of version
|
||||||
|
# 1.0 of the CDDL.
|
||||||
|
#
|
||||||
|
# A full copy of the text of the CDDL should have accompanied this
|
||||||
|
# source. A copy of the CDDL is also available via the Internet at
|
||||||
|
# http://www.illumos.org/license/CDDL.
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright 2019 Joyent, Inc.
|
||||||
|
#
|
||||||
|
|
||||||
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
. $STF_SUITE/tests/functional/refreserv/refreserv.cfg
|
||||||
|
|
||||||
|
#
|
||||||
|
# DESCRIPTION:
|
||||||
|
# raidz refreservation=auto picks worst raidz vdev
|
||||||
|
#
|
||||||
|
# STRATEGY:
|
||||||
|
# 1. Create a pool with a single raidz vdev
|
||||||
|
# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k]
|
||||||
|
# - create a volume
|
||||||
|
# - remember its refreservation
|
||||||
|
# - destroy the volume
|
||||||
|
# 3. Destroy the pool
|
||||||
|
# 4. Recreate the pool with one more disk in the vdev, then repeat steps
|
||||||
|
# 2 and 3.
|
||||||
|
#
|
||||||
|
# NOTES:
|
||||||
|
# 1. This test will use up to 14 disks but can cover the key concepts with
|
||||||
|
# 5 disks.
|
||||||
|
# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely.
|
||||||
|
#
|
||||||
|
|
||||||
|
verify_runnable "global"
|
||||||
|
|
||||||
|
typeset -a alldisks=($DISKS)
|
||||||
|
|
||||||
|
# The larger the volsize, the better zvol_volsize_to_reservation() is at
|
||||||
|
# guessing the right number - though it is horrible with tiny blocks. At 10M on
|
||||||
|
# ashift=12, the estimate may be over 26% too high.
|
||||||
|
volsize=100
|
||||||
|
|
||||||
|
function cleanup
|
||||||
|
{
|
||||||
|
default_cleanup_noexit
|
||||||
|
default_setup_noexit "${alldisks[0]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_assert "raidz refreservation=auto picks worst raidz vdev"
|
||||||
|
log_onexit cleanup
|
||||||
|
|
||||||
|
poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL"
|
||||||
|
|
||||||
|
# Testing tiny block sizes on ashift=12 pools causes so much size inflation
|
||||||
|
# that small test disks may fill before creating small volumes. However,
|
||||||
|
# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for
|
||||||
|
# testing the problems that arise from 4K and 8K blocks on ashift=12 pools.
|
||||||
|
bps=$(lsblk -nrdo min-io /dev/${alldisks[0]})
|
||||||
|
case "$bps" in
|
||||||
|
512)
|
||||||
|
allshifts=(9 10 17)
|
||||||
|
;;
|
||||||
|
4096)
|
||||||
|
allshifts=(12 13 17)
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
log_fail "bytes/sector: $bps != (512|4096)"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
log_note "Testing in ashift=${allshifts[0]} mode"
|
||||||
|
|
||||||
|
typeset -A sizes=
|
||||||
|
|
||||||
|
#
|
||||||
|
# Determine the refreservation for a $volsize MiB volume on each raidz type at
|
||||||
|
# various block sizes.
|
||||||
|
#
|
||||||
|
for parity in 1 2 3; do
|
||||||
|
raid=raidz$parity
|
||||||
|
typeset -A sizes["$raid"]
|
||||||
|
|
||||||
|
# Ensure we hit scenarios with and without skip blocks
|
||||||
|
for ndisks in $((parity * 2)) $((parity * 2 + 1)); do
|
||||||
|
typeset -a disks=(${alldisks[0..$((ndisks - 1))]})
|
||||||
|
|
||||||
|
if (( ${#disks[@]} < ndisks )); then
|
||||||
|
log_note "Too few disks to test $raid-$ndisks"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
typeset -A sizes["$raid"]["$ndisks"]
|
||||||
|
|
||||||
|
log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}"
|
||||||
|
|
||||||
|
for bits in "${allshifts[@]}"; do
|
||||||
|
vbs=$((1 << bits))
|
||||||
|
log_note "Gathering refreservation for $raid-$ndisks" \
|
||||||
|
"volblocksize=$vbs"
|
||||||
|
|
||||||
|
vol=$TESTPOOL/$TESTVOL
|
||||||
|
log_must zfs create -V ${volsize}m \
|
||||||
|
-o volblocksize=$vbs "$vol"
|
||||||
|
|
||||||
|
refres=$(zfs get -Hpo value refreservation "$vol")
|
||||||
|
log_must test -n "$refres"
|
||||||
|
sizes["$raid"]["$ndisks"]["$vbs"]=$refres
|
||||||
|
|
||||||
|
log_must_busy zfs destroy "$vol"
|
||||||
|
done
|
||||||
|
|
||||||
|
log_must_busy zpool destroy "$TESTPOOL"
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
# A little extra info is always helpful when diagnosing problems. To
|
||||||
|
# pretty-print what you find in the log, do this in ksh:
|
||||||
|
# typeset -A sizes=(...)
|
||||||
|
# print -v sizes
|
||||||
|
log_note "sizes=$(print -C sizes)"
|
||||||
|
|
||||||
|
#
|
||||||
|
# Helper furnction for checking that refreservation is calculated properly in
|
||||||
|
# multi-vdev pools. "Properly" is defined as assuming that all vdevs are as
|
||||||
|
# space inefficient as the worst one.
|
||||||
|
#
|
||||||
|
function check_vdevs {
|
||||||
|
typeset raid=$1
|
||||||
|
typeset nd1=$2
|
||||||
|
typeset nd2=$3
|
||||||
|
typeset -a disks1 disks2
|
||||||
|
typeset vbs vol refres refres1 refres2 expect
|
||||||
|
|
||||||
|
disks1=(${alldisks[0..$((nd1 - 1))]})
|
||||||
|
disks2=(${alldisks[$nd1..$((nd1 + nd2 - 1))]})
|
||||||
|
if (( ${#disks2[@]} < nd2 )); then
|
||||||
|
log_note "Too few disks to test $raid-$nd1 + $raid=$nd2"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_must zpool create -f "$TESTPOOL" \
|
||||||
|
"$raid" "${disks1[@]}" "$raid" "${disks2[@]}"
|
||||||
|
|
||||||
|
for bits in "${allshifts[@]}"; do
|
||||||
|
vbs=$((1 << bits))
|
||||||
|
log_note "Verifying $raid-$nd1 $raid-$nd2 volblocksize=$vbs"
|
||||||
|
|
||||||
|
vol=$TESTPOOL/$TESTVOL
|
||||||
|
log_must zfs create -V ${volsize}m -o volblocksize=$vbs "$vol"
|
||||||
|
refres=$(zfs get -Hpo value refreservation "$vol")
|
||||||
|
log_must test -n "$refres"
|
||||||
|
|
||||||
|
refres1=${sizes["$raid"]["$nd1"]["$vbs"]}
|
||||||
|
refres2=${sizes["$raid"]["$nd2"]["$vbs"]}
|
||||||
|
|
||||||
|
if (( refres1 > refres2 )); then
|
||||||
|
log_note "Expecting refres ($refres) to match refres" \
|
||||||
|
"from $raid-$nd1 ($refres1)"
|
||||||
|
log_must test "$refres" -eq "$refres1"
|
||||||
|
else
|
||||||
|
log_note "Expecting refres ($refres) to match refres" \
|
||||||
|
"from $raid-$nd1 ($refres2)"
|
||||||
|
log_must test "$refres" -eq "$refres2"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_must zfs destroy "$vol"
|
||||||
|
done
|
||||||
|
|
||||||
|
log_must zpool destroy "$TESTPOOL"
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# Verify that multi-vdev pools use the last optimistic size for all the
|
||||||
|
# permutations within a particular raidz variant.
|
||||||
|
#
|
||||||
|
for raid in "${!sizes[@]}"; do
|
||||||
|
# ksh likes to create a [0] item for us. Thanks, ksh!
|
||||||
|
[[ $raid == "0" ]] && continue
|
||||||
|
|
||||||
|
for nd1 in "${!sizes["$raid"][@]}"; do
|
||||||
|
# And with an empty array we get one key, ''. Thanks, ksh!
|
||||||
|
[[ $nd1 == "0" || -z "$nd1" ]] && continue
|
||||||
|
|
||||||
|
for nd2 in "${!sizes["$raid"][@]}"; do
|
||||||
|
[[ $nd2 == "0" || -z "$nd2" ]] && continue
|
||||||
|
|
||||||
|
check_vdevs "$raid" "$nd1" "$nd2"
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
log_pass "raidz refreservation=auto picks worst raidz vdev"
|
|
@ -0,0 +1,130 @@
|
||||||
|
#!/bin/ksh -p
|
||||||
|
#
|
||||||
|
# This file and its contents are supplied under the terms of the
|
||||||
|
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
# You may only use this file in accordance with the terms of version
|
||||||
|
# 1.0 of the CDDL.
|
||||||
|
#
|
||||||
|
# A full copy of the text of the CDDL should have accompanied this
|
||||||
|
# source. A copy of the CDDL is also available via the Internet at
|
||||||
|
# http://www.illumos.org/license/CDDL.
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright 2019 Joyent, Inc.
|
||||||
|
#
|
||||||
|
|
||||||
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
. $STF_SUITE/tests/functional/refreserv/refreserv.cfg
|
||||||
|
|
||||||
|
#
|
||||||
|
# DESCRIPTION:
|
||||||
|
# raidz refreservation=auto accounts for extra parity and skip blocks
|
||||||
|
#
|
||||||
|
# STRATEGY:
|
||||||
|
# 1. Create a pool with a single raidz vdev
|
||||||
|
# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k]
|
||||||
|
# - create a volume
|
||||||
|
# - fully overwrite it
|
||||||
|
# - verify that referenced is less than or equal to reservation
|
||||||
|
# - destroy the volume
|
||||||
|
# 3. Destroy the pool
|
||||||
|
# 4. Recreate the pool with one more disk in the vdev, then repeat steps
|
||||||
|
# 2 and 3.
|
||||||
|
# 5. Repeat all steps above for raidz2 and raidz3.
|
||||||
|
#
|
||||||
|
# NOTES:
|
||||||
|
# 1. This test will use up to 14 disks but can cover the key concepts with
|
||||||
|
# 5 disks.
|
||||||
|
# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely.
|
||||||
|
#
|
||||||
|
|
||||||
|
verify_runnable "global"
|
||||||
|
|
||||||
|
typeset -a alldisks=($DISKS)
|
||||||
|
|
||||||
|
# The larger the volsize, the better zvol_volsize_to_reservation() is at
|
||||||
|
# guessing the right number. At 10M on ashift=12, the estimate may be over 26%
|
||||||
|
# too high.
|
||||||
|
volsize=100
|
||||||
|
|
||||||
|
function cleanup
|
||||||
|
{
|
||||||
|
default_cleanup_noexit
|
||||||
|
default_setup_noexit "${alldisks[0]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_assert "raidz refreservation=auto accounts for extra parity and skip blocks"
|
||||||
|
log_onexit cleanup
|
||||||
|
|
||||||
|
poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL"
|
||||||
|
|
||||||
|
# Testing tiny block sizes on ashift=12 pools causes so much size inflation
|
||||||
|
# that small test disks may fill before creating small volumes. However,
|
||||||
|
# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for
|
||||||
|
# testing the problems that arise from 4K and 8K blocks on ashift=12 pools.
|
||||||
|
bps=$(lsblk -nrdo min-io /dev/${alldisks[0]})
|
||||||
|
log_must test "$bps" -eq 512 -o "$bps" -eq 4096
|
||||||
|
case "$bps" in
|
||||||
|
512)
|
||||||
|
allshifts=(9 10 17)
|
||||||
|
maxpct=151
|
||||||
|
;;
|
||||||
|
4096)
|
||||||
|
allshifts=(12 13 17)
|
||||||
|
maxpct=110
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
log_fail "bytes/sector: $bps != (512|4096)"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
log_note "Testing in ashift=${allshifts[0]} mode"
|
||||||
|
|
||||||
|
# This loop handles all iterations of steps 1 through 4 described in strategy
|
||||||
|
# comment above,
|
||||||
|
for parity in 1 2 3; do
|
||||||
|
raid=raidz$parity
|
||||||
|
|
||||||
|
# Ensure we hit scenarios with and without skip blocks
|
||||||
|
for ndisks in $((parity * 2)) $((parity * 2 + 1)); do
|
||||||
|
typeset -a disks=(${alldisks[0..$((ndisks - 1))]})
|
||||||
|
|
||||||
|
if (( ${#disks[@]} < ndisks )); then
|
||||||
|
log_note "Too few disks to test $raid-$ndisks"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}"
|
||||||
|
|
||||||
|
for bits in "${allshifts[@]}"; do
|
||||||
|
vbs=$((1 << bits))
|
||||||
|
log_note "Testing $raid-$ndisks volblocksize=$vbs"
|
||||||
|
|
||||||
|
vol=$TESTPOOL/$TESTVOL
|
||||||
|
log_must zfs create -V ${volsize}m \
|
||||||
|
-o volblocksize=$vbs "$vol"
|
||||||
|
block_device_wait "/dev/zvol/$vol"
|
||||||
|
log_must dd if=/dev/zero of=/dev/zvol/$vol \
|
||||||
|
bs=1024k count=$volsize
|
||||||
|
sync
|
||||||
|
|
||||||
|
ref=$(zfs get -Hpo value referenced "$vol")
|
||||||
|
refres=$(zfs get -Hpo value refreservation "$vol")
|
||||||
|
log_must test -n "$ref"
|
||||||
|
log_must test -n "$refres"
|
||||||
|
|
||||||
|
typeset -F2 deltapct=$((refres * 100.0 / ref))
|
||||||
|
log_note "$raid-$ndisks refreservation $refres" \
|
||||||
|
"is $deltapct% of reservation $res"
|
||||||
|
|
||||||
|
log_must test "$ref" -le "$refres"
|
||||||
|
log_must test "$deltapct" -le $maxpct
|
||||||
|
|
||||||
|
log_must_busy zfs destroy "$vol"
|
||||||
|
done
|
||||||
|
|
||||||
|
log_must_busy zpool destroy "$TESTPOOL"
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
log_pass "raidz refreservation=auto accounts for extra parity and skip blocks"
|
Loading…
Reference in New Issue