Improve too large physical ashift handling

When iterating through children physical ashifts for vdev, prefer
ones above the maximum logical ashift, that we can actually use,
but within the administrator defined maximum.

When selecting top-level vdev ashift, do not set it to the defined
maximum in case physical ashift is even higher, but just ignore one.
Using the maximum does not prevent misaligned writes, but reduces
space efficiency.  Since ZFS tries to write data sequentially and
aggregates the writes, in many cases large misanigned writes may be
not as bad as the space penalty otherwise.

Allow internal physical ashifts for vdevs higher than SHIFT_MAX.
May be one day allocator or aggregation could benefit from that.

Reduce zfs_vdev_max_auto_ashift default from 16 (64KB) to 14 (16KB),
so that ZFS may still use bigger ashifts up to SHIFT_MAX (64KB),
but only if it really has to or explicitly told to, but not as an
"optimization".

There are some read-intensive NVMe SSDs that report Preferred Write
Alignment of 64KB, and attempt to build RAIDZ2 of those leads to a
space inefficiency that can't be justified.  Instead these changes
make ZFS fall back to logical ashift of 12 (4KB) by default and
only warn user that it may be suboptimal for performance.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #13798
This commit is contained in:
Alexander Motin 2022-09-08 13:30:53 -04:00 committed by Tony Hutter
parent ebbbe01e31
commit 44cec45f72
9 changed files with 69 additions and 13 deletions

View File

@ -642,6 +642,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
*/ */
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
void vdev_metaslab_group_create(vdev_t *vd); void vdev_metaslab_group_create(vdev_t *vd);
uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
/* /*
* Vdev ashift optimization tunables * Vdev ashift optimization tunables

View File

@ -347,9 +347,12 @@ When a vdev is added, target this number of metaslabs per top-level vdev.
.It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512MB Pc Pq int .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512MB Pc Pq int
Default limit for metaslab size. Default limit for metaslab size.
. .
.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy ASHIFT_MAX Po 16 Pc Pq ulong .It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq ulong
Maximum ashift used when optimizing for logical -> physical sector size on new Maximum ashift used when optimizing for logical -> physical sector size on new
top-level vdevs. top-level vdevs.
May be increased up to
.Sy ASHIFT_MAX Po 16 Pc ,
but this may negatively impact pool space efficiency.
. .
.It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong
Minimum ashift used when creating new top-level vdevs. Minimum ashift used when creating new top-level vdevs.

View File

@ -956,8 +956,7 @@ skip_open:
*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
*physical_ashift = 0; *physical_ashift = 0;
if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && ISP2(pp->stripesize) && pp->stripeoffset == 0)
pp->stripeoffset == 0)
*physical_ashift = highbit(pp->stripesize) - 1; *physical_ashift = highbit(pp->stripesize) - 1;
/* /*

View File

@ -134,7 +134,15 @@ int zfs_vdev_standard_sm_blksz = (1 << 17);
*/ */
int zfs_nocacheflush = 0; int zfs_nocacheflush = 0;
uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX; /*
* Maximum and minimum ashift values that can be automatically set based on
* vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX
* is higher than the maximum value, it is intentionally limited here to not
* excessively impact pool space efficiency. Higher ashift values may still
* be forced by vdev logical ashift or by user via ashift property, but won't
* be set automatically as a performance optimization.
*/
uint64_t zfs_vdev_max_auto_ashift = 14;
uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
/*PRINTFLIKE2*/ /*PRINTFLIKE2*/
@ -1835,6 +1843,24 @@ vdev_set_deflate_ratio(vdev_t *vd)
} }
} }
/*
* Choose the best of two ashifts, preferring one between logical ashift
* (absolute minimum) and administrator defined maximum, otherwise take
* the biggest of the two.
*/
uint64_t
vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
{
if (a > logical && a <= zfs_vdev_max_auto_ashift) {
if (b <= logical || b > zfs_vdev_max_auto_ashift)
return (a);
else
return (MAX(a, b));
} else if (b <= logical || b > zfs_vdev_max_auto_ashift)
return (MAX(a, b));
return (b);
}
/* /*
* Maximize performance by inflating the configured ashift for top level * Maximize performance by inflating the configured ashift for top level
* vdevs to be as close to the physical ashift as possible while maintaining * vdevs to be as close to the physical ashift as possible while maintaining
@ -1846,7 +1872,8 @@ vdev_ashift_optimize(vdev_t *vd)
{ {
ASSERT(vd == vd->vdev_top); ASSERT(vd == vd->vdev_top);
if (vd->vdev_ashift < vd->vdev_physical_ashift) { if (vd->vdev_ashift < vd->vdev_physical_ashift &&
vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
vd->vdev_ashift = MIN( vd->vdev_ashift = MIN(
MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
MAX(zfs_vdev_min_auto_ashift, MAX(zfs_vdev_min_auto_ashift,
@ -4452,7 +4479,10 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_configured_ashift = vd->vdev_top != NULL vs->vs_configured_ashift = vd->vdev_top != NULL
? vd->vdev_top->vdev_ashift : vd->vdev_ashift; ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
vs->vs_logical_ashift = vd->vdev_logical_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift;
vs->vs_physical_ashift = vd->vdev_physical_ashift; if (vd->vdev_physical_ashift <= ASHIFT_MAX)
vs->vs_physical_ashift = vd->vdev_physical_ashift;
else
vs->vs_physical_ashift = 0;
/* /*
* Report fragmentation and rebuild progress for top-level, * Report fragmentation and rebuild progress for top-level,

View File

@ -1496,8 +1496,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
physical_ashift = MAX(physical_ashift, }
cvd->vdev_physical_ashift); for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_ops == &vdev_draid_spare_ops)
continue;
physical_ashift = vdev_best_ashift(logical_ashift,
physical_ashift, cvd->vdev_physical_ashift);
} }
*asizep = asize; *asizep = asize;

View File

@ -409,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
*physical_ashift = MAX(*physical_ashift, }
cvd->vdev_physical_ashift); for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_open_error)
continue;
*physical_ashift = vdev_best_ashift(*logical_ashift,
*physical_ashift, cvd->vdev_physical_ashift);
} }
if (numerrors == vd->vdev_children) { if (numerrors == vd->vdev_children) {

View File

@ -1426,8 +1426,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
*physical_ashift = MAX(*physical_ashift, }
cvd->vdev_physical_ashift); for (c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_open_error != 0)
continue;
*physical_ashift = vdev_best_ashift(*logical_ashift,
*physical_ashift, cvd->vdev_physical_ashift);
} }
*asize *= vd->vdev_children; *asize *= vd->vdev_children;

View File

@ -81,7 +81,9 @@ TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch
TXG_HISTORY txg.history zfs_txg_history TXG_HISTORY txg.history zfs_txg_history
TXG_TIMEOUT txg.timeout zfs_txg_timeout TXG_TIMEOUT txg.timeout zfs_txg_timeout
UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress
VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift
VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift
VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift
VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count
VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip
VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev

View File

@ -57,7 +57,9 @@ disk2=$TEST_BASE_DIR/disk2
log_must mkfile $SIZE $disk1 log_must mkfile $SIZE $disk1
log_must mkfile $SIZE $disk2 log_must mkfile $SIZE $disk2
logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT)
orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT)
typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
for ashift in ${ashifts[@]} for ashift in ${ashifts[@]}
@ -81,7 +83,8 @@ do
log_must zpool create $TESTPOOL $disk1 log_must zpool create $TESTPOOL $disk1
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift
log_must zpool add $TESTPOOL $disk2 log_must zpool add $TESTPOOL $disk2
verify_ashift $disk2 $ashift exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift ))
verify_ashift $disk2 $exp
if [[ $? -ne 0 ]] if [[ $? -ne 0 ]]
then then
log_fail "Device was added without setting ashift value to "\ log_fail "Device was added without setting ashift value to "\