diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index d22abfbc25..470eaa763d 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -641,6 +641,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); +uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); /* * Vdev ashift optimization tunables diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index cc55ee32ba..cecaf7e7f0 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -347,9 +347,12 @@ When a vdev is added, target this number of metaslabs per top-level vdev. .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq int Default limit for metaslab size. . -.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy ASHIFT_MAX Po 16 Pc Pq ulong +.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq ulong Maximum ashift used when optimizing for logical \[->] physical sector size on new top-level vdevs. +May be increased up to +.Sy ASHIFT_MAX Po 16 Pc , +but this may negatively impact pool space efficiency. . .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong Minimum ashift used when creating new top-level vdevs. diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index f3b4846f4e..fef6a1b88e 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -955,8 +955,7 @@ skip_open: *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *physical_ashift = 0; if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && - ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && - pp->stripeoffset == 0) + ISP2(pp->stripesize) && pp->stripeoffset == 0) *physical_ashift = highbit(pp->stripesize) - 1; /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ea0245610f..048616c253 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -136,7 +136,15 @@ int zfs_vdev_standard_sm_blksz = (1 << 17); */ int zfs_nocacheflush = 0; -uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX; +/* + * Maximum and minimum ashift values that can be automatically set based on + * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX + * is higher than the maximum value, it is intentionally limited here to not + * excessively impact pool space efficiency. Higher ashift values may still + * be forced by vdev logical ashift or by user via ashift property, but won't + * be set automatically as a performance optimization. + */ +uint64_t zfs_vdev_max_auto_ashift = 14; uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; void @@ -1845,6 +1853,24 @@ vdev_set_deflate_ratio(vdev_t *vd) } } +/* + * Choose the best of two ashifts, preferring one between logical ashift + * (absolute minimum) and administrator defined maximum, otherwise take + * the biggest of the two. + */ +uint64_t +vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) +{ + if (a > logical && a <= zfs_vdev_max_auto_ashift) { + if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (a); + else + return (MAX(a, b)); + } else if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (MAX(a, b)); + return (b); +} + /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining @@ -1856,7 +1882,8 @@ vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); - if (vd->vdev_ashift < vd->vdev_physical_ashift) { + if (vd->vdev_ashift < vd->vdev_physical_ashift && + vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, @@ -4463,7 +4490,10 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; - vs->vs_physical_ashift = vd->vdev_physical_ashift; + if (vd->vdev_physical_ashift <= ASHIFT_MAX) + vs->vs_physical_ashift = vd->vdev_physical_ashift; + else + vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 24034d9d93..24ea5d2cbe 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1496,8 +1496,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); - physical_ashift = MAX(physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + physical_ashift = vdev_best_ashift(logical_ashift, + physical_ashift, cvd->vdev_physical_ashift); } *asizep = asize; diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 3879de6804..f9a01c9f53 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -409,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b4daf642ed..5a44983e55 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1527,8 +1527,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); + } + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index d6a2fe5db7..80e7bcb3bd 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -81,7 +81,9 @@ TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch TXG_HISTORY txg.history zfs_txg_history TXG_TIMEOUT txg.timeout zfs_txg_timeout UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress +VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift +VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh index 8d5ce5efa5..0166e84baa 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh @@ -57,7 +57,9 @@ disk2=$TEST_BASE_DIR/disk2 log_must mkfile $SIZE $disk1 log_must mkfile $SIZE $disk2 +logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT) orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) +max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT) typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} @@ -77,7 +79,8 @@ do log_must zpool create $TESTPOOL $disk1 log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift log_must zpool add $TESTPOOL $disk2 - log_must verify_ashift $disk2 $ashift + exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift )) + log_must verify_ashift $disk2 $exp # clean things for the next run log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift