Change target size of metaslabs from 256GB to 16GB

= Old behavior

For vdev sizes 100GB to 50TB we keep ~200 metaslabs per
vdev and the metaslab size grows from 512MB to 256GB.
For vdev's bigger than that we start increasing the
number of metaslabs until we hit the 128K limit.

= New Behavior

For vdev sizes 100GB to 3TB we keep ~200 metaslabs per
vdev and the metaslab size grows from 512MB to 16GB.
For vdev's bigger than that we start increasing the
number of metaslabs until we hit the 128K limit.

= Reasoning

The old behavior makes metaslabs grow in size when
the vdev range is between 3TB (ms_size 16GB) and
32PB (ms_size 256GB). Even though keeping the number
of metaslabs is good in terms of potential number of
I/Os per TXG, these bigger metaslabs take longer
to be loaded and after they are loaded they can
take up a lot of memory because of their range trees.

This change tries to put a boundary in memory and
loading time for the specific range of vdev sizes.

Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Don Brady <don.brady@delphix.com>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #8324
This commit is contained in:
Serapheim Dimitropoulos 2019-01-25 16:38:27 -08:00 committed by Brian Behlendorf
parent df72b8bebe
commit c853f382db
3 changed files with 54 additions and 40 deletions

View File

@ -320,7 +320,7 @@ Use \fB1\fR for yes (default) and \fB0\fR for no.
.sp .sp
.ne 2 .ne 2
.na .na
\fBvdev_max_ms_count\fR (int) \fBzfs_vdev_default_ms_count\fR (int)
.ad .ad
.RS 12n .RS 12n
When a vdev is added target this number of metaslabs per top-level vdev. When a vdev is added target this number of metaslabs per top-level vdev.
@ -331,7 +331,7 @@ Default value: \fB200\fR.
.sp .sp
.ne 2 .ne 2
.na .na
\fBvdev_min_ms_count\fR (int) \fBzfs_vdev_min_ms_count\fR (int)
.ad .ad
.RS 12n .RS 12n
Minimum number of metaslabs to create in a top-level vdev. Minimum number of metaslabs to create in a top-level vdev.

View File

@ -54,20 +54,20 @@
#include <sys/zvol.h> #include <sys/zvol.h>
#include <sys/zfs_ratelimit.h> #include <sys/zfs_ratelimit.h>
/* target number of metaslabs per top-level vdev */ /* default target for number of metaslabs per top-level vdev */
int vdev_max_ms_count = 200; int zfs_vdev_default_ms_count = 200;
/* minimum number of metaslabs per top-level vdev */ /* minimum number of metaslabs per top-level vdev */
int vdev_min_ms_count = 16; int zfs_vdev_min_ms_count = 16;
/* practical upper limit of total metaslabs per top-level vdev */ /* practical upper limit of total metaslabs per top-level vdev */
int vdev_ms_count_limit = 1ULL << 17; int zfs_vdev_ms_count_limit = 1ULL << 17;
/* lower limit for metaslab size (512M) */ /* lower limit for metaslab size (512M) */
int vdev_default_ms_shift = 29; int zfs_vdev_default_ms_shift = 29;
/* upper limit for metaslab size (256G) */ /* upper limit for metaslab size (16G) */
int vdev_max_ms_shift = 38; int zfs_vdev_max_ms_shift = 34;
int vdev_validate_skip = B_FALSE; int vdev_validate_skip = B_FALSE;
@ -2281,16 +2281,24 @@ void
vdev_metaslab_set_size(vdev_t *vd) vdev_metaslab_set_size(vdev_t *vd)
{ {
uint64_t asize = vd->vdev_asize; uint64_t asize = vd->vdev_asize;
uint64_t ms_count = asize >> vdev_default_ms_shift; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
uint64_t ms_shift; uint64_t ms_shift;
/* /*
* There are two dimensions to the metaslab sizing calculation: * There are two dimensions to the metaslab sizing calculation:
* the size of the metaslab and the count of metaslabs per vdev. * the size of the metaslab and the count of metaslabs per vdev.
* In general, we aim for vdev_max_ms_count (200) metaslabs. The
* range of the dimensions are as follows:
* *
* 2^29 <= ms_size <= 2^38 * The default values used below are a good balance between memory
* usage (larger metaslab size means more memory needed for loaded
* metaslabs; more metaslabs means more memory needed for the
* metaslab_t structs), metaslab load time (larger metaslabs take
* longer to load), and metaslab sync time (more metaslabs means
* more time spent syncing all of them).
*
* In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
* The range of the dimensions are as follows:
*
* 2^29 <= ms_size <= 2^34
* 16 <= ms_count <= 131,072 * 16 <= ms_count <= 131,072
* *
* On the lower end of vdev sizes, we aim for metaslabs sizes of * On the lower end of vdev sizes, we aim for metaslabs sizes of
@ -2299,35 +2307,41 @@ vdev_metaslab_set_size(vdev_t *vd)
* of at least 16 metaslabs will override this minimum size goal. * of at least 16 metaslabs will override this minimum size goal.
* *
* On the upper end of vdev sizes, we aim for a maximum metaslab * On the upper end of vdev sizes, we aim for a maximum metaslab
* size of 256GB. However, we will cap the total count to 2^17 * size of 16GB. However, we will cap the total count to 2^17
* metaslabs to keep our memory footprint in check. * metaslabs to keep our memory footprint in check and let the
* metaslab size grow from there if that limit is hit.
* *
* The net effect of applying above constrains is summarized below. * The net effect of applying above constrains is summarized below.
* *
* vdev size metaslab count * vdev size metaslab count
* -------------|----------------- * --------------|-----------------
* < 8GB ~16 * < 8GB ~16
* 8GB - 100GB one per 512MB * 8GB - 100GB one per 512MB
* 100GB - 50TB ~200 * 100GB - 3TB ~200
* 50TB - 32PB one per 256GB * 3TB - 2PB one per 16GB
* > 32PB ~131,072 * > 2PB ~131,072
* ------------------------------- * --------------------------------
*
* Finally, note that all of the above calculate the initial
* number of metaslabs. Expanding a top-level vdev will result
* in additional metaslabs being allocated making it possible
* to exceed the zfs_vdev_ms_count_limit.
*/ */
if (ms_count < vdev_min_ms_count) if (ms_count < zfs_vdev_min_ms_count)
ms_shift = highbit64(asize / vdev_min_ms_count); ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
else if (ms_count > vdev_max_ms_count) else if (ms_count > zfs_vdev_default_ms_count)
ms_shift = highbit64(asize / vdev_max_ms_count); ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
else else
ms_shift = vdev_default_ms_shift; ms_shift = zfs_vdev_default_ms_shift;
if (ms_shift < SPA_MAXBLOCKSHIFT) { if (ms_shift < SPA_MAXBLOCKSHIFT) {
ms_shift = SPA_MAXBLOCKSHIFT; ms_shift = SPA_MAXBLOCKSHIFT;
} else if (ms_shift > vdev_max_ms_shift) { } else if (ms_shift > zfs_vdev_max_ms_shift) {
ms_shift = vdev_max_ms_shift; ms_shift = zfs_vdev_max_ms_shift;
/* cap the total count to constrain memory footprint */ /* cap the total count to constrain memory footprint */
if ((asize >> ms_shift) > vdev_ms_count_limit) if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
ms_shift = highbit64(asize / vdev_ms_count_limit); ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
} }
vd->vdev_ms_shift = ms_shift; vd->vdev_ms_shift = ms_shift;
@ -4674,16 +4688,16 @@ EXPORT_SYMBOL(vdev_online);
EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear); EXPORT_SYMBOL(vdev_clear);
/* BEGIN CSTYLED */ /* BEGIN CSTYLED */
module_param(vdev_max_ms_count, int, 0644); module_param(zfs_vdev_default_ms_count, int, 0644);
MODULE_PARM_DESC(vdev_max_ms_count, MODULE_PARM_DESC(zfs_vdev_default_ms_count,
"Target number of metaslabs per top-level vdev"); "Target number of metaslabs per top-level vdev");
module_param(vdev_min_ms_count, int, 0644); module_param(zfs_vdev_min_ms_count, int, 0644);
MODULE_PARM_DESC(vdev_min_ms_count, MODULE_PARM_DESC(zfs_vdev_min_ms_count,
"Minimum number of metaslabs per top-level vdev"); "Minimum number of metaslabs per top-level vdev");
module_param(vdev_ms_count_limit, int, 0644); module_param(zfs_vdev_ms_count_limit, int, 0644);
MODULE_PARM_DESC(vdev_ms_count_limit, MODULE_PARM_DESC(zfs_vdev_ms_count_limit,
"Practical upper limit of total metaslabs per top-level vdev"); "Practical upper limit of total metaslabs per top-level vdev");
module_param(zfs_slow_io_events_per_second, uint, 0644); module_param(zfs_slow_io_events_per_second, uint, 0644);

View File

@ -48,7 +48,7 @@ function custom_cleanup
{ {
set_vdev_validate_skip 0 set_vdev_validate_skip 0
cleanup cleanup
log_must set_tunable64 vdev_min_ms_count 16 log_must set_tunable64 zfs_vdev_min_ms_count 16
} }
log_onexit custom_cleanup log_onexit custom_cleanup
@ -208,7 +208,7 @@ increase_device_sizes $(( FILE_SIZE * 4 ))
# Increase the number of metaslabs for small pools temporarily to # Increase the number of metaslabs for small pools temporarily to
# reduce the chance of reusing a metaslab that holds old MOS metadata. # reduce the chance of reusing a metaslab that holds old MOS metadata.
log_must set_tunable64 vdev_min_ms_count 150 log_must set_tunable64 zfs_vdev_min_ms_count 150
# Part of the rewind test is to see how it reacts to path changes # Part of the rewind test is to see how it reacts to path changes
typeset pathstochange="$VDEV0 $VDEV1 $VDEV2 $VDEV3" typeset pathstochange="$VDEV0 $VDEV1 $VDEV2 $VDEV3"