looping in metaslab_block_picker impacts performance on fragmented pools

On fragmented pools with high-performance storage, the looping in
metaslab_block_picker() can become the performance-limiting bottleneck.
When looking for a larger block (e.g. a 128K block for the ZIL), we may
search through many free segments (up to hundreds of thousands) to find
one that is large enough to satisfy the allocation. This can take a long
time (up to dozens of ms), and is done while holding the ms_lock, which
other threads may spin waiting for.

When this performance problem is encountered, profiling will show
high CPU time in metaslab_block_picker, as well as in mutex_enter from
various callers.

The problem is very evident on a test system with a sync write workload
with 8K writes to a recordsize=8k filesystem, with 4TB of SSD storage,
84% full and 88% fragmented. It has also been observed on production
systems with 90TB of storage, 76% full and 87% fragmented.

The fix is to change metaslab_df_alloc() to search only up to 16MB from
the previous allocation (of this alignment). After that, we will pick a
segment that is of the exact size requested (or larger). This reduces
the number of iterations to a few hundred on fragmented pools (a ~100x
improvement).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-62324
Closes #8877
This commit is contained in:
Matthew Ahrens 2019-06-13 13:06:15 -07:00 committed by Brian Behlendorf
parent 9c7da9a95a
commit d3230d761a
2 changed files with 117 additions and 60 deletions

View File

@ -325,6 +325,40 @@ Enable use of the fragmentation metric in computing metaslab weights.
Use \fB1\fR for yes (default) and \fB0\fR for no. Use \fB1\fR for yes (default) and \fB0\fR for no.
.RE .RE
.sp
.ne 2
.na
\fBmetaslab_df_max_search\fR (int)
.ad
.RS 12n
Maximum distance to search forward from the last offset. Without this limit,
fragmented pools can see >100,000 iterations and metaslab_block_picker()
becomes the performance limiting factor on high-performance storage.
With the default setting of 16MB, we typically see less than 500 iterations,
even with very fragmented, ashift=9 pools. The maximum number of iterations
possible is: \fBmetaslab_df_max_search / (2 * (1<<ashift))\fR.
With the default setting of 16MB this is 16*1024 (with ashift=9) or 2048
(with ashift=12).
.sp
Default value: \fB16,777,216\fR (16MB)
.RE
.sp
.ne 2
.na
\fBmetaslab_df_use_largest_segment\fR (int)
.ad
.RS 12n
If we are not searching forward (due to metaslab_df_max_search,
metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable controls
what segment is used. If it is set, we will use the largest free segment.
If it is not set, we will use a segment of exactly the requested size (or
larger).
.sp
Use \fB1\fR for yes and \fB0\fR for no (default).
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -159,6 +159,30 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
*/ */
int metaslab_df_free_pct = 4; int metaslab_df_free_pct = 4;
/*
* Maximum distance to search forward from the last offset. Without this
* limit, fragmented pools can see >100,000 iterations and
* metaslab_block_picker() becomes the performance limiting factor on
* high-performance storage.
*
* With the default setting of 16MB, we typically see less than 500
* iterations, even with very fragmented, ashift=9 pools. The maximum number
* of iterations possible is:
* metaslab_df_max_search / (2 * (1<<ashift))
* With the default setting of 16MB this is 16*1024 (with ashift=9) or
* 2048 (with ashift=12).
*/
int metaslab_df_max_search = 16 * 1024 * 1024;
/*
* If we are not searching forward (due to metaslab_df_max_search,
* metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
* controls what segment is used. If it is set, we will use the largest free
* segment. If it is not set, we will use a segment of exactly the requested
* size (or larger).
*/
int metaslab_df_use_largest_segment = B_FALSE;
/* /*
* Percentage of all cpus that can be used by the metaslab taskq. * Percentage of all cpus that can be used by the metaslab taskq.
*/ */
@ -1200,8 +1224,7 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
return (rs); return (rs);
} }
#if defined(WITH_FF_BLOCK_ALLOCATOR) || \ #if defined(WITH_DF_BLOCK_ALLOCATOR) || \
defined(WITH_DF_BLOCK_ALLOCATOR) || \
defined(WITH_CF_BLOCK_ALLOCATOR) defined(WITH_CF_BLOCK_ALLOCATOR)
/* /*
* This is a helper function that can be used by the allocator to find * This is a helper function that can be used by the allocator to find
@ -1210,13 +1233,16 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
*/ */
static uint64_t static uint64_t
metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
uint64_t align) uint64_t max_search)
{ {
range_seg_t *rs = metaslab_block_find(t, *cursor, size); range_seg_t *rs = metaslab_block_find(t, *cursor, size);
uint64_t first_found;
while (rs != NULL) { if (rs != NULL)
uint64_t offset = P2ROUNDUP(rs->rs_start, align); first_found = rs->rs_start;
while (rs != NULL && rs->rs_start - first_found <= max_search) {
uint64_t offset = rs->rs_start;
if (offset + size <= rs->rs_end) { if (offset + size <= rs->rs_end) {
*cursor = offset + size; *cursor = offset + size;
return (offset); return (offset);
@ -1224,55 +1250,30 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
rs = AVL_NEXT(t, rs); rs = AVL_NEXT(t, rs);
} }
/*
* If we know we've searched the whole map (*cursor == 0), give up.
* Otherwise, reset the cursor to the beginning and try again.
*/
if (*cursor == 0)
return (-1ULL);
*cursor = 0; *cursor = 0;
return (metaslab_block_picker(t, cursor, size, align)); return (-1ULL);
} }
#endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */ #endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
#if defined(WITH_FF_BLOCK_ALLOCATOR)
/*
* ==========================================================================
* The first-fit block allocator
* ==========================================================================
*/
static uint64_t
metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
{
/*
* Find the largest power of 2 block size that evenly divides the
* requested size. This is used to try to allocate blocks with similar
* alignment from the same area of the metaslab (i.e. same cursor
* bucket) but it does not guarantee that other allocations sizes
* may exist in the same region.
*/
uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
avl_tree_t *t = &msp->ms_allocatable->rt_root;
return (metaslab_block_picker(t, cursor, size, align));
}
static metaslab_ops_t metaslab_ff_ops = {
metaslab_ff_alloc
};
metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
#endif /* WITH_FF_BLOCK_ALLOCATOR */
#if defined(WITH_DF_BLOCK_ALLOCATOR) #if defined(WITH_DF_BLOCK_ALLOCATOR)
/* /*
* ========================================================================== * ==========================================================================
* Dynamic block allocator - * Dynamic Fit (df) block allocator
* Uses the first fit allocation scheme until space get low and then *
* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold * Search for a free chunk of at least this size, starting from the last
* and metaslab_df_free_pct to determine when to switch the allocation scheme. * offset (for this alignment of block) looking for up to
* metaslab_df_max_search bytes (16MB). If a large enough free chunk is not
* found within 16MB, then return a free chunk of exactly the requested size (or
* larger).
*
* If it seems like searching from the last offset will be unproductive, skip
* that and just return a free chunk of exactly the requested size (or larger).
* This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This
* mechanism is probably not very useful and may be removed in the future.
*
* The behavior when not searching can be changed to return the largest free
* chunk, instead of a free chunk of exactly the requested size, by setting
* metaslab_df_use_largest_segment.
* ========================================================================== * ==========================================================================
*/ */
static uint64_t static uint64_t
@ -1288,28 +1289,42 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
uint64_t align = size & -size; uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
range_tree_t *rt = msp->ms_allocatable; range_tree_t *rt = msp->ms_allocatable;
avl_tree_t *t = &rt->rt_root;
uint64_t max_size = metaslab_block_maxsize(msp);
int free_pct = range_tree_space(rt) * 100 / msp->ms_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
uint64_t offset;
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT3U(avl_numnodes(t), ==, ASSERT3U(avl_numnodes(&rt->rt_root), ==,
avl_numnodes(&msp->ms_allocatable_by_size)); avl_numnodes(&msp->ms_allocatable_by_size));
if (max_size < size)
return (-1ULL);
/* /*
* If we're running low on space switch to using the size * If we're running low on space, find a segment based on size,
* sorted AVL tree (best-fit). * rather than iterating based on offset.
*/ */
if (max_size < metaslab_df_alloc_threshold || if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) { free_pct < metaslab_df_free_pct) {
t = &msp->ms_allocatable_by_size; offset = -1;
*cursor = 0; } else {
offset = metaslab_block_picker(&rt->rt_root,
cursor, size, metaslab_df_max_search);
} }
return (metaslab_block_picker(t, cursor, size, 1ULL)); if (offset == -1) {
range_seg_t *rs;
if (metaslab_df_use_largest_segment) {
/* use largest free segment */
rs = avl_last(&msp->ms_allocatable_by_size);
} else {
/* use segment of this size, or next largest */
rs = metaslab_block_find(&msp->ms_allocatable_by_size,
0, size);
}
if (rs != NULL && rs->rs_start + size <= rs->rs_end) {
offset = rs->rs_start;
*cursor = offset + size;
}
}
return (offset);
} }
static metaslab_ops_t metaslab_df_ops = { static metaslab_ops_t metaslab_df_ops = {
@ -4823,6 +4838,14 @@ MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
module_param(metaslab_force_ganging, ulong, 0644); module_param(metaslab_force_ganging, ulong, 0644);
MODULE_PARM_DESC(metaslab_force_ganging, MODULE_PARM_DESC(metaslab_force_ganging,
"blocks larger than this size are forced to be gang blocks"); "blocks larger than this size are forced to be gang blocks");
module_param(metaslab_df_max_search, int, 0644);
MODULE_PARM_DESC(metaslab_df_max_search,
"max distance (bytes) to search forward before using size tree");
module_param(metaslab_df_use_largest_segment, int, 0644);
MODULE_PARM_DESC(metaslab_df_use_largest_segment,
"when looking in size tree, use largest segment instead of exact fit");
/* END CSTYLED */ /* END CSTYLED */
#endif #endif