Only examine best metaslabs on each vdev

On a system with very high fragmentation, we may need to do lots of gang allocations (e.g. most indirect block allocations (~50KB) may need to gang). Before failing a "normal" allocation and resorting to ganging, we try every metaslab. This has the impact of loading every metaslab (not a huge deal since we now typically keep all metaslabs loaded), and also iterating over every metaslab for every failing allocation. If there are many metaslabs (more than the typical ~200, e.g. due to vdev expansion or very large vdevs), the CPU cost of this iteration can be very impactful. This iteration is done with the mg_lock held, creating long hold times and high lock contention for concurrent allocations, ultimately causing long txg sync times and poor application performance. To address this, this commit changes the behavior of "normal" (not try_hard, not ZIL) allocations. These will now only examine the 100 best metaslabs (as determined by their ms_weight). If none of these have a large enough free segment, then the allocation will fail and we'll fall back on ganging. To accomplish this, we will now (normally) gang before doing a `try_hard` allocation. Non-try_hard allocations will only examine the 100 best metaslabs of each vdev. In summary, we will first try normal allocation. If that fails then we will do a gang allocation. If that fails then we will do a "try hard" gang allocation. If that fails then we will have a multi-layer gang block. Reviewed-by: Paul Dagnelie <pcd@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Closes #11327
2020-12-16 14:40:05 -08:00 · 2020-12-16 14:40:05 -08:00 · be5c6d9653
parent f8020c9363
commit be5c6d9653
4 changed files with 95 additions and 55 deletions
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@ -78,6 +78,7 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
 #define	METASLAB_DONT_THROTTLE		0x10
 #define	METASLAB_MUST_RESERVE		0x20
 #define	METASLAB_FASTWRITE		0x40
+#define	METASLAB_ZIL			0x80

 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -526,6 +526,40 @@ memory that is the threshold.
 Default value: \fB25 percent\fR
 .RE

+.sp
+.ne 2
+.na
+\fBzfs_metaslab_try_hard_before_gang\fR (int)
+.ad
+.RS 12n
+If not set (the default), we will first try normal allocation.
+If that fails then we will do a gang allocation.
+If that fails then we will do a "try hard" gang allocation.
+If that fails then we will have a multi-layer gang block.
+.sp
+If set, we will first try normal allocation.
+If that fails then we will do a "try hard" allocation.
+If that fails we will do a gang allocation.
+If that fails we will do a "try hard" gang allocation.
+If that fails then we will have a multi-layer gang block.
+.sp
+Default value: \fB0 (false)\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_metaslab_find_max_tries\fR (int)
+.ad
+.RS 12n
+When not trying hard, we only consider this number of the best metaslabs.
+This improves performance, especially when there are many metaslabs per vdev
+and the allocation can't actually be satisfied (so we would otherwise iterate
+all the metaslabs).
+.sp
+Default value: \fB100\fR
+.RE
+
 .sp
 .ne 2
 .na
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@ -264,9 +264,7 @@ int zfs_metaslab_switch_threshold = 2;
 * Internal switch to enable/disable the metaslab allocation tracing
 * facility.
 */
-#ifdef _METASLAB_TRACING
-boolean_t metaslab_trace_enabled = B_TRUE;
-#endif
+boolean_t metaslab_trace_enabled = B_FALSE;

 /*
 * Maximum entries that the metaslab allocation tracing facility will keep
@ -276,9 +274,7 @@ boolean_t metaslab_trace_enabled = B_TRUE;
 * to every exceed this value. In debug mode, the system will panic if this
 * limit is ever reached allowing for further investigation.
 */
-#ifdef _METASLAB_TRACING
 uint64_t metaslab_trace_max_entries = 5000;
-#endif

 /*
 * Maximum number of metaslabs per group that can be disabled
@ -314,6 +310,35 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE;
 */
 uint32_t metaslab_by_size_min_shift = 14;

+/*
+ * If not set, we will first try normal allocation.  If that fails then
+ * we will do a gang allocation.  If that fails then we will do a "try hard"
+ * gang allocation.  If that fails then we will have a multi-layer gang
+ * block.
+ *
+ * If set, we will first try normal allocation.  If that fails then
+ * we will do a "try hard" allocation.  If that fails we will do a gang
+ * allocation.  If that fails we will do a "try hard" gang allocation.  If
+ * that fails then we will have a multi-layer gang block.
+ */
+int zfs_metaslab_try_hard_before_gang = B_FALSE;
+
+/*
+ * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
+ * metaslabs.  This improves performance, especially when there are many
+ * metaslabs per vdev and the allocation can't actually be satisfied (so we
+ * would otherwise iterate all the metaslabs).  If there is a metaslab with a
+ * worse weight but it can actually satisfy the allocation, we won't find it
+ * until trying hard.  This may happen if the worse metaslab is not loaded
+ * (and the true weight is better than we have calculated), or due to weight
+ * bucketization.  E.g. we are looking for a 60K segment, and the best
+ * metaslabs all have free segments in the 32-63K bucket, but the best
+ * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
+ * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
+ * bucket, and therefore a lower weight).
+ */
+int zfs_metaslab_find_max_tries = 100;
+
 static uint64_t metaslab_weight(metaslab_t *, boolean_t);
 static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@ -325,19 +350,20 @@ static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 static unsigned int metaslab_idx_func(multilist_t *, void *);
 static void metaslab_evict(metaslab_t *, uint64_t);
 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
-#ifdef _METASLAB_TRACING
 kmem_cache_t *metaslab_alloc_trace_cache;

 typedef struct metaslab_stats {
 	kstat_named_t metaslabstat_trace_over_limit;
-	kstat_named_t metaslabstat_df_find_under_floor;
 	kstat_named_t metaslabstat_reload_tree;
+	kstat_named_t metaslabstat_too_many_tries;
+	kstat_named_t metaslabstat_try_hard;
 } metaslab_stats_t;

 static metaslab_stats_t metaslab_stats = {
 	{ "trace_over_limit",		KSTAT_DATA_UINT64 },
-	{ "df_find_under_floor",	KSTAT_DATA_UINT64 },
 	{ "reload_tree",		KSTAT_DATA_UINT64 },
+	{ "too_many_tries",		KSTAT_DATA_UINT64 },
+	{ "try_hard",			KSTAT_DATA_UINT64 },
 };

 #define	METASLABSTAT_BUMP(stat) \
@ -373,18 +399,6 @@ metaslab_stat_fini(void)
 	kmem_cache_destroy(metaslab_alloc_trace_cache);
 	metaslab_alloc_trace_cache = NULL;
 }
-#else
-
-void
-metaslab_stat_init(void)
-{
-}
-
-void
-metaslab_stat_fini(void)
-{
-}
-#endif

 /*
 * ==========================================================================
@ -1355,9 +1369,7 @@ static void
 metaslab_size_tree_full_load(range_tree_t *rt)
 {
 	metaslab_rt_arg_t *mrap = rt->rt_arg;
-#ifdef _METASLAB_TRACING
 	METASLABSTAT_BUMP(metaslabstat_reload_tree);
-#endif
 	ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
 	mrap->mra_floor_shift = 0;
 	struct mssa_arg arg = {0};
@ -1667,13 +1679,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 		} else {
 			zfs_btree_index_t where;
 			/* use segment of this size, or next largest */
-#ifdef _METASLAB_TRACING
-			metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg;
-			if (size < (1 << mrap->mra_floor_shift)) {
-				METASLABSTAT_BUMP(
-				    metaslabstat_df_find_under_floor);
-			}
-#endif
 			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
 			    rt, msp->ms_start, size, &where);
 		}
@ -4404,7 +4409,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva)
 * Metaslab allocation tracing facility
 * ==========================================================================
 */
-#ifdef _METASLAB_TRACING

 /*
 * Add an allocation trace element to the allocation tracing list.
@ -4479,21 +4483,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
 	list_destroy(&zal->zal_list);
 	zal->zal_size = 0;
 }
-#else
-
-#define	metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
-
-void
-metaslab_trace_init(zio_alloc_list_t *zal)
-{
-}
-
-void
-metaslab_trace_fini(zio_alloc_list_t *zal)
-{
-}
-
-#endif /* _METASLAB_TRACING */

 /*
 * ==========================================================================
@ -4634,8 +4623,16 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
 	if (msp == NULL)
 		msp = avl_nearest(t, idx, AVL_AFTER);

+	int tries = 0;
 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
+
+		if (!try_hard && tries > zfs_metaslab_find_max_tries) {
+			METASLABSTAT_BUMP(metaslabstat_too_many_tries);
+			return (NULL);
+		}
+		tries++;
+
 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
@ -5287,9 +5284,12 @@ next:
 	} while ((mg = mg->mg_next) != rotor);

 	/*
-	 * If we haven't tried hard, do so now.
+	 * If we haven't tried hard, perhaps do so now.
 	 */
-	if (!try_hard) {
+	if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
+	    GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
+	    psize <= 1 << spa->spa_min_ashift)) {
+		METASLABSTAT_BUMP(metaslabstat_try_hard);
 		try_hard = B_TRUE;
 		goto top;
 	}
@ -6245,3 +6245,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG,

 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW,
 	"Percentage of memory that can be used to store metaslab range trees");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
+	ZMOD_RW, "Try hard to allocate before ganging");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW,
+	"Normally only consider this many of the best metaslabs in each vdev");
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -3585,17 +3585,16 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	 * of, so we just hash the objset ID to pick the allocator to get
 	 * some parallelism.
 	 */
-	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
-	    txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL,
-	    cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
-	    spa->spa_alloc_count);
+	int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
+	int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
+	    spa->spa_alloc_count;
+	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp,
+	    1, txg, NULL, flags, &io_alloc_list, NULL, allocator);
 	if (error == 0) {
 		*slog = TRUE;
 	} else {
-		error = metaslab_alloc(spa, spa_normal_class(spa), size,
-		    new_bp, 1, txg, NULL, METASLAB_FASTWRITE,
-		    &io_alloc_list, NULL, cityhash4(0, 0, 0,
-		    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count);
+		error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp,
+		    1, txg, NULL, flags, &io_alloc_list, NULL, allocator);
 		if (error == 0)
 			*slog = FALSE;
 	}