From 685ae4429ffe7753359faa6cbae34e061d1e670b Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 22 Jul 2023 04:52:32 +1000
Subject: [PATCH] metaslab: tuneable to better control force ganging

metaslab_force_ganging isn't enough to actually force ganging, because
it still only forces 3% of the time. This adds
metaslab_force_ganging_pct so we can configure how often to force
ganging.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15088
---
 man/man4/zfs.4        |  7 ++++++-
 module/zfs/metaslab.c | 14 ++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 7959bfe33b..3843419731 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -15,7 +15,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd January 10, 2023
+.Dd July 21, 2023
 .Dt ZFS 4
 .Os
 .
@@ -239,6 +239,11 @@ relative to the pool.
 Make some blocks above a certain size be gang blocks.
 This option is used by the test suite to facilitate testing.
 .
+.It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint
+For blocks that could be forced to be a gang block (due to
+.Sy metaslab_force_ganging ) ,
+force this many of them to be gang blocks.
+.
 .It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 176247d63b..9991e1a22c 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -58,6 +58,11 @@ static uint64_t metaslab_aliquot = 1024 * 1024;
  */
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
+/*
+ * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
+ */
+uint_t metaslab_force_ganging_pct = 3;
+
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
@@ -5109,7 +5114,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 * damage can result in extremely long reconstruction times.  This
 	 * will also test spilling from special to normal.
 	 */
-	if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) {
+	if (psize >= metaslab_force_ganging &&
+	    metaslab_force_ganging_pct > 0 &&
+	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
@@ -6266,7 +6273,10 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 	"Segment-based metaslab selection maximum buckets before switching");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
-	"Blocks larger than this size are forced to be gang blocks");
+	"Blocks larger than this size are sometimes forced to be gang blocks");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
+	"Percentage of large blocks that will be forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 	"Max distance (bytes) to search forward before using size tree");