From 645b83307918085ab2f0e12618809e348635b34f Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 1 May 2024 14:07:20 -0400
Subject: [PATCH] Improve write issue taskqs utilization

- Reduce number of allocators on small system down to one per 4
CPU cores, keeping maximum at 4 on 16+ core systems. Small systems
should not have the lock contention multiple allocators supposed
to solve, while having several metaslabs open and modified each
TXG is not free.
 - Reduce number of write issue taskqs down to one per 16 CPU
cores and an integer fraction of number of allocators.  On mid-
sized systems, where multiple allocators already make sense, too
many write issue taskqs may reduce write speed on single-file
workloads, since single file is handled by only one taskq to
reduce fragmentation. On large systems, that can actually benefit
from many taskq's better IOPS, the bottleneck is less important,
since in worst case there will be at least 16 cores to handle it.
 - Distribute dnodes between allocators (and taskqs) in a round-
robin fashion instead of relying on sync taskqs to be balanced.
The last is not guarantied and may depend on scheduling.
 - Remove io_wr_iss_tq from struct zio.  io_allocator is enough.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16130
---
 include/sys/spa.h       |  2 +
 include/sys/spa_impl.h  |  9 ++++-
 include/sys/zio.h       |  3 --
 man/man4/zfs.4          | 25 ++++++++-----
 module/zfs/dmu_objset.c |  2 +
 module/zfs/spa.c        | 81 ++++++++++++++++++++++++++---------------
 module/zfs/spa_misc.c   | 22 +++++++++--
 module/zfs/zio.c        |  1 -
 8 files changed, 98 insertions(+), 47 deletions(-)

diff --git a/include/sys/spa.h b/include/sys/spa.h
index 001c221fb4..3073c4d1b9 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -829,6 +829,8 @@ extern uint_t zfs_sync_pass_deferred_free;
 /* spa sync taskqueues */
 taskq_t *spa_sync_tq_create(spa_t *spa, const char *name);
 void spa_sync_tq_destroy(spa_t *spa);
+uint_t spa_acq_allocator(spa_t *spa);
+void spa_rel_allocator(spa_t *spa, uint_t allocator);
 void spa_select_allocator(zio_t *zio);
 
 /* spa namespace global mutex */
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index d7da085ab3..a40914ec5f 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -63,6 +63,12 @@ typedef struct spa_alloc {
 	avl_tree_t	spaa_tree;
 } ____cacheline_aligned spa_alloc_t;
 
+typedef struct spa_allocs_use {
+	kmutex_t	sau_lock;
+	uint_t		sau_rotor;
+	boolean_t	sau_inuse[];
+} spa_allocs_use_t;
+
 typedef struct spa_error_entry {
 	zbookmark_phys_t	se_bookmark;
 	char			*se_name;
@@ -192,7 +198,7 @@ typedef struct spa_taskqs {
 /* one for each thread in the spa sync taskq */
 typedef struct spa_syncthread_info {
 	kthread_t	*sti_thread;
-	taskq_t		*sti_wr_iss_tq;		/* assigned wr_iss taskq */
+	uint_t		sti_allocator;
 } spa_syncthread_info_t;
 
 typedef enum spa_all_vdev_zap_action {
@@ -270,6 +276,7 @@ struct spa {
 	 * allocation performance in write-heavy workloads.
 	 */
 	spa_alloc_t	*spa_allocs;
+	spa_allocs_use_t *spa_allocs_use;
 	int		spa_alloc_count;
 	int		spa_active_allocator;	/* selectable allocator */
 
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 4037b42998..77c70b9b48 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -528,9 +528,6 @@ struct zio {
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
-
-	/* write issue taskq selection, based upon sync thread */
-	taskq_t		*io_wr_iss_tq;
 };
 
 enum blk_verify_flag {
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index ef0385d42b..5edd80659e 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -525,10 +525,17 @@ most ZPL operations (e.g. write, create) will return
 .
 .It Sy spa_num_allocators Ns = Ns Sy 4 Pq int
 Determines the number of block alloctators to use per spa instance.
-Capped by the number of actual CPUs in the system.
+Capped by the number of actual CPUs in the system via
+.Sy spa_cpus_per_allocator .
 .Pp
 Note that setting this value too high could result in performance
 degredation and/or excess fragmentation.
+Set value only applies to pools imported/created after that.
+.
+.It Sy spa_cpus_per_allocator Ns = Ns Sy 4 Pq int
+Determines the minimum number of CPUs in a system for block alloctator
+per spa instance.
+Set value only applies to pools imported/created after that.
 .
 .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint
 Limits the number of on-disk error log entries that will be converted to the
@@ -2339,21 +2346,19 @@ Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint
 Number of worker threads per taskq.
-Lower values improve I/O ordering and CPU utilization,
-while higher reduces lock contention.
+Higher values improve I/O ordering and CPU utilization,
+while lower reduce lock contention.
+Set value only applies to pools imported/created after that.
 .Pp
 If
 .Sy 0 ,
 generate a system-dependent value close to 6 threads per taskq.
 Set value only applies to pools imported/created after that.
 .
-.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint
-Determines the number of CPUs to run write issue taskqs.
-.Pp
-When 0 (the default), the value to use is computed internally
-as the number of actual CPUs in the system divided by the
-.Sy spa_num_allocators
-value.
+.It Sy zio_taskq_write_tpq Ns = Ns Sy 16 Pq uint
+Determines the minumum number of threads per write issue taskq.
+Higher values improve CPU utilization on high throughput,
+while lower reduce taskq locks contention on high IOPS.
 Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 5ea99f7428..f1818ae155 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1664,12 +1664,14 @@ sync_dnodes_task(void *arg)
 	sync_objset_arg_t *soa = sda->sda_soa;
 	objset_t *os = soa->soa_os;
 
+	uint_t allocator = spa_acq_allocator(os->os_spa);
 	multilist_sublist_t *ms =
 	    multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
 
 	dmu_objset_sync_dnodes(ms, soa->soa_tx);
 
 	multilist_sublist_unlock(ms);
+	spa_rel_allocator(os->os_spa, allocator);
 
 	kmem_free(sda, sizeof (*sda));
 
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 147165ee85..ec2b674fb7 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -208,7 +208,7 @@ static const uint_t	zio_taskq_basedc = 80;	  /* base duty cycle */
 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
 #endif
 
-static uint_t	zio_taskq_wr_iss_ncpus = 0;
+static uint_t	zio_taskq_write_tpq = 16;
 
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
@@ -1067,17 +1067,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 	case ZTI_MODE_SYNC:
 
 		/*
-		 * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
-		 * not to exceed the number of spa allocators.
+		 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
+		 * not to exceed the number of spa allocators, and align to it.
 		 */
-		if (zio_taskq_wr_iss_ncpus == 0) {
-			count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
-		} else {
-			count = MAX(1,
-			    boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
-		}
+		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+		count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
 		count = MIN(count, spa->spa_alloc_count);
+		while (spa->spa_alloc_count % count != 0 &&
+		    spa->spa_alloc_count < count * 2)
+			count--;
 
 		/*
 		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
@@ -1495,15 +1494,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
-	if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
-	    (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
-		/* dispatch to assigned write issue taskq */
-		tq = zio->io_wr_iss_tq;
-		return (tq);
-	}
-
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
+	} else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+	    (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) {
+		tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
@@ -10233,16 +10228,10 @@ spa_sync_tq_create(spa_t *spa, const char *name)
 	VERIFY(spa->spa_sync_tq != NULL);
 	VERIFY(kthreads != NULL);
 
-	spa_taskqs_t *tqs =
-	    &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
-
 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
-	for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+	for (int i = 0; i < nthreads; i++, ti++) {
 		ti->sti_thread = kthreads[i];
-		if (w == tqs->stqs_count) {
-			w = 0;
-		}
-		ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+		ti->sti_allocator = i;
 	}
 
 	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
@@ -10261,6 +10250,42 @@ spa_sync_tq_destroy(spa_t *spa)
 	spa->spa_sync_tq = NULL;
 }
 
+uint_t
+spa_acq_allocator(spa_t *spa)
+{
+	int i;
+
+	if (spa->spa_alloc_count == 1)
+		return (0);
+
+	mutex_enter(&spa->spa_allocs_use->sau_lock);
+	uint_t r = spa->spa_allocs_use->sau_rotor;
+	do {
+		if (++r == spa->spa_alloc_count)
+			r = 0;
+	} while (spa->spa_allocs_use->sau_inuse[r]);
+	spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
+	spa->spa_allocs_use->sau_rotor = r;
+	mutex_exit(&spa->spa_allocs_use->sau_lock);
+
+	spa_syncthread_info_t *ti = spa->spa_syncthreads;
+	for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
+		if (ti->sti_thread == curthread) {
+			ti->sti_allocator = r;
+			break;
+		}
+	}
+	ASSERT3S(i, <, spa->spa_alloc_count);
+	return (r);
+}
+
+void
+spa_rel_allocator(spa_t *spa, uint_t allocator)
+{
+	if (spa->spa_alloc_count > 1)
+		spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
+}
+
 void
 spa_select_allocator(zio_t *zio)
 {
@@ -10288,8 +10313,7 @@ spa_select_allocator(zio_t *zio)
 		spa_syncthread_info_t *ti = spa->spa_syncthreads;
 		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
 			if (ti->sti_thread == curthread) {
-				zio->io_allocator = i;
-				zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+				zio->io_allocator = ti->sti_allocator;
 				return;
 			}
 		}
@@ -10306,7 +10330,6 @@ spa_select_allocator(zio_t *zio)
 	    bm->zb_blkid >> 20);
 
 	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
-	zio->io_wr_iss_tq = NULL;
 }
 
 /*
@@ -10919,5 +10942,5 @@ ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
 #endif
 /* END CSTYLED */
 
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
-	"Number of CPUs to run write issue taskqs");
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
+	"Number of CPUs per write issue taskq");
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 5fb7847b5d..e6d4a9bdb2 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -394,6 +394,7 @@ static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
  * Number of allocators to use, per spa instance
  */
 static int spa_num_allocators = 4;
+static int spa_cpus_per_allocator = 4;
 
 /*
  * Spa active allocator.
@@ -747,8 +748,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	if (altroot)
 		spa->spa_root = spa_strdup(altroot);
 
-	/* Do not allow more allocators than CPUs. */
-	spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+	/* Do not allow more allocators than fraction of CPUs. */
+	spa->spa_alloc_count = MAX(MIN(spa_num_allocators,
+	    boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1);
 
 	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (spa_alloc_t), KM_SLEEP);
@@ -758,6 +760,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 	}
+	if (spa->spa_alloc_count > 1) {
+		spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t,
+		    sau_inuse[spa->spa_alloc_count]), KM_SLEEP);
+		mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT,
+		    NULL);
+	}
 
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
@@ -853,6 +861,11 @@ spa_remove(spa_t *spa)
 	}
 	kmem_free(spa->spa_allocs, spa->spa_alloc_count *
 	    sizeof (spa_alloc_t));
+	if (spa->spa_alloc_count > 1) {
+		mutex_destroy(&spa->spa_allocs_use->sau_lock);
+		kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t,
+		    sau_inuse[spa->spa_alloc_count]));
+	}
 
 	avl_destroy(&spa->spa_metaslabs_by_flushed);
 	avl_destroy(&spa->spa_sm_logs_by_txg);
@@ -3097,4 +3110,7 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
 	param_get_uint, ZMOD_RW, "Reserved free space in pool");
 
 ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
-	"Number of allocators per spa, capped by ncpus");
+	"Number of allocators per spa");
+
+ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW,
+	"Minimum number of CPUs per allocators");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 0e7993d87e..870343bf4f 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2925,7 +2925,6 @@ static void
 zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
 {
 	cio->io_allocator = pio->io_allocator;
-	cio->io_wr_iss_tq = pio->io_wr_iss_tq;
 }
 
 static void