Linux 6.10: rework queue limits setup

Linux has started moving to a model where instead of applying block queue limits through individual modification functions, a complete limits structure is built up and applied atomically, either when the block device or open, or some time afterwards. As of 6.10 this transition appears only partly completed. This commit matches that model within OpenZFS in a way that should work for past and future kernels. We set up a queue limits structure with any limits that have had their modification functions removed. For newer kernels that can have limits applied at block device open (HAVE_BLK_ALLOC_DISK_2ARG), we have a conversion function to turn the OpenZFS queue limits structure into Linux's queue_limits structure, which can then be passed in. For older kernels, we provide an application function that just calls the old functions for each limit in the structure. Signed-off-by: Rob Norris <robn@despairlabs.com> Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
2024-05-27 21:32:07 -04:00 · 2024-05-27 21:32:07 -04:00 · 0342c4a6b2
parent d7bf0e5259
commit 0342c4a6b2
2 changed files with 118 additions and 72 deletions
--- a/config/kernel-blk-queue.m4
+++ b/config/kernel-blk-queue.m4
@ -332,7 +332,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [
 	ZFS_LINUX_TEST_RESULT([blk_queue_max_hw_sectors], [
 		AC_MSG_RESULT(yes)
 	],[
-		ZFS_LINUX_TEST_ERROR([blk_queue_max_hw_sectors])
+		AC_MSG_RESULT(no)
 	])
 ])
@ -355,7 +355,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [
 	ZFS_LINUX_TEST_RESULT([blk_queue_max_segments], [
 		AC_MSG_RESULT(yes)
 	], [
-		ZFS_LINUX_TEST_ERROR([blk_queue_max_segments])
+		AC_MSG_RESULT(no)
 	])
 ])
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@ -1074,8 +1074,106 @@ static const struct block_device_operations zvol_ops = {
 #endif
 };
 typedef struct zvol_queue_limits {
 	unsigned int	zql_max_hw_sectors;
 	unsigned short	zql_max_segments;
 	unsigned int	zql_max_segment_size;
 	unsigned int	zql_io_opt;
 } zvol_queue_limits_t;
 static void
 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
    boolean_t use_blk_mq)
 {
 	limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
 	if (use_blk_mq) {
 		/*
 		 * IO requests can be really big (1MB).  When an IO request
 		 * comes in, it is passed off to zvol_read() or zvol_write()
 		 * in a new thread, where it is chunked up into 'volblocksize'
 		 * sized pieces and processed.  So for example, if the request
 		 * is a 1MB write and your volblocksize is 128k, one zvol_write
 		 * thread will take that request and sequentially do ten 128k
 		 * IOs.  This is due to the fact that the thread needs to lock
 		 * each volblocksize sized block.  So you might be wondering:
 		 * "instead of passing the whole 1MB request to one thread,
 		 * why not pass ten individual 128k chunks to ten threads and
 		 * process the whole write in parallel?"  The short answer is
 		 * that there's a sweet spot number of chunks that balances
 		 * the greater parallelism with the added overhead of more
 		 * threads. The sweet spot can be different depending on if you
 		 * have a read or write  heavy workload.  Writes typically want
 		 * high chunk counts while reads typically want lower ones.  On
 		 * a test pool with 6 NVMe drives in a 3x 2-disk mirror
 		 * configuration, with volblocksize=8k, the sweet spot for good
 		 * sequential reads and writes was at 8 chunks.
 		 */
 		/*
 		 * Below we tell the kernel how big we want our requests
 		 * to be.  You would think that blk_queue_io_opt() would be
 		 * used to do this since it is used to "set optimal request
 		 * size for the queue", but that doesn't seem to do
 		 * anything - the kernel still gives you huge requests
 		 * with tons of little PAGE_SIZE segments contained within it.
 		 *
 		 * Knowing that the kernel will just give you PAGE_SIZE segments
 		 * no matter what, you can say "ok, I want PAGE_SIZE byte
 		 * segments, and I want 'N' of them per request", where N is
 		 * the correct number of segments for the volblocksize and
 		 * number of chunks you want.
 		 */
 #ifdef HAVE_BLK_MQ
 		if (zvol_blk_mq_blocks_per_thread != 0) {
 			unsigned int chunks;
 			chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
 			limits->zql_max_segment_size = PAGE_SIZE;
 			limits->zql_max_segments =
 			    (zv->zv_volblocksize * chunks) / PAGE_SIZE;
 		} else {
 			/*
 			 * Special case: zvol_blk_mq_blocks_per_thread = 0
 			 * Max everything out.
 			 */
 			limits->zql_max_segments = UINT16_MAX;
 			limits->zql_max_segment_size = UINT_MAX;
 		}
 	} else {
 #endif
 		limits->zql_max_segments = UINT16_MAX;
 		limits->zql_max_segment_size = UINT_MAX;
 	}
 	limits->zql_io_opt = zv->zv_volblocksize;
 }
 #ifdef HAVE_BLK_ALLOC_DISK_2ARG
 static void
 zvol_queue_limits_convert(zvol_queue_limits_t *limits,
    struct queue_limits *qlimits)
 {
 	memset(qlimits, 0, sizeof (struct queue_limits));
 	qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
 	qlimits->max_segments = limits->zql_max_segments;
 	qlimits->max_segment_size = limits->zql_max_segment_size;
 	qlimits->io_opt = limits->zql_io_opt;
 }
 #else
 static void
 zvol_queue_limits_apply(zvol_queue_limits_t *limits,
    struct request_queue *queue)
 {
 	blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
 	blk_queue_max_segments(queue, limits->zql_max_segments);
 	blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
 	blk_queue_io_opt(queue, limits->zql_io_opt);
 }
 #endif
 static int
-zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
+zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 {
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
 #if defined(HAVE_BLK_ALLOC_DISK)
@ -1085,8 +1183,11 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
 	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
-	struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
+	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
 	struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
 	if (IS_ERR(disk)) {
 		zso->zvo_disk = NULL;
 		return (1);
@ -1107,6 +1208,7 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
 	}
 	zso->zvo_disk->queue = zso->zvo_queue;
 	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif /* HAVE_BLK_ALLOC_DISK */
 #else
 	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
@ -1120,13 +1222,14 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
 	}
 	zso->zvo_disk->queue = zso->zvo_queue;
 	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
 	return (0);
 }
 static int
-zvol_alloc_blk_mq(zvol_state_t *zv)
+zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
 {
 #ifdef HAVE_BLK_MQ
 	struct zvol_state_os *zso = zv->zv_zso;
@ -1142,9 +1245,12 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
 		return (1);
 	}
 	zso->zvo_queue = zso->zvo_disk->queue;
 	zvol_queue_limits_apply(limits, zso->zvo_queue);
 	zso->zvo_disk->minors = ZVOL_MINORS;
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
-	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
+	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
 	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
 	if (IS_ERR(disk)) {
 		zso->zvo_disk = NULL;
 		blk_mq_free_tag_set(&zso->tag_set);
@ -1170,6 +1276,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
 	/* Our queue is now created, assign it to our disk */
 	zso->zvo_disk->queue = zso->zvo_queue;
 	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif
 #endif
@ -1209,6 +1316,9 @@ zvol_alloc(dev_t dev, const char *name)
 	zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
 #endif
 	zvol_queue_limits_t limits;
 	zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
 	/*
 	 * The block layer has 3 interfaces for getting BIOs:
 	 *
@ -1225,10 +1335,10 @@ zvol_alloc(dev_t dev, const char *name)
 	 *    disk and the queue separately. (5.13 kernel or older)
 	 */
 	if (zv->zv_zso->use_blk_mq) {
-		ret = zvol_alloc_blk_mq(zv);
+		ret = zvol_alloc_blk_mq(zv, &limits);
 		zso->zvo_disk->fops = &zvol_ops_blk_mq;
 	} else {
-		ret = zvol_alloc_non_blk_mq(zso);
+		ret = zvol_alloc_non_blk_mq(zso, &limits);
 		zso->zvo_disk->fops = &zvol_ops;
 	}
 	if (ret != 0)
@ -1505,74 +1615,10 @@ zvol_os_create_minor(const char *name)
 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
 	blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
 	    (DMU_MAX_ACCESS / 4) >> 9);
 	if (zv->zv_zso->use_blk_mq) {
 		/*
 		 * IO requests can be really big (1MB).  When an IO request
 		 * comes in, it is passed off to zvol_read() or zvol_write()
 		 * in a new thread, where it is chunked up into 'volblocksize'
 		 * sized pieces and processed.  So for example, if the request
 		 * is a 1MB write and your volblocksize is 128k, one zvol_write
 		 * thread will take that request and sequentially do ten 128k
 		 * IOs.  This is due to the fact that the thread needs to lock
 		 * each volblocksize sized block.  So you might be wondering:
 		 * "instead of passing the whole 1MB request to one thread,
 		 * why not pass ten individual 128k chunks to ten threads and
 		 * process the whole write in parallel?"  The short answer is
 		 * that there's a sweet spot number of chunks that balances
 		 * the greater parallelism with the added overhead of more
 		 * threads. The sweet spot can be different depending on if you
 		 * have a read or write  heavy workload.  Writes typically want
 		 * high chunk counts while reads typically want lower ones.  On
 		 * a test pool with 6 NVMe drives in a 3x 2-disk mirror
 		 * configuration, with volblocksize=8k, the sweet spot for good
 		 * sequential reads and writes was at 8 chunks.
 		 */
 		/*
 		 * Below we tell the kernel how big we want our requests
 		 * to be.  You would think that blk_queue_io_opt() would be
 		 * used to do this since it is used to "set optimal request
 		 * size for the queue", but that doesn't seem to do
 		 * anything - the kernel still gives you huge requests
 		 * with tons of little PAGE_SIZE segments contained within it.
 		 *
 		 * Knowing that the kernel will just give you PAGE_SIZE segments
 		 * no matter what, you can say "ok, I want PAGE_SIZE byte
 		 * segments, and I want 'N' of them per request", where N is
 		 * the correct number of segments for the volblocksize and
 		 * number of chunks you want.
 		 */
 #ifdef HAVE_BLK_MQ
 		if (zvol_blk_mq_blocks_per_thread != 0) {
 			unsigned int chunks;
 			chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
 			blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
 			    PAGE_SIZE);
 			blk_queue_max_segments(zv->zv_zso->zvo_queue,
 			    (zv->zv_volblocksize * chunks) / PAGE_SIZE);
 		} else {
 			/*
 			 * Special case: zvol_blk_mq_blocks_per_thread = 0
 			 * Max everything out.
 			 */
 			blk_queue_max_segments(zv->zv_zso->zvo_queue,
 			    UINT16_MAX);
 			blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
 			    UINT_MAX);
 		}
 #endif
 	} else {
 		blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
 		blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
 	}
 	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
 	    zv->zv_volblocksize);
 	blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
 	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
 	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
 	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,