From 98bcc390e86d796be613a46a0721a936eaeae577 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 18 Jul 2023 11:11:29 +1000 Subject: [PATCH] vdev_disk: rework bio max segment calculation A single "page" in an ABD does not necessarily correspond to one segment in a bio, because of how ZFS does ABD allocations and how it breaks them up with adding them to a bio. Because of this, simply dividing the ABD size by the page size can only ever give a minimum number of segments required, rather than the correct number. Until we can fix that, we'll just make each bio as large as they can be for as many segments as the device queue will permit without needing to split the the bio. This is a little wasteful if we don't intend to put that many segments in the bio, but its not a lot of memory and its only lost until the bio is completed. This also adds a tuneable, vdev_disk_max_segs, to allow setting this value to be set by the operator. This is very useful for debugging. Signed-off-by: Rob Norris (cherry picked from commit a3a438d1bedb0626417cd73ba10b1479a06bef7f) --- include/os/linux/kernel/linux/mod_compat.h | 1 + module/os/linux/zfs/vdev_disk.c | 27 ++++++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h index cc42c3f7c7..983cdb93bc 100644 --- a/include/os/linux/kernel/linux/mod_compat.h +++ b/include/os/linux/kernel/linux/mod_compat.h @@ -72,6 +72,7 @@ enum scope_prefix_types { zfs_txg, zfs_vdev, zfs_vdev_cache, + zfs_vdev_disk, zfs_vdev_file, zfs_vdev_mirror, zfs_vnops, diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 37963bafde..0ed1a112f5 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -47,6 +47,14 @@ typedef struct vdev_disk { } vdev_disk_t; int zio_suppress_zero_writes = B_TRUE; + +/* + * Maximum number of segments to add to a bio. If this is higher than the + * maximum allowed by the device queue or the kernel itself, it will be + * clamped. Setting it to zero will cause the kernel's ideal size to be used. + */ +unsigned long vdev_disk_max_segs = 0; + /* * Unique identifier for the exclusive vdev holder. */ @@ -591,15 +599,17 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, } static inline unsigned int -vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) -{ - unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, - bio_size, abd_offset); +vdev_bio_max_segs(struct block_device *bdev) { + const unsigned long tune_max_segs = + vdev_disk_max_segs > 0 ? vdev_disk_max_segs : ULONG_MAX; + const unsigned long dev_max_segs = + queue_max_segments(bdev_get_queue(bdev)); + const unsigned long max_segs = MIN(tune_max_segs, dev_max_segs); #ifdef HAVE_BIO_MAX_SEGS - return (bio_max_segs(nr_segs)); + return (bio_max_segs(max_segs)); #else - return (MIN(nr_segs, BIO_MAX_PAGES)); + return (MIN(max_segs, BIO_MAX_PAGES)); #endif } @@ -664,7 +674,7 @@ retry: goto retry; } - nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); + nr_vecs = vdev_bio_max_segs(bdev); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); @@ -1030,3 +1040,6 @@ param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) ZFS_MODULE_PARAM(zfs_zio, zio_, suppress_zero_writes, INT, ZMOD_RW, "Do not send zero byte writes to hardware"); + +ZFS_MODULE_PARAM(zfs_vdev_disk, vdev_disk_, max_segs, ULONG, ZMOD_RW, + "Maximum number of data segments to add to an IO request");