From f2330bd1568489ae1fb16d975a5a9bcfe12ed219 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Thu, 28 Apr 2022 18:12:24 -0400 Subject: [PATCH] Default zfs_max_recordsize to 16M Increase the default allowed maximum recordsize from 1M to 16M. As described in the zfs(4) man page, there are significant costs which need to be considered before using very large blocks. However, there are scenarios where they make good sense and it should no longer be necessary to artificially restrict their use behind a module option. Note that for 32-bit platforms we continue to leave this restriction in place due to the limited virtual address space available (256-512MB). On these systems only a handful of blocks could be cached at any one time severely impacting performance and potentially stability. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rich Ercolani Closes #12830 Closes #13302 --- man/man4/zfs.4 | 6 ++--- module/zfs/dsl_dataset.c | 25 ++++++++++--------- module/zfs/zio.c | 9 ------- .../alloc_class/alloc_class_011_neg.ksh | 2 +- .../zfs_create/zfs_create_008_neg.ksh | 2 +- .../cli_root/zfs_set/zfs_set_001_neg.ksh | 2 +- .../zpool_create/zpool_create_023_neg.ksh | 2 +- 7 files changed, 20 insertions(+), 28 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index a18917eb1e..290ecd22e1 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1469,15 +1469,15 @@ feature uses to estimate incoming log blocks. .It Sy zfs_max_logsm_summary_length Ns = Ns Sy 10 Pq ulong Maximum number of rows allowed in the summary of the spacemap log. . -.It Sy zfs_max_recordsize Ns = Ns Sy 1048576 Po 1MB Pc Pq int +.It Sy zfs_max_recordsize Ns = Ns Sy 16777216 Po 16MB Pc Pq int We currently support block sizes from .Em 512B No to Em 16MB . The benefits of larger blocks, and thus larger I/O, need to be weighed against the cost of COWing a giant block to modify one byte. Additionally, very large blocks can have an impact on I/O latency, and also potentially on the memory allocator. -Therefore, we do not allow the recordsize to be set larger than this tunable. -Larger blocks can be created by changing it, +Therefore, we formerly forbade creating blocks larger than 1M. +Larger blocks could be created by changing it, and pools with larger blocks can always be imported and used, regardless of this setting. . diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 2d98c2f04d..ca894c3525 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -73,12 +73,19 @@ * The SPA supports block sizes up to 16MB. However, very large blocks * can have an impact on i/o latency (e.g. tying up a spinning disk for * ~300ms), and also potentially on the memory allocator. Therefore, - * we do not allow the recordsize to be set larger than zfs_max_recordsize - * (default 1MB). Larger blocks can be created by changing this tunable, - * and pools with larger blocks can always be imported and used, regardless - * of this setting. + * we did not allow the recordsize to be set larger than zfs_max_recordsize + * (former default: 1MB). Larger blocks could be created by changing this + * tunable, and pools with larger blocks could always be imported and used, + * regardless of this setting. + * + * We do, however, still limit it by default to 1M on x86_32, because Linux's + * 3/1 memory split doesn't leave much room for 16M chunks. */ -int zfs_max_recordsize = 1 * 1024 * 1024; +#ifdef _ILP32 +int zfs_max_recordsize = 1 * 1024 * 1024; +#else +int zfs_max_recordsize = 16 * 1024 * 1024; +#endif static int zfs_allow_redacted_dataset_mount = 0; #define SWITCH64(x, y) \ @@ -4964,13 +4971,7 @@ dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg, return (0); } -#if defined(_LP64) -#define RECORDSIZE_PERM ZMOD_RW -#else -/* Limited to 1M on 32-bit platforms due to lack of virtual address space */ -#define RECORDSIZE_PERM ZMOD_RD -#endif -ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM, +ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, ZMOD_RW, "Max allowed record size"); ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f6adea5724..2a16d5cef2 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -166,15 +166,6 @@ zio_init(void) cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; -#if defined(_ILP32) && defined(_KERNEL) - /* - * Cache size limited to 1M on 32-bit platforms until ARC - * buffers no longer require virtual address space. - */ - if (size > zfs_max_recordsize) - break; -#endif - while (!ISP2(p2)) p2 &= p2 - 1; diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh index d804e5371e..0be49b8587 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh @@ -35,7 +35,7 @@ log_must disk_setup log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ $CLASS_DISK0 $CLASS_DISK1 -for value in 256 1025 2097152 +for value in 256 1025 33554432 do log_mustnot zfs set special_small_blocks=$value $TESTPOOL done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh index a905e50dfa..d82f10f71f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh @@ -56,7 +56,7 @@ set -A args "ab" "-?" "-cV" "-Vc" "-c -V" "c" "V" "--c" "-e" "-s" \ "-blah" "-cV 12k" "-s -cV 1P" "-sc" "-Vs 5g" "-o" "--o" "-O" "--O" \ "-o QuOta=none" "-o quota=non" "-o quota=abcd" "-o quota=0" "-o quota=" \ "-o ResErVaTi0n=none" "-o reserV=none" "-o reservation=abcd" "-o reserv=" \ - "-o recorDSize=64k" "-o recordsize=2048K" "-o recordsize=2M" \ + "-o recorDSize=64k" "-o recordsize=32768K" "-o recordsize=32M" \ "-o recordsize=256" "-o recsize=" "-o recsize=zero" "-o recordsize=0" \ "-o mountPoint=/tmp/tmpfile$$" "-o mountpoint=non0" "-o mountpoint=" \ "-o mountpoint=LEGACY" "-o mounpoint=none" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh index 5cfaec55e4..e58fe9bfe9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh @@ -78,7 +78,7 @@ while (( i < ${#dataset[@]} )); do (( j += 1 )) done # Additional recordsize - set_n_check_prop "2048K" "recordsize" "${dataset[i]}" false + set_n_check_prop "32768K" "recordsize" "${dataset[i]}" false set_n_check_prop "128B" "recordsize" "${dataset[i]}" false (( i += 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh index f101521bd3..780cf86d6c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh @@ -52,7 +52,7 @@ log_onexit cleanup set -A args "QuOta=none" "quota=non" "quota=abcd" "quota=0" "quota=" \ "ResErVaTi0n=none" "reserV=none" "reservation=abcd" "reserv=" \ - "recorDSize=64k" "recordsize=2M" "recordsize=2048K" \ + "recorDSize=64k" "recordsize=32M" "recordsize=32768K" \ "recordsize=256" "recsize=" "recsize=zero" "recordsize=0" \ "mountPoint=/tmp/tmpfile$$" "mountpoint=non0" "mountpoint=" \ "mountpoint=LEGACY" "mounpoint=none" \