From b3626f0a3576152256bbbd7fedab90e063037ba1 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 16 Dec 2023 18:01:45 +1100 Subject: [PATCH 01/31] linux 6.7 compat: simplify current_time() check 6.7 changed the names of the time members in struct inode, so we can't assign back to it because we don't know its name. In practice this doesn't matter though - if we're missing current_time(), then we must be on <4.9, and we know our fallback will need to return timespec. Signed-off-by: Rob Norris Sponsored-by: https://github.com/sponsors/robn --- config/kernel-current-time.m4 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/config/kernel-current-time.m4 b/config/kernel-current-time.m4 index 3ceb5f63ef..ab7d9c5ced 100644 --- a/config/kernel-current-time.m4 +++ b/config/kernel-current-time.m4 @@ -2,12 +2,15 @@ dnl # dnl # 4.9, current_time() added dnl # 4.18, return type changed from timespec to timespec64 dnl # +dnl # Note that we don't care about the return type in this check. If we have +dnl # to implement a fallback, we'll know we're <4.9, which was timespec. +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_CURRENT_TIME], [ ZFS_LINUX_TEST_SRC([current_time], [ #include ], [ struct inode ip __attribute__ ((unused)); - ip.i_atime = current_time(&ip); + (void) current_time(&ip); ]) ]) From 3c13601a12b1739d09cec36eb5057b24141b4ae7 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 16 Dec 2023 22:31:32 +1100 Subject: [PATCH 02/31] linux 6.7 compat: use inode atime/mtime accessors 6.6 made i_ctime inaccessible; 6.7 has done the same for i_atime and i_mtime. This extends the method used for ctime in b37f29341 to atime and mtime as well. Signed-off-by: Rob Norris Sponsored-by: https://github.com/sponsors/robn --- config/kernel-inode-times.m4 | 78 ++++++++++++++++++++++++++++++ include/os/linux/zfs/sys/zpl.h | 20 ++++++++ module/os/linux/zfs/zfs_ctldir.c | 4 +- module/os/linux/zfs/zfs_vnops_os.c | 33 ++++++++----- module/os/linux/zfs/zfs_znode.c | 45 +++++++++-------- module/os/linux/zfs/zpl_inode.c | 3 +- 6 files changed, 148 insertions(+), 35 deletions(-) diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4 index aae95abf17..4d861596ed 100644 --- a/config/kernel-inode-times.m4 +++ b/config/kernel-inode-times.m4 @@ -52,6 +52,48 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ memset(&ip, 0, sizeof(ip)); inode_set_ctime_to_ts(&ip, ts); ]) + + dnl # + dnl # 6.7 API change + dnl # i_atime/i_mtime no longer directly accessible, must use + dnl # inode_get_mtime(ip), inode_set_mtime*(ip) to + dnl # read/write. + dnl # + ZFS_LINUX_TEST_SRC([inode_get_atime], [ + #include + ],[ + struct inode ip; + + memset(&ip, 0, sizeof(ip)); + inode_get_atime(&ip); + ]) + ZFS_LINUX_TEST_SRC([inode_get_mtime], [ + #include + ],[ + struct inode ip; + + memset(&ip, 0, sizeof(ip)); + inode_get_mtime(&ip); + ]) + + ZFS_LINUX_TEST_SRC([inode_set_atime_to_ts], [ + #include + ],[ + struct inode ip; + struct timespec64 ts = {0}; + + memset(&ip, 0, sizeof(ip)); + inode_set_atime_to_ts(&ip, ts); + ]) + ZFS_LINUX_TEST_SRC([inode_set_mtime_to_ts], [ + #include + ],[ + struct inode ip; + struct timespec64 ts = {0}; + + memset(&ip, 0, sizeof(ip)); + inode_set_mtime_to_ts(&ip, ts); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ @@ -90,4 +132,40 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ ],[ AC_MSG_RESULT(no) ]) + + AC_MSG_CHECKING([whether inode_get_atime() exists]) + ZFS_LINUX_TEST_RESULT([inode_get_atime], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_GET_ATIME, 1, + [inode_get_atime() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_set_atime_to_ts() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_atime_to_ts], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_ATIME_TO_TS, 1, + [inode_set_atime_to_ts() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_get_mtime() exists]) + ZFS_LINUX_TEST_RESULT([inode_get_mtime], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_GET_MTIME, 1, + [inode_get_mtime() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_set_mtime_to_ts() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_mtime_to_ts], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_MTIME_TO_TS, 1, + [inode_set_mtime_to_ts() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 9b729be6d7..91a4751fff 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -273,5 +273,25 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); #else #define zpl_inode_set_ctime_to_ts(ip, ts) (ip->i_ctime = ts) #endif +#ifdef HAVE_INODE_GET_ATIME +#define zpl_inode_get_atime(ip) inode_get_atime(ip) +#else +#define zpl_inode_get_atime(ip) (ip->i_atime) +#endif +#ifdef HAVE_INODE_SET_ATIME_TO_TS +#define zpl_inode_set_atime_to_ts(ip, ts) inode_set_atime_to_ts(ip, ts) +#else +#define zpl_inode_set_atime_to_ts(ip, ts) (ip->i_atime = ts) +#endif +#ifdef HAVE_INODE_GET_MTIME +#define zpl_inode_get_mtime(ip) inode_get_mtime(ip) +#else +#define zpl_inode_get_mtime(ip) (ip->i_mtime) +#endif +#ifdef HAVE_INODE_SET_MTIME_TO_TS +#define zpl_inode_set_mtime_to_ts(ip, ts) inode_set_mtime_to_ts(ip, ts) +#else +#define zpl_inode_set_mtime_to_ts(ip, ts) (ip->i_mtime = ts) +#endif #endif /* _SYS_ZPL_H */ diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 94e25fa0ae..54ed70d039 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -520,8 +520,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, ip->i_uid = SUID_TO_KUID(0); ip->i_gid = SGID_TO_KGID(0); ip->i_blkbits = SPA_MINBLOCKSHIFT; - ip->i_atime = now; - ip->i_mtime = now; + zpl_inode_set_atime_to_ts(ip, now); + zpl_inode_set_mtime_to_ts(ip, now); zpl_inode_set_ctime_to_ts(ip, now); ip->i_fop = fops; ip->i_op = ops; diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index b464f615cd..65d1d786ae 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -2435,15 +2435,17 @@ top: if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; - ZFS_TIME_ENCODE(&ip->i_atime, atime); + inode_timespec_t tmp_atime; + ZFS_TIME_ENCODE(&tmp_atime, atime); + zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( - vap->va_mtime, ZTOI(zp)); + zpl_inode_set_mtime_to_ts(ZTOI(zp), + zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); @@ -3657,7 +3659,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; @@ -3821,9 +3823,10 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - tmp_ctime = zpl_inode_get_ctime(ip); - ZFS_TIME_ENCODE(&tmp_ctime, ctime); + tmp_ts = zpl_inode_get_mtime(ip); + ZFS_TIME_ENCODE(&tmp_ts, mtime); + tmp_ts = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ts, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; @@ -3873,7 +3876,7 @@ zfs_dirty_inode(struct inode *ip, int flags) zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; @@ -3918,10 +3921,12 @@ zfs_dirty_inode(struct inode *ip, int flags) SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_atime, atime); - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - tmp_ctime = zpl_inode_get_ctime(ip); - ZFS_TIME_ENCODE(&tmp_ctime, ctime); + tmp_ts = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_ts, atime); + tmp_ts = zpl_inode_get_mtime(ip); + ZFS_TIME_ENCODE(&tmp_ts, mtime); + tmp_ts = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ts, ctime); mode = ip->i_mode; zp->z_mode = mode; @@ -3964,7 +3969,9 @@ zfs_inactive(struct inode *ip) if (error) { dmu_tx_abort(tx); } else { - ZFS_TIME_ENCODE(&ip->i_atime, atime); + inode_timespec_t tmp_atime; + tmp_atime = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index f71026da83..b99df188c6 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -542,7 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; @@ -614,10 +614,12 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; - ZFS_TIME_DECODE(&ip->i_atime, atime); - ZFS_TIME_DECODE(&ip->i_mtime, mtime); - ZFS_TIME_DECODE(&tmp_ctime, ctime); - zpl_inode_set_ctime_to_ts(ip, tmp_ctime); + ZFS_TIME_DECODE(&tmp_ts, atime); + zpl_inode_set_atime_to_ts(ip, tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ip, tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; @@ -1197,7 +1199,7 @@ zfs_rezget(znode_t *zp) uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; @@ -1290,10 +1292,12 @@ zfs_rezget(znode_t *zp) zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); - ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); - ZFS_TIME_DECODE(&tmp_ctime, ctime); - zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); + ZFS_TIME_DECODE(&tmp_ts, atime); + zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { @@ -1401,22 +1405,24 @@ zfs_zinactive(znode_t *zp) boolean_t zfs_relatime_need_update(const struct inode *ip) { - inode_timespec_t now, tmp_ctime; + inode_timespec_t now, tmp_atime, tmp_ts; gethrestime(&now); + tmp_atime = zpl_inode_get_atime(ip); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ - if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) + tmp_ts = zpl_inode_get_mtime(ip); + if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); - tmp_ctime = zpl_inode_get_ctime(ip); - if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0) + tmp_ts = zpl_inode_get_ctime(ip); + if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); - if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) + if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); @@ -1439,7 +1445,7 @@ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { - inode_timespec_t now, tmp_ctime; + inode_timespec_t now, tmp_ts; gethrestime(&now); @@ -1447,7 +1453,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); @@ -1456,8 +1463,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); - ZFS_TIME_DECODE(&tmp_ctime, ctime); - zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index 96f65b9e94..ad1753f7a0 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -526,7 +526,8 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) vap->va_ctime = ia->ia_ctime; if (vap->va_mask & ATTR_ATIME) - ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); + zpl_inode_set_atime_to_ts(ip, + zpl_inode_timestamp_truncate(ia->ia_atime, ip)); cookie = spl_fstrans_mark(); #ifdef HAVE_USERNS_IOPS_SETATTR From 18a9185165e2713e690e52347a37de1878e2a9fc Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 16 Dec 2023 17:39:07 +1100 Subject: [PATCH 03/31] linux 6.7 compat: handle superblock shrinker member change In 6.7 the superblock shrinker member s_shrink has changed from being an embedded struct to a pointer. Detect this, and don't take a reference if it already is one. Signed-off-by: Rob Norris Sponsored-by: https://github.com/sponsors/robn --- config/kernel-shrink.m4 | 35 +++++++++++++++++++++++++++++++- module/os/linux/zfs/zfs_vfsops.c | 10 +++++++-- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/config/kernel-shrink.m4 b/config/kernel-shrink.m4 index 0c702153e8..1c5f753d41 100644 --- a/config/kernel-shrink.m4 +++ b/config/kernel-shrink.m4 @@ -19,12 +19,44 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK], [ ],[]) ]) +dnl # +dnl # 6.7 API change +dnl # s_shrink is now a pointer. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR], [ + ZFS_LINUX_TEST_SRC([super_block_s_shrink_ptr], [ + #include + unsigned long shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } + static struct shrinker shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + static const struct super_block + sb __attribute__ ((unused)) = { + .s_shrink = &shrinker, + }; + ],[]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SUPER_BLOCK_S_SHRINK], [ AC_MSG_CHECKING([whether super_block has s_shrink]) ZFS_LINUX_TEST_RESULT([super_block_s_shrink], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK, 1, + [have super_block s_shrink]) ],[ - ZFS_LINUX_TEST_ERROR([sb->s_shrink()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether super_block has s_shrink pointer]) + ZFS_LINUX_TEST_RESULT([super_block_s_shrink_ptr], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK_PTR, 1, + [have super_block s_shrink pointer]) + ],[ + AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([sb->s_shrink()]) + ]) ]) ]) @@ -174,6 +206,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER], [ ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK + ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_HAS_NID ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 2792bc0272..2015c20d73 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1240,12 +1240,18 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. */ +#if defined(HAVE_SUPER_BLOCK_S_SHRINK) +#define S_SHRINK(sb) (&(sb)->s_shrink) +#elif defined(HAVE_SUPER_BLOCK_S_SHRINK_PTR) +#define S_SHRINK(sb) ((sb)->s_shrink) +#endif + int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfsvfs_t *zfsvfs = sb->s_fs_info; int error = 0; - struct shrinker *shrinker = &sb->s_shrink; + struct shrinker *shrinker = S_SHRINK(sb); struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, @@ -1257,7 +1263,7 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ defined(SHRINK_CONTROL_HAS_NID) && \ defined(SHRINKER_NUMA_AWARE) - if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { + if (shrinker->flags & SHRINKER_NUMA_AWARE) { *objects = 0; for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); From 03b84099d9c4d3f1b4d1b123abc967e81f6d15db Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 17 Dec 2023 00:36:21 +1100 Subject: [PATCH 04/31] linux 6.7 compat: rework shrinker setup for heap allocations 6.7 changes the shrinker API such that shrinkers must be allocated dynamically by the kernel. To accomodate this, this commit reworks spl_register_shrinker() to do something similar against earlier kernels. Signed-off-by: Rob Norris Sponsored-by: https://github.com/sponsors/robn --- config/kernel-shrink.m4 | 52 +++++++++++-- include/os/linux/spl/sys/shrinker.h | 66 +++++----------- module/Kbuild.in | 1 + module/os/linux/spl/spl-shrinker.c | 115 ++++++++++++++++++++++++++++ module/os/linux/zfs/arc_os.c | 11 ++- 5 files changed, 189 insertions(+), 56 deletions(-) create mode 100644 module/os/linux/spl/spl-shrinker.c diff --git a/config/kernel-shrink.m4 b/config/kernel-shrink.m4 index 1c5f753d41..4a529c43b5 100644 --- a/config/kernel-shrink.m4 +++ b/config/kernel-shrink.m4 @@ -128,6 +128,25 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK], [ ]) ]) +dnl # +dnl # 6.7 API change +dnl # register_shrinker has been replaced by shrinker_register. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER], [ + ZFS_LINUX_TEST_SRC([shrinker_register], [ + #include + unsigned long shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } + ],[ + struct shrinker cache_shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + shrinker_register(&cache_shrinker); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[ dnl # dnl # 6.0 API change @@ -165,14 +184,36 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[ dnl # cs->shrink() is logically split in to dnl # cs->count_objects() and cs->scan_objects() dnl # - AC_MSG_CHECKING([if cs->count_objects callback exists]) + AC_MSG_CHECKING( + [whether cs->count_objects callback exists]) ZFS_LINUX_TEST_RESULT( - [shrinker_cb_shrink_control_split],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1, - [cs->count_objects exists]) + [shrinker_cb_shrink_control_split],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1, + [cs->count_objects exists]) ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING( + [whether shrinker_register exists]) + ZFS_LINUX_TEST_RESULT([shrinker_register], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SHRINKER_REGISTER, 1, + [shrinker_register exists]) + + dnl # We assume that the split shrinker + dnl # callback exists if + dnl # shrinker_register() exists, + dnl # because the latter is a much more + dnl # recent addition, and the macro + dnl # test for shrinker_register() only + dnl # works if the callback is split + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, + 1, [cs->count_objects exists]) + ],[ + AC_MSG_RESULT(no) ZFS_LINUX_TEST_ERROR([shrinker]) + ]) ]) ]) ]) @@ -211,6 +252,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER], [ ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT ZFS_AC_KERNEL_SRC_REGISTER_SHRINKER_VARARG + ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER ]) AC_DEFUN([ZFS_AC_KERNEL_SHRINKER], [ diff --git a/include/os/linux/spl/sys/shrinker.h b/include/os/linux/spl/sys/shrinker.h index d472754be4..bca4c85069 100644 --- a/include/os/linux/spl/sys/shrinker.h +++ b/include/os/linux/spl/sys/shrinker.h @@ -29,12 +29,13 @@ /* * Due to frequent changes in the shrinker API the following - * compatibility wrappers should be used. They are as follows: + * compatibility wrapper should be used. * - * SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost); + * shrinker = spl_register_shrinker(name, countfunc, scanfunc, seek_cost); + * spl_unregister_shrinker(shrinker); * - * SPL_SHRINKER_DECLARE is used to declare a shrinker with the name varname, - * which is passed to spl_register_shrinker()/spl_unregister_shrinker(). + * spl_register_shrinker is used to create and register a shrinker with the + * given name. * The countfunc returns the number of free-able objects. * The scanfunc returns the number of objects that were freed. * The callbacks can return SHRINK_STOP if further calls can't make any more @@ -57,57 +58,28 @@ * ...scan objects in the cache and reclaim them... * } * - * SPL_SHRINKER_DECLARE(my_shrinker, my_count, my_scan, DEFAULT_SEEKS); + * static struct shrinker *my_shrinker; * * void my_init_func(void) { - * spl_register_shrinker(&my_shrinker); + * my_shrinker = spl_register_shrinker("my-shrinker", + * my_count, my_scan, DEFAULT_SEEKS); + * } + * + * void my_fini_func(void) { + * spl_unregister_shrinker(my_shrinker); * } */ -#ifdef HAVE_REGISTER_SHRINKER_VARARG -#define spl_register_shrinker(x) register_shrinker(x, "zfs-arc-shrinker") -#else -#define spl_register_shrinker(x) register_shrinker(x) -#endif -#define spl_unregister_shrinker(x) unregister_shrinker(x) +typedef unsigned long (*spl_shrinker_cb) + (struct shrinker *, struct shrink_control *); -/* - * Linux 3.0 to 3.11 Shrinker API Compatibility. - */ -#if defined(HAVE_SINGLE_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ -static int \ -__ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\ -{ \ - if (sc->nr_to_scan != 0) { \ - (void) scanfunc(shrink, sc); \ - } \ - return (countfunc(shrink, sc)); \ -} \ - \ -static struct shrinker varname = { \ - .shrink = __ ## varname ## _wrapper, \ - .seeks = seek_cost, \ -} +struct shrinker *spl_register_shrinker(const char *name, + spl_shrinker_cb countfunc, spl_shrinker_cb scanfunc, int seek_cost); +void spl_unregister_shrinker(struct shrinker *); +#ifndef SHRINK_STOP +/* 3.0-3.11 compatibility */ #define SHRINK_STOP (-1) - -/* - * Linux 3.12 and later Shrinker API Compatibility. - */ -#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ -static struct shrinker varname = { \ - .count_objects = countfunc, \ - .scan_objects = scanfunc, \ - .seeks = seek_cost, \ -} - -#else -/* - * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced. - */ -#error "Unknown shrinker callback" #endif #endif /* SPL_SHRINKER_H */ diff --git a/module/Kbuild.in b/module/Kbuild.in index b9c284a244..f1a145779d 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -79,6 +79,7 @@ SPL_OBJS := \ spl-kstat.o \ spl-proc.o \ spl-procfs-list.o \ + spl-shrinker.o \ spl-taskq.o \ spl-thread.o \ spl-trace.o \ diff --git a/module/os/linux/spl/spl-shrinker.c b/module/os/linux/spl/spl-shrinker.c new file mode 100644 index 0000000000..d5c8da471c --- /dev/null +++ b/module/os/linux/spl/spl-shrinker.c @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Shrinker Implementation. + */ + +#include +#include + +#ifdef HAVE_SINGLE_SHRINKER_CALLBACK +/* 3.0-3.11: single shrink() callback, which we wrap to carry both functions */ +struct spl_shrinker_wrap { + struct shrinker shrinker; + spl_shrinker_cb countfunc; + spl_shrinker_cb scanfunc; +}; + +static int +spl_shrinker_single_cb(struct shrinker *shrinker, struct shrink_control *sc) +{ + struct spl_shrinker_wrap *sw = (struct spl_shrinker_wrap *)shrinker; + + if (sc->nr_to_scan != 0) + (void) sw->scanfunc(&sw->shrinker, sc); + return (sw->countfunc(&sw->shrinker, sc)); +} +#endif + +struct shrinker * +spl_register_shrinker(const char *name, spl_shrinker_cb countfunc, + spl_shrinker_cb scanfunc, int seek_cost) +{ + struct shrinker *shrinker; + + /* allocate shrinker */ +#if defined(HAVE_SHRINKER_REGISTER) + /* 6.7: kernel will allocate the shrinker for us */ + shrinker = shrinker_alloc(0, name); +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + /* 3.12-6.6: we allocate the shrinker */ + shrinker = kmem_zalloc(sizeof (struct shrinker), KM_SLEEP); +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) + /* 3.0-3.11: allocate a wrapper */ + struct spl_shrinker_wrap *sw = + kmem_zalloc(sizeof (struct spl_shrinker_wrap), KM_SLEEP); + shrinker = &sw->shrinker; +#else + /* 2.x-2.6.22, or a newer shrinker API has been introduced. */ +#error "Unknown shrinker API" +#endif + + if (shrinker == NULL) + return (NULL); + + /* set callbacks */ +#ifdef HAVE_SINGLE_SHRINKER_CALLBACK + sw->countfunc = countfunc; + sw->scanfunc = scanfunc; + shrinker->shrink = spl_shrinker_single_cb; +#else + shrinker->count_objects = countfunc; + shrinker->scan_objects = scanfunc; +#endif + + /* set params */ + shrinker->seeks = seek_cost; + + /* register with kernel */ +#if defined(HAVE_SHRINKER_REGISTER) + shrinker_register(shrinker); +#elif defined(HAVE_REGISTER_SHRINKER_VARARG) + register_shrinker(shrinker, name); +#else + register_shrinker(shrinker); +#endif + + return (shrinker); +} +EXPORT_SYMBOL(spl_register_shrinker); + +void +spl_unregister_shrinker(struct shrinker *shrinker) +{ +#if defined(HAVE_SHRINKER_REGISTER) + shrinker_free(shrinker); +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + unregister_shrinker(shrinker); + kmem_free(shrinker, sizeof (struct shrinker)); +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) + unregister_shrinker(shrinker); + kmem_free(shrinker, sizeof (struct spl_shrinker_wrap)); +#else +#error "Unknown shrinker API" +#endif +} +EXPORT_SYMBOL(spl_unregister_shrinker); diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 43ed087e2d..1fa9f3eb3f 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -247,8 +247,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) return (sc->nr_to_scan); } -SPL_SHRINKER_DECLARE(arc_shrinker, - arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); +static struct shrinker *arc_shrinker = NULL; int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) @@ -351,14 +350,18 @@ arc_lowmem_init(void) * reclaim from the arc. This is done to prevent kswapd from * swapping out pages when it is preferable to shrink the arc. */ - spl_register_shrinker(&arc_shrinker); + arc_shrinker = spl_register_shrinker("zfs-arc-shrinker", + arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); + VERIFY(arc_shrinker); + arc_set_sys_free(allmem); } void arc_lowmem_fini(void) { - spl_unregister_shrinker(&arc_shrinker); + spl_unregister_shrinker(arc_shrinker); + arc_shrinker = NULL; } int From 3c502e376b77b463501c2c218f286a1c2735afb4 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 8 Dec 2023 17:31:31 -0800 Subject: [PATCH 05/31] ZTS: Disable io_uring test on CentOS 9 The io_uring test fails on CentOS 9 with the following fio error. Disable the test for the benefit of the CI until this can be fully investigated. This basic test passes as expected on newer kernels. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #15636 --- tests/zfs-tests/tests/functional/io/io_uring.ksh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/zfs-tests/tests/functional/io/io_uring.ksh b/tests/zfs-tests/tests/functional/io/io_uring.ksh index 47e439d0f4..2fa1465563 100755 --- a/tests/zfs-tests/tests/functional/io/io_uring.ksh +++ b/tests/zfs-tests/tests/functional/io/io_uring.ksh @@ -44,6 +44,13 @@ if ! $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then log_unsupported "Requires io_uring support" fi +if [ -e /etc/os-release ] ; then + source /etc/os-release + if [ -n "$REDHAT_SUPPORT_PRODUCT_VERSION" ] && ((floor($REDHAT_SUPPORT_PRODUCT_VERSION) == 9)) ; then + log_unsupported "Disabled on CentOS 9, fails with 'Operation not permitted'" + fi +fi + fio --ioengine=io_uring --parse-only || log_unsupported "fio io_uring support required" function cleanup From d530d5d8a567c0cf64a434f0303929dc0bb338da Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 21 Dec 2023 11:22:56 -0800 Subject: [PATCH 06/31] Linux 6.5 compat: check BLK_OPEN_EXCL is defined On some systems we already have blkdev_get_by_path() with 4 args but still the old FMODE_EXCL and not BLK_OPEN_EXCL defined. The vdev_bdev_mode() function was added to handle this case but there was no generic way to specify exclusive access. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #15692 --- module/os/linux/zfs/vdev_disk.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 48ac55f070..8b5aa94fe4 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -85,7 +85,7 @@ static blk_mode_t #else static fmode_t #endif -vdev_bdev_mode(spa_mode_t spa_mode) +vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) { #ifdef HAVE_BLK_MODE_T blk_mode_t mode = 0; @@ -95,6 +95,9 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= BLK_OPEN_WRITE; + + if (exclusive) + mode |= BLK_OPEN_EXCL; #else fmode_t mode = 0; @@ -103,6 +106,9 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; + + if (exclusive) + mode |= FMODE_EXCL; #endif return (mode); @@ -225,10 +231,10 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, { #ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG return (blkdev_get_by_path(path, - vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops)); + vdev_bdev_mode(mode, B_TRUE), holder, hops)); #else return (blkdev_get_by_path(path, - vdev_bdev_mode(mode) | FMODE_EXCL, holder)); + vdev_bdev_mode(mode, B_TRUE), holder)); #endif } @@ -238,7 +244,7 @@ vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder) #ifdef HAVE_BLKDEV_PUT_HOLDER return (blkdev_put(bdev, holder)); #else - return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL)); + return (blkdev_put(bdev, vdev_bdev_mode(mode, B_TRUE))); #endif } @@ -248,9 +254,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, { struct block_device *bdev; #ifdef HAVE_BLK_MODE_T - blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); #else - fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); #endif hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; From db2db50e370162e0d94e21fcff0be0891c02e99e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 25 Oct 2023 15:11:37 +1100 Subject: [PATCH 07/31] spa: make read/write queues configurable We are finding that as customers get larger and faster machines (hundreds of cores, large NVMe-backed pools) they keep hitting relatively low performance ceilings. Our profiling work almost always finds that they're running into bottlenecks on the SPA IO taskqs. Unfortunately there's often little we can advise at that point, because there's very few ways to change behaviour without patching. This commit adds two load-time parameters `zio_taskq_read` and `zio_taskq_write` that can configure the READ and WRITE IO taskqs directly. This achieves two goals: it gives operators (and those that support them) a way to tune things without requiring a custom build of OpenZFS, which is often not possible, and it lets us easily try different config variations in a variety of environments to inform the development of better defaults for these kind of systems. Because tuning the IO taskqs really requires a fairly deep understanding of how IO in ZFS works, and generally isn't needed without a pretty serious workload and an ability to identify bottlenecks, only minimal documentation is provided. Its expected that anyone using this is going to have the source code there as well. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. --- man/man4/zfs.4 | 10 ++ module/zfs/spa.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 306 insertions(+), 1 deletion(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 4ec52a2fb6..c12ef1387c 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2280,6 +2280,16 @@ If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . +.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp +Set the queue and thread configuration for the IO read queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. +.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp +Set the queue and thread configuration for the IO write queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1410651c63..32a5852921 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -151,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ -static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { +static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ @@ -1164,6 +1164,292 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) tqs->stqs_taskq = NULL; } +#ifdef _KERNEL +/* + * The READ and WRITE rows of zio_taskqs are configurable at module load time + * by setting zio_taskq_read or zio_taskq_write. + * + * Example (the defaults for READ and WRITE) + * zio_taskq_read='fixed,1,8 null scale null' + * zio_taskq_write='batch fixed,1,5 scale fixed,1,5' + * + * Each sets the entire row at a time. + * + * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number + * of threads per taskq. + * + * 'null' can only be set on the high-priority queues (queue selection for + * high-priority queues will fall back to the regular queue if the high-pri + * is NULL. + */ +static const char *const modes[ZTI_NMODES] = { + "fixed", "batch", "scale", "null" +}; + +/* Parse the incoming config string. Modifies cfg */ +static int +spa_taskq_param_set(zio_type_t t, char *cfg) +{ + int err = 0; + + zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; + + char *next = cfg, *tok, *c; + + /* + * Parse out each element from the string and fill `row`. The entire + * row has to be set at once, so any errors are flagged by just + * breaking out of this loop early. + */ + uint_t q; + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { + /* `next` is the start of the config */ + if (next == NULL) + break; + + /* Eat up leading space */ + while (isspace(*next)) + next++; + if (*next == '\0') + break; + + /* Mode ends at space or end of string */ + tok = next; + next = strchr(tok, ' '); + if (next != NULL) *next++ = '\0'; + + /* Parameters start after a comma */ + c = strchr(tok, ','); + if (c != NULL) *c++ = '\0'; + + /* Match mode string */ + uint_t mode; + for (mode = 0; mode < ZTI_NMODES; mode++) + if (strcmp(tok, modes[mode]) == 0) + break; + if (mode == ZTI_NMODES) + break; + + /* Invalid canary */ + row[q].zti_mode = ZTI_NMODES; + + /* Per-mode setup */ + switch (mode) { + + /* + * FIXED is parameterised: number of queues, and number of + * threads per queue. + */ + case ZTI_MODE_FIXED: { + /* No parameters? */ + if (c == NULL || *c == '\0') + break; + + /* Find next parameter */ + tok = c; + c = strchr(tok, ','); + if (c == NULL) + break; + + /* Take digits and convert */ + unsigned long long nq; + if (!(isdigit(*tok))) + break; + err = ddi_strtoull(tok, &tok, 10, &nq); + /* Must succeed and also end at the next param sep */ + if (err != 0 || tok != c) + break; + + /* Move past the comma */ + tok++; + /* Need another number */ + if (!(isdigit(*tok))) + break; + /* Remember start to make sure we moved */ + c = tok; + + /* Take digits */ + unsigned long long ntpq; + err = ddi_strtoull(tok, &tok, 10, &ntpq); + /* Must succeed, and moved forward */ + if (err != 0 || tok == c || *tok != '\0') + break; + + /* + * sanity; zero queues/threads make no sense, and + * 16K is almost certainly more than anyone will ever + * need and avoids silly numbers like UINT32_MAX + */ + if (nq == 0 || nq >= 16384 || + ntpq == 0 || ntpq >= 16384) + break; + + const zio_taskq_info_t zti = ZTI_P(ntpq, nq); + row[q] = zti; + break; + } + + case ZTI_MODE_BATCH: { + const zio_taskq_info_t zti = ZTI_BATCH; + row[q] = zti; + break; + } + + case ZTI_MODE_SCALE: { + const zio_taskq_info_t zti = ZTI_SCALE; + row[q] = zti; + break; + } + + case ZTI_MODE_NULL: { + /* + * Can only null the high-priority queues; the general- + * purpose ones have to exist. + */ + if (q != ZIO_TASKQ_ISSUE_HIGH && + q != ZIO_TASKQ_INTERRUPT_HIGH) + break; + + const zio_taskq_info_t zti = ZTI_NULL; + row[q] = zti; + break; + } + + default: + break; + } + + /* Ensure we set a mode */ + if (row[q].zti_mode == ZTI_NMODES) + break; + } + + /* Didn't get a full row, fail */ + if (q < ZIO_TASKQ_TYPES) + return (SET_ERROR(EINVAL)); + + /* Eat trailing space */ + if (next != NULL) + while (isspace(*next)) + next++; + + /* If there's anything left over then fail */ + if (next != NULL && *next != '\0') + return (SET_ERROR(EINVAL)); + + /* Success! Copy it into the real config */ + for (q = 0; q < ZIO_TASKQ_TYPES; q++) + zio_taskqs[t][q] = row[q]; + + return (0); +} + +static int +spa_taskq_param_get(zio_type_t t, char *buf) +{ + int pos = 0; + + /* Build paramater string from live config */ + const char *sep = ""; + for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { + const zio_taskq_info_t *zti = &zio_taskqs[t][q]; + if (zti->zti_mode == ZTI_MODE_FIXED) + pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, + modes[zti->zti_mode], zti->zti_count, + zti->zti_value); + else + pos += sprintf(&buf[pos], "%s%s", sep, + modes[zti->zti_mode]); + sep = " "; + } + + buf[pos++] = '\n'; + buf[pos] = '\0'; + + return (pos); +} + +#ifdef __linux__ +static int +spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); +} +#else +#include + +/* + * On FreeBSD load-time parameters can be set up before malloc() is available, + * so we have to do all the parsing work on the stack. + */ +#define SPA_TASKQ_PARAM_MAX (128) + +static int +spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err = 0; + + if (req->newptr == NULL) { + int len = spa_taskq_param_get(ZIO_TYPE_READ, buf); + struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); + sbuf_cpy(s, buf); + err = sbuf_finish(s); + sbuf_delete(s); + return (err); + } + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err = 0; + + if (req->newptr == NULL) { + int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf); + struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); + sbuf_cpy(s, buf); + err = sbuf_finish(s); + sbuf_delete(s); + return (err); + } + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); +} +#endif +#endif /* _KERNEL */ + /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention @@ -10210,4 +10496,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); + +#ifdef _KERNEL +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + "Configure IO queues for read IO"); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + "Configure IO queues for write IO"); +#endif /* END CSTYLED */ From 2a59b6bfa96648bc2c8c83eed0e026010e8da864 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 24 Oct 2023 17:33:58 -0400 Subject: [PATCH 08/31] ABD: Be more assertive in iterators Once we verified the ABDs and asserted the sizes we should never see premature ABDs ends. Assert that and remove extra branches from production builds. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15428 --- module/zfs/abd.c | 104 ++++++++++++----------------------------------- 1 file changed, 26 insertions(+), 78 deletions(-) diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 745ee8f02e..d982f201c9 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -802,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - boolean_t gang = abd_is_gang(abd); abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { - /* If we are at the end of the gang ABD we are done */ - if (gang && !c_abd) - break; + IMPLY(abd_is_gang(abd), c_abd != NULL); abd_iter_map(&aiter); @@ -930,7 +927,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, { int ret = 0; struct abd_iter daiter, saiter; - boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; if (size == 0) @@ -942,16 +938,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); - dabd_is_gang_abd = abd_is_gang(dabd); - sabd_is_gang_abd = abd_is_gang(sabd); c_dabd = abd_init_abd_iter(dabd, &daiter, doff); c_sabd = abd_init_abd_iter(sabd, &saiter, soff); while (size > 0) { - /* if we are at the end of the gang ABD we are done */ - if ((dabd_is_gang_abd && !c_dabd) || - (sabd_is_gang_abd && !c_sabd)) - break; + IMPLY(abd_is_gang(dabd), c_dabd != NULL); + IMPLY(abd_is_gang(sabd), c_sabd != NULL); abd_iter_map(&daiter); abd_iter_map(&saiter); @@ -1032,66 +1024,40 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, int i; ssize_t len, dlen; struct abd_iter caiters[3]; - struct abd_iter daiter = {0}; + struct abd_iter daiter; void *caddrs[3]; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_dabd = NULL; - boolean_t cabds_is_gang_abd[3]; - boolean_t dabd_is_gang_abd = B_FALSE; ASSERT3U(parity, <=, 3); - for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); + abd_verify(cabds[i]); + ASSERT3U(csize, <=, cabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); } - if (dabd) { - dabd_is_gang_abd = abd_is_gang(dabd); + ASSERT3S(dsize, >=, 0); + if (dsize > 0) { + ASSERT(dabd); + abd_verify(dabd); + ASSERT3U(dsize, <=, dabd->abd_size); c_dabd = abd_init_abd_iter(dabd, &daiter, 0); } - ASSERT3S(dsize, >=, 0); - abd_enter_critical(flags); while (csize > 0) { - /* if we are at the end of the gang ABD we are done */ - if (dabd_is_gang_abd && !c_dabd) - break; - + len = csize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we are - * done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; + len = MIN(caiters[i].iter_mapsize, len); } - len = csize; - - if (dabd && dsize > 0) + if (dsize > 0) { + IMPLY(abd_is_gang(dabd), c_dabd != NULL); abd_iter_map(&daiter); - - switch (parity) { - case 3: - len = MIN(caiters[2].iter_mapsize, len); - zfs_fallthrough; - case 2: - len = MIN(caiters[1].iter_mapsize, len); - zfs_fallthrough; - case 1: - len = MIN(caiters[0].iter_mapsize, len); - } - - /* must be progressive */ - ASSERT3S(len, >, 0); - - if (dabd && dsize > 0) { - /* this needs precise iter.length */ len = MIN(daiter.iter_mapsize, len); dlen = len; } else @@ -1114,7 +1080,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, &caiters[i], len); } - if (dabd && dsize > 0) { + if (dsize > 0) { abd_iter_unmap(&daiter); c_dabd = abd_advance_abd_iter(dabd, c_dabd, &daiter, @@ -1153,16 +1119,16 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; unsigned long flags __maybe_unused = 0; - boolean_t cabds_is_gang_abd[3]; - boolean_t tabds_is_gang_abd[3]; abd_t *c_cabds[3]; abd_t *c_tabds[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); - tabds_is_gang_abd[i] = abd_is_gang(tabds[i]); + abd_verify(cabds[i]); + abd_verify(tabds[i]); + ASSERT3U(tsize, <=, cabds[i]->abd_size); + ASSERT3U(tsize, <=, tabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &citers[i], 0); c_tabds[i] = @@ -1171,36 +1137,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, abd_enter_critical(flags); while (tsize > 0) { - + len = tsize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we - * are done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; - if (tabds_is_gang_abd[i] && !c_tabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); + IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL); abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; xaddrs[i] = xiters[i].iter_mapaddr; + len = MIN(citers[i].iter_mapsize, len); + len = MIN(xiters[i].iter_mapsize, len); } - len = tsize; - switch (parity) { - case 3: - len = MIN(xiters[2].iter_mapsize, len); - len = MIN(citers[2].iter_mapsize, len); - zfs_fallthrough; - case 2: - len = MIN(xiters[1].iter_mapsize, len); - len = MIN(citers[1].iter_mapsize, len); - zfs_fallthrough; - case 1: - len = MIN(xiters[0].iter_mapsize, len); - len = MIN(citers[0].iter_mapsize, len); - } /* must be progressive */ ASSERT3S(len, >, 0); /* From c34fe8dcbcb710081d8927b76bab06dd43c20c8c Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Tue, 7 Nov 2023 12:34:50 -0700 Subject: [PATCH 09/31] Update the kstat dataset_name when renaming a zvol Add a dataset_kstats_rename function, and call it when renaming a zvol on FreeBSD and Linux. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Alan Somers Sponsored-by: Axcient Closes #15482 Closes #15486 --- include/sys/dataset_kstats.h | 1 + module/os/freebsd/zfs/zvol_os.c | 1 + module/os/linux/zfs/zvol_os.c | 2 ++ module/zfs/dataset_kstats.c | 12 ++++++++++++ 4 files changed, 16 insertions(+) diff --git a/include/sys/dataset_kstats.h b/include/sys/dataset_kstats.h index 40cf5258a2..c81a07f0c1 100644 --- a/include/sys/dataset_kstats.h +++ b/include/sys/dataset_kstats.h @@ -71,6 +71,7 @@ typedef struct dataset_kstats { int dataset_kstats_create(dataset_kstats_t *, objset_t *); void dataset_kstats_destroy(dataset_kstats_t *); +void dataset_kstats_rename(dataset_kstats_t *dk, const char *); void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t); void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t); diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 2520507b98..b6edac434d 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -1333,6 +1333,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + dataset_kstats_rename(&zv->zv_kstat, newname); } /* diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index f94ce69fb9..8562e98973 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1521,6 +1521,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); + + dataset_kstats_rename(&zv->zv_kstat, newname); } void diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 767a461e00..2ac058fd2c 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -198,6 +198,18 @@ dataset_kstats_destroy(dataset_kstats_t *dk) zil_sums_fini(&dk->dk_zil_sums); } +void +dataset_kstats_rename(dataset_kstats_t *dk, const char *name) +{ + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + char *ds_name; + + ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name); + ASSERT3S(ds_name, !=, NULL); + (void) strlcpy(ds_name, name, + KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name)); +} + void dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten) From f13593619b074dff63f6940d32033d2f147166e3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 7 Nov 2023 14:35:40 -0500 Subject: [PATCH 10/31] FreeBSD: Optimize large kstat outputs - Use sbuf_new_for_sysctl() to reduce double-buffering on sysctl output. - Use much faster sbuf_cat() instead of sbuf_printf("%s"). Together it reduces `sysctl kstat.zfs.misc.dbufs` time from minutes to seconds, making dbufstat almost usable. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15495 --- module/os/freebsd/spl/spl_kstat.c | 38 +++++++++++++------------------ 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c index 9f5f92e194..43cd4da02e 100644 --- a/module/os/freebsd/spl/spl_kstat.c +++ b/module/os/freebsd/spl/spl_kstat.c @@ -187,19 +187,18 @@ kstat_sysctl_dataset_string(SYSCTL_HANDLER_ARGS) static int kstat_sysctl_io(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; kstat_t *ksp = arg1; kstat_io_t *kip = ksp->ks_data; int rc; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, 0, req); + /* Update the aggsums before reading */ (void) ksp->ks_update(ksp, KSTAT_READ); /* though wlentime & friends are signed, they will never be negative */ - sbuf_printf(sb, + sbuf_printf(&sb, "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", kip->nread, kip->nwritten, @@ -207,25 +206,21 @@ kstat_sysctl_io(SYSCTL_HANDLER_ARGS) kip->wtime, kip->wlentime, kip->wlastupdate, kip->rtime, kip->rlentime, kip->rlastupdate, kip->wcnt, kip->rcnt); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } static int kstat_sysctl_raw(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; void *data; kstat_t *ksp = arg1; void *(*addr_op)(kstat_t *ksp, loff_t index); int n, has_header, rc = 0; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, PAGE_SIZE, req); if (ksp->ks_raw_ops.addr) addr_op = ksp->ks_raw_ops.addr; @@ -258,8 +253,10 @@ restart_headers: if (has_header) { if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart_headers; - if (rc == 0) - sbuf_printf(sb, "\n%s", ksp->ks_raw_buf); + if (rc == 0) { + sbuf_cat(&sb, "\n"); + sbuf_cat(&sb, ksp->ks_raw_buf); + } } while ((data = addr_op(ksp, n)) != NULL) { @@ -270,22 +267,19 @@ restart: if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (rc == 0) - sbuf_printf(sb, "%s", ksp->ks_raw_buf); + sbuf_cat(&sb, ksp->ks_raw_buf); } else { ASSERT3U(ksp->ks_ndata, ==, 1); - sbuf_hexdump(sb, ksp->ks_data, + sbuf_hexdump(&sb, ksp->ks_data, ksp->ks_data_size, NULL, 0); } n++; } free(ksp->ks_raw_buf, M_TEMP); mutex_exit(ksp->ks_lock); - sbuf_trim(sb); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } From a8c29a79df2dd0ca8cf31f0f2b35475d10567fdb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 10 Nov 2023 13:34:46 -0500 Subject: [PATCH 11/31] Linux: Reclaim unused spl_kmem_cache_reclaim It is unused for 3 years since #10576. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15507 --- include/os/linux/spl/sys/kmem_cache.h | 2 -- man/man4/spl.4 | 8 -------- module/os/linux/spl/spl-kmem-cache.c | 11 ----------- 3 files changed, 21 deletions(-) diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index 82d50b6034..b159bb52d1 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -70,8 +70,6 @@ typedef enum kmem_cbrc { #define KMC_REAP_CHUNK INT_MAX #define KMC_DEFAULT_SEEKS 1 -#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ - extern struct list_head spl_kmem_cache_list; extern struct rw_semaphore spl_kmem_cache_sem; diff --git a/man/man4/spl.4 b/man/man4/spl.4 index 82455fb532..414a923948 100644 --- a/man/man4/spl.4 +++ b/man/man4/spl.4 @@ -31,14 +31,6 @@ for use by the kmem caches. For the majority of systems and workloads only a small number of threads are required. . -.It Sy spl_kmem_cache_reclaim Ns = Ns Sy 0 Pq uint -When this is set it prevents Linux from being able to rapidly reclaim all the -memory held by the kmem caches. -This may be useful in circumstances where it's preferable that Linux -reclaim memory from some other subsystem first. -Setting this will increase the likelihood out of memory events on a memory -constrained system. -. .It Sy spl_kmem_cache_obj_per_slab Ns = Ns Sy 8 Pq uint The preferred number of objects per slab in the cache. In general, a larger value will increase the caches memory footprint diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 3c30dfc577..a2920c7466 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -76,17 +76,6 @@ module_param(spl_kmem_cache_magazine_size, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_magazine_size, "Default magazine size (2-256), set automatically (0)"); -/* - * The default behavior is to report the number of objects remaining in the - * cache. This allows the Linux VM to repeatedly reclaim objects from the - * cache when memory is low satisfy other memory allocations. Alternately, - * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache - * is reclaimed. This may increase the likelihood of out of memory events. - */ -static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; -module_param(spl_kmem_cache_reclaim, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); - static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_obj_per_slab, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); From 2e259c6f00142165588b492fd52cb8267d7aa753 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 14 Nov 2023 16:47:57 -0500 Subject: [PATCH 12/31] L2ARC: Restrict write size to 1/4 of the device PR #15457 exposed weird logic in L2ARC write sizing. If it appeared bigger than device size, instead of liming write it reset all the system-wide tunables to their default. Aside of being excessive, it did not actually help with the problem, still allowing infinite loop to happen. This patch removes the tunables reverting logic, but instead limits L2ARC writes (or at least eviction/trim) to 1/4 of the capacity. Reviewed-by: Brian Behlendorf Reviewed-by: George Amanakis Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15519 --- module/zfs/arc.c | 30 +++---------------- .../tests/functional/cache/cache_012_pos.ksh | 20 ++++--------- 2 files changed, 10 insertions(+), 40 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index dfea15b743..4db6c06148 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8031,9 +8031,8 @@ l2arc_write_size(l2arc_dev_t *dev) */ size = l2arc_write_max; if (size == 0) { - cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " - "be greater than zero, resetting it to the default (%d)", - L2ARC_WRITE_SIZE); + cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, " + "resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } @@ -8056,30 +8055,9 @@ l2arc_write_size(l2arc_dev_t *dev) * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ - if (size > dev->l2ad_end - dev->l2ad_start) { - cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " - "plus the overhead of log blocks (persistent L2ARC, " - "%llu bytes) exceeds the size of the cache device " - "(guid %llu), resetting them to the default (%d)", - (u_longlong_t)l2arc_log_blk_overhead(size, dev), - (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4); - size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; - - if (l2arc_trim_ahead > 1) { - cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); - l2arc_trim_ahead = 1; - } - - if (arc_warm == B_FALSE) - size += l2arc_write_boost; - - size += l2arc_log_blk_overhead(size, dev); - if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { - size += MAX(64 * 1024 * 1024, - (size * l2arc_trim_ahead) / 100); - } - } + size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift); return (size); diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh index 74caa12a9c..945db71bf1 100755 --- a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh @@ -31,15 +31,13 @@ # 2. Set l2arc_write_max to a value larger than the cache device. # 3. Create a file larger than the cache device and random read # for 10 sec. -# 4. Verify that l2arc_write_max is set back to the default. -# 5. Set l2arc_write_max to a value less than the cache device size but +# 4. Set l2arc_write_max to a value less than the cache device size but # larger than the default (256MB). -# 6. Record the l2_size. -# 7. Random read for 1 sec. -# 8. Record the l2_size again. -# 9. If (6) <= (8) then we have not looped around yet. -# 10. If (6) > (8) then we looped around. Break out of the loop and test. -# 11. Destroy pool. +# 5. Record the l2_size. +# 6. Random read for 1 sec. +# 7. Record the l2_size again. +# 8. If (5) <= (7) then we have not looped around yet. +# 9. Destroy pool. # verify_runnable "global" @@ -93,10 +91,6 @@ log_must zfs set relatime=off $TESTPOOL log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio -typeset write_max2=$(get_tunable L2ARC_WRITE_MAX) - -log_must test $write_max2 -eq $write_max - log_must set_tunable32 L2ARC_WRITE_MAX $(( 256 * 1024 * 1024 )) export RUNTIME=1 @@ -108,8 +102,6 @@ while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do do_once=false done -log_must test $l2_size1 -gt $l2_size2 - log_must zpool destroy $TESTPOOL log_pass "Looping around a cache device succeeds." From ad47eca195d048792a07a3d2dea05d369ad40900 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 28 Nov 2023 16:35:14 -0500 Subject: [PATCH 13/31] ZIL: Assert record sizes in different places This should make sure we have log written without overflows. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15517 --- module/os/freebsd/zfs/zio_crypt.c | 9 +++-- module/os/linux/zfs/zio_crypt.c | 9 +++-- module/zfs/zfs_replay.c | 50 ++++++++++++++++++++++---- module/zfs/zil.c | 60 ++++++++++++++++++++----------- module/zfs/zio_checksum.c | 16 +++++---- module/zfs/zvol.c | 8 +++++ 6 files changed, 115 insertions(+), 37 deletions(-) diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index 024a931d78..b08916b317 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1251,7 +1251,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, iovec_t *dst_iovecs; zil_chain_t *zilc; lr_t *lr; - uint64_t txtype, lr_len; + uint64_t txtype, lr_len, nused; uint_t crypt_len, nr_iovecs, vec; uint_t aad_len = 0, total_len = 0; @@ -1268,7 +1268,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + ASSERT3U(nused, >=, sizeof (zil_chain_t)); + ASSERT3U(nused, <=, datalen); + blkend = src + nused; /* * Calculate the number of encrypted iovecs we will need. @@ -1287,6 +1290,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } + ASSERT3U(lr_len, >=, sizeof (lr_t)); + ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 775ab8efbc..2114be2819 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1405,7 +1405,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, boolean_t *no_crypt) { int ret; - uint64_t txtype, lr_len; + uint64_t txtype, lr_len, nused; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; @@ -1432,7 +1432,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + ASSERT3U(nused, >=, sizeof (zil_chain_t)); + ASSERT3U(nused, <=, datalen); + blkend = src + nused; /* calculate the number of encrypted iovecs we will need */ for (; slrp < blkend; slrp += lr_len) { @@ -1445,6 +1448,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } + ASSERT3U(lr_len, >=, sizeof (lr_t)); + ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 09c7be853b..2e0af60f6d 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -309,6 +309,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); @@ -470,6 +472,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); @@ -613,6 +617,8 @@ zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -648,6 +654,8 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -715,12 +723,14 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL)); } @@ -730,12 +740,14 @@ zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap) #ifdef __linux__ zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE, NULL)); #else @@ -750,14 +762,13 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) zfsvfs_t *zfsvfs = arg1; lr_rename_whiteout_t *lr = arg2; int error; - /* sname and tname follow lr_rename_whiteout_t */ - char *sname = (char *)(lr + 1); - char *tname = sname + strlen(sname) + 1; /* For the whiteout file. */ xvattr_t xva; uint64_t objid; uint64_t dnodesize; + ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -783,6 +794,9 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) if (error) return (error); + /* sname and tname follow lr_rename_whiteout_t */ + char *sname = (char *)(lr + 1); + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname, RENAME_WHITEOUT, &xva.xva_vattr)); #else @@ -800,6 +814,8 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) int error; uint64_t eod, offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -863,6 +879,8 @@ zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) int error; uint64_t end; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -910,6 +928,8 @@ zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) flock64_t fl = {0}; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -940,6 +960,8 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) int error; void *start; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + xva_init(&xva); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); @@ -1002,6 +1024,9 @@ zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap) size_t size; int error = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size); + ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa, SPA_FEATURE_ZILSAXATTR)); if (byteswap) @@ -1079,6 +1104,10 @@ zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + + sizeof (ace_t) * lr->lr_aclcnt); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_oldace_byteswap(ace, lr->lr_aclcnt); @@ -1124,6 +1153,9 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); @@ -1171,6 +1203,10 @@ zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index a118861369..37fb792f5d 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -522,6 +522,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); + ASSERT3U(reclen, <=, end - lrp); if (lr->lrc_seq > claim_lr_seq) { arc_buf_destroy(abuf, &abuf); goto done; @@ -604,7 +605,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) lr_write_t *lr = (lr_write_t *)lrc; int error; - ASSERT(lrc->lrc_txtype == TX_WRITE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); /* * If the block is not readable, don't claim it. This can happen @@ -631,7 +632,9 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) spa_t *spa; uint_t ii; - ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); if (tx == NULL) { return (0); @@ -691,7 +694,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; - ASSERT(lrc->lrc_txtype == TX_WRITE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); /* * If we previously claimed it, we need to free it. @@ -712,7 +715,9 @@ zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) spa_t *spa; uint_t ii; - ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); if (tx == NULL) { return (0); @@ -1794,6 +1799,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) itx = list_next(&lwb->lwb_itxs, itx)) zil_lwb_commit(zilog, lwb, itx); lwb->lwb_nused = lwb->lwb_nfilled; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); @@ -2023,13 +2029,16 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) return (lwb); } + reclen = lr->lrc_reclen; if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + ASSERT3U(reclen, ==, sizeof (lr_write_t)); dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { + ASSERT3U(reclen, >=, sizeof (lr_t)); dlen = 0; } - reclen = lr->lrc_reclen; + ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0)); zilog->zl_cur_used += (reclen + dlen); cont: @@ -2048,19 +2057,19 @@ cont: if (lwb == NULL) return (NULL); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; - - /* - * There must be enough space in the new, empty log block to - * hold reclen. For WR_COPIED, we need to fit the whole - * record in one block, and reclen is the header size + the - * data size. For WR_NEED_COPY, we can create multiple - * records, splitting the data into multiple blocks, so we - * only need to fit one word of data per block; in this case - * reclen is just the header size (no data). - */ - ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } + /* + * There must be enough space in the log block to hold reclen. + * For WR_COPIED, we need to fit the whole record in one block, + * and reclen is the write record header size + the data size. + * For WR_NEED_COPY, we can create multiple records, splitting + * the data into multiple blocks, so we only need to fit one + * word of data per block; in this case reclen is just the header + * size (no data). + */ + ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); + dnow = MIN(dlen, lwb_sp - reclen); if (dlen > dnow) { ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); @@ -2236,7 +2245,9 @@ zil_itx_create(uint64_t txtype, size_t olrsize) size_t itxsize, lrsize; itx_t *itx; + ASSERT3U(olrsize, >=, sizeof (lr_t)); lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t); + ASSERT3U(lrsize, >=, olrsize); itxsize = offsetof(itx_t, itx_lr) + lrsize; itx = zio_data_buf_alloc(itxsize); @@ -2255,6 +2266,10 @@ zil_itx_create(uint64_t txtype, size_t olrsize) static itx_t * zil_itx_clone(itx_t *oitx) { + ASSERT3U(oitx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(oitx->itx_size, ==, + offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen); + itx_t *itx = zio_data_buf_alloc(oitx->itx_size); memcpy(itx, oitx, oitx->itx_size); itx->itx_callback = NULL; @@ -2265,6 +2280,9 @@ zil_itx_clone(itx_t *oitx) void zil_itx_destroy(itx_t *itx) { + ASSERT3U(itx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(itx->itx_lr.lrc_reclen, ==, + itx->itx_size - offsetof(itx_t, itx_lr)); IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL); IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); @@ -2348,7 +2366,7 @@ void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; list_t clean_list; @@ -2375,7 +2393,8 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; - ian = avl_find(t, &oid, &where); + ian_search.ia_foid = oid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); @@ -2573,7 +2592,7 @@ void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; @@ -2603,7 +2622,8 @@ zil_async_to_sync(zilog_t *zilog, uint64_t foid) */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { - ian = avl_find(t, &foid, &where); + ian_search.ia_foid = foid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 9de515e876..e511b31fee 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -363,11 +363,14 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, zil_chain_t zilc; abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); - size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ, - uint64_t); + uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused, + ZIL_MIN_BLKSZ, uint64_t); + ASSERT3U(size, >=, nused); + size = nused; eck = zilc.zc_eck; eck_offset = offsetof(zil_chain_t, zc_eck); } else { + ASSERT3U(size, >=, sizeof (zio_eck_t)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); @@ -448,12 +451,13 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, return (SET_ERROR(ECKSUM)); } - if (nused > size) { + nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + if (size < nused) return (SET_ERROR(ECKSUM)); - } - - size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + size = nused; } else { + if (size < sizeof (zio_eck_t)) + return (SET_ERROR(ECKSUM)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 53dcb4dee4..91b2d9fcb5 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -417,6 +417,8 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) lr_truncate_t *lr = arg2; uint64_t offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -453,6 +455,8 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) dmu_tx_t *tx; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -506,6 +510,10 @@ zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) uint_t ii; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + dmu_objset_name(os, name); cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.", name); From e48195c816edbea0efeb41436811af353ae26a35 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 1 Dec 2023 14:50:10 -0500 Subject: [PATCH 14/31] ZIO: Add overflow checks for linear buffers Since we use a limited set of kmem caches, quite often we have unused memory after the end of the buffer. Put there up to a 512-byte canary when built with debug to detect buffer overflows at the free time. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15553 --- lib/libspl/include/assert.h | 3 ++ module/zfs/zio.c | 57 +++++++++++++++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index c5bf0f0cc8..af4957dfba 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -64,6 +64,9 @@ libspl_assert(const char *buf, const char *file, const char *func, int line) #undef verify #endif +#define PANIC(fmt, a...) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + #define VERIFY(cond) \ (void) ((!(cond)) && \ libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__)) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3b3b40fa73..d8eb075eef 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -306,6 +306,53 @@ zio_fini(void) * ========================================================================== */ +#ifdef ZFS_DEBUG +static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; +#endif + +/* + * Use empty space after the buffer to detect overflows. + * + * Since zio_init() creates kmem caches only for certain set of buffer sizes, + * allocations of different sizes may have some unused space after the data. + * Filling part of that space with a known pattern on allocation and checking + * it on free should allow us to detect some buffer overflows. + */ +static void +zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) +{ +#ifdef ZFS_DEBUG + size_t off = P2ROUNDUP(size, sizeof (ulong_t)); + ulong_t *canary = p + off / sizeof (ulong_t); + size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; + if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && + cache[c] == cache[c + 1]) + asize = (c + 2) << SPA_MINBLOCKSHIFT; + for (; off < asize; canary++, off += sizeof (ulong_t)) + *canary = zio_buf_canary; +#endif +} + +static void +zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) +{ +#ifdef ZFS_DEBUG + size_t off = P2ROUNDUP(size, sizeof (ulong_t)); + ulong_t *canary = p + off / sizeof (ulong_t); + size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; + if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && + cache[c] == cache[c + 1]) + asize = (c + 2) << SPA_MINBLOCKSHIFT; + for (; off < asize; canary++, off += sizeof (ulong_t)) { + if (unlikely(*canary != zio_buf_canary)) { + PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx", + p, size, (canary - p) * sizeof (ulong_t), + *canary, zio_buf_canary); + } + } +#endif +} + /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's @@ -322,7 +369,9 @@ zio_buf_alloc(size_t size) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif - return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); + void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE); + zio_buf_put_canary(p, size, zio_buf_cache, c); + return (p); } /* @@ -338,7 +387,9 @@ zio_data_buf_alloc(size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); + void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE); + zio_buf_put_canary(p, size, zio_data_buf_cache, c); + return (p); } void @@ -351,6 +402,7 @@ zio_buf_free(void *buf, size_t size) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif + zio_buf_check_canary(buf, size, zio_buf_cache, c); kmem_cache_free(zio_buf_cache[c], buf); } @@ -361,6 +413,7 @@ zio_data_buf_free(void *buf, size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + zio_buf_check_canary(buf, size, zio_data_buf_cache, c); kmem_cache_free(zio_data_buf_cache[c], buf); } From 3b8f2273622318461836dc27fa65e50126caff78 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 1 Dec 2023 18:23:20 -0500 Subject: [PATCH 15/31] ZIL: Remove TX_CLONE_RANGE replay for ZVOLs. zil_claim_clone_range() takes references on cloned blocks before ZIL replay. Later zil_free_clone_range() drops them after replay or on dataset destroy. The total balance is neutral. It means we do not need to do anything (drop the references) for not implemented yet TX_CLONE_RANGE replay for ZVOLs. This is a logical follow up to #15603. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15612 --- module/zfs/zvol.c | 60 +---------------------------------------------- 1 file changed, 1 insertion(+), 59 deletions(-) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 91b2d9fcb5..20ea71f233 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -486,64 +486,6 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) return (error); } -/* - * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed - * after a system failure. - * - * TODO: For now we drop block cloning transations for ZVOLs as they are - * unsupported, but we still need to inform BRT about that as we - * claimed them during pool import. - * This situation can occur when we try to import a pool from a ZFS - * version supporting block cloning for ZVOLs into a system that - * has this ZFS version, that doesn't support block cloning for ZVOLs. - */ -static int -zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) -{ - char name[ZFS_MAX_DATASET_NAME_LEN]; - zvol_state_t *zv = arg1; - objset_t *os = zv->zv_objset; - lr_clone_range_t *lr = arg2; - blkptr_t *bp; - dmu_tx_t *tx; - spa_t *spa; - uint_t ii; - int error; - - ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); - ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, - lr_bps[lr->lr_nbps])); - - dmu_objset_name(os, name); - cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.", - name); - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - tx = dmu_tx_create(os); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - spa = os->os_spa; - - for (ii = 0; ii < lr->lr_nbps; ii++) { - bp = &lr->lr_bps[ii]; - - if (!BP_IS_HOLE(bp)) { - zio_free(spa, dmu_tx_get_txg(tx), bp); - } - } - - (void) zil_replaying(zv->zv_zilog, tx); - dmu_tx_commit(tx); - - return (0); -} - static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { @@ -578,7 +520,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ - zvol_replay_clone_range /* TX_CLONE_RANGE */ + zvol_replay_err, /* TX_CLONE_RANGE */ }; /* From e11b3eb1c608754d98ecb91e2c3dc8d5700ec7a8 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 5 Dec 2023 13:58:11 -0500 Subject: [PATCH 16/31] ZIL: Do not clone blocks from the future ZIL claim can not handle block pointers cloned from the future, since they are not yet allocated at that point. It may happen either if the block was just written when it was cloned, or if the pool was frozen or somehow else rewound on import. Handle it from two sides: prevent cloning of blocks with physical birth time from not yet synced or frozen TXG, and abort ZIL claim if we still detect such blocks due to rewind or something else. While there, assert that any cloned blocks we claim are really allocated by calling metaslab_check_free(). Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15617 --- module/zfs/dmu.c | 15 +++++++++++++++ module/zfs/zil.c | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3f626031de..63464d7474 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2255,6 +2255,21 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, goto out; } + /* + * If the block was allocated in transaction group that is not + * yet synced, we could clone it, but we couldn't write this + * operation into ZIL, or it may be impossible to replay, since + * the block may appear not yet allocated at that point. + */ + if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { + error = SET_ERROR(EINVAL); + goto out; + } + if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { + error = SET_ERROR(EAGAIN); + goto out; + } + bps[i] = *bp; } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 37fb792f5d..7670e17295 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -625,11 +625,12 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) } static int -zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) +zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t first_txg) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; - spa_t *spa; + spa_t *spa = zilog->zl_spa; uint_t ii; ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); @@ -644,19 +645,36 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) * XXX: Do we need to byteswap lr? */ - spa = zilog->zl_spa; - for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; /* - * When data in embedded into BP there is no need to create - * BRT entry as there is no data block. Just copy the BP as - * it contains the data. + * When data is embedded into the BP there is no need to create + * BRT entry as there is no data block. Just copy the BP as it + * contains the data. */ - if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + continue; + + /* + * We can not handle block pointers from the future, since they + * are not yet allocated. It should not normally happen, but + * just in case lets be safe and just stop here now instead of + * corrupting the pool. + */ + if (BP_PHYSICAL_BIRTH(bp) >= first_txg) + return (SET_ERROR(ENOENT)); + + /* + * Assert the block is really allocated before we reference it. + */ + metaslab_check_free(spa, bp); + } + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) brt_pending_add(spa, bp, tx); - } } return (0); @@ -671,7 +689,7 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, case TX_WRITE: return (zil_claim_write(zilog, lrc, tx, first_txg)); case TX_CLONE_RANGE: - return (zil_claim_clone_range(zilog, lrc, tx)); + return (zil_claim_clone_range(zilog, lrc, tx, first_txg)); default: return (0); } From 121924575e48f45b402a9fe2282055946e065c4a Mon Sep 17 00:00:00 2001 From: oromenahar Date: Tue, 5 Dec 2023 20:03:48 +0100 Subject: [PATCH 17/31] Allow block cloning across encrypted datasets When two datasets share the same master encryption key, it is safe to clone encrypted blocks. Currently only snapshots and clones of a dataset share with it the same encryption key. Added a test for: - Clone from encrypted sibling to encrypted sibling with non encrypted parent - Clone from encrypted parent to inherited encrypted child - Clone from child to sibling with encrypted parent - Clone from snapshot to the original datasets - Clone from foreign snapshot to a foreign dataset - Cloning from non-encrypted to encrypted datasets - Cloning from encrypted to non-encrypted datasets Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Original-patch-by: Pawel Jakub Dawidek Signed-off-by: Kay Pedersen Closes #15544 --- include/sys/dsl_crypt.h | 1 + man/man7/zpool-features.7 | 9 +- module/zfs/brt.c | 6 +- module/zfs/dsl_crypt.c | 34 ++++ module/zfs/zfs_vnops.c | 25 ++- tests/runfiles/linux.run | 1 + tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/tests/Makefile.am | 1 + .../block_cloning/block_cloning.kshlib | 12 +- .../block_cloning_cross_enc_dataset.ksh | 170 ++++++++++++++++++ 10 files changed, 236 insertions(+), 25 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh diff --git a/include/sys/dsl_crypt.h b/include/sys/dsl_crypt.h index 72716e296c..fbcae37153 100644 --- a/include/sys/dsl_crypt.h +++ b/include/sys/dsl_crypt.h @@ -206,6 +206,7 @@ void dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, dmu_tx_t *tx); int dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, boolean_t *will_encrypt); +boolean_t dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb); void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx); uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 8ca4bd927b..8456a9aa76 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -364,9 +364,12 @@ When this feature is enabled ZFS will use block cloning for operations like Block cloning allows to create multiple references to a single block. It is much faster than copying the data (as the actual data is neither read nor written) and takes no additional space. -Blocks can be cloned across datasets under some conditions (like disabled -encryption and equal -.Nm recordsize ) . +Blocks can be cloned across datasets under some conditions (like equal +.Nm recordsize , +the same master encryption key, etc.). +ZFS tries its best to clone across datasets including encrypted ones. +This is limited for various (nontrivial) reasons depending on the OS +and/or ZFS internals. .Pp This feature becomes .Sy active diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 759bc8d2e2..a701c70fcf 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -157,10 +157,8 @@ * (copying the file content to the new dataset and removing the source file). * In that case Block Cloning will only be used briefly, because the BRT entries * will be removed when the source is removed. - * Note: currently it is not possible to clone blocks between encrypted - * datasets, even if those datasets use the same encryption key (this includes - * snapshots of encrypted datasets). Cloning blocks between datasets that use - * the same keys should be possible and should be implemented in the future. + * Block Cloning across encrypted datasets is supported as long as both + * datasets share the same master key (e.g. snapshots and clones) * * Block Cloning flow through ZFS layers. * diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index 5e6e4e3d6c..8e1055d9bc 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -266,6 +266,40 @@ spa_crypto_key_compare(const void *a, const void *b) return (0); } +/* + * this compares a crypto key based on zk_guid. See comment on + * spa_crypto_key_compare for more information. + */ +boolean_t +dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb) +{ + dsl_crypto_key_t *dcka = NULL; + dsl_crypto_key_t *dckb = NULL; + uint64_t obja, objb; + boolean_t equal; + spa_t *spa; + + spa = dmu_objset_spa(osa); + if (spa != dmu_objset_spa(osb)) + return (B_FALSE); + obja = dmu_objset_ds(osa)->ds_object; + objb = dmu_objset_ds(osb)->ds_object; + + if (spa_keystore_lookup_key(spa, obja, FTAG, &dcka) != 0) + return (B_FALSE); + if (spa_keystore_lookup_key(spa, objb, FTAG, &dckb) != 0) { + spa_keystore_dsl_key_rele(spa, dcka, FTAG); + return (B_FALSE); + } + + equal = (dcka->dck_key.zk_guid == dckb->dck_key.zk_guid); + + spa_keystore_dsl_key_rele(spa, dcka, FTAG); + spa_keystore_dsl_key_rele(spa, dckb, FTAG); + + return (equal); +} + static int spa_key_mapping_compare(const void *a, const void *b) { diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 3a5fa75df2..17e990451e 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1103,6 +1104,16 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, return (SET_ERROR(EXDEV)); } + /* + * Cloning across encrypted datasets is possible only if they + * share the same master key. + */ + if (inos != outos && inos->os_encrypted && + !dmu_objset_crypto_key_equal(inos, outos)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + error = zfs_verify_zp(inzp); if (error == 0) error = zfs_verify_zp(outzp); @@ -1286,20 +1297,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ break; } - /* - * Encrypted data is fine as long as it comes from the same - * dataset. - * TODO: We want to extend it in the future to allow cloning to - * datasets with the same keys, like clones or to be able to - * clone a file from a snapshot of an encrypted dataset into the - * dataset itself. - */ - if (BP_IS_PROTECTED(&bps[0])) { - if (inzfsvfs != outzfsvfs) { - error = SET_ERROR(EXDEV); - break; - } - } /* * Start a transaction. diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 8bc55a1b4b..fb78d96fb5 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -42,6 +42,7 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', 'block_cloning_disabled_ficlonerange', 'block_cloning_copyfilerange_cross_dataset', + 'block_cloning_cross_enc_dataset', 'block_cloning_copyfilerange_fallback_same_txg'] tags = ['functional', 'block_cloning'] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 4608e87522..b188a101c2 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -305,6 +305,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_cross_reason], 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_cross_enc_dataset': + ['SKIP', cfr_cross_reason], }) diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 87b50f59ca..21b830126b 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -451,6 +451,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/block_cloning_ficlone.ksh \ functional/block_cloning/block_cloning_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlonerange_partial.ksh \ + functional/block_cloning/block_cloning_cross_enc_dataset.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib index 8e16366b4c..526bd54a2b 100644 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib @@ -28,8 +28,8 @@ function have_same_content { - typeset hash1=$(cat $1 | md5sum) - typeset hash2=$(cat $2 | md5sum) + typeset hash1=$(md5digest $1) + typeset hash2=$(md5digest $2) log_must [ "$hash1" = "$hash2" ] } @@ -44,10 +44,14 @@ function have_same_content # function get_same_blocks { + KEY=$5 + if [ ${#KEY} -gt 0 ]; then + KEY="--key=$KEY" + fi typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$ - zdb -vvvvv $1 -O $2 | \ + zdb $KEY -vvvvv $1 -O $2 | \ awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a - zdb -vvvvv $3 -O $4 | \ + zdb $KEY -vvvvv $3 -O $4 | \ awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ') } diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh new file mode 100755 index 0000000000..fe8f0867b9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh @@ -0,0 +1,170 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Kay Pedersen +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then + log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" +fi + +claim="Block cloning across encrypted datasets." + +log_assert $claim + +DS1="$TESTPOOL/encrypted1" +DS2="$TESTPOOL/encrypted2" +DS1_NC="$TESTPOOL/notcrypted1" +PASSPHRASE="top_secret" + +function prepare_enc +{ + log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt $DS1" + log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt $DS2" + log_must zfs create $DS1/child1 + log_must zfs create $DS1/child2 + log_must zfs create $DS1_NC + + log_note "Create test file" + # we must wait until the src file txg is written to the disk otherwise we + # will fallback to normal copy. See "dmu_read_l0_bps" in + # "zfs/module/zfs/dmu.c" and "zfs_clone_range" in + # "zfs/module/zfs/zfs_vnops.c" + log_must dd if=/dev/urandom of=/$DS1/file bs=128K count=4 + log_must dd if=/dev/urandom of=/$DS1/child1/file bs=128K count=4 + log_must dd if=/dev/urandom of=/$DS1_NC/file bs=128K count=4 + log_must sync_pool $TESTPOOL +} + +function cleanup_enc +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +function clone_and_check +{ + I_FILE="$1" + O_FILE=$2 + I_DS=$3 + O_DS=$4 + SAME_BLOCKS=$5 + # the CLONE option provides a choice between copy_file_range + # which should clone and a dd which is a copy no matter what + CLONE=$6 + SNAPSHOT=$7 + if [ ${#SNAPSHOT} -gt 0 ]; then + I_FILE=".zfs/snapshot/$SNAPSHOT/$1" + fi + if [ $CLONE ]; then + log_must clonefile -f "/$I_DS/$I_FILE" "/$O_DS/$O_FILE" 0 0 524288 + else + log_must dd if="/$I_DS/$I_FILE" of="/$O_DS/$O_FILE" bs=128K + fi + log_must sync_pool $TESTPOOL + + log_must have_same_content "/$I_DS/$I_FILE" "/$O_DS/$O_FILE" + + if [ ${#SNAPSHOT} -gt 0 ]; then + I_DS="$I_DS@$SNAPSHOT" + I_FILE="$1" + fi + typeset blocks=$(get_same_blocks \ + $I_DS $I_FILE $O_DS $O_FILE $PASSPHRASE) + log_must [ "$blocks" = "$SAME_BLOCKS" ] +} + +log_onexit cleanup_enc + +prepare_enc + +log_note "Cloning entire file with copy_file_range across different enc" \ + "roots, should fallback" +# we are expecting no same block map. +clone_and_check "file" "clone" $DS1 $DS2 "" true +log_note "check if the file is still readable and the same after" \ + "unmount and key unload, shouldn't fail" +typeset hash1=$(md5digest "/$DS1/file") +log_must zfs umount $DS1 && zfs unload-key $DS1 +typeset hash2=$(md5digest "/$DS2/clone") +log_must [ "$hash1" = "$hash2" ] + +cleanup_enc +prepare_enc + +log_note "Cloning entire file with copy_file_range across different child datasets" +# clone shouldn't work because of deriving a new master key for the child +# we are expecting no same block map. +clone_and_check "file" "clone" $DS1 "$DS1/child1" "" true +clone_and_check "file" "clone" "$DS1/child1" "$DS1/child2" "" true + +cleanup_enc +prepare_enc + +log_note "Copying entire file with copy_file_range across same snapshot" +log_must zfs snapshot -r $DS1@s1 +log_must sync_pool $TESTPOOL +log_must rm -f "/$DS1/file" +log_must sync_pool $TESTPOOL +clone_and_check "file" "clone" "$DS1" "$DS1" "0 1 2 3" true "s1" + +cleanup_enc +prepare_enc + +log_note "Copying entire file with copy_file_range across different snapshot" +clone_and_check "file" "file" $DS1 $DS2 "" true +log_must zfs snapshot -r $DS2@s1 +log_must sync_pool $TESTPOOL +log_must rm -f "/$DS1/file" "/$DS2/file" +log_must sync_pool $TESTPOOL +clone_and_check "file" "clone" "$DS2" "$DS1" "" true "s1" +typeset hash1=$(md5digest "/$DS1/.zfs/snapshot/s1/file") +log_note "destroy the snapshot and check if the file is still readable and" \ + "has the same content" +log_must zfs destroy -r $DS2@s1 +log_must sync_pool $TESTPOOL +typeset hash2=$(md5digest "/$DS1/file") +log_must [ "$hash1" = "$hash2" ] + +cleanup_enc +prepare_enc + +log_note "Copying with copy_file_range from non encrypted to encrypted" +clone_and_check "file" "copy" $DS1_NC $DS1 "" true + +cleanup_enc +prepare_enc + +log_note "Copying with copy_file_range from encrypted to non encrypted" +clone_and_check "file" "copy" $DS1 $DS1_NC "" true + +log_must sync_pool $TESTPOOL + +log_pass $claim From dea2d3c6cda7f61bd53deb133926bee77819f9bd Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 6 Dec 2023 15:39:12 -0500 Subject: [PATCH 18/31] zdb: Dump encrypted write and clone ZIL records Block pointers are not encrypted in TX_WRITE and TX_CLONE_RANGE records, so we can dump them, that may be useful for debugging. Related to #15543. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15629 --- cmd/zdb/zdb_il.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 970c45c9b3..63d95ddedc 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -168,7 +168,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length); - if (txtype == TX_WRITE2 || verbose < 5) + if (txtype == TX_WRITE2 || verbose < 4) return; if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { @@ -178,6 +178,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) "will claim" : "won't claim"); print_log_bp(bp, tab_prefix); + if (verbose < 5) + return; if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); @@ -202,6 +204,9 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) if (error) goto out; } else { + if (verbose < 5) + return; + /* data is stored after the end of the lr_write record */ data = abd_alloc(lr->lr_length, B_FALSE); abd_copy_from_buf(data, lr + 1, lr->lr_length); @@ -217,6 +222,28 @@ out: abd_free(data); } +static void +zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg) +{ + (void) txtype; + const lr_write_t *lr = arg; + const blkptr_t *bp = &lr->lr_blkptr; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + (void) printf("%s(encrypted)\n", tab_prefix); + + if (verbose < 4) + return; + + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + (void) printf("%shas blkptr, %s\n", tab_prefix, + !BP_IS_HOLE(bp) && + bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? + "will claim" : "won't claim"); + print_log_bp(bp, tab_prefix); + } +} + static void zil_prt_rec_truncate(zilog_t *zilog, int txtype, const void *arg) { @@ -312,11 +339,34 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg) { (void) zilog, (void) txtype; const lr_clone_range_t *lr = arg; + int verbose = MAX(dump_opt['d'], dump_opt['i']); (void) printf("%sfoid %llu, offset %llx, length %llx, blksize %llx\n", tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz); + if (verbose < 4) + return; + + for (unsigned int i = 0; i < lr->lr_nbps; i++) { + (void) printf("%s[%u/%llu] ", tab_prefix, i + 1, + (u_longlong_t)lr->lr_nbps); + print_log_bp(&lr->lr_bps[i], ""); + } +} + +static void +zil_prt_rec_clone_range_enc(zilog_t *zilog, int txtype, const void *arg) +{ + (void) zilog, (void) txtype; + const lr_clone_range_t *lr = arg; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + (void) printf("%s(encrypted)\n", tab_prefix); + + if (verbose < 4) + return; + for (unsigned int i = 0; i < lr->lr_nbps; i++) { (void) printf("%s[%u/%llu] ", tab_prefix, i + 1, (u_longlong_t)lr->lr_nbps); @@ -327,6 +377,7 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg) typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *); typedef struct zil_rec_info { zil_prt_rec_func_t zri_print; + zil_prt_rec_func_t zri_print_enc; const char *zri_name; uint64_t zri_count; } zil_rec_info_t; @@ -341,7 +392,9 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "}, {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "}, {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "}, - {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "}, + {.zri_print = zil_prt_rec_write, + .zri_print_enc = zil_prt_rec_write_enc, + .zri_name = "TX_WRITE "}, {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "}, {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "}, {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "}, @@ -358,6 +411,7 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_EXCHANGE "}, {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_WHITEOUT "}, {.zri_print = zil_prt_rec_clone_range, + .zri_print_enc = zil_prt_rec_clone_range_enc, .zri_name = "TX_CLONE_RANGE "}, }; @@ -384,6 +438,8 @@ print_log_record(zilog_t *zilog, const lr_t *lr, void *arg, uint64_t claim_txg) if (txtype && verbose >= 3) { if (!zilog->zl_os->os_encrypted) { zil_rec_info[txtype].zri_print(zilog, txtype, lr); + } else if (zil_rec_info[txtype].zri_print_enc) { + zil_rec_info[txtype].zri_print_enc(zilog, txtype, lr); } else { (void) printf("%s(encrypted)\n", tab_prefix); } From 1e1d748cae2edf4afc08cb4a9d063b843dc9f396 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 6 Dec 2023 18:02:05 -0500 Subject: [PATCH 19/31] ZIL: Remove 128K into 2x68K LWB split optimization To improve 128KB block write performance in case of multiple VDEVs ZIL used to spit those writes into two 64KB ones. Unfortunately it was found to cause LWB buffer overflow, trying to write maximum- sizes 128KB TX_CLONE_RANGE record with 1022 block pointers into 68KB buffer, since unlike TX_WRITE ZIL code can't split it. This is a minimally-invasive temporary block cloning fix until the following more invasive prediction code refactoring. Reviewed-by: Brian Behlendorf Reviewed-by: Ameer Hamza Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15634 --- module/zfs/zil.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 7670e17295..5642f082bd 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1733,8 +1733,6 @@ static const struct { { 8192 + 4096, 8192 + 4096 }, /* database */ { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ - { 131072, 131072 }, /* < 128KB writes */ - { 131072 +4096, 65536 + 4096 }, /* 128KB writes */ { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ }; From e09356fa05ff174ed02ade8ea8e4ab98effa0ccd Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 6 Dec 2023 18:37:27 -0500 Subject: [PATCH 20/31] BRT: Limit brt_vdev_dump() to only one vdev Without this patch on pool of 60 vdevs with ZFS_DEBUG enabled clone takes much more time than copy, while heavily trashing dbgmsg for no good reason, repeatedly dumping all vdevs BRTs again and again, even unmodified ones. I am generally not sure this dumping is not excessive, but decided to keep it for now, just restricting its scope to more reasonable. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15625 --- module/zfs/brt.c | 80 ++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/module/zfs/brt.c b/module/zfs/brt.c index a701c70fcf..225ddaca1e 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -342,7 +342,7 @@ brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) ASSERT3U(idx, <, brtvd->bv_size); - if (brtvd->bv_need_byteswap) { + if (unlikely(brtvd->bv_need_byteswap)) { return (BSWAP_16(brtvd->bv_entcount[idx])); } else { return (brtvd->bv_entcount[idx]); @@ -355,7 +355,7 @@ brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) ASSERT3U(idx, <, brtvd->bv_size); - if (brtvd->bv_need_byteswap) { + if (unlikely(brtvd->bv_need_byteswap)) { brtvd->bv_entcount[idx] = BSWAP_16(entcnt); } else { brtvd->bv_entcount[idx] = entcnt; @@ -390,55 +390,39 @@ brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) #ifdef ZFS_DEBUG static void -brt_vdev_dump(brt_t *brt) +brt_vdev_dump(brt_vdev_t *brtvd) { - brt_vdev_t *brtvd; - uint64_t vdevid; + uint64_t idx; - if ((zfs_flags & ZFS_DEBUG_BRT) == 0) { - return; - } - - if (brt->brt_nvdevs == 0) { - zfs_dbgmsg("BRT empty"); - return; - } - - zfs_dbgmsg("BRT vdev dump:"); - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - uint64_t idx; - - brtvd = &brt->brt_vdevs[vdevid]; - zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d " - "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", - (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid, - brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, - (u_longlong_t)brtvd->bv_size, - (u_longlong_t)brtvd->bv_totalcount, - (u_longlong_t)brtvd->bv_nblocks, - (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); - if (brtvd->bv_totalcount > 0) { - zfs_dbgmsg(" entcounts:"); - for (idx = 0; idx < brtvd->bv_size; idx++) { - if (brt_vdev_entcount_get(brtvd, idx) > 0) { - zfs_dbgmsg(" [%04llu] %hu", - (u_longlong_t)idx, - brt_vdev_entcount_get(brtvd, idx)); - } + zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " + "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", + (u_longlong_t)brtvd->bv_vdevid, + brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, + (u_longlong_t)brtvd->bv_size, + (u_longlong_t)brtvd->bv_totalcount, + (u_longlong_t)brtvd->bv_nblocks, + (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); + if (brtvd->bv_totalcount > 0) { + zfs_dbgmsg(" entcounts:"); + for (idx = 0; idx < brtvd->bv_size; idx++) { + uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); + if (entcnt > 0) { + zfs_dbgmsg(" [%04llu] %hu", + (u_longlong_t)idx, entcnt); } } - if (brtvd->bv_entcount_dirty) { - char *bitmap; + } + if (brtvd->bv_entcount_dirty) { + char *bitmap; - bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); - for (idx = 0; idx < brtvd->bv_nblocks; idx++) { - bitmap[idx] = - BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; - } - bitmap[idx] = '\0'; - zfs_dbgmsg(" bitmap: %s", bitmap); - kmem_free(bitmap, brtvd->bv_nblocks + 1); + bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); + for (idx = 0; idx < brtvd->bv_nblocks; idx++) { + bitmap[idx] = + BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; } + bitmap[idx] = '\0'; + zfs_dbgmsg(" dirty: %s", bitmap); + kmem_free(bitmap, brtvd->bv_nblocks + 1); } } #endif @@ -767,7 +751,8 @@ brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, BT_SET(brtvd->bv_bitmap, idx); #ifdef ZFS_DEBUG - brt_vdev_dump(brt); + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); #endif } @@ -803,7 +788,8 @@ brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, BT_SET(brtvd->bv_bitmap, idx); #ifdef ZFS_DEBUG - brt_vdev_dump(brt); + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); #endif } From b13c91bb2997cb121cdf935496f92ae773672773 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 8 Dec 2023 19:43:39 -0500 Subject: [PATCH 21/31] DMU: Fix lock leak on dbuf_hold() error dmu_assign_arcbuf_by_dnode() should drop dn_struct_rwlock lock in case dbuf_hold() failed. I don't have reproduction for this, but it looks inconsistent with dmu_buf_hold_noread_by_dnode() and co. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15644 --- module/zfs/dmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 63464d7474..909605aa26 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1482,9 +1482,9 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); - rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned and the arc buf is the From a701548eb4a8ed47ebda9b28a23e1de80adf1b91 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 12 Dec 2023 15:53:59 -0500 Subject: [PATCH 22/31] dbuf: Handle arcbuf assignment after block cloning In some cases dbuf_assign_arcbuf() may be called on a block that was recently cloned. If it happened in current TXG we must undo the block cloning first, since the only one dirty record per TXG can't and shouldn't mean both cloning and overwrite same time. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15653 --- module/zfs/dbuf.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 5a7fe42b60..7691cd85f6 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2930,7 +2930,8 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); if (db->db_state == DB_CACHED && zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { @@ -2967,6 +2968,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; + } else if (db->db_state == DB_NOFILL) { + /* + * We will be completely replacing the cloned block. In case + * it was cloned in this transaction group, let's undirty the + * pending clone and mark the block as uncached. This will be + * as if the clone was never done. + */ + VERIFY(!dbuf_undirty(db, tx)); + db->db_state = DB_UNCACHED; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); From 9c40ae02199668d1e9a07d14c4ea713b2c5e584e Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 12 Dec 2023 15:59:24 -0500 Subject: [PATCH 23/31] dbuf: Set dr_data when unoverriding after clone Block cloning normally creates dirty record without dr_data. But if the block is read after cloning, it is moved into DB_CACHED state and receives the data buffer. If after that we call dbuf_unoverride() to convert the dirty record into normal write, we should give it the data buffer from dbuf and release one. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15654 Closes #15656 --- module/zfs/dbuf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 7691cd85f6..e4c59b5934 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1904,7 +1904,6 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; - boolean_t release; ASSERT(MUTEX_HELD(&db->db_mtx)); /* @@ -1925,7 +1924,10 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); - release = !dr->dt.dl.dr_brtwrite; + if (dr->dt.dl.dr_brtwrite) { + ASSERT0(dr->dt.dl.dr_data); + dr->dt.dl.dr_data = db->db_buf; + } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; dr->dt.dl.dr_brtwrite = B_FALSE; @@ -1939,7 +1941,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ - if (release) + if (dr->dt.dl.dr_data) arc_release(dr->dt.dl.dr_data, db); } From f71c16a66126b7a1896b3b0c59a42a7b9186e56b Mon Sep 17 00:00:00 2001 From: chrisperedun <126915832+chrisperedun@users.noreply.github.com> Date: Thu, 21 Dec 2023 14:12:30 -0500 Subject: [PATCH 24/31] Don't panic on unencrypted block in encrypted dataset While 763ca47 closes the situation of block cloning creating unencrypted records in encrypted datasets, existing data still causes panic on read. Setting zfs_recover bypasses this but at the cost of potentially ignoring more serious issues. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Chris Peredun Closes #15677 --- module/zfs/dbuf.c | 2 -- module/zfs/dmu_send.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index e4c59b5934..255add6cd2 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1619,8 +1619,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth); - zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", dmu_objset_id(db->db_objset)); err = SET_ERROR(EIO); goto early_unlock; } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 2d37ed2cdf..37c68528bf 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1124,8 +1124,6 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { spa_log_error(spa, zb, &bp->blk_birth); - zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", dmu_objset_id(sta->os)); return (SET_ERROR(EIO)); } From 976bf9b6a61919638d42ed79cd207132785d128a Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 9 Jan 2024 08:05:24 +0800 Subject: [PATCH 25/31] Linux 6.2 compat: add check for kernel_neon_* availability This patch adds check for `kernel_neon_*` symbols on arm and arm64 platforms to address the following issues: 1. Linux 6.2+ on arm64 has exported them with `EXPORT_SYMBOL_GPL`, so license compatibility must be checked before use. 2. On both arm and arm64, the definitions of these symbols are guarded by `CONFIG_KERNEL_MODE_NEON`, but their declarations are still present. Checking in configuration phase only leads to MODPOST errors (undefined references). Reviewed-by: Brian Behlendorf Signed-off-by: Shengqi Chen Closes #15711 Closes #14555 Closes: #15401 --- config/kernel-fpu.m4 | 23 +++++++++++++++++--- include/os/linux/kernel/linux/simd_aarch64.h | 6 +++++ include/os/linux/kernel/linux/simd_arm.h | 6 +++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 index c6efebd8cf..edfde1a02d 100644 --- a/config/kernel-fpu.m4 +++ b/config/kernel-fpu.m4 @@ -79,6 +79,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [ __kernel_fpu_end(); ], [], [ZFS_META_LICENSE]) + ZFS_LINUX_TEST_SRC([kernel_neon], [ + #include + ], [ + kernel_neon_begin(); + kernel_neon_end(); + ], [], [ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_KERNEL_FPU], [ @@ -105,9 +111,20 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) ],[ - AC_MSG_RESULT(internal) - AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1, - [kernel fpu internal]) + dnl # + dnl # ARM neon symbols (only on arm and arm64) + dnl # could be GPL-only on arm64 after Linux 6.2 + dnl # + ZFS_LINUX_TEST_RESULT([kernel_neon_license],[ + AC_MSG_RESULT(kernel_neon_*) + AC_DEFINE(HAVE_KERNEL_NEON, 1, + [kernel has kernel_neon_* functions]) + ],[ + # catch-all + AC_MSG_RESULT(internal) + AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1, + [kernel fpu internal]) + ]) ]) ]) ]) diff --git a/include/os/linux/kernel/linux/simd_aarch64.h b/include/os/linux/kernel/linux/simd_aarch64.h index 16276b08c7..123a0c72bc 100644 --- a/include/os/linux/kernel/linux/simd_aarch64.h +++ b/include/os/linux/kernel/linux/simd_aarch64.h @@ -71,9 +71,15 @@ #define ID_AA64PFR0_EL1 sys_reg(3, 0, 0, 1, 0) #define ID_AA64ISAR0_EL1 sys_reg(3, 0, 0, 6, 0) +#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON)) #define kfpu_allowed() 1 #define kfpu_begin() kernel_neon_begin() #define kfpu_end() kernel_neon_end() +#else +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#endif #define kfpu_init() (0) #define kfpu_fini() do {} while (0) diff --git a/include/os/linux/kernel/linux/simd_arm.h b/include/os/linux/kernel/linux/simd_arm.h index c432a6d4ab..bc70eaef30 100644 --- a/include/os/linux/kernel/linux/simd_arm.h +++ b/include/os/linux/kernel/linux/simd_arm.h @@ -53,9 +53,15 @@ #include #include +#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON)) #define kfpu_allowed() 1 #define kfpu_begin() kernel_neon_begin() #define kfpu_end() kernel_neon_end() +#else +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#endif #define kfpu_init() (0) #define kfpu_fini() do {} while (0) From 152a775eac59e026100835cb213ccafa3a163ad6 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 9 Jan 2024 12:46:43 -0500 Subject: [PATCH 26/31] Improve block sizes checks during cloning - Fail if source block is smaller than destination. We can only grow blocks, not shrink them. - Fail if we do not have full znode range lock. In that case grow is not even called. We should improve zfs_rangelock_cb() somehow to know when cloning needs to grow the block size unlike write. - Fail of we tried to resize, but failed. There are many reasons for it to fail that we can not predict at this level, so be ready for them. Unlike write, that may proceed after growth failure, block cloning can't and must return error. This fixes assertion inside dmu_brt_clone() when it sees different number of blocks held in destination than it got block pointers. Builds without ZFS_DEBUG returned EXDEV, so are not affected much. Reviewed-by: Pawel Jakub Dawidek Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15724 Closes #15735 --- module/zfs/zfs_vnops.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 17e990451e..812e42f645 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1192,11 +1192,18 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, inblksz = inzp->z_blksz; /* - * We cannot clone into files with different block size if we can't - * grow it (block size is already bigger or more than one block). + * We cannot clone into a file with different block size if we can't + * grow it (block size is already bigger, has more than one block, or + * not locked for growth). There are other possible reasons for the + * grow to fail, but we cover what we can before opening transaction + * and the rest detect after we try to do it. */ + if (inblksz < outzp->z_blksz) { + error = SET_ERROR(EINVAL); + goto unlock; + } if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || - outzp->z_size > inblksz)) { + outlr->lr_length != UINT64_MAX)) { error = SET_ERROR(EINVAL); goto unlock; } @@ -1315,12 +1322,24 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } /* - * Copy source znode's block size. This only happens on the - * first iteration since zfs_rangelock_reduce() will shrink down - * lr_len to the appropriate size. + * Copy source znode's block size. This is done only if the + * whole znode is locked (see zfs_rangelock_cb()) and only + * on the first iteration since zfs_rangelock_reduce() will + * shrink down lr_length to the appropriate size. */ if (outlr->lr_length == UINT64_MAX) { zfs_grow_blocksize(outzp, inblksz, tx); + + /* + * Block growth may fail for many reasons we can not + * predict here. If it happen the cloning is doomed. + */ + if (inblksz != outzp->z_blksz) { + error = SET_ERROR(EINVAL); + dmu_tx_abort(tx); + break; + } + /* * Round range lock up to the block boundary, so we * prevent appends until we are done. From ac592318b83a7d1cddb47f68d5b77789865f9768 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 9 Jan 2024 12:48:40 -0500 Subject: [PATCH 27/31] Fix livelist assertions for dedup and cloning Two block pointers in livelist pointing to the same location may be caused not only by dedup, but also by block cloning. We should not assert D bit set in them. Two block pointers in livelist pointing to the same location may have different logical birth time in case of dedup or cloning. We should assert identical physical birth time instead. Assert identical physical block size between pointers in addition to checksum, since that is what checksums are calculated on. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15732 --- module/zfs/dsl_deadlist.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 47c234f76c..ac30a37081 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -1000,8 +1000,6 @@ livelist_compare(const void *larg, const void *rarg) /* if vdevs are equal, sort by offsets. */ uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); - if (l_dva0_offset == r_dva0_offset) - ASSERT3U(l->blk_birth, ==, r->blk_birth); return (TREE_CMP(l_dva0_offset, r_dva0_offset)); } @@ -1016,9 +1014,9 @@ struct livelist_iter_arg { * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a * corresponding FREE are stored in the supplied bplist. * - * Note that multiple FREE and ALLOC entries for the same blkptr may - * be encountered when dedup is involved. For this reason we keep a - * refcount for all the FREE entries of each blkptr and ensure that + * Note that multiple FREE and ALLOC entries for the same blkptr may be + * encountered when dedup or block cloning is involved. For this reason we + * keep a refcount for all the FREE entries of each blkptr and ensure that * each of those FREE entries has a corresponding ALLOC preceding it. */ static int @@ -1037,6 +1035,13 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, livelist_entry_t node; node.le_bp = *bp; livelist_entry_t *found = avl_find(avl, &node, NULL); + if (found) { + ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp)); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, + BP_GET_CHECKSUM(&found->le_bp)); + ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==, + BP_PHYSICAL_BIRTH(&found->le_bp)); + } if (bp_freed) { if (found == NULL) { /* first free entry for this blkptr */ @@ -1046,10 +1051,10 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, e->le_refcnt = 1; avl_add(avl, e); } else { - /* dedup block free */ - ASSERT(BP_GET_DEDUP(bp)); - ASSERT3U(BP_GET_CHECKSUM(bp), ==, - BP_GET_CHECKSUM(&found->le_bp)); + /* + * Deduped or cloned block free. We could assert D bit + * for dedup, but there is no such one for cloning. + */ ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); found->le_refcnt++; } @@ -1065,14 +1070,6 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, /* all tracked free pairs have been matched */ avl_remove(avl, found); kmem_free(found, sizeof (livelist_entry_t)); - } else { - /* - * This is definitely a deduped blkptr so - * let's validate it. - */ - ASSERT(BP_GET_DEDUP(bp)); - ASSERT3U(BP_GET_CHECKSUM(bp), ==, - BP_GET_CHECKSUM(&found->le_bp)); } } } From 3bd23fd78dce7b5e3b76cce5a210a2977e1e05a8 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 11 Jan 2024 19:43:38 +1100 Subject: [PATCH 28/31] freebsd: fix compile for spa_taskq_read/spa_taskq_write params Missed in #15695, backporting #15675. Signed-off-by: Rob Norris --- include/os/freebsd/spl/sys/mod_os.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 77ce75ca3f..150e50380d 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -91,6 +91,12 @@ #define param_set_max_auto_ashift_args(var) \ CTLTYPE_UINT, NULL, 0, param_set_max_auto_ashift, "IU" +#define spa_taskq_read_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A" + +#define spa_taskq_write_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A" + #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" From 9181e94f0b24e3459e1e9b6b2b096ff40b9461eb Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 29 Dec 2023 10:22:58 -0500 Subject: [PATCH 29/31] spa: Fix FreeBSD sysctl handlers sbuf_cpy() resets the sbuf state, which is wrong for sbufs allocated by sbuf_new_for_sysctl(). In particular, this code triggers an assertion failure in sbuf_clear(). Simplify by just using sysctl_handle_string() for both reading and setting the tunable. Fixes: 6930ecbb7 ("spa: make read/write queues configurable") Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reported-by: Peter Holm Signed-off-by: Mark Johnston Closes #15719 --- module/zfs/spa.c | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 32a5852921..739e2cb7c2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1398,8 +1398,6 @@ spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); } #else -#include - /* * On FreeBSD load-time parameters can be set up before malloc() is available, * so we have to do all the parsing work on the stack. @@ -1410,19 +1408,11 @@ static int spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; - int err = 0; - - if (req->newptr == NULL) { - int len = spa_taskq_param_get(ZIO_TYPE_READ, buf); - struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); - sbuf_cpy(s, buf); - err = sbuf_finish(s); - sbuf_delete(s); - return (err); - } + int err; + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); - if (err) + if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); } @@ -1431,19 +1421,11 @@ static int spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; - int err = 0; - - if (req->newptr == NULL) { - int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf); - struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); - sbuf_cpy(s, buf); - err = sbuf_finish(s); - sbuf_delete(s); - return (err); - } + int err; + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); - if (err) + if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); } From a00231a3fc9909aa5ccf91af9c3a473665e9dea4 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 29 Dec 2023 12:56:35 -0500 Subject: [PATCH 30/31] spa: Let spa_taskq_param_get()'s addition of a newline be optional For FreeBSD sysctls, we don't want the extra newline, since the sysctl(8) utility will format strings appropriately. Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reported-by: Peter Holm Signed-off-by: Mark Johnston Closes #15719 --- module/zfs/spa.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 739e2cb7c2..d7fe96cde6 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1345,7 +1345,7 @@ spa_taskq_param_set(zio_type_t t, char *cfg) } static int -spa_taskq_param_get(zio_type_t t, char *buf) +spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) { int pos = 0; @@ -1363,7 +1363,8 @@ spa_taskq_param_get(zio_type_t t, char *buf) sep = " "; } - buf[pos++] = '\n'; + if (add_newline) + buf[pos++] = '\n'; buf[pos] = '\0'; return (pos); @@ -1381,7 +1382,7 @@ spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) static int spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) { - return (spa_taskq_param_get(ZIO_TYPE_READ, buf)); + return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); } static int @@ -1395,7 +1396,7 @@ spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) static int spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) { - return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); } #else /* @@ -1410,7 +1411,7 @@ spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) char buf[SPA_TASKQ_PARAM_MAX]; int err; - (void) spa_taskq_param_get(ZIO_TYPE_READ, buf); + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); @@ -1423,7 +1424,7 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) char buf[SPA_TASKQ_PARAM_MAX]; int err; - (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf); + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); From 9ecd112dc1dcdb903ebd3f1971cb1256094e73c4 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Thu, 7 Dec 2023 04:37:50 +0800 Subject: [PATCH 31/31] compact: workaround for GPL-only symbols on riscv from Linux 6.2 Since Linux 6.2, the implementation of flush_dcache_page on riscv references GPL-only symbol `PageHuge`, breaking the build of zfs. This patch uses existing mechanism to override flush_dcache_page, removing the call to `PageHuge`. According to comments in kernel, it is only used to do some check against HugeTLB pages, which only exist in userspace. ZFS uses flush_dcache_page only on kernel pages, thus this patch will not introduce any behaviour change. See also: torvalds/linux@d33deda, openzfs/zfs@589f59b Reviewed-by: Brian Behlendorf Signed-off-by: Shengqi Chen Closes #14974 Closes #15627 --- config/kernel-flush_dcache_page.m4 | 5 +++-- config/kernel.m4 | 6 ++++++ include/os/linux/kernel/linux/dcache_compat.h | 15 +++++++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/config/kernel-flush_dcache_page.m4 b/config/kernel-flush_dcache_page.m4 index 2340c386ef..aa916c87d5 100644 --- a/config/kernel-flush_dcache_page.m4 +++ b/config/kernel-flush_dcache_page.m4 @@ -1,7 +1,8 @@ dnl # dnl # Starting from Linux 5.13, flush_dcache_page() becomes an inline -dnl # function and may indirectly referencing GPL-only cpu_feature_keys on -dnl # powerpc +dnl # function and may indirectly referencing GPL-only symbols: +dnl # on powerpc: cpu_feature_keys +dnl # on riscv: PageHuge (added from 6.2) dnl # dnl # diff --git a/config/kernel.m4 b/config/kernel.m4 index 056517a841..d25b65994f 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -168,6 +168,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE ;; + riscv*) + ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE + ;; esac AC_MSG_CHECKING([for available kernel interfaces]) @@ -310,6 +313,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_CPU_HAS_FEATURE ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE ;; + riscv*) + ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE + ;; esac ]) diff --git a/include/os/linux/kernel/linux/dcache_compat.h b/include/os/linux/kernel/linux/dcache_compat.h index 1e35204932..ab1711b99f 100644 --- a/include/os/linux/kernel/linux/dcache_compat.h +++ b/include/os/linux/kernel/linux/dcache_compat.h @@ -42,8 +42,8 @@ /* * Starting from Linux 5.13, flush_dcache_page() becomes an inline function * and under some configurations, may indirectly referencing GPL-only - * cpu_feature_keys on powerpc. Override this function when it is detected - * being GPL-only. + * symbols, e.g., cpu_feature_keys on powerpc and PageHuge on riscv. + * Override this function when it is detected being GPL-only. */ #if defined __powerpc__ && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY #include @@ -53,6 +53,17 @@ clear_bit(PG_dcache_clean, &(page)->flags); \ } while (0) #endif +/* + * For riscv implementation, the use of PageHuge can be safely removed. + * Because it handles pages allocated by HugeTLB, while flush_dcache_page + * in zfs module is only called on kernel pages. + */ +#if defined __riscv && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY +#define flush_dcache_page(page) do { \ + if (test_bit(PG_dcache_clean, &(page)->flags)) \ + clear_bit(PG_dcache_clean, &(page)->flags); \ + } while (0) +#endif /* * 2.6.30 API change,