diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4 new file mode 100644 index 0000000000..cc96404d8b --- /dev/null +++ b/config/kernel-vfs-file_range.m4 @@ -0,0 +1,164 @@ +dnl # +dnl # The *_file_range APIs have a long history: +dnl # +dnl # 2.6.29: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE ioctl introduced +dnl # 3.12: BTRFS_IOC_FILE_EXTENT_SAME ioctl introduced +dnl # +dnl # 4.5: copy_file_range() syscall introduced, added to VFS +dnl # 4.5: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE renamed to FICLONE ands +dnl # FICLONERANGE, added to VFS as clone_file_range() +dnl # 4.5: BTRFS_IOC_FILE_EXTENT_SAME renamed to FIDEDUPERANGE, added to VFS +dnl # as dedupe_file_range() +dnl # +dnl # 4.20: VFS clone_file_range() and dedupe_file_range() replaced by +dnl # remap_file_range() +dnl # +dnl # 5.3: VFS copy_file_range() expected to do its own fallback, +dnl # generic_copy_file_range() added to support it +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [ + #include + + static ssize_t test_copy_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; (void) flags; + return (0); + } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .copy_file_range = test_copy_file_range, + }; + ],[]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE], [ + AC_MSG_CHECKING([whether fops->copy_file_range() is available]) + ZFS_LINUX_TEST_RESULT([vfs_copy_file_range], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_COPY_FILE_RANGE, 1, + [fops->copy_file_range() is available]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([generic_copy_file_range], [ + #include + ], [ + struct file *src_file __attribute__ ((unused)) = NULL; + loff_t src_off __attribute__ ((unused)) = 0; + struct file *dst_file __attribute__ ((unused)) = NULL; + loff_t dst_off __attribute__ ((unused)) = 0; + size_t len __attribute__ ((unused)) = 0; + unsigned int flags __attribute__ ((unused)) = 0; + generic_copy_file_range(src_file, src_off, dst_file, dst_off, + len, flags); + ]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [ + AC_MSG_CHECKING([whether generic_copy_file_range() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_copy_file_range], + [generic_copy_file_range], [fs/read_write.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFS_GENERIC_COPY_FILE_RANGE, 1, + [generic_copy_file_range() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [ + #include + + static int test_clone_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + u64 len) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; + return (0); + } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .clone_file_range = test_clone_file_range, + }; + ],[]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE], [ + AC_MSG_CHECKING([whether fops->clone_file_range() is available]) + ZFS_LINUX_TEST_RESULT([vfs_clone_file_range], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_CLONE_FILE_RANGE, 1, + [fops->clone_file_range() is available]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([vfs_dedupe_file_range], [ + #include + + static int test_dedupe_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + u64 len) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; + return (0); + } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .dedupe_file_range = test_dedupe_file_range, + }; + ],[]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE], [ + AC_MSG_CHECKING([whether fops->dedupe_file_range() is available]) + ZFS_LINUX_TEST_RESULT([vfs_dedupe_file_range], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DEDUPE_FILE_RANGE, 1, + [fops->dedupe_file_range() is available]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([vfs_remap_file_range], [ + #include + + static loff_t test_remap_file_range(struct file *src_file, + loff_t src_off, struct file *dst_file, loff_t dst_off, + loff_t len, unsigned int flags) { + (void) src_file; (void) src_off; + (void) dst_file; (void) dst_off; + (void) len; (void) flags; + return (0); + } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .remap_file_range = test_remap_file_range, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE], [ + AC_MSG_CHECKING([whether fops->remap_file_range() is available]) + ZFS_LINUX_TEST_RESULT([vfs_remap_file_range], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_REMAP_FILE_RANGE, 1, + [fops->remap_file_range() is available]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index cb7e736c9a..b17ccfdeec 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -116,6 +116,11 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER + ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN @@ -249,6 +254,11 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_VFS_IOV_ITER + ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE + ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE + ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE + ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE + ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_MAKE_REQUEST_FN diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 2b302e9dab..8b0e79afb0 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -180,6 +180,20 @@ zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx) } #endif /* HAVE_VFS_ITERATE */ + +/* zpl_file_range.c */ + +/* handlers for file_operations of the same name */ +extern ssize_t zpl_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags); +extern loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags); +extern int zpl_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len); +extern int zpl_dedupe_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len); + + #if defined(HAVE_INODE_TIMESTAMP_TRUNCATE) #define zpl_inode_timestamp_truncate(ts, ip) timestamp_truncate(ts, ip) #elif defined(HAVE_INODE_TIMESPEC64_TIMES) diff --git a/module/Kbuild.in b/module/Kbuild.in index 485331ac65..c132171592 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -461,6 +461,7 @@ ZFS_OBJS_OS := \ zpl_ctldir.o \ zpl_export.o \ zpl_file.o \ + zpl_file_range.o \ zpl_inode.o \ zpl_super.o \ zpl_xattr.o \ diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index e690525d3c..92b603e98a 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1283,7 +1283,6 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } #endif /* CONFIG_COMPAT */ - const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_READPAGES .readpages = zpl_readpages, @@ -1333,6 +1332,18 @@ const struct file_operations zpl_file_operations = { .aio_fsync = zpl_aio_fsync, #endif .fallocate = zpl_fallocate, +#ifdef HAVE_VFS_COPY_FILE_RANGE + .copy_file_range = zpl_copy_file_range, +#endif +#ifdef HAVE_VFS_REMAP_FILE_RANGE + .remap_file_range = zpl_remap_file_range, +#endif +#ifdef HAVE_VFS_CLONE_FILE_RANGE + .clone_file_range = zpl_clone_file_range, +#endif +#ifdef HAVE_VFS_DEDUPE_FILE_RANGE + .dedupe_file_range = zpl_dedupe_file_range, +#endif #ifdef HAVE_FILE_FADVISE .fadvise = zpl_fadvise, #endif diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c new file mode 100644 index 0000000000..db387a7481 --- /dev/null +++ b/module/os/linux/zfs/zpl_file_range.c @@ -0,0 +1,183 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2023, Klara Inc. + */ + +#ifdef CONFIG_COMPAT +#include +#endif +#include +#include +#include +#include +#include + +/* + * Clone part of a file via block cloning. + * + * Note that we are not required to update file offsets; the kernel will take + * care of that depending on how it was called. + */ +static ssize_t +__zpl_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, size_t len) +{ + struct inode *src_i = file_inode(src_file); + struct inode *dst_i = file_inode(dst_file); + uint64_t src_off_o = (uint64_t)src_off; + uint64_t dst_off_o = (uint64_t)dst_off; + uint64_t len_o = (uint64_t)len; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int err; + + if (!spa_feature_is_enabled( + dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING)) + return (-EOPNOTSUPP); + + if (src_i != dst_i) + spl_inode_lock_shared(src_i); + spl_inode_lock(dst_i); + + crhold(cr); + cookie = spl_fstrans_mark(); + + err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i), + &dst_off_o, &len_o, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + spl_inode_unlock(dst_i); + if (src_i != dst_i) + spl_inode_unlock_shared(src_i); + + if (err < 0) + return (err); + + return ((ssize_t)len_o); +} + +#ifdef HAVE_VFS_COPY_FILE_RANGE +/* + * Entry point for copy_file_range(). Copy len bytes from src_off in src_file + * to dst_off in dst_file. We are permitted to do this however we like, so we + * try to just clone the blocks, and if we can't support it, fall back to the + * kernel's generic byte copy function. + */ +ssize_t +zpl_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags) +{ + ssize_t ret; + + if (flags != 0) + return (-EINVAL); + + /* Try to do it via zfs_clone_range() */ + ret =__zpl_clone_file_range(src_file, src_off, + dst_file, dst_off, len); + +#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE + /* + * Since Linux 5.3 the filesystem driver is responsible for executing + * an appropriate fallback, and a generic fallback function is provided. + */ + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = generic_copy_file_range(src_file, src_off, dst_file, + dst_off, len, flags); +#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ + + return (ret); +} +#endif /* HAVE_VFS_COPY_FILE_RANGE */ + +#ifdef HAVE_VFS_REMAP_FILE_RANGE +/* + * Entry point for FICLONE/FICLONERANGE/FIDEDUPERANGE. + * + * FICLONE and FICLONERANGE are basically the same as copy_file_range(), except + * that they must clone - they cannot fall back to copying. FICLONE is exactly + * FICLONERANGE, for the entire file. We don't need to try to tell them apart; + * the kernel will sort that out for us. + * + * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the + * range in both files and if they're the same, arrange for them to be backed + * by the same storage. + */ +loff_t +zpl_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags) +{ + if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) + return (-EINVAL); + + /* + * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given + * range if we want. Its designed for filesystems that make data past + * EOF available, and don't want it to be visible in both files. ZFS + * doesn't do that, so we just turn the flag off. + */ + flags &= ~REMAP_FILE_CAN_SHORTEN; + + if (flags & REMAP_FILE_DEDUP) + /* No support for dedup yet */ + return (-EOPNOTSUPP); + + /* Zero length means to clone everything to the end of the file */ + if (len == 0) + len = i_size_read(file_inode(src_file)) - src_off; + + return (__zpl_clone_file_range(src_file, src_off, + dst_file, dst_off, len)); +} +#endif /* HAVE_VFS_REMAP_FILE_RANGE */ + +#ifdef HAVE_VFS_CLONE_FILE_RANGE +/* + * Entry point for FICLONE and FICLONERANGE, before Linux 4.20. + */ +int +zpl_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len) +{ + /* Zero length means to clone everything to the end of the file */ + if (len == 0) + len = i_size_read(file_inode(src_file)) - src_off; + + return (__zpl_clone_file_range(src_file, src_off, + dst_file, dst_off, len)); +} +#endif /* HAVE_VFS_CLONE_FILE_RANGE */ + +#ifdef HAVE_VFS_DEDUPE_FILE_RANGE +/* + * Entry point for FIDEDUPERANGE, before Linux 4.20. + */ +int +zpl_dedupe_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len) +{ + /* No support for dedup yet */ + return (-EOPNOTSUPP); +} +#endif /* HAVE_VFS_DEDUPE_FILE_RANGE */