From 5d12545da8c112aa813560950f39315956338963 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 27 Jun 2023 23:44:53 +1000
Subject: [PATCH] linux: implement filesystem-side copy/clone functions

This implements the Linux VFS ops required to service the file
copy/clone APIs:

  .copy_file_range    (4.5+)
  .clone_file_range   (4.5-4.19)
  .dedupe_file_range  (4.5-4.19)
  .remap_file_range   (4.20+)

Note that dedupe_file_range() and remap_file_range(REMAP_FILE_DEDUP) are
hooked up here, but are not implemented yet.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15050
---
 config/kernel-vfs-file_range.m4      | 164 ++++++++++++++++++++++++
 config/kernel.m4                     |  10 ++
 include/os/linux/zfs/sys/zpl.h       |  14 ++
 module/Kbuild.in                     |   1 +
 module/os/linux/zfs/zpl_file.c       |  13 +-
 module/os/linux/zfs/zpl_file_range.c | 183 +++++++++++++++++++++++++++
 6 files changed, 384 insertions(+), 1 deletion(-)
 create mode 100644 config/kernel-vfs-file_range.m4
 create mode 100644 module/os/linux/zfs/zpl_file_range.c

diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
new file mode 100644
index 0000000000..cc96404d8b
--- /dev/null
+++ b/config/kernel-vfs-file_range.m4
@@ -0,0 +1,164 @@
+dnl #
+dnl # The *_file_range APIs have a long history:
+dnl #
+dnl # 2.6.29: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE ioctl introduced
+dnl # 3.12: BTRFS_IOC_FILE_EXTENT_SAME ioctl introduced
+dnl #
+dnl # 4.5: copy_file_range() syscall introduced, added to VFS
+dnl # 4.5: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE renamed to FICLONE ands
+dnl #      FICLONERANGE, added to VFS as clone_file_range()
+dnl # 4.5: BTRFS_IOC_FILE_EXTENT_SAME renamed to FIDEDUPERANGE, added to VFS
+dnl #      as dedupe_file_range()
+dnl #
+dnl # 4.20: VFS clone_file_range() and dedupe_file_range() replaced by
+dnl #       remap_file_range()
+dnl #
+dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
+dnl #      generic_copy_file_range() added to support it
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
+		#include <linux/fs.h>
+
+		static ssize_t test_copy_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    size_t len, unsigned int flags) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len; (void) flags;
+			return (0);
+		}
+
+		static const struct file_operations
+		    fops __attribute__ ((unused)) = {
+			.copy_file_range	= test_copy_file_range,
+		};
+	],[])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE], [
+	AC_MSG_CHECKING([whether fops->copy_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([vfs_copy_file_range], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_COPY_FILE_RANGE, 1,
+		    [fops->copy_file_range() is available])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([generic_copy_file_range], [
+		#include <linux/fs.h>
+	], [
+		struct file *src_file __attribute__ ((unused)) = NULL;
+		loff_t src_off __attribute__ ((unused)) = 0;
+		struct file *dst_file __attribute__ ((unused)) = NULL;
+		loff_t dst_off __attribute__ ((unused)) = 0;
+		size_t len __attribute__ ((unused)) = 0;
+		unsigned int flags __attribute__ ((unused)) = 0;
+		generic_copy_file_range(src_file, src_off, dst_file, dst_off,
+		    len, flags);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
+	AC_MSG_CHECKING([whether generic_copy_file_range() is available])
+	ZFS_LINUX_TEST_RESULT_SYMBOL([generic_copy_file_range],
+	[generic_copy_file_range], [fs/read_write.c], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_VFS_GENERIC_COPY_FILE_RANGE, 1,
+		    [generic_copy_file_range() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
+		#include <linux/fs.h>
+
+		static int test_clone_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    u64 len) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len;
+			return (0);
+		}
+
+		static const struct file_operations
+		    fops __attribute__ ((unused)) = {
+			.clone_file_range	= test_clone_file_range,
+		};
+	],[])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE], [
+	AC_MSG_CHECKING([whether fops->clone_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([vfs_clone_file_range], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_CLONE_FILE_RANGE, 1,
+		    [fops->clone_file_range() is available])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([vfs_dedupe_file_range], [
+		#include <linux/fs.h>
+
+		static int test_dedupe_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    u64 len) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len;
+			return (0);
+		}
+
+		static const struct file_operations
+		    fops __attribute__ ((unused)) = {
+                .dedupe_file_range	= test_dedupe_file_range,
+		};
+	],[])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE], [
+	AC_MSG_CHECKING([whether fops->dedupe_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([vfs_dedupe_file_range], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_DEDUPE_FILE_RANGE, 1,
+		    [fops->dedupe_file_range() is available])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([vfs_remap_file_range], [
+		#include <linux/fs.h>
+
+		static loff_t test_remap_file_range(struct file *src_file,
+		    loff_t src_off, struct file *dst_file, loff_t dst_off,
+		    loff_t len, unsigned int flags) {
+			(void) src_file; (void) src_off;
+			(void) dst_file; (void) dst_off;
+			(void) len; (void) flags;
+			return (0);
+		}
+
+		static const struct file_operations
+		    fops __attribute__ ((unused)) = {
+			.remap_file_range	= test_remap_file_range,
+		};
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE], [
+	AC_MSG_CHECKING([whether fops->remap_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([vfs_remap_file_range], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_REMAP_FILE_RANGE, 1,
+		    [fops->remap_file_range() is available])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index cb7e736c9a..b17ccfdeec 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -116,6 +116,11 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
+	ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN
@@ -249,6 +254,11 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_RW_ITERATE
 	ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS
 	ZFS_AC_KERNEL_VFS_IOV_ITER
+	ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
 	ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_MAKE_REQUEST_FN
diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h
index 2b302e9dab..8b0e79afb0 100644
--- a/include/os/linux/zfs/sys/zpl.h
+++ b/include/os/linux/zfs/sys/zpl.h
@@ -180,6 +180,20 @@ zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx)
 }
 #endif /* HAVE_VFS_ITERATE */
 
+
+/* zpl_file_range.c */
+
+/* handlers for file_operations of the same name */
+extern ssize_t zpl_copy_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags);
+extern loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags);
+extern int zpl_clone_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, uint64_t len);
+extern int zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, uint64_t len);
+
+
 #if defined(HAVE_INODE_TIMESTAMP_TRUNCATE)
 #define	zpl_inode_timestamp_truncate(ts, ip)	timestamp_truncate(ts, ip)
 #elif defined(HAVE_INODE_TIMESPEC64_TIMES)
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 485331ac65..c132171592 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -461,6 +461,7 @@ ZFS_OBJS_OS := \
 	zpl_ctldir.o \
 	zpl_export.o \
 	zpl_file.o \
+	zpl_file_range.o \
 	zpl_inode.o \
 	zpl_super.o \
 	zpl_xattr.o \
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index e690525d3c..92b603e98a 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -1283,7 +1283,6 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 }
 #endif /* CONFIG_COMPAT */
 
-
 const struct address_space_operations zpl_address_space_operations = {
 #ifdef HAVE_VFS_READPAGES
 	.readpages	= zpl_readpages,
@@ -1333,6 +1332,18 @@ const struct file_operations zpl_file_operations = {
 	.aio_fsync	= zpl_aio_fsync,
 #endif
 	.fallocate	= zpl_fallocate,
+#ifdef HAVE_VFS_COPY_FILE_RANGE
+	.copy_file_range	= zpl_copy_file_range,
+#endif
+#ifdef HAVE_VFS_REMAP_FILE_RANGE
+	.remap_file_range	= zpl_remap_file_range,
+#endif
+#ifdef HAVE_VFS_CLONE_FILE_RANGE
+	.clone_file_range	= zpl_clone_file_range,
+#endif
+#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
+	.dedupe_file_range	= zpl_dedupe_file_range,
+#endif
 #ifdef HAVE_FILE_FADVISE
 	.fadvise	= zpl_fadvise,
 #endif
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
new file mode 100644
index 0000000000..db387a7481
--- /dev/null
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -0,0 +1,183 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <linux/fs.h>
+#include <sys/file.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfeature.h>
+
+/*
+ * Clone part of a file via block cloning.
+ *
+ * Note that we are not required to update file offsets; the kernel will take
+ * care of that depending on how it was called.
+ */
+static ssize_t
+__zpl_clone_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, size_t len)
+{
+	struct inode *src_i = file_inode(src_file);
+	struct inode *dst_i = file_inode(dst_file);
+	uint64_t src_off_o = (uint64_t)src_off;
+	uint64_t dst_off_o = (uint64_t)dst_off;
+	uint64_t len_o = (uint64_t)len;
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int err;
+
+	if (!spa_feature_is_enabled(
+	    dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING))
+		return (-EOPNOTSUPP);
+
+	if (src_i != dst_i)
+		spl_inode_lock_shared(src_i);
+	spl_inode_lock(dst_i);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+
+	err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i),
+	    &dst_off_o, &len_o, cr);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	spl_inode_unlock(dst_i);
+	if (src_i != dst_i)
+		spl_inode_unlock_shared(src_i);
+
+	if (err < 0)
+		return (err);
+
+	return ((ssize_t)len_o);
+}
+
+#ifdef HAVE_VFS_COPY_FILE_RANGE
+/*
+ * Entry point for copy_file_range(). Copy len bytes from src_off in src_file
+ * to dst_off in dst_file. We are permitted to do this however we like, so we
+ * try to just clone the blocks, and if we can't support it, fall back to the
+ * kernel's generic byte copy function.
+ */
+ssize_t
+zpl_copy_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags)
+{
+	ssize_t ret;
+
+	if (flags != 0)
+		return (-EINVAL);
+
+	/* Try to do it via zfs_clone_range() */
+	ret =__zpl_clone_file_range(src_file, src_off,
+	    dst_file, dst_off, len);
+
+#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
+	/*
+	 * Since Linux 5.3 the filesystem driver is responsible for executing
+	 * an appropriate fallback, and a generic fallback function is provided.
+	 */
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(src_file, src_off, dst_file,
+		    dst_off, len, flags);
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
+
+	return (ret);
+}
+#endif /* HAVE_VFS_COPY_FILE_RANGE */
+
+#ifdef HAVE_VFS_REMAP_FILE_RANGE
+/*
+ * Entry point for FICLONE/FICLONERANGE/FIDEDUPERANGE.
+ *
+ * FICLONE and FICLONERANGE are basically the same as copy_file_range(), except
+ * that they must clone - they cannot fall back to copying. FICLONE is exactly
+ * FICLONERANGE, for the entire file. We don't need to try to tell them apart;
+ * the kernel will sort that out for us.
+ *
+ * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
+ * range in both files and if they're the same, arrange for them to be backed
+ * by the same storage.
+ */
+loff_t
+zpl_remap_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags)
+{
+	if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
+		return (-EINVAL);
+
+	/*
+	 * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
+	 * range if we want. Its designed for filesystems that make data past
+	 * EOF available, and don't want it to be visible in both files. ZFS
+	 * doesn't do that, so we just turn the flag off.
+	 */
+	flags &= ~REMAP_FILE_CAN_SHORTEN;
+
+	if (flags & REMAP_FILE_DEDUP)
+		/* No support for dedup yet */
+		return (-EOPNOTSUPP);
+
+	/* Zero length means to clone everything to the end of the file */
+	if (len == 0)
+		len = i_size_read(file_inode(src_file)) - src_off;
+
+	return (__zpl_clone_file_range(src_file, src_off,
+	    dst_file, dst_off, len));
+}
+#endif /* HAVE_VFS_REMAP_FILE_RANGE */
+
+#ifdef HAVE_VFS_CLONE_FILE_RANGE
+/*
+ * Entry point for FICLONE and FICLONERANGE, before Linux 4.20.
+ */
+int
+zpl_clone_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, uint64_t len)
+{
+	/* Zero length means to clone everything to the end of the file */
+	if (len == 0)
+		len = i_size_read(file_inode(src_file)) - src_off;
+
+	return (__zpl_clone_file_range(src_file, src_off,
+	    dst_file, dst_off, len));
+}
+#endif /* HAVE_VFS_CLONE_FILE_RANGE */
+
+#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
+/*
+ * Entry point for FIDEDUPERANGE, before Linux 4.20.
+ */
+int
+zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
+    struct file *dst_file, loff_t dst_off, uint64_t len)
+{
+	/* No support for dedup yet */
+	return (-EOPNOTSUPP);
+}
+#endif /* HAVE_VFS_DEDUPE_FILE_RANGE */