From d93a3febba8155127cedab0a4d803183c1dbfb4e Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Tue, 24 Nov 2020 13:00:00 -0700 Subject: [PATCH 1/7] Adding Direct IO Support Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads. O_DIRECT support in ZFS will always ensure there is coherency between buffered and O_DIRECT IO requests. This ensures that all IO requests, whether buffered or direct, will see the same file contents at all times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While data is written directly to VDEV disks, metadata will not be synced until the associated TXG is synced. For both O_DIRECT read and write request the offset and requeset sizes, at a minimum, must be PAGE_SIZE aligned. In the event they are not, then EINVAL is returned unless the direct property is set to always (see below). For O_DIRECT writes: The request also must be block aligned (recordsize) or the write request will take the normal (buffered) write path. In the event that request is block aligned and a cached copy of the buffer in the ARC, then it will be discarded from the ARC forcing all further reads to retrieve the data from disk. For O_DIRECT reads: The only alignment restrictions are PAGE_SIZE alignment. In the event that the requested data is in buffered (in the ARC) it will just be copied from the ARC into the user buffer. For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in the event that file contents are mmap'ed. In this case, all requests that are at least PAGE_SIZE aligned will just fall back to the buffered paths. If the request however is not PAGE_SIZE aligned, EINVAL will be returned as always regardless if the file's contents are mmap'ed. Since O_DIRECT writes go through the normal ZIO pipeline, the following operations are supported just as with normal buffered writes: Checksum Compression Dedup Encryption Erasure Coding There is one caveat for the data integrity of O_DIRECT writes that is distinct for each of the OS's supported by ZFS. FreeBSD - FreeBSD is able to place user pages under write protection so any data in the user buffers and written directly down to the VDEV disks is guaranteed to not change. There is no concern with data integrity and O_DIRECT writes. Linux - Linux is not able to place anonymous user pages under write protection. Because of this, if the user decides to manipulate the page contents while the write operation is occurring, data integrity can not be guaranteed. However, there is a module parameter `zfs_vdev_direct_write_verify` that contols the if a O_DIRECT writes that can occur to a top-level VDEV before a checksum verify is run before the contents of the I/O buffer are committed to disk. In the event of a checksum verification failure the write will return EIO. The number of O_DIRECT write checksum verification errors can be observed by doing `zpool status -d`, which will list all verification errors that have occurred on a top-level VDEV. Along with `zpool status`, a ZED event will be issues as `dio_verify` when a checksum verification error occurs. A new dataset property `direct` has been added with the following 3 allowable values: disabled - Accepts O_DIRECT flag, but silently ignores it and treats the request as a buffered IO request. standard - Follows the alignment restrictions outlined above for write/read IO requests when the O_DIRECT flag is used. always - Treats every write/read IO request as though it passed O_DIRECT and will do O_DIRECT if the alignment restrictions are met otherwise will redirect through the ARC. This property will not allow a request to fail. There is also a module paramter zfs_dio_enabled that can be used to force all reads and writes through the ARC. By setting this module paramter to 0, it mimics as if the direct dataset property is set to disabled. Signed-off-by: Brian Atkinson Co-authored-by: Mark Maybee Co-authored-by: Matt Macy Co-authored-by: Brian Behlendorf --- cmd/zpool/zpool_main.c | 30 +- cmd/ztest.c | 46 +- config/kernel-get-user-pages.m4 | 179 + config/kernel-vfs-direct_IO.m4 | 4 +- config/kernel-vfs-iov_iter.m4 | 49 + config/kernel.m4 | 4 + include/os/freebsd/spl/sys/mod_os.h | 3 + include/os/freebsd/spl/sys/mutex.h | 1 + include/os/freebsd/spl/sys/param.h | 1 + include/os/freebsd/spl/sys/uio.h | 31 +- include/os/freebsd/zfs/sys/abd_os.h | 9 + include/os/linux/kernel/linux/kmap_compat.h | 47 + include/os/linux/spl/sys/uio.h | 47 +- include/os/linux/zfs/sys/abd_os.h | 3 + include/os/linux/zfs/sys/zpl.h | 1 - include/sys/abd.h | 7 + include/sys/abd_impl.h | 13 +- include/sys/arc.h | 3 +- include/sys/dbuf.h | 63 +- include/sys/dmu.h | 17 +- include/sys/dmu_impl.h | 34 +- include/sys/dmu_objset.h | 1 + include/sys/fm/fs/zfs.h | 2 + include/sys/fs/zfs.h | 11 + include/sys/spa.h | 12 + include/sys/uio_impl.h | 37 + include/sys/vdev_impl.h | 11 + include/sys/zfs_racct.h | 7 +- include/sys/zfs_vnops.h | 9 + include/sys/zfs_znode.h | 2 +- include/sys/zio.h | 2 + include/sys/zio_impl.h | 7 +- lib/libspl/include/sys/uio.h | 26 + lib/libzfs/libzfs.abi | 6877 +++++------------ lib/libzpool/Makefile.am | 1 + lib/libzpool/abd_os.c | 64 + man/man4/zfs.4 | 25 + man/man7/zfsprops.7 | 38 + man/man8/zpool-events.8 | 14 +- man/man8/zpool-status.8 | 11 +- module/Kbuild.in | 2 + module/Makefile.bsd | 1 + module/os/freebsd/spl/spl_uio.c | 203 + module/os/freebsd/zfs/abd_os.c | 178 +- module/os/freebsd/zfs/sysctl_os.c | 29 + module/os/freebsd/zfs/zfs_racct.c | 8 +- module/os/freebsd/zfs/zfs_vnops_os.c | 143 +- module/os/freebsd/zfs/zvol_os.c | 1 + module/os/linux/zfs/abd_os.c | 214 +- module/os/linux/zfs/vdev_os.c | 49 + module/os/linux/zfs/zfs_racct.c | 29 +- module/os/linux/zfs/zfs_uio.c | 296 +- module/os/linux/zfs/zfs_vfsops.c | 1 + module/os/linux/zfs/zfs_vnops_os.c | 66 +- module/os/linux/zfs/zpl_file.c | 405 +- module/zcommon/zfs_prop.c | 11 + module/zfs/abd.c | 94 +- module/zfs/arc.c | 2 +- module/zfs/dataset_kstats.c | 6 +- module/zfs/dbuf.c | 375 +- module/zfs/dmu.c | 151 +- module/zfs/dmu_direct.c | 437 ++ module/zfs/dmu_objset.c | 19 + module/zfs/spa_stats.c | 46 + module/zfs/vdev.c | 28 + module/zfs/vdev_label.c | 4 + module/zfs/zfs_fm.c | 2 + module/zfs/zfs_log.c | 4 +- module/zfs/zfs_vnops.c | 380 +- module/zfs/zio.c | 118 +- tests/runfiles/common.run | 10 +- tests/runfiles/freebsd.run | 4 + tests/runfiles/linux.run | 4 + tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 2 + tests/zfs-tests/cmd/manipulate_user_buffer.c | 260 + tests/zfs-tests/cmd/stride_dd.c | 238 +- tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/include/libtest.shlib | 12 + tests/zfs-tests/include/tunables.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 22 + .../tests/functional/cache/cache_012_pos.ksh | 2 +- .../compression/l2arc_compressed_arc.ksh | 2 +- .../compression/l2arc_encrypted.ksh | 2 +- .../l2arc_encrypted_no_compressed_arc.ksh | 2 +- .../tests/functional/direct/cleanup.ksh | 31 + .../zfs-tests/tests/functional/direct/dio.cfg | 26 + .../tests/functional/direct/dio.kshlib | 331 + .../functional/direct/dio_aligned_block.ksh | 116 + .../functional/direct/dio_async_always.ksh | 69 + .../direct/dio_async_fio_ioengines.ksh | 107 + .../functional/direct/dio_compression.ksh | 66 + .../tests/functional/direct/dio_dedup.ksh | 63 + .../functional/direct/dio_encryption.ksh | 64 + .../functional/direct/dio_grow_block.ksh | 87 + .../functional/direct/dio_max_recordsize.ksh | 72 + .../tests/functional/direct/dio_mixed.ksh | 108 + .../tests/functional/direct/dio_mmap.ksh | 93 + .../functional/direct/dio_overwrites.ksh | 71 + .../tests/functional/direct/dio_property.ksh | 126 + .../tests/functional/direct/dio_random.ksh | 83 + .../functional/direct/dio_recordsize.ksh | 76 + .../functional/direct/dio_unaligned_block.ksh | 79 + .../direct/dio_unaligned_filesize.ksh | 92 + .../direct/dio_write_stable_pages.ksh | 103 + .../functional/direct/dio_write_verify.ksh | 222 + .../tests/functional/direct/setup.ksh | 32 + tests/zfs-tests/tests/functional/io/setup.ksh | 2 +- .../tests/functional/l2arc/l2arc.cfg | 2 +- .../tests/functional/rsend/rsend.kshlib | 7 - .../functional/slog/slog_replay_fs_001.ksh | 12 + .../tests/functional/trim/trim_l2arc.ksh | 2 +- 112 files changed, 8420 insertions(+), 5336 deletions(-) create mode 100644 config/kernel-get-user-pages.m4 create mode 100644 module/os/linux/zfs/vdev_os.c create mode 100644 module/zfs/dmu_direct.c create mode 100644 tests/zfs-tests/cmd/manipulate_user_buffer.c create mode 100755 tests/zfs-tests/tests/functional/direct/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/direct/dio.cfg create mode 100644 tests/zfs-tests/tests/functional/direct/dio.kshlib create mode 100755 tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_async_always.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_compression.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_dedup.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_encryption.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_mixed.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_mmap.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_property.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_random.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh create mode 100755 tests/zfs-tests/tests/functional/direct/setup.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index ce859226c2..10e65fdc15 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -521,7 +521,7 @@ get_usage(zpool_help_t idx) return (gettext("\tstatus [--power] [-j [--json-int, " "--json-flat-vdevs, ...\n" "\t --json-pool-key-guid]] [-c [script1,script2,...]] " - "[-DegiLpPstvx] ...\n" + "[-dDegiLpPstvx] ...\n" "\t [-T d|u] [pool] [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" @@ -2601,6 +2601,7 @@ typedef struct status_cbdata { boolean_t cb_print_unhealthy; boolean_t cb_print_status; boolean_t cb_print_slow_ios; + boolean_t cb_print_dio_verify; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; @@ -2878,7 +2879,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; - char rbuf[6], wbuf[6], cbuf[6]; + char rbuf[6], wbuf[6], cbuf[6], dbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; @@ -2996,6 +2997,17 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, printf(" %5s", "-"); } } + if (VDEV_STAT_VALID(vs_dio_verify_errors, vsc) && + cb->cb_print_dio_verify) { + zfs_nicenum(vs->vs_dio_verify_errors, dbuf, + sizeof (dbuf)); + + if (cb->cb_literal) + printf(" %5llu", + (u_longlong_t)vs->vs_dio_verify_errors); + else + printf(" %5s", dbuf); + } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, @@ -10872,6 +10884,10 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, " %5s", gettext("POWER")); } + if (cbp->cb_print_dio_verify) { + printf_color(ANSI_BOLD, " %5s", gettext("DIO")); + } + if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); @@ -10920,10 +10936,11 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ... - * [pool] [interval [count]] + * zpool status [-c [script1,script2,...]] [-dDegiLpPstvx] [--power] ... + * [-T d|u] [pool] [interval [count]] * * -c CMD For each vdev, run command CMD + * -d Display Direct I/O write verify errors * -D Display dedup status (undocumented) * -e Display only unhealthy vdevs * -g Display guid for individual vdev name. @@ -10966,7 +10983,7 @@ zpool_do_status(int argc, char **argv) }; /* check options */ - while ((c = getopt_long(argc, argv, "c:jDegiLpPstT:vx", long_options, + while ((c = getopt_long(argc, argv, "c:jdDegiLpPstT:vx", long_options, NULL)) != -1) { switch (c) { case 'c': @@ -10993,6 +11010,9 @@ zpool_do_status(int argc, char **argv) } cmd = optarg; break; + case 'd': + cb.cb_print_dio_verify = B_TRUE; + break; case 'D': if (++cb.cb_dedup_stats > 2) cb.cb_dedup_stats = 2; diff --git a/cmd/ztest.c b/cmd/ztest.c index a7843d3388..1cf7efe3aa 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -2262,6 +2262,13 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; + + /* + * We will randomly set when to do O_DIRECT on a read. + */ + if (ztest_random(4) == 0) + prefetch |= DMU_DIRECTIO; + ztest_block_tag_t rbt; VERIFY(dmu_read(os, lr->lr_foid, offset, @@ -2813,6 +2820,13 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) enum ztest_io_type io_type; uint64_t blocksize; void *data; + uint32_t dmu_read_flags = DMU_READ_NO_PREFETCH; + + /* + * We will randomly set when to do O_DIRECT on a read. + */ + if (ztest_random(4) == 0) + dmu_read_flags |= DMU_DIRECTIO; VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); blocksize = doi.doi_data_block_size; @@ -2878,7 +2892,7 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) (void) pthread_rwlock_unlock(&ztest_name_lock); VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, - DMU_READ_NO_PREFETCH)); + dmu_read_flags)); (void) ztest_write(zd, object, offset, blocksize, data); break; @@ -5045,6 +5059,13 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) uint64_t stride = 123456789ULL; uint64_t width = 40; int free_percent = 5; + uint32_t dmu_read_flags = DMU_READ_PREFETCH; + + /* + * We will randomly set when to do O_DIRECT on a read. + */ + if (ztest_random(4) == 0) + dmu_read_flags |= DMU_DIRECTIO; /* * This test uses two objects, packobj and bigobj, that are always @@ -5123,10 +5144,10 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) * Read the current contents of our objects. */ error = dmu_read(os, packobj, packoff, packsize, packbuf, - DMU_READ_PREFETCH); + dmu_read_flags); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, - DMU_READ_PREFETCH); + dmu_read_flags); ASSERT0(error); /* @@ -5244,9 +5265,9 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, - packsize, packcheck, DMU_READ_PREFETCH)); + packsize, packcheck, dmu_read_flags)); VERIFY0(dmu_read(os, bigobj, bigoff, - bigsize, bigcheck, DMU_READ_PREFETCH)); + bigsize, bigcheck, dmu_read_flags)); ASSERT0(memcmp(packbuf, packcheck, packsize)); ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); @@ -5336,6 +5357,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) dmu_buf_t *bonus_db; arc_buf_t **bigbuf_arcbufs; dmu_object_info_t doi; + uint32_t dmu_read_flags = DMU_READ_PREFETCH; + + /* + * We will randomly set when to do O_DIRECT on a read. + */ + if (ztest_random(4) == 0) + dmu_read_flags |= DMU_DIRECTIO; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); @@ -5466,10 +5494,10 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) */ if (i != 0 || ztest_random(2) != 0) { error = dmu_read(os, packobj, packoff, - packsize, packbuf, DMU_READ_PREFETCH); + packsize, packbuf, dmu_read_flags); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, - bigbuf, DMU_READ_PREFETCH); + bigbuf, dmu_read_flags); ASSERT0(error); } compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, @@ -5529,9 +5557,9 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, - packsize, packcheck, DMU_READ_PREFETCH)); + packsize, packcheck, dmu_read_flags)); VERIFY0(dmu_read(os, bigobj, bigoff, - bigsize, bigcheck, DMU_READ_PREFETCH)); + bigsize, bigcheck, dmu_read_flags)); ASSERT0(memcmp(packbuf, packcheck, packsize)); ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); diff --git a/config/kernel-get-user-pages.m4 b/config/kernel-get-user-pages.m4 new file mode 100644 index 0000000000..f9d02b66a1 --- /dev/null +++ b/config/kernel-get-user-pages.m4 @@ -0,0 +1,179 @@ +dnl # +dnl # get_user_pages_unlocked() function was not available till 4.0. +dnl # In earlier kernels (< 4.0) get_user_pages() is available(). +dnl # +dnl # 4.0 API change, +dnl # long get_user_pages_unlocked(struct task_struct *tsk, +dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages, +dnl # int write, int force, struct page **pages) +dnl # +dnl # 4.8 API change, +dnl # long get_user_pages_unlocked(unsigned long start, +dnl # unsigned long nr_pages, int write, int force, struct page **page) +dnl # +dnl # 4.9 API change, +dnl # long get_user_pages_unlocked(usigned long start, int nr_pages, +dnl # struct page **pages, unsigned int gup_flags) +dnl # + +dnl# +dnl# Check available get_user_pages/_unlocked interfaces. +dnl# +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [ + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [ + #include + ], [ + unsigned long start = 0; + unsigned long nr_pages = 1; + unsigned int gup_flags = 0; + struct page **pages = NULL; + long ret __attribute__ ((unused)); + + ret = get_user_pages_unlocked(start, nr_pages, pages, + gup_flags); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [ + #include + ], [ + unsigned long start = 0; + unsigned long nr_pages = 1; + int write = 0; + int force = 0; + long ret __attribute__ ((unused)); + struct page **pages = NULL; + + ret = get_user_pages_unlocked(start, nr_pages, write, force, + pages); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [ + #include + ], [ + struct task_struct *tsk = NULL; + struct mm_struct *mm = NULL; + unsigned long start = 0; + unsigned long nr_pages = 1; + int write = 0; + int force = 0; + struct page **pages = NULL; + long ret __attribute__ ((unused)); + + ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write, + force, pages); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct_gup_flags], [ + #include + ], [ + struct task_struct *tsk = NULL; + struct mm_struct *mm = NULL; + unsigned long start = 0; + unsigned long nr_pages = 1; + struct page **pages = NULL; + unsigned int gup_flags = 0; + long ret __attribute__ ((unused)); + + ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, + pages, gup_flags); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [ + #include + ], [ + struct task_struct *tsk = NULL; + struct mm_struct *mm = NULL; + struct vm_area_struct **vmas = NULL; + unsigned long start = 0; + unsigned long nr_pages = 1; + int write = 0; + int force = 0; + struct page **pages = NULL; + int ret __attribute__ ((unused)); + + ret = get_user_pages(tsk, mm, start, nr_pages, write, + force, pages, vmas); + ]) +]) + +dnl # +dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest. +dnl # We first check for get_user_pages_unlocked as that is available in +dnl # newer kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [ + dnl # + dnl # Current API (as of 4.9) of get_user_pages_unlocked + dnl # + AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags]) + ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1, + [get_user_pages_unlocked() takes gup flags]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.8 API change, get_user_pages_unlocked + dnl # + AC_MSG_CHECKING( + [whether get_user_pages_unlocked() takes write flag]) + ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1, + [get_user_pages_unlocked() takes write flag]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.0-4.3, 4.5-4.7 API, get_user_pages_unlocked + dnl # + AC_MSG_CHECKING( + [whether get_user_pages_unlocked() takes task_struct]) + ZFS_LINUX_TEST_RESULT( + [get_user_pages_unlocked_task_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE( + HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1, + [get_user_pages_unlocked() takes task_struct]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.4 API, get_user_pages_unlocked + dnl # + AC_MSG_CHECKING( + [whether get_user_pages_unlocked() takes task_struct, gup_flags]) + ZFS_LINUX_TEST_RESULT( + [get_user_pages_unlocked_task_struct_gup_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE( + HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT_GUP_FLAGS, 1, + [get_user_pages_unlocked() takes task_struct, gup_flags]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # get_user_pages + dnl # + AC_MSG_CHECKING( + [whether get_user_pages() takes struct task_struct]) + ZFS_LINUX_TEST_RESULT( + [get_user_pages_task_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE( + HAVE_GET_USER_PAGES_TASK_STRUCT, 1, + [get_user_pages() takes task_struct]) + ], [ + dnl # + dnl # If we cannot map the user's + dnl # pages in then we cannot do + dnl # Direct I/O + dnl # + ZFS_LINUX_TEST_ERROR([Direct I/O]) + ]) + ]) + ]) + ]) + ]) +]) diff --git a/config/kernel-vfs-direct_IO.m4 b/config/kernel-vfs-direct_IO.m4 index 7b7b91f979..715e824b7a 100644 --- a/config/kernel-vfs-direct_IO.m4 +++ b/config/kernel-vfs-direct_IO.m4 @@ -1,5 +1,5 @@ dnl # -dnl # Check for direct IO interfaces. +dnl # Check for Direct I/O interfaces. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [ ZFS_LINUX_TEST_SRC([direct_io_iter], [ @@ -100,7 +100,7 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO], [ AC_DEFINE(HAVE_VFS_DIRECT_IO_IOVEC, 1, [aops->direct_IO() uses iovec]) ],[ - ZFS_LINUX_TEST_ERROR([direct IO]) + ZFS_LINUX_TEST_ERROR([Direct I/O]) AC_MSG_RESULT([no]) ]) ]) diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index ff560ff3ee..e7ced02970 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -85,6 +85,34 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ bytes = copy_from_iter((void *)&buf, size, &iter); ]) + ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [ + #include + ], [ + struct iov_iter iter = { 0 }; + struct page **pages = NULL; + size_t maxsize = 4096; + unsigned maxpages = 1; + size_t start; + size_t ret __attribute__ ((unused)); + + ret = iov_iter_get_pages2(&iter, pages, maxsize, maxpages, + &start); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_get_pages], [ + #include + ], [ + struct iov_iter iter = { 0 }; + struct page **pages = NULL; + size_t maxsize = 4096; + unsigned maxpages = 1; + size_t start; + size_t ret __attribute__ ((unused)); + + ret = iov_iter_get_pages(&iter, pages, maxsize, maxpages, + &start); + ]) + ZFS_LINUX_TEST_SRC([iov_iter_type], [ #include #include @@ -184,6 +212,27 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ enable_vfs_iov_iter="no" ]) + dnl # + dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_page_pages2(). + dnl # + AC_MSG_CHECKING([whether iov_iter_get_pages2() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1, + [iov_iter_get_pages2() is available]) + ], [ + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether iov_iter_get_pages() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_get_pages], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_GET_PAGES, 1, + [iov_iter_get_pages() is available]) + ], [ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + ]) + dnl # dnl # This checks for iov_iter_type() in linux/uio.h. It is not dnl # required, however, and the module will compiled without it diff --git a/config/kernel.m4 b/config/kernel.m4 index 4d471358d2..83da863fce 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -79,6 +79,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_SHOW_OPTIONS ZFS_AC_KERNEL_SRC_FILE_INODE ZFS_AC_KERNEL_SRC_FILE_DENTRY + ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_FSYNC ZFS_AC_KERNEL_SRC_AIO_FSYNC ZFS_AC_KERNEL_SRC_EVICT_INODE @@ -111,6 +112,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_GETATTR ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_ITERATE + ZFS_AC_KERNEL_SRC_GET_USER_PAGES ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS @@ -234,6 +236,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_SHOW_OPTIONS ZFS_AC_KERNEL_FILE_INODE ZFS_AC_KERNEL_FILE_DENTRY + ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_FSYNC ZFS_AC_KERNEL_AIO_FSYNC ZFS_AC_KERNEL_EVICT_INODE @@ -266,6 +269,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_GETATTR ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_ITERATE + ZFS_AC_KERNEL_GET_USER_PAGES ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index df7be6fc13..01a660434f 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -100,6 +100,9 @@ #define spa_taskq_write_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A" +#define param_set_direct_write_verify_pct_args(var) \ + CTLTYPE_UINT, NULL, 0, param_set_direct_write_verify_pct, "IU" + #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h index 8cfe56c753..bbff9fe803 100644 --- a/include/os/freebsd/spl/sys/mutex.h +++ b/include/os/freebsd/spl/sys/mutex.h @@ -70,4 +70,5 @@ typedef enum { #define mutex_exit(lock) sx_xunlock(lock) #define mutex_owned(lock) sx_xlocked(lock) #define mutex_owner(lock) sx_xholder(lock) + #endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/freebsd/spl/sys/param.h b/include/os/freebsd/spl/sys/param.h index 92724e332d..96440dce03 100644 --- a/include/os/freebsd/spl/sys/param.h +++ b/include/os/freebsd/spl/sys/param.h @@ -33,6 +33,7 @@ #include #include_next #define PAGESIZE PAGE_SIZE +#define PAGESHIFT PAGE_SHIFT #define ptob(x) ((uint64_t)(x) << PAGE_SHIFT) #ifdef _KERNEL #include diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h index b9d41903ea..2bd5bdb80d 100644 --- a/include/os/freebsd/spl/sys/uio.h +++ b/include/os/freebsd/spl/sys/uio.h @@ -34,13 +34,30 @@ #include_next #include #include +#include + +/* + * uio_extflg: extended flags + */ +#define UIO_DIRECT 0x0001 /* Direct I/O requset */ typedef struct iovec iovec_t; typedef enum uio_seg zfs_uio_seg_t; typedef enum uio_rw zfs_uio_rw_t; +/* + * This structure is used when doing Direct I/O. + */ +typedef struct { + vm_page_t *pages; + int npages; +} zfs_uio_dio_t; + typedef struct zfs_uio { struct uio *uio; + offset_t uio_soffset; + uint16_t uio_extflg; + zfs_uio_dio_t uio_dio; } zfs_uio_t; #define GET_UIO_STRUCT(u) (u)->uio @@ -52,6 +69,7 @@ typedef struct zfs_uio { #define zfs_uio_iovbase(u, idx) GET_UIO_STRUCT(u)->uio_iov[(idx)].iov_base #define zfs_uio_td(u) GET_UIO_STRUCT(u)->uio_td #define zfs_uio_rw(u) GET_UIO_STRUCT(u)->uio_rw +#define zfs_uio_soffset(u) (u)->uio_soffset #define zfs_uio_fault_disable(u, set) #define zfs_uio_prefaultpages(size, u) (0) @@ -61,6 +79,13 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) zfs_uio_offset(uio) = off; } +static inline void +zfs_uio_setsoffset(zfs_uio_t *uio, offset_t off) +{ + ASSERT3U(zfs_uio_offset(uio), ==, off); + zfs_uio_soffset(uio) = off; +} + static inline void zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { @@ -71,7 +96,11 @@ zfs_uio_advance(zfs_uio_t *uio, ssize_t size) static __inline void zfs_uio_init(zfs_uio_t *uio, struct uio *uio_s) { - GET_UIO_STRUCT(uio) = uio_s; + memset(uio, 0, sizeof (zfs_uio_t)); + if (uio_s != NULL) { + GET_UIO_STRUCT(uio) = uio_s; + zfs_uio_soffset(uio) = uio_s->uio_offset; + } } int zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio); diff --git a/include/os/freebsd/zfs/sys/abd_os.h b/include/os/freebsd/zfs/sys/abd_os.h index 57122ee83e..c7895a5e43 100644 --- a/include/os/freebsd/zfs/sys/abd_os.h +++ b/include/os/freebsd/zfs/sys/abd_os.h @@ -26,10 +26,15 @@ #ifndef _ABD_OS_H #define _ABD_OS_H +#include +#include + #ifdef __cplusplus extern "C" { #endif +struct abd; + struct abd_scatter { uint_t abd_offset; void *abd_chunks[1]; /* actually variable-length */ @@ -37,8 +42,12 @@ struct abd_scatter { struct abd_linear { void *abd_buf; + struct sf_buf *sf; /* for LINEAR_PAGE FreeBSD */ }; +__attribute__((malloc)) +struct abd *abd_alloc_from_pages(vm_page_t *, unsigned long, uint64_t); + #ifdef __cplusplus } #endif diff --git a/include/os/linux/kernel/linux/kmap_compat.h b/include/os/linux/kernel/linux/kmap_compat.h index fb59c5f026..432c0e9913 100644 --- a/include/os/linux/kernel/linux/kmap_compat.h +++ b/include/os/linux/kernel/linux/kmap_compat.h @@ -38,6 +38,8 @@ #define zfs_kmap_local(page) kmap_atomic(page) #define zfs_kunmap_local(addr) kunmap_atomic(addr) #endif +#define zfs_kmap(page) kmap(page) +#define zfs_kunmap(page) kunmap(page) /* 5.0 API change - no more 'type' argument for access_ok() */ #ifdef HAVE_ACCESS_OK_TYPE @@ -46,4 +48,49 @@ #define zfs_access_ok(type, addr, size) access_ok(addr, size) #endif +/* + * read returning FOLL_WRITE is due to the fact that we are stating + * that the kernel will have write access to the user pages. So, when + * a Direct I/O read request is issued, the kernel must write to the user + * pages. + * + * get_user_pages_unlocked was not available to 4.0, so we also check + * for get_user_pages on older kernels. + */ +/* 4.9 API change - for and read flag is passed as gup flags */ +#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0) + +/* 4.8 API change - no longer takes struct task_struct as arguement */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(addr, numpages, read, 0, pages) + +/* 4.0-4.3, 4.5-4.7 API */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \ + pages) + +/* 4.4 API */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT_GUP_FLAGS) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(current, current->mm, addr, numpages, pages, \ + read ? FOLL_WRITE : 0) + +/* Using get_user_pages if kernel is < 4.0 */ +#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \ + NULL) +#else +/* + * This case is unreachable. We must be able to use either + * get_user_pages_unlocked() or get_user_pages() to map user pages into + * the kernel. + */ +#error "Unknown Direct I/O interface" +#endif + #endif /* _ZFS_KMAP_H */ diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 5e6ea8d3c2..1f0a7fa68d 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -33,6 +33,12 @@ #include #include #include +#include + +/* + * uio_extflg: extended flags + */ +#define UIO_DIRECT 0x0001 /* Direct I/O request */ #if defined(HAVE_VFS_IOV_ITER) && defined(HAVE_FAULT_IN_IOV_ITER_READABLE) #define iov_iter_fault_in_readable(a, b) fault_in_iov_iter_readable(a, b) @@ -54,6 +60,14 @@ typedef enum zfs_uio_seg { #endif } zfs_uio_seg_t; +/* + * This structures is used when doing Direct I/O. + */ +typedef struct { + struct page **pages; /* Mapped pages */ + int npages; /* Number of mapped pages */ +} zfs_uio_dio_t; + typedef struct zfs_uio { union { const struct iovec *uio_iov; @@ -62,15 +76,16 @@ typedef struct zfs_uio { struct iov_iter *uio_iter; #endif }; - int uio_iovcnt; - offset_t uio_loffset; - zfs_uio_seg_t uio_segflg; + int uio_iovcnt; /* Number of iovecs */ + offset_t uio_soffset; /* Starting logical offset */ + offset_t uio_loffset; /* Current logical offset */ + zfs_uio_seg_t uio_segflg; /* Segment type */ boolean_t uio_fault_disable; - uint16_t uio_fmode; - uint16_t uio_extflg; - ssize_t uio_resid; - - size_t uio_skip; + uint16_t uio_fmode; /* Access mode (unused) */ + uint16_t uio_extflg; /* Extra flags (UIO_DIRECT) */ + ssize_t uio_resid; /* Residual unprocessed bytes */ + size_t uio_skip; /* Skipped bytes in current iovec */ + zfs_uio_dio_t uio_dio; /* Direct I/O user pages */ struct request *rq; } zfs_uio_t; @@ -83,6 +98,7 @@ typedef struct zfs_uio { #define zfs_uio_iovlen(u, idx) (u)->uio_iov[(idx)].iov_len #define zfs_uio_iovbase(u, idx) (u)->uio_iov[(idx)].iov_base #define zfs_uio_fault_disable(u, set) (u)->uio_fault_disable = set +#define zfs_uio_soffset(u) (u)->uio_soffset #define zfs_uio_rlimit_fsize(z, u) (0) #define zfs_uio_fault_move(p, n, rw, u) zfs_uiomove((p), (n), (rw), (u)) @@ -94,6 +110,13 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) uio->uio_loffset = off; } +static inline void +zfs_uio_setsoffset(zfs_uio_t *uio, offset_t off) +{ + ASSERT3U(zfs_uio_offset(uio), ==, off); + zfs_uio_soffset(uio) = off; +} + static inline void zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { @@ -117,6 +140,8 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; + uio->uio_soffset = uio->uio_loffset; + memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } static inline void @@ -146,6 +171,8 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) } uio->rq = rq; + uio->uio_soffset = uio->uio_loffset; + memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } #if defined(HAVE_VFS_IOV_ITER) @@ -162,8 +189,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; + uio->uio_soffset = uio->uio_loffset; + memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } -#endif +#endif /* HAVE_VFS_IOV_ITER */ #if defined(HAVE_ITER_IOV) #define zfs_uio_iter_iov(iter) iter_iov((iter)) diff --git a/include/os/linux/zfs/sys/abd_os.h b/include/os/linux/zfs/sys/abd_os.h index ce4f5a2bdf..606e8bf682 100644 --- a/include/os/linux/zfs/sys/abd_os.h +++ b/include/os/linux/zfs/sys/abd_os.h @@ -55,6 +55,9 @@ int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +__attribute__((malloc)) +abd_t *abd_alloc_from_pages(struct page **, unsigned long, uint64_t); + #ifdef __cplusplus } #endif diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 91a4751fff..c8eefe4fe5 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/include/sys/abd.h b/include/sys/abd.h index 567b88c0fc..bd3a7bd7c9 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -46,6 +46,7 @@ typedef enum abd_flags { ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */ ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */ ABD_FLAG_ALLOCD = 1 << 8, /* we allocated the abd_t */ + ABD_FLAG_FROM_PAGES = 1 << 9, /* does not own pages */ } abd_flags_t; typedef struct abd { @@ -200,6 +201,12 @@ abd_get_size(abd_t *abd) return (abd->abd_size); } +static inline boolean_t +abd_is_from_pages(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_FROM_PAGES) ? B_TRUE : B_FALSE); +} + /* * Module lifecycle * Defined in each specific OS's abd_os.c diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 1eb25d94ad..7b08798504 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -43,6 +43,9 @@ typedef enum abd_stats_op { /* forward declarations */ struct scatterlist; struct page; +#if defined(__FreeBSD__) && defined(_KERNEL) +struct sf_buf; +#endif struct abd_iter { /* public interface */ @@ -71,6 +74,9 @@ struct abd_iter { size_t iter_offset; /* offset in current sg/abd_buf, */ /* abd_offset included */ struct scatterlist *iter_sg; /* current sg */ +#if defined(__FreeBSD__) && defined(_KERNEL) + struct sf_buf *sf; /* used to map in vm_page_t FreeBSD */ +#endif }; extern abd_t *abd_zero_scatter; @@ -78,6 +84,7 @@ extern abd_t *abd_zero_scatter; abd_t *abd_gang_get_offset(abd_t *, size_t *); abd_t *abd_alloc_struct(size_t); void abd_free_struct(abd_t *); +void abd_init_struct(abd_t *); /* * OS specific functions @@ -108,9 +115,9 @@ void abd_iter_page(struct abd_iter *); #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) -#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) -#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) -#define ABD_GANG(abd) (abd->abd_u.abd_gang) +#define ABD_SCATTER(abd) ((abd)->abd_u.abd_scatter) +#define ABD_LINEAR_BUF(abd) ((abd)->abd_u.abd_linear.abd_buf) +#define ABD_GANG(abd) ((abd)->abd_u.abd_gang) #ifdef __cplusplus } diff --git a/include/sys/arc.h b/include/sys/arc.h index c92b3eee61..883c07b4ff 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -120,7 +120,7 @@ typedef enum arc_flags /* * Private ARC flags. These flags are private ARC only flags that - * will show up in b_flags in the arc_hdr_buf_t. These flags should + * will show up in b_flags in the arc_buf_hdr_t. These flags should * only be set by ARC code. */ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ @@ -179,7 +179,6 @@ typedef enum arc_flags ARC_FLAG_COMPRESS_4 = 1 << 28, ARC_FLAG_COMPRESS_5 = 1 << 29, ARC_FLAG_COMPRESS_6 = 1 << 30 - } arc_flags_t; typedef enum arc_buf_flags { diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 8b03b1f895..5ce00bc025 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -61,17 +61,17 @@ extern "C" { /* * The simplified state transition diagram for dbufs looks like: * - * +--> READ --+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * ^ | ^ ^ - * | | | | - * | +--> FILL --+ | - * | | | - * | | | - * | +------> NOFILL -----+ - * | | + * +-------> READ ------+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * ^ | ^ ^ + * | | | | + * | +-------> FILL ------+ | + * | | | | + * | | | | + * | +------> NOFILL -----+-----> UNCACHED + * | | (Direct I/O) * +---------------+ * * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range @@ -336,6 +336,14 @@ typedef struct dmu_buf_impl { /* The buffer was partially read. More reads may follow. */ uint8_t db_partial_read; + + /* + * This block is being held under a writer rangelock of a Direct I/O + * write that is waiting for previous buffered writes to synced out + * due to mixed buffered and O_DIRECT operations. This is needed to + * check whether to grab the rangelock in zfs_get_data(). + */ + uint8_t db_mixed_io_dio_wait; } dmu_buf_impl_t; #define DBUF_HASH_MUTEX(h, idx) \ @@ -393,6 +401,11 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dmu_buf_direct_mixed_io_wait(dmu_buf_impl_t *db, uint64_t txg, + boolean_t read); +void dmu_buf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +blkptr_t *dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db); +int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, @@ -465,6 +478,32 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) return (NULL); } +/* + * All Direct I/O writes happen in open context so the first dirty record will + * always be associated with the write. After a Direct I/O write completes the + * dirty records dr_overriden state will bet DR_OVERRIDDEN and the dr_data will + * get set to NULL. + */ +static inline dbuf_dirty_record_t * +dbuf_get_dirty_direct(dmu_buf_impl_t *db) +{ + return (list_head(&db->db_dirty_records)); +} + +static inline boolean_t +dbuf_dirty_is_direct_write(dmu_buf_impl_t *db, dbuf_dirty_record_t *dr) +{ + boolean_t ret = B_FALSE; + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (dr != NULL && db->db_level == 0 && !dr->dt.dl.dr_brtwrite && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN && + dr->dt.dl.dr_data == NULL) { + ret = B_TRUE; + } + return (ret); +} + #define DBUF_GET_BUFC_TYPE(_db) \ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) @@ -473,7 +512,7 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db); +boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp); #ifdef ZFS_DEBUG diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 928f5f2b4f..216d7d2885 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -525,6 +525,7 @@ void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 +#define WP_DIRECT_WR 0x8 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); @@ -575,6 +576,8 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp); * * The object number must be a valid, allocated object number. */ +int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + const void *tag, dmu_buf_t **dbp); int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **, int flags); int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, @@ -589,6 +592,7 @@ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, dmu_buf_t ***dbpp, uint32_t flags); int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp); + /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. @@ -873,16 +877,20 @@ int dmu_free_long_object(objset_t *os, uint64_t object); #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ #define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ +#define DMU_DIRECTIO 4 /* use Direct I/O */ + int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags); + void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx); -void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); +int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx, uint32_t flags); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx); #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size); @@ -1080,6 +1088,7 @@ typedef struct zgd { struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct zfs_locked_range *zgd_lr; + boolean_t zgd_grabbed_rangelock; void *zgd_private; } zgd_t; diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 83ae2b76ba..8317072f62 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -35,6 +35,10 @@ #include #include #include +#include +#include +#include +#include #ifdef __cplusplus extern "C" { @@ -134,7 +138,7 @@ extern "C" { * db_data_pending * db_dirtied * db_link - * db_dirty_node (??) + * dbuf_dirty_records * db_dirtycnt * db_d.* * db.* @@ -150,8 +154,10 @@ extern "C" { * dbuf_find: none (db_holds) * dbuf_hash_insert: none (db_holds) * dmu_buf_read_array_impl: none (db_state, db_changed) - * dmu_sync: none (db_dirty_node, db_d) + * dmu_sync: none (db_dirty_records, db_d) * dnode_reallocate: none (db) + * dmu_write_direct: none (db_dirty_records, db_d) + * dmu_write_direct_done: none (db_dirty_records, db_d) * * dn_mtx (leaf) * protects: @@ -234,8 +240,9 @@ extern "C" { * dnode_new_blkid */ -struct objset; struct dmu_pool; +struct dmu_buf; +struct zgd; typedef struct dmu_sendstatus { list_node_t dss_link; @@ -245,9 +252,30 @@ typedef struct dmu_sendstatus { uint64_t dss_blocks; /* blocks visited during the sending process */ } dmu_sendstatus_t; +/* + * dmu_sync_{ready/done} args + */ +typedef struct { + dbuf_dirty_record_t *dsa_dr; + void (*dsa_done)(struct zgd *, int); + struct zgd *dsa_zgd; + dmu_tx_t *dsa_tx; +} dmu_sync_arg_t; + +void dmu_sync_done(zio_t *, arc_buf_t *buf, void *varg); +void dmu_sync_ready(zio_t *, arc_buf_t *buf, void *varg); + void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); +int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *); +int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags); +int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *); +#if defined(_KERNEL) +int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t); +int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *); +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index a9123e862a..587dac738b 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -134,6 +134,7 @@ struct objset { zfs_cache_type_t os_secondary_cache; zfs_prefetch_type_t os_prefetch; zfs_sync_type_t os_sync; + zfs_direct_t os_direct; zfs_redundant_metadata_type_t os_redundant_metadata; uint64_t os_recordsize; /* diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index c746600cd2..55b150c044 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -42,6 +42,7 @@ extern "C" { #define FM_EREPORT_ZFS_DATA "data" #define FM_EREPORT_ZFS_DELAY "delay" #define FM_EREPORT_ZFS_DEADMAN "deadman" +#define FM_EREPORT_ZFS_DIO_VERIFY "dio_verify" #define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" @@ -84,6 +85,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS "dio_verify_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index fc4f22cd53..3852fa0317 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -193,6 +193,7 @@ typedef enum { ZFS_PROP_SNAPSHOTS_CHANGED, ZFS_PROP_PREFETCH, ZFS_PROP_VOLTHREADING, + ZFS_PROP_DIRECT, ZFS_NUM_PROPS } zfs_prop_t; @@ -533,6 +534,12 @@ typedef enum { ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; +typedef enum { + ZFS_DIRECT_DISABLED = 0, + ZFS_DIRECT_STANDARD, + ZFS_DIRECT_ALWAYS +} zfs_direct_t; + typedef enum zfs_keystatus { ZFS_KEYSTATUS_NONE = 0, ZFS_KEYSTATUS_UNAVAILABLE, @@ -790,6 +797,9 @@ typedef struct zpool_load_policy { /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" +/* Number of Direct I/O write verify errors */ +#define ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS "vdev_dio_verify_errors" + /* vdev enclosure sysfs path */ #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" @@ -1262,6 +1272,7 @@ typedef struct vdev_stat { uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_noalloc; /* allocations halted? */ uint64_t vs_pspace; /* physical capacity */ + uint64_t vs_dio_verify_errors; /* DIO write verify errors */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ diff --git a/include/sys/spa.h b/include/sys/spa.h index 93f381affd..fb5ca6ec57 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -949,6 +949,14 @@ typedef struct spa_iostats { kstat_named_t simple_trim_bytes_skipped; kstat_named_t simple_trim_extents_failed; kstat_named_t simple_trim_bytes_failed; + kstat_named_t arc_read_count; + kstat_named_t arc_read_bytes; + kstat_named_t arc_write_count; + kstat_named_t arc_write_bytes; + kstat_named_t direct_read_count; + kstat_named_t direct_read_bytes; + kstat_named_t direct_write_count; + kstat_named_t direct_write_bytes; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); @@ -972,6 +980,10 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); +extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, + uint32_t flags); +extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, + uint32_t flags); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index aa34edda5f..9911645ad2 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -40,10 +40,47 @@ #define _SYS_UIO_IMPL_H #include +#include extern int zfs_uiomove(void *, size_t, zfs_uio_rw_t, zfs_uio_t *); extern int zfs_uiocopy(void *, size_t, zfs_uio_rw_t, zfs_uio_t *, size_t *); extern void zfs_uioskip(zfs_uio_t *, size_t); +extern void zfs_uio_free_dio_pages(zfs_uio_t *, zfs_uio_rw_t); +extern int zfs_uio_get_dio_pages_alloc(zfs_uio_t *, zfs_uio_rw_t); +extern boolean_t zfs_uio_page_aligned(zfs_uio_t *); + +static inline boolean_t +zfs_dio_page_aligned(void *buf) +{ + return ((((unsigned long)(buf) & (PAGESIZE - 1)) == 0) ? + B_TRUE : B_FALSE); +} + +static inline boolean_t +zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz) +{ + return (IS_P2ALIGNED(offset, blksz)); +} + +static inline boolean_t +zfs_dio_size_aligned(uint64_t size, uint64_t blksz) +{ + return (IS_P2ALIGNED(size, blksz)); +} + +static inline boolean_t +zfs_dio_aligned(uint64_t offset, uint64_t size, uint64_t blksz) +{ + return (zfs_dio_offset_aligned(offset, blksz) && + zfs_dio_size_aligned(size, blksz)); +} + +static inline boolean_t +zfs_uio_aligned(zfs_uio_t *uio, uint64_t blksz) +{ + return (zfs_dio_aligned(zfs_uio_offset(uio), zfs_uio_resid(uio), + blksz)); +} static inline void zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 57ff31e89e..08ed38d5c2 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -448,9 +448,14 @@ struct vdev { /* * We rate limit ZIO delay, deadman, and checksum events, since they * can flood ZED with tons of events when a drive is acting up. + * + * We also rate limit Direct I/O write verify errors, since a user might + * be continually manipulating a buffer that can flood ZED with tons of + * events. */ zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; + zfs_ratelimit_t vdev_dio_verify_rl; zfs_ratelimit_t vdev_checksum_rl; /* @@ -649,6 +654,12 @@ extern uint_t zfs_vdev_max_auto_ashift; int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); +/* + * VDEV checksum verification precentage for Direct I/O writes + */ +extern uint_t zfs_vdev_direct_write_verify_pct; +int param_set_direct_write_verify_pct(ZFS_MODULE_PARAM_ARGS); + #ifdef __cplusplus } #endif diff --git a/include/sys/zfs_racct.h b/include/sys/zfs_racct.h index 0e8bd04c1a..ff84cccb09 100644 --- a/include/sys/zfs_racct.h +++ b/include/sys/zfs_racct.h @@ -26,12 +26,13 @@ #ifndef _SYS_ZFS_RACCT_H #define _SYS_ZFS_RACCT_H -#include +#include +#include /* * Platform-dependent resource accounting hooks */ -void zfs_racct_read(uint64_t size, uint64_t iops); -void zfs_racct_write(uint64_t size, uint64_t iops); +void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); +void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); #endif /* _SYS_ZFS_RACCT_H */ diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index e60b99bed1..8de71448e4 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -29,6 +29,12 @@ extern int zfs_bclone_enabled; +typedef enum zfs_direct_enabled { + ZFS_DIRECT_IO_ERR, + ZFS_DIRECT_IO_DISABLED, + ZFS_DIRECT_IO_ENABLED +} zfs_direct_enabled_t; + extern int zfs_fsync(znode_t *, int, cred_t *); extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); @@ -46,6 +52,9 @@ extern int mappedread(znode_t *, int, zfs_uio_t *); extern int mappedread_sf(znode_t *, int, zfs_uio_t *); extern void update_pages(znode_t *, int64_t, int, objset_t *); +extern zfs_direct_enabled_t zfs_check_direct_enabled(znode_t *, int, int *); +extern int zfs_setup_direct(znode_t *, zfs_uio_t *, zfs_uio_rw_t, int *); + /* * Platform code that asynchronously drops zp's inode / vnode_t. * diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index d71144807f..c852c4758a 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -308,7 +308,7 @@ extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, const char *dname, znode_t *szp, znode_t *wzp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t len, boolean_t commit, - zil_callback_t callback, void *callback_data); + boolean_t o_direct, zil_callback_t callback, void *callback_data); extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len); extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zio.h b/include/sys/zio.h index 446b64ccd8..e6a881e0fb 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -222,6 +222,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NOPWRITE (1ULL << 28) #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) +#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 31) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) @@ -352,6 +353,7 @@ typedef struct zio_prop { boolean_t zp_brtwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; + boolean_t zp_direct_write; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 2b026d4867..d6549d2ace 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -157,8 +157,9 @@ enum zio_stage { ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--XT */ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */ + ZIO_STAGE_DIO_CHECKSUM_VERIFY = 1 << 25, /* -W---- */ - ZIO_STAGE_DONE = 1 << 25 /* RWFCXT */ + ZIO_STAGE_DONE = 1 << 26 /* RWFCXT */ }; #define ZIO_ROOT_PIPELINE \ @@ -224,6 +225,10 @@ enum zio_stage { ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) +#define ZIO_DIRECT_WRITE_PIPELINE \ + ZIO_WRITE_PIPELINE & \ + (~ZIO_STAGE_ISSUE_ASYNC) + #define ZIO_DDT_CHILD_WRITE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_VDEV_IO_STAGES | \ diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index 665bfc4230..b107333d6f 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -82,6 +82,32 @@ typedef struct zfs_uio { #define zfs_uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len #define zfs_uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base +static inline boolean_t +zfs_dio_page_aligned(void *buf) +{ + return ((((unsigned long)(buf) & (PAGESIZE - 1)) == 0) ? + B_TRUE : B_FALSE); +} + +static inline boolean_t +zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz) +{ + return (IS_P2ALIGNED(offset, blksz)); +} + +static inline boolean_t +zfs_dio_size_aligned(uint64_t size, uint64_t blksz) +{ + return (IS_P2ALIGNED(size, blksz)); +} + +static inline boolean_t +zfs_dio_aligned(uint64_t offset, uint64_t size, uint64_t blksz) +{ + return (zfs_dio_offset_aligned(offset, blksz) && + zfs_dio_size_aligned(size, blksz)); +} + static inline void zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) { diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 88dd8b3c67..3eac5f504a 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -2,13 +2,17 @@ + + - + + + @@ -626,61 +630,192 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + + + + + - - - - - - - - - + + + - - - + + + + + + + + + + + + + + + + - - - - + + + + + + + + + @@ -725,35 +860,11 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - @@ -765,11 +876,6 @@ - - - - - @@ -780,41 +886,40 @@ - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + @@ -867,6 +972,11 @@ + + + + + @@ -887,6 +997,11 @@ + + + + + @@ -1089,6 +1204,11 @@ + + + + + @@ -1112,20 +1232,16 @@ + + + - - - - - - - - - + + @@ -1133,10 +1249,6 @@ - - - - @@ -1247,48 +1359,266 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1307,48 +1637,27 @@ - - - + + + + - + + + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -1363,10 +1672,10 @@ - + - + @@ -1384,11 +1693,30 @@ - + + + + + + + + + - + + + + + + + + + + + + @@ -1410,185 +1738,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1642,12 +1791,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1664,7 +1862,6 @@ - @@ -1672,28 +1869,16 @@ - - - - - - - - - - - - - + @@ -1770,122 +1955,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1904,23 +1973,70 @@ - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1969,61 +2085,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -2159,656 +2220,27 @@ + - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -2839,47 +2271,16 @@ - - - - - - - + + + - - - - - - - - - - - - - - - - + + + + - - - - - - - - - - - - - - - - + @@ -2887,713 +2288,9 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -3638,28 +2335,8 @@ - - - - - - - - - - - - - - - - - - - - @@ -3697,12 +2374,113 @@ - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + @@ -3720,36 +2498,22 @@ + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - @@ -3764,672 +2528,19 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - + + + - + - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -4443,10 +2554,28 @@ + + + + + + + + + + + + + + + + + + @@ -4483,6 +2612,17 @@ + + + + + + + + + + + @@ -4524,6 +2664,22 @@ + + + + + + + + + + + + + + + + @@ -4561,6 +2717,10 @@ + + + + @@ -4573,6 +2733,12 @@ + + + + + + @@ -4752,90 +2918,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -4845,125 +2927,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -4972,32 +2935,8 @@ - - - - - - - - - - - - - - - - - - - - - - - - @@ -5021,106 +2960,12 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -5147,39 +2992,19 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + @@ -5189,12 +3014,28 @@ + + + + + + + + + + + + + + + + @@ -5233,6 +3074,13 @@ + + + + + + + @@ -5240,68 +3088,22 @@ + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -5314,251 +3116,31 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + @@ -5568,15 +3150,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -5610,7 +3219,6 @@ - @@ -5638,43 +3246,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -5684,6 +3255,51 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -5743,21 +3359,6 @@ - - - - - - - - - - - - - - - @@ -5811,15 +3412,6 @@ - - - - - - - - - @@ -5837,6 +3429,7 @@ +<<<<<<< HEAD @@ -5945,6 +3538,8 @@ +======= +>>>>>>> ccf1a36dc (Adding Direct IO Support) @@ -5965,6 +3560,7 @@ +<<<<<<< HEAD @@ -6392,10 +3988,22 @@ +======= + + + + +>>>>>>> ccf1a36dc (Adding Direct IO Support) + + + + + + @@ -6409,6 +4017,15 @@ + + + + + + + + + @@ -6436,6 +4053,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + @@ -6821,39 +4463,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -6957,786 +4566,10 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -7791,22 +4624,66 @@ - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -7815,24 +4692,10 @@ - - - - - - - - - - - - - - - + @@ -7901,106 +4764,10 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +<<<<<<< HEAD @@ -8223,6 +4990,8 @@ +======= +>>>>>>> ccf1a36dc (Adding Direct IO Support) @@ -8235,6 +5004,12 @@ + + + + + + @@ -8267,6 +5042,10 @@ + + + + @@ -8274,6 +5053,18 @@ + + + + + + + + + + + + @@ -8315,6 +5106,12 @@ + + + + + + @@ -8342,6 +5139,19 @@ +<<<<<<< HEAD +======= + + + + + + + + + + +>>>>>>> ccf1a36dc (Adding Direct IO Support) @@ -8383,25 +5193,14 @@ + + + + + - - - - - - - - - - - - - - - - @@ -8421,205 +5220,274 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - + + + - - - - - + + + + + + - - + + + + + + + + + + + + + - + - - + + - - + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -8628,37 +5496,19 @@ - - - - - - - - + + + + - - - + + + - - - - - - - - - + + - - - - - - - @@ -8671,129 +5521,29 @@ + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + @@ -8806,6 +5556,11 @@ + + + + + @@ -8816,49 +5571,55 @@ - - - - - - - - - - - + + - - - - - - - - - + + + + + + + +<<<<<<< HEAD +======= + + + + + +>>>>>>> ccf1a36dc (Adding Direct IO Support) + + + + + + + + + + + + - - - @@ -8916,88 +5677,19 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + @@ -9010,19 +5702,8 @@ - - - - - - - - - - - @@ -9031,15 +5712,6 @@ - - - - - - - - - @@ -9051,6 +5723,12 @@ + + + + + + @@ -9063,6 +5741,12 @@ + + + + + + @@ -9106,18 +5790,20 @@ - - - - - - - + + + + + + + + + @@ -9129,6 +5815,19 @@ + + + + + + + + + + + + + @@ -9139,6 +5838,18 @@ + + + + + + + + + + + + @@ -9146,6 +5857,16 @@ + + + + + + + + + + @@ -9166,10 +5887,30 @@ + + + + + + + + + + + + + + + + + + + + @@ -9184,6 +5925,54 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -9226,24 +6015,6 @@ - - - - - - - - - - - - - - - - - - @@ -9256,36 +6027,27 @@ - - - + - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + @@ -9297,23 +6059,32 @@ + + + - - - - - - + + + + + + + + + + + + + + + + + - - - - - @@ -9323,6 +6094,11 @@ + + + + + @@ -9337,9 +6113,6 @@ - - - @@ -9405,16 +6178,6 @@ - - - - - - - - - - @@ -9432,6 +6195,9 @@ + + + @@ -9447,6 +6213,12 @@ + + + + + + @@ -9527,21 +6299,12 @@ - - + - - - - - - - - @@ -9583,6 +6346,12 @@ + + + + + + @@ -9590,6 +6359,24 @@ + + + + + + + + + + + + + + + + + + @@ -9675,6 +6462,24 @@ + + + + + + + + + + + + + + + + + + @@ -9686,6 +6491,22 @@ + + + + + + + + + + + + + + + + @@ -9704,100 +6525,124 @@ + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + - - + - - - - - - - - - - - - - - - - + + + - - - - - - - - - - + + + - - - - - - - - - - - + + + - - - - - - - - - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + @@ -9805,15 +6650,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -9832,12 +6726,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -9858,12 +6800,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -9883,8 +6866,122 @@ - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 81949bf9e5..d9552613cb 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -86,6 +86,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/ddt_zap.c \ module/zfs/dmu.c \ module/zfs/dmu_diff.c \ + module/zfs/dmu_direct.c \ module/zfs/dmu_object.c \ module/zfs/dmu_objset.c \ module/zfs/dmu_recv.c \ diff --git a/lib/libzpool/abd_os.c b/lib/libzpool/abd_os.c index 5a91605b2f..8531b8f40a 100644 --- a/lib/libzpool/abd_os.c +++ b/lib/libzpool/abd_os.c @@ -363,3 +363,67 @@ void abd_cache_reap_now(void) { } + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will alloate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, 0); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } +#ifdef ZFS_DEBUG + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); +#endif + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * no change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); +#ifdef ZFS_DEBUG + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 20bb95c1ae..ab0ab5e716 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -416,6 +416,28 @@ May be increased up to .Sy ASHIFT_MAX Po 16 Pc , but this may negatively impact pool space efficiency. . +.It Sy zfs_vdev_direct_write_verify_pct Ns = Ns Sy Linux 2 | FreeBSD 0 Pq uint +If non-zero, then a Direct I/O write's checksum will be verified every +percentage (pct) of Direct I/O writes that are issued to a top-level VDEV +before it is committed and the block pointer is updated. +In the event the checksum is not valid then the I/O operation will be +redirected through the ARC. +This module parameter can be used to detect if the +contents of the users buffer have changed in the process of doing a Direct I/O +write. +It can also help to identify if reported checksum errors are tied to Direct I/O +writes. +Each verify error causes a +.Sy dio_verify +zevent. +Direct Write I/O checkum verify errors can be seen with +.Nm zpool Cm status Fl d . +The default value for this is 2 percent on Linux, but is 0 for +.Fx +because user pages can be placed under write protection in +.Fx +before the Direct I/O write is issued. +. .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq uint Minimum ashift used when creating new top-level vdevs. . @@ -1093,6 +1115,9 @@ This will smoothly handle between ten times and a tenth of this number. .Pp .Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 . . +.It Sy zfs_dio_write_verify_events_per_second Ns = Ns Sy 20 Ns /s Pq uint +Rate limit Direct I/O write verify events to this many per second. +. .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int Disables requirement for IVset GUIDs to be present and match when doing a raw receive of encrypted datasets. diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index f7026119b7..cf3acf3622 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -1039,6 +1039,44 @@ See the section of .Xr zfsconcepts 7 . .It Xo +.Sy direct Ns = Ns Sy disabled Ns | Ns Sy standard Ns | Ns Sy always +.Xc +Controls the behavior of Direct I/O requests +.Pq e.g. Dv O_DIRECT . +The +.Sy standard +behavior for Direct I/O requests is to bypass the ARC when possible. +These requests will not be cached and performance will be limited by the +raw speed of the underlying disks +.Pq Dv this is the default . +.Sy always +causes every properly aligned read or write to be treated as a direct request. +.Sy disabled +causes the O_DIRECT flag to be silently ignored and all direct requests will +be handled by the ARC. +This is the default behavior for OpenZFS 2.1 and prior releases. +.Pp +Bypassing the ARC requires that a direct request be correctly aligned. +For write requests the starting offset and size of the request must be +.Sy recordsize Ns +-aligned, if not then the unaligned portion of the request will be silently +redirected through the ARC. +For read requests there is no +.Sy recordsize +alignment restriction on either the starting offset or size. +All direct requests must use a page-aligned memory buffer and the request +size must be a multiple of the page size or an error is returned. +.Pp +Concurrently mixing buffered and direct requests to overlapping regions of +a file can decrease performance. +However, the resulting file will always be coherent. +For example, a direct read after a buffered write will return the data +from the buffered write. +Furthermore, if an application uses +.Xr mmap 2 +based file access then in order to maintain coherency all direct requests +are converted to buffered requests while the file is mapped. +.It Xo .Sy dnodesize Ns = Ns Sy legacy Ns | Ns Sy auto Ns | Ns Sy 1k Ns | Ns .Sy 2k Ns | Ns Sy 4k Ns | Ns Sy 8k Ns | Ns Sy 16k .Xc diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index ef20ef4e00..77d44bd8ad 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -98,6 +98,17 @@ This can be an indicator of problems with the underlying storage device. The number of delay events is ratelimited by the .Sy zfs_slow_io_events_per_second module parameter. +.It Sy dio_verify +Issued when there was a checksum verify error after a Direct I/O write has been +issued and is redirected through the ARC. +This event can only take place if the module parameter +.Sy zfs_vdev_direct_write_verify_pct +is not set to zero. +See +.Xr zfs 4 +for more details on the +.Sy zfs_vdev_direct_write_verify_pct +module paramter. .It Sy config Issued every time a vdev change have been done to the pool. .It Sy zpool @@ -408,8 +419,9 @@ ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--XT ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--XT ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R----- +ZIO_STAGE_DIO_CHECKSUM_VERIFY:0x02000000:-W---- -ZIO_STAGE_DONE:0x02000000:RWFCXT +ZIO_STAGE_DONE:0x04000000:RWFCXT .TE . .Sh I/O FLAGS diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index b40faeb997..923b99de30 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm status -.Op Fl DegiLpPstvx +.Op Fl dDegiLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … @@ -81,6 +81,15 @@ to display vdevs in flat hierarchy instead of nested vdev objects. Specify .Sy --json-pool-key-guid to set pool GUID as key for pool objects instead of pool names. +.It Fl d +Display the number of Direct I/O write checksum verify errors that have occured +on a top-level VDEV. +See +.Sx zfs_vdev_direct_write_verify_pct +in +.Xr zfs 4 +for details about the conditions that can cause Direct I/O write checksum +verify failures to occur. .It Fl D Display a histogram of deduplication statistics, showing the allocated .Pq physically present on disk diff --git a/module/Kbuild.in b/module/Kbuild.in index a119198dbf..4f266f62d6 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -326,6 +326,7 @@ ZFS_OBJS := \ ddt_stats.o \ ddt_zap.o \ dmu.o \ + dmu_direct.o \ dmu_diff.o \ dmu_object.o \ dmu_objset.o \ @@ -445,6 +446,7 @@ ZFS_OBJS_OS := \ vdev_disk.o \ vdev_file.o \ vdev_label_os.o \ + vdev_os.o \ zfs_acl.o \ zfs_ctldir.o \ zfs_debug.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 534f325713..20a6201805 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -256,6 +256,7 @@ SRCS+= abd.c \ ddt_stats.c \ ddt_zap.c \ dmu.c \ + dmu_direct.c \ dmu_diff.c \ dmu_object.c \ dmu_objset.c \ diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c index 17886cbeb5..f47952db22 100644 --- a/module/os/freebsd/spl/spl_uio.c +++ b/module/os/freebsd/spl/spl_uio.c @@ -44,6 +44,10 @@ #include #include #include +#include +#include +#include +#include static void zfs_freeuio(struct uio *uio) @@ -115,3 +119,202 @@ zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) ASSERT3U(zfs_uio_rw(uio), ==, dir); return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio))); } + +/* + * Check if the uio is page-aligned in memory. + */ +boolean_t +zfs_uio_page_aligned(zfs_uio_t *uio) +{ + const struct iovec *iov = GET_UIO_STRUCT(uio)->uio_iov; + + for (int i = zfs_uio_iovcnt(uio); i > 0; iov++, i--) { + unsigned long addr = (unsigned long)iov->iov_base; + size_t size = iov->iov_len; + if ((addr & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void +zfs_uio_set_pages_to_stable(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + ASSERT3U(uio->uio_dio.npages, >, 0); + + for (int i = 0; i < uio->uio_dio.npages; i++) { + vm_page_t page = uio->uio_dio.pages[i]; + ASSERT3P(page, !=, NULL); + + MPASS(page == PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(page))); + vm_page_busy_acquire(page, VM_ALLOC_SBUSY); + pmap_remove_write(page); + } +} + +static void +zfs_uio_release_stable_pages(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + for (int i = 0; i < uio->uio_dio.npages; i++) { + vm_page_t page = uio->uio_dio.pages[i]; + + ASSERT3P(page, !=, NULL); + vm_page_sunbusy(page); + } +} + +/* + * If the operation is marked as read, then we are stating the pages will be + * written to and must be given write access. + */ +static int +zfs_uio_hold_pages(unsigned long start, size_t len, unsigned long nr_pages, + zfs_uio_rw_t rw, vm_page_t *pages) +{ + vm_map_t map; + vm_prot_t prot; + int count; + + map = &curthread->td_proc->p_vmspace->vm_map; + ASSERT3S(len, >, 0); + + prot = rw == UIO_READ ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ; + count = vm_fault_quick_hold_pages(map, start, len, prot, pages, + nr_pages); + + return (count); +} + +void +zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3P(uio->uio_dio.pages, !=, NULL); + ASSERT(zfs_uio_rw(uio) == rw); + + if (rw == UIO_WRITE) + zfs_uio_release_stable_pages(uio); + + vm_page_unhold_pages(&uio->uio_dio.pages[0], + uio->uio_dio.npages); + + kmem_free(uio->uio_dio.pages, + uio->uio_dio.npages * sizeof (vm_page_t)); +} + +static long +zfs_uio_get_user_pages(unsigned long start, unsigned long nr_pages, + size_t len, zfs_uio_rw_t rw, vm_page_t *pages) +{ + int count; + + count = zfs_uio_hold_pages(start, len, nr_pages, rw, pages); + + if (count != nr_pages) { + if (count > 0) + vm_page_unhold_pages(pages, count); + return (count); + } + + ASSERT3U(count, ==, nr_pages); + + return (count); +} + +static size_t +zfs_uio_iov_step(struct iovec v, zfs_uio_t *uio, int *numpages) +{ + unsigned long addr = (unsigned long)(v.iov_base); + size_t len = v.iov_len; + int n = DIV_ROUND_UP(len, PAGE_SIZE); + + int res = zfs_uio_get_user_pages( + P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, len, + zfs_uio_rw(uio), &uio->uio_dio.pages[uio->uio_dio.npages]); + if (res != n) { + *numpages = -1; + return (SET_ERROR(EFAULT)); + } + + ASSERT3S(len, ==, res * PAGE_SIZE); + *numpages = res; + return (len); +} + +static int +zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) +{ + const struct iovec *iovp = GET_UIO_STRUCT(uio)->uio_iov; + size_t wanted; + size_t maxsize = zfs_uio_resid(uio); + + wanted = maxsize; + + for (int i = 0; i < zfs_uio_iovcnt(uio); i++) { + struct iovec iov; + int numpages = 0; + + if (iovp->iov_len == 0) { + iovp++; + continue; + } + iov.iov_len = MIN(maxsize, iovp->iov_len); + iov.iov_base = iovp->iov_base; + size_t left = zfs_uio_iov_step(iov, uio, &numpages); + + if (numpages == -1) + return (left); + + ASSERT3U(left, ==, iov.iov_len); + uio->uio_dio.npages += numpages; + maxsize -= iov.iov_len; + wanted -= left; + iovp++; + } + + ASSERT0(wanted); + + return (0); +} + +/* + * This function maps user pages into the kernel. In the event that the user + * pages were not mapped successfully an error value is reutrned. + * + * On success, 0 is returned. + */ +int +zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + int error = 0; + size_t npages = DIV_ROUND_UP(zfs_uio_resid(uio), PAGE_SIZE); + size_t size = npages * sizeof (vm_page_t); + + ASSERT(zfs_uio_rw(uio) == rw); + + uio->uio_dio.pages = kmem_alloc(size, KM_SLEEP); + + error = zfs_uio_get_dio_pages_impl(uio); + + if (error) { + kmem_free(uio->uio_dio.pages, size); + return (error); + } + + /* + * Since we will be writing the user pages we must make sure that + * they are stable. That way the contents of the pages can not change + * while we are doing: compression, checksumming, encryption, parity + * calculations or deduplication. + */ + if (zfs_uio_rw(uio) == UIO_WRITE) + zfs_uio_set_pages_to_stable(uio); + + uio->uio_extflg |= UIO_DIRECT; + + return (0); +} diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index f24ea3dc76..c7a1859f90 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -32,6 +32,7 @@ #include #include #include +#include typedef struct abd_stats { kstat_named_t abdstat_struct_size; @@ -135,9 +136,17 @@ abd_size_alloc_linear(size_t size) void abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { - uint_t n = abd_scatter_chunkcnt(abd); + uint_t n; + + if (abd_is_from_pages(abd)) + n = abd_chunkcnt_for_bytes(abd->abd_size); + else + n = abd_scatter_chunkcnt(abd); ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); int waste = (n << PAGE_SHIFT) - abd->abd_size; + ASSERT3U(n, >, 0); + ASSERT3S(waste, >=, 0); + IMPLY(abd_is_linear_page(abd), waste < PAGE_SIZE); if (op == ABDSTAT_INCR) { ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); @@ -198,10 +207,16 @@ abd_free_chunks(abd_t *abd) { uint_t i, n; - n = abd_scatter_chunkcnt(abd); - for (i = 0; i < n; i++) { - kmem_cache_free(abd_chunk_cache, - ABD_SCATTER(abd).abd_chunks[i]); + /* + * Scatter ABDs may be constructed by abd_alloc_from_pages() from + * an array of pages. In which case they should not be freed. + */ + if (!abd_is_from_pages(abd)) { + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { + kmem_cache_free(abd_chunk_cache, + ABD_SCATTER(abd).abd_chunks[i]); + } } } @@ -342,11 +357,8 @@ abd_fini(void) void abd_free_linear_page(abd_t *abd) { - /* - * FreeBSD does not have scatter linear pages - * so there is an error. - */ - VERIFY(0); + ASSERT3P(abd->abd_u.abd_linear.sf, !=, NULL); + zfs_unmap_page(abd->abd_u.abd_linear.sf); } /* @@ -365,6 +377,26 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) return (abd_alloc_linear(size, is_metadata)); } +static abd_t * +abd_get_offset_from_pages(abd_t *abd, abd_t *sabd, size_t chunkcnt, + size_t new_offset) +{ + ASSERT(abd_is_from_pages(sabd)); + + /* + * Set the child child chunks to point at the parent chunks as + * the chunks are just pages and we don't want to copy them. + */ + size_t parent_offset = new_offset / PAGE_SIZE; + ASSERT3U(parent_offset, <, abd_scatter_chunkcnt(sabd)); + for (int i = 0; i < chunkcnt; i++) + ABD_SCATTER(abd).abd_chunks[i] = + ABD_SCATTER(sabd).abd_chunks[parent_offset + i]; + + abd->abd_flags |= ABD_FLAG_FROM_PAGES; + return (abd); +} + abd_t * abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) @@ -399,6 +431,11 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK; + if (abd_is_from_pages(sabd)) { + return (abd_get_offset_from_pages(abd, sabd, chunkcnt, + new_offset)); + } + /* Copy the scatterlist starting at the correct offset */ (void) memcpy(&ABD_SCATTER(abd).abd_chunks, &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT], @@ -407,6 +444,47 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, return (abd); } +/* + * Allocate a scatter ABD structure from user pages. + */ +abd_t * +abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size) +{ + VERIFY3U(size, <=, DMU_MAX_ACCESS); + ASSERT3U(offset, <, PAGE_SIZE); + ASSERT3P(pages, !=, NULL); + + abd_t *abd = abd_alloc_struct(size); + abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES; + abd->abd_size = size; + + if (size < PAGE_SIZE) { + /* + * We do not have a full page so we will just use a linear ABD. + * We have to make sure to take into account the offset though. + * In all other cases our offset will be 0 as we are always + * PAGE_SIZE aligned. + */ + ASSERT3U(offset + size, <=, PAGE_SIZE); + abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; + ABD_LINEAR_BUF(abd) = (char *)zfs_map_page(pages[0], + &abd->abd_u.abd_linear.sf) + offset; + } else { + ABD_SCATTER(abd).abd_offset = offset; + ASSERT0(ABD_SCATTER(abd).abd_offset); + + /* + * Setting the ABD's abd_chunks to point to the user pages. + */ + for (int i = 0; i < abd_chunkcnt_for_bytes(size); i++) + ABD_SCATTER(abd).abd_chunks[i] = pages[i]; + } + + abd_update_scatter_stats(abd, ABDSTAT_INCR); + + return (abd); +} + /* * Initialize the abd_iter. */ @@ -468,6 +546,16 @@ abd_iter_map(struct abd_iter *aiter) if (abd_is_linear(abd)) { aiter->iter_mapsize = abd->abd_size - offset; paddr = ABD_LINEAR_BUF(abd); + } else if (abd_is_from_pages(abd)) { + aiter->sf = NULL; + offset += ABD_SCATTER(abd).abd_offset; + size_t index = offset / PAGE_SIZE; + offset &= PAGE_MASK; + aiter->iter_mapsize = MIN(PAGE_SIZE - offset, + abd->abd_size - aiter->iter_pos); + paddr = zfs_map_page( + ABD_SCATTER(aiter->iter_abd).abd_chunks[index], + &aiter->sf); } else { offset += ABD_SCATTER(abd).abd_offset; paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT]; @@ -490,6 +578,12 @@ abd_iter_unmap(struct abd_iter *aiter) ASSERT3U(aiter->iter_mapsize, >, 0); } + if (abd_is_from_pages(aiter->iter_abd) && + !abd_is_linear_page(aiter->iter_abd)) { + ASSERT3P(aiter->sf, !=, NULL); + zfs_unmap_page(aiter->sf); + } + aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; } @@ -499,3 +593,67 @@ abd_cache_reap_now(void) { kmem_cache_reap_soon(abd_chunk_cache); } + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will alloate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, 0); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } +#ifdef ZFS_DEBUG + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); +#endif + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * no change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); +#ifdef ZFS_DEBUG + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index c84cb7407a..d58fd241c5 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -831,6 +831,35 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, " new top-level vdevs. (LEGACY)"); /* END CSTYLED */ +int +param_set_direct_write_verify_pct(SYSCTL_HANDLER_ARGS) +{ + int val; + int err; + + val = zfs_vdev_direct_write_verify_pct; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val > 100 || val < 0) + return (SET_ERROR(EINVAL)); + + zfs_vdev_direct_write_verify_pct = val; + + return (0); +} + +/* BEGIN CSTYLED */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, direct_write_verify_pct, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + &zfs_vdev_direct_write_verify_pct, + sizeof (zfs_vdev_direct_write_verify_pct), + param_set_direct_write_verify_pct, "IU", + "Percentage of Direct I/O writes per top-level VDEV for checksum" + " verification to be performed"); +/* END CSTYLED */ + /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. diff --git a/module/os/freebsd/zfs/zfs_racct.c b/module/os/freebsd/zfs/zfs_racct.c index 883255bc19..2989a9af92 100644 --- a/module/os/freebsd/zfs/zfs_racct.c +++ b/module/os/freebsd/zfs/zfs_racct.c @@ -27,7 +27,7 @@ #include void -zfs_racct_read(uint64_t size, uint64_t iops) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { curthread->td_ru.ru_inblock += iops; #ifdef RACCT @@ -40,10 +40,12 @@ zfs_racct_read(uint64_t size, uint64_t iops) #else (void) size; #endif /* RACCT */ + + spa_iostats_read_add(spa, size, iops, flags); } void -zfs_racct_write(uint64_t size, uint64_t iops) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { curthread->td_ru.ru_oublock += iops; #ifdef RACCT @@ -56,4 +58,6 @@ zfs_racct_write(uint64_t size, uint64_t iops) #else (void) size; #endif /* RACCT */ + + spa_iostats_write_add(spa, size, iops, flags); } diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 01b964f98f..d13db17516 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -4131,7 +4131,7 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, * but that would make the locking messier */ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, - len, commit, NULL, NULL); + len, commit, B_FALSE, NULL, NULL); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { @@ -4266,12 +4266,37 @@ ioflags(int ioflags) flags |= O_APPEND; if (ioflags & IO_NDELAY) flags |= O_NONBLOCK; + if (ioflags & IO_DIRECT) + flags |= O_DIRECT; if (ioflags & IO_SYNC) flags |= O_SYNC; return (flags); } +static int +zfs_freebsd_read_direct(znode_t *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, + int ioflag, cred_t *cr) +{ + int ret; + int flags = ioflag; + + ASSERT3U(rw, ==, UIO_READ); + + /* On error, return to fallback to the buffred path */ + ret = zfs_setup_direct(zp, uio, rw, &flags); + if (ret) + return (ret); + + ASSERT(uio->uio_extflg & UIO_DIRECT); + + ret = zfs_read(zp, uio, flags, cr); + + zfs_uio_free_dio_pages(uio, rw); + + return (ret); +} + #ifndef _SYS_SYSPROTO_H_ struct vop_read_args { struct vnode *a_vp; @@ -4285,9 +4310,87 @@ static int zfs_freebsd_read(struct vop_read_args *ap) { zfs_uio_t uio; + int error = 0; + znode_t *zp = VTOZ(ap->a_vp); + int ioflag = ioflags(ap->a_ioflag); + zfs_uio_init(&uio, ap->a_uio); - return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), - ap->a_cred)); + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(zp, ioflag, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + error = + zfs_freebsd_read_direct(zp, &uio, UIO_READ, ioflag, + ap->a_cred); + /* + * XXX We occasionally get an EFAULT for Direct I/O reads on + * FreeBSD 13. This still needs to be resolved. The EFAULT comes + * from: + * zfs_uio_get__dio_pages_alloc() -> + * zfs_uio_get_dio_pages_impl() -> + * zfs_uio_iov_step() -> + * zfs_uio_get_user_pages(). + * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O + * read fails to map in the user pages (returning EFAULT) the + * Direct I/O request is broken up into two separate IO requests + * and issued separately using Direct I/O. + */ +#ifdef ZFS_DEBUG + if (error == EFAULT) { +#if 0 + printf("%s(%d): Direct I/O read returning EFAULT " + "uio = %p, zfs_uio_offset(uio) = %lu " + "zfs_uio_resid(uio) = %lu\n", + __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio), + zfs_uio_resid(&uio)); +#endif + } + +#endif + + /* + * On error we will return unless the error is EAGAIN, which + * just tells us to fallback to the buffered path. + */ + if (error != EAGAIN) + return (error); + else + ioflag &= ~O_DIRECT; + } + + + ASSERT(direct == ZFS_DIRECT_IO_DISABLED || + (direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN)); + + error = zfs_read(zp, &uio, ioflag, ap->a_cred); + + return (error); +} + +static int +zfs_freebsd_write_direct(znode_t *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, + int ioflag, cred_t *cr) +{ + int ret; + int flags = ioflag; + + ASSERT3U(rw, ==, UIO_WRITE); + + /* On error, return to fallback to the buffred path */ + ret = zfs_setup_direct(zp, uio, rw, &flags); + if (ret) + return (ret); + + ASSERT(uio->uio_extflg & UIO_DIRECT); + + ret = zfs_write(zp, uio, flags, cr); + + zfs_uio_free_dio_pages(uio, rw); + + return (ret); } #ifndef _SYS_SYSPROTO_H_ @@ -4303,9 +4406,39 @@ static int zfs_freebsd_write(struct vop_write_args *ap) { zfs_uio_t uio; + int error = 0; + znode_t *zp = VTOZ(ap->a_vp); + int ioflag = ioflags(ap->a_ioflag); + zfs_uio_init(&uio, ap->a_uio); - return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), - ap->a_cred)); + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(zp, ioflag, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + error = + zfs_freebsd_write_direct(zp, &uio, UIO_WRITE, ioflag, + ap->a_cred); + + /* + * On error we will return unless the error is EAGAIN, which + * just tells us to fallback to the buffered path. + */ + if (error != EAGAIN) + return (error); + else + ioflag &= ~O_DIRECT; + + } + + ASSERT(direct == ZFS_DIRECT_IO_DISABLED || + (direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN)); + + error = zfs_write(zp, &uio, ioflag, ap->a_cred); + + return (error); } /* diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index ddb20b0314..c3be4730d4 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -922,6 +922,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) if (commit) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); + return (error); } diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 60287ccdda..72b5a628ee 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -186,6 +186,7 @@ static int zfs_abd_scatter_min_size = 512 * 3; abd_t *abd_zero_scatter = NULL; struct page; + /* * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will * point to ZERO_PAGE if it is available or it will be an allocated zero'd @@ -453,14 +454,21 @@ abd_free_chunks(abd_t *abd) if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); - abd_for_each_sg(abd, sg, nr_pages, i) { - page = sg_page(sg); - abd_unmark_zfs_page(page); - order = compound_order(page); - __free_pages(page, order); - ASSERT3U(sg->length, <=, PAGE_SIZE << order); - ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + /* + * Scatter ABDs may be constructed by abd_alloc_from_pages() from + * an array of pages. In which case they should not be freed. + */ + if (!abd_is_from_pages(abd)) { + abd_for_each_sg(abd, sg, nr_pages, i) { + page = sg_page(sg); + abd_unmark_zfs_page(page); + order = compound_order(page); + __free_pages(page, order); + ASSERT3U(sg->length, <=, PAGE_SIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } } + abd_free_sg_table(abd); } @@ -551,17 +559,19 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) void abd_verify_scatter(abd_t *abd) { - size_t n; - int i = 0; - struct scatterlist *sg = NULL; - ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); ASSERT3U(ABD_SCATTER(abd).abd_offset, <, ABD_SCATTER(abd).abd_sgl->length); - n = ABD_SCATTER(abd).abd_nents; + +#ifdef ZFS_DEBUG + struct scatterlist *sg = NULL; + size_t n = ABD_SCATTER(abd).abd_nents; + int i = 0; + abd_for_each_sg(abd, sg, n, i) { ASSERT3P(sg_page(sg), !=, NULL); } +#endif } static void @@ -687,6 +697,11 @@ abd_free_linear_page(abd_t *abd) { /* Transform it back into a scatter ABD for freeing */ struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + + /* When backed by user page unmap it */ + if (abd_is_from_pages(abd)) + zfs_kunmap(sg_page(sg)); + abd->abd_flags &= ~ABD_FLAG_LINEAR; abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; ABD_SCATTER(abd).abd_nents = 1; @@ -697,6 +712,69 @@ abd_free_linear_page(abd_t *abd) abd_update_scatter_stats(abd, ABDSTAT_DECR); } +/* + * Allocate a scatter ABD structure from user pages. The pages must be + * pinned with get_user_pages, or similiar, but need not be mapped via + * the kmap interfaces. + */ +abd_t * +abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size) +{ + uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE); + struct sg_table table; + + VERIFY3U(size, <=, DMU_MAX_ACCESS); + ASSERT3U(offset, <, PAGE_SIZE); + ASSERT3P(pages, !=, NULL); + + /* + * Even if this buf is filesystem metadata, we only track that we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd_t *abd = abd_alloc_struct(0); + abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER; + abd->abd_size = size; + + while (sg_alloc_table_from_pages(&table, pages, npages, offset, + size, __GFP_NOWARN | GFP_NOIO) != 0) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + if (size < PAGE_SIZE) { + /* + * Since there is only one entry, this ABD can be represented + * as a linear buffer. All single-page (4K) ABD's constructed + * from a user page can be represented this way as long as the + * page is mapped to a virtual address. This allows us to + * apply an offset in to the mapped page. + * + * Note that kmap() must be used, not kmap_atomic(), because + * the mapping needs to bet set up on all CPUs. Using kmap() + * also enables the user of highmem pages when required. + */ + ASSERT3U(offset + size, <=, PAGE_SIZE); + abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; + abd->abd_u.abd_linear.abd_sgl = table.sgl; + zfs_kmap(sg_page(table.sgl)); + ABD_LINEAR_BUF(abd) = sg_virt(table.sgl); + } else { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + ABD_SCATTER(abd).abd_offset = offset; + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + + ASSERT0(ABD_SCATTER(abd).abd_offset); + } + + abd_update_scatter_stats(abd, ABDSTAT_INCR); + + return (abd); +} + /* * If we're going to use this ABD for doing I/O using the block layer, the * consumer of the ABD data doesn't care if it's scattered or not, and we don't @@ -746,6 +824,9 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, ABD_SCATTER(abd).abd_offset = new_offset; ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + if (abd_is_from_pages(sabd)) + abd->abd_flags |= ABD_FLAG_FROM_PAGES; + return (abd); } @@ -873,6 +954,115 @@ abd_cache_reap_now(void) { } +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, 0); + /* + * In the event the ABD is composed of a single user page from Direct + * I/O we can not direclty return the raw buffer. This is a consequence + * of not being able to write protect the page and the contents of the + * page can be changed at any time by the user. + */ + if (abd_is_from_pages(abd)) { + buf = zio_buf_alloc(n); + } else if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + +#ifdef ZFS_DEBUG + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); +#endif + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + + /* + * In the event the ABD is composed of a single user page from Direct + * I/O we must make sure copy the data over into the newly allocated + * buffer. This is a consequence of the fact that we can not write + * protect the user page and there is a risk the contents of the page + * could be changed by the user at any moment. + */ + if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will + * not change the contents of the ABD. If you want any changes you made to + * buf to be copied back to abd, use abd_return_buf_copy() instead. If the + * ABD is not constructed from user pages for Direct I/O then an ASSERT + * checks to make sure the contents of buffer have not changed since it was + * borrowed. We can not ASSERT that the contents of the buffer have not changed + * if it is composed of user pages because the pages can not be placed under + * write protection and the user could have possibly changed the contents in + * the pages at any time. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); +#ifdef ZFS_DEBUG + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif + if (abd_is_from_pages(abd)) { + zio_buf_free(buf, n); + } else if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else if (abd_is_gang(abd)) { +#ifdef ZFS_DEBUG + /* + * We have to be careful with gang ABD's that we do not ASSERT0 + * for any ABD's that contain user pages from Direct I/O. In + * order to handle this, we just iterate through the gang ABD + * and only verify ABDs that are not from user pages. + */ + void *cmp_buf = buf; + + for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + if (!abd_is_from_pages(cabd)) { + ASSERT0(abd_cmp_buf(cabd, cmp_buf, + cabd->abd_size)); + } + cmp_buf = (char *)cmp_buf + cabd->abd_size; + } +#endif + zio_buf_free(buf, n); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + /* * This is abd_iter_page(), the function underneath abd_iterate_page_func(). * It yields the next page struct and data offset and size within it, without diff --git a/module/os/linux/zfs/vdev_os.c b/module/os/linux/zfs/vdev_os.c new file mode 100644 index 0000000000..3bd7296da9 --- /dev/null +++ b/module/os/linux/zfs/vdev_os.c @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2022 by Triad National Security, LLC. + */ + +#include + +#ifdef _KERNEL + +int +param_set_direct_write_verify_pct(const char *buf, zfs_kernel_param_t *kp) +{ + uint_t val; + int error; + + error = kstrtouint(buf, 0, &val); + if (error < 0) + return (SET_ERROR(error)); + + if (val > 100) + return (SET_ERROR(-EINVAL)); + + error = param_set_uint(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} + +#endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_racct.c b/module/os/linux/zfs/zfs_racct.c index ce623ef9d1..ce197caa45 100644 --- a/module/os/linux/zfs/zfs_racct.c +++ b/module/os/linux/zfs/zfs_racct.c @@ -25,14 +25,35 @@ #include +#ifdef _KERNEL +#include + void -zfs_racct_read(uint64_t size, uint64_t iops) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { - (void) size, (void) iops; + task_io_account_read(size); + spa_iostats_read_add(spa, size, iops, flags); } void -zfs_racct_write(uint64_t size, uint64_t iops) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { - (void) size, (void) iops; + task_io_account_write(size); + spa_iostats_write_add(spa, size, iops, flags); } + +#else + +void +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + (void) spa, (void) size, (void) iops, (void) flags; +} + +void +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + (void) spa, (void) size, (void) iops, (void) flags; +} + +#endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index a99a1ba882..75ce3b0d8f 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -41,12 +41,19 @@ #ifdef _KERNEL +#include +#include +#include #include #include #include #include +#include +#include #include #include +#include +#include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -327,8 +334,13 @@ EXPORT_SYMBOL(zfs_uiomove); int zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) { - if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) { - /* There's never a need to fault in kernel pages */ + if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC || + (uio->uio_extflg & UIO_DIRECT)) { + /* + * There's never a need to fault in kernel pages or Direct I/O + * write pages. Direct I/O write pages have been pinned in so + * there is never a time for these pages a fault will occur. + */ return (0); #if defined(HAVE_VFS_IOV_ITER) } else if (uio->uio_segflg == UIO_ITER) { @@ -437,9 +449,289 @@ zfs_uioskip(zfs_uio_t *uio, size_t n) uio->uio_iovcnt--; } } + uio->uio_loffset += n; uio->uio_resid -= n; } EXPORT_SYMBOL(zfs_uioskip); +/* + * Check if the uio is page-aligned in memory. + */ +boolean_t +zfs_uio_page_aligned(zfs_uio_t *uio) +{ + boolean_t aligned = B_TRUE; + + if (uio->uio_segflg == UIO_USERSPACE || + uio->uio_segflg == UIO_SYSSPACE) { + const struct iovec *iov = uio->uio_iov; + size_t skip = uio->uio_skip; + + for (int i = uio->uio_iovcnt; i > 0; iov++, i--) { + unsigned long addr = + (unsigned long)(iov->iov_base + skip); + size_t size = iov->iov_len - skip; + if ((addr & (PAGE_SIZE - 1)) || + (size & (PAGE_SIZE - 1))) { + aligned = B_FALSE; + break; + } + skip = 0; + } +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + unsigned long alignment = + iov_iter_alignment(uio->uio_iter); + aligned = IS_P2ALIGNED(alignment, PAGE_SIZE); +#endif + } else { + /* Currently not supported */ + aligned = B_FALSE; + } + + return (aligned); +} + + +#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64) +#define ZFS_MARKEED_PAGE 0x0 +#define IS_ZFS_MARKED_PAGE(_p) 0 +#define zfs_mark_page(_p) +#define zfs_unmark_page(_p) +#define IS_ZERO_PAGE(_p) 0 + +#else +/* + * Mark pages to know if they were allocated to replace ZERO_PAGE() for + * Direct I/O writes. + */ +#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */ +#define IS_ZFS_MARKED_PAGE(_p) \ + (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE) +#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0)) + +static inline void +zfs_mark_page(struct page *page) +{ + ASSERT3P(page, !=, NULL); + get_page(page); + SetPagePrivate(page); + set_page_private(page, ZFS_MARKED_PAGE); +} + +static inline void +zfs_unmark_page(struct page *page) +{ + ASSERT3P(page, !=, NULL); + set_page_private(page, 0UL); + ClearPagePrivate(page); + put_page(page); +} +#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */ + +static void +zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + + for (int i = 0; i < uio->uio_dio.npages; i++) { + struct page *p = uio->uio_dio.pages[i]; + lock_page(p); + + if (IS_ZERO_PAGE(p)) { + /* + * If the user page points the kernels ZERO_PAGE() a + * new zero filled page will just be allocated so the + * contents of the page can not be changed by the user + * while a Direct I/O write is taking place. + */ + gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO | + __GFP_ZERO | GFP_KERNEL; + + ASSERT0(IS_ZFS_MARKED_PAGE(p)); + unlock_page(p); + put_page(p); + + p = __page_cache_alloc(gfp_zero_page); + zfs_mark_page(p); + } else { + unlock_page(p); + } + } +} + +void +zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3P(uio->uio_dio.pages, !=, NULL); + + for (int i = 0; i < uio->uio_dio.npages; i++) { + struct page *p = uio->uio_dio.pages[i]; + + if (IS_ZFS_MARKED_PAGE(p)) { + zfs_unmark_page(p); + __free_page(p); + continue; + } + + put_page(p); + } + + vmem_free(uio->uio_dio.pages, + uio->uio_dio.npages * sizeof (struct page *)); +} + +/* + * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's + * iov_iter_get_pages(). + */ +static size_t +zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio, int *numpages) +{ + unsigned long addr = (unsigned long)(v.iov_base); + size_t len = v.iov_len; + int n = DIV_ROUND_UP(len, PAGE_SIZE); + + int res = zfs_get_user_pages( + P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, rw == UIO_READ, + &uio->uio_dio.pages[uio->uio_dio.npages]); + if (res < 0) { + *numpages = -1; + return (-res); + } else if (len != (res * PAGE_SIZE)) { + *numpages = -1; + return (len); + } + + ASSERT3S(len, ==, res * PAGE_SIZE); + *numpages = res; + return (len); +} + +static int +zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + const struct iovec *iovp = uio->uio_iov; + size_t skip = uio->uio_skip; + size_t wanted, maxsize; + + ASSERT(uio->uio_segflg != UIO_SYSSPACE); + wanted = maxsize = uio->uio_resid - skip; + + for (int i = 0; i < uio->uio_iovcnt; i++) { + struct iovec iov; + int numpages = 0; + + if (iovp->iov_len == 0) { + iovp++; + skip = 0; + continue; + } + iov.iov_len = MIN(maxsize, iovp->iov_len - skip); + iov.iov_base = iovp->iov_base + skip; + ssize_t left = zfs_uio_iov_step(iov, rw, uio, &numpages); + + if (numpages == -1) { + return (left); + } + + ASSERT3U(left, ==, iov.iov_len); + uio->uio_dio.npages += numpages; + maxsize -= iov.iov_len; + wanted -= left; + skip = 0; + iovp++; + } + + ASSERT0(wanted); + return (0); +} + +#if defined(HAVE_VFS_IOV_ITER) +static int +zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + size_t skip = uio->uio_skip; + size_t wanted = uio->uio_resid - uio->uio_skip; + size_t rollback = 0; + size_t cnt; + size_t maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); + + while (wanted) { +#if defined(HAVE_IOV_ITER_GET_PAGES2) + cnt = iov_iter_get_pages2(uio->uio_iter, + &uio->uio_dio.pages[uio->uio_dio.npages], + wanted, maxpages, &skip); +#else + cnt = iov_iter_get_pages(uio->uio_iter, + &uio->uio_dio.pages[uio->uio_dio.npages], + wanted, maxpages, &skip); +#endif + if (cnt < 0) { + iov_iter_revert(uio->uio_iter, rollback); + return (SET_ERROR(-cnt)); + } + uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE); + rollback += cnt; + wanted -= cnt; + skip = 0; +#if !defined(HAVE_IOV_ITER_GET_PAGES2) + /* + * iov_iter_get_pages2() advances the iov_iter on success. + */ + iov_iter_advance(uio->uio_iter, cnt); +#endif + + } + ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); + iov_iter_revert(uio->uio_iter, rollback); + + return (0); +} +#endif /* HAVE_VFS_IOV_ITER */ + +/* + * This function maps user pages into the kernel. In the event that the user + * pages were not mapped successfully an error value is returned. + * + * On success, 0 is returned. + */ +int +zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + int error = 0; + size_t npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); + size_t size = npages * sizeof (struct page *); + + if (uio->uio_segflg == UIO_USERSPACE) { + uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); + error = zfs_uio_get_dio_pages_iov(uio, rw); + ASSERT3S(uio->uio_dio.npages, ==, npages); +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); + error = zfs_uio_get_dio_pages_iov_iter(uio, rw); + ASSERT3S(uio->uio_dio.npages, ==, npages); +#endif + } else { + return (SET_ERROR(EOPNOTSUPP)); + } + + if (error) { + vmem_free(uio->uio_dio.pages, size); + return (error); + } + + if (rw == UIO_WRITE) { + zfs_uio_dio_check_for_zero_page(uio); + } + + uio->uio_extflg |= UIO_DIRECT; + + return (0); +} + #endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index a52f08868d..22a4ad1ef5 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "zfs_comutil.h" enum { diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 9803c7fecb..a1c55b81dd 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -228,7 +228,8 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) #if defined(_KERNEL) -static int zfs_fillpage(struct inode *ip, struct page *pp); +static int zfs_fillpage(struct inode *ip, struct page *pp, + boolean_t rangelock_held); /* * When a file is memory mapped, we must keep the IO data synchronized @@ -296,13 +297,14 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { + /* * If filemap_fault() retries there exists a window * where the page will be unlocked and not up to date. * In this case we must try and fill the page. */ if (unlikely(!PageUptodate(pp))) { - error = zfs_fillpage(ip, pp); + error = zfs_fillpage(ip, pp, B_TRUE); if (error) { unlock_page(pp); put_page(pp); @@ -3866,7 +3868,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, } zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, - for_sync ? zfs_putpage_sync_commit_cb : + B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : zfs_putpage_async_commit_cb, pp); dmu_tx_commit(tx); @@ -4007,20 +4009,68 @@ zfs_inactive(struct inode *ip) * Fill pages with data from the disk. */ static int -zfs_fillpage(struct inode *ip, struct page *pp) +zfs_fillpage(struct inode *ip, struct page *pp, boolean_t rangelock_held) { + znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; + zfs_locked_range_t *lr = NULL; ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; + /* + * It is important to hold the rangelock here because it is possible + * a Direct I/O write might be taking place at the same time that a + * page is being faulted in through filemap_fault(). With a Direct I/O + * write, db->db_data will be set to NULL either in: + * 1. dmu_write_direct() -> dmu_buf_will_not_fill() -> + * dmu_buf_will_fill() -> dbuf_noread() -> dbuf_clear_data() + * 2. dmu_write_direct_done() + * If the rangelock is not held, then there is a race between faulting + * in a page and writing out a Direct I/O write. Without the rangelock + * a NULL pointer dereference can occur in dmu_read_impl() for + * db->db_data during the mempcy operation. + * + * Another important note here is we have to check to make sure the + * rangelock is not already held from mappedread() -> zfs_fillpage(). + * filemap_fault() will first add the page to the inode address_space + * mapping and then will drop the page lock. This leaves open a window + * for mappedread() to begin. In this case he page lock and rangelock, + * are both held and it might have to call here if the page is not + * up to date. In this case the rangelock can not be held twice or a + * deadlock can happen. So the rangelock only needs to be aquired if + * zfs_fillpage() is being called by zfs_getpage(). + * + * Finally it is also important to drop the page lock before grabbing + * the rangelock to avoid another deadlock between here and + * zfs_write() -> update_pages(). update_pages() holds both the + * rangelock and the page lock. + */ + if (rangelock_held == B_FALSE) { + /* + * First try grabbing the rangelock. If that can not be done + * the page lock must be dropped before grabbing the rangelock + * to avoid a deadlock with update_pages(). See comment above. + */ + lr = zfs_rangelock_tryenter(&zp->z_rangelock, io_off, io_len, + RL_READER); + if (lr == NULL) { + get_page(pp); + unlock_page(pp); + lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, + io_len, RL_READER); + lock_page(pp); + put_page(pp); + } + } + void *va = kmap(pp); - int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, + int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, io_len, va, DMU_READ_PREFETCH); if (io_len != PAGE_SIZE) memset((char *)va + io_len, 0, PAGE_SIZE - io_len); @@ -4038,6 +4088,10 @@ zfs_fillpage(struct inode *ip, struct page *pp) SetPageUptodate(pp); } + + if (rangelock_held == B_FALSE) + zfs_rangelock_exit(lr); + return (error); } @@ -4062,7 +4116,7 @@ zfs_getpage(struct inode *ip, struct page *pp) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - error = zfs_fillpage(ip, pp); + error = zfs_fillpage(ip, pp, B_FALSE); if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 9dec52215c..d3fd4340e7 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -309,7 +309,7 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, } static ssize_t -zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +zpl_iter_read_buffered(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); fstrans_cookie_t cookie; @@ -322,8 +322,9 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) crhold(cr); cookie = spl_fstrans_mark(); + int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, - filp->f_flags | zfs_io_flags(kiocb), cr); + flags, cr); spl_fstrans_unmark(cookie); crfree(cr); @@ -339,6 +340,72 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) return (read); } +static ssize_t +zpl_iter_read_direct(struct kiocb *kiocb, struct iov_iter *to) +{ + cred_t *cr = CRED(); + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + ssize_t count = iov_iter_count(to); + int flags = filp->f_flags | zfs_io_flags(kiocb); + zfs_uio_t uio; + ssize_t ret; + + zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); + + /* On error, return to fallback to the buffered path. */ + ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_READ, &flags); + if (ret) + return (-ret); + + ASSERT(uio.uio_extflg & UIO_DIRECT); + + crhold(cr); + fstrans_cookie_t cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(ip), &uio, flags, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + zfs_uio_free_dio_pages(&uio, UIO_READ); + + if (error < 0) + return (error); + + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); + + return (read); +} + +static ssize_t +zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +{ + struct inode *ip = kiocb->ki_filp->f_mapping->host; + struct file *filp = kiocb->ki_filp; + int flags = filp->f_flags | zfs_io_flags(kiocb); + int error = 0; + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(ITOZ(ip), flags, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (-error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + ssize_t read = zpl_iter_read_direct(kiocb, to); + + if (read >= 0 || read != -EAGAIN) + return (read); + + /* Otherwise fallback to buffered read */ + } + + return (zpl_iter_read_buffered(kiocb, to)); +} + static inline ssize_t zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, size_t *countp) @@ -365,27 +432,22 @@ zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, } static ssize_t -zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) +zpl_iter_write_buffered(struct kiocb *kiocb, struct iov_iter *from) { cred_t *cr = CRED(); - fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; + size_t wrote; + size_t count = iov_iter_count(from); + zfs_uio_t uio; - size_t count = 0; - ssize_t ret; - - ret = zpl_generic_write_checks(kiocb, from, &count); - if (ret) - return (ret); - zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); crhold(cr); - cookie = spl_fstrans_mark(); + fstrans_cookie_t cookie = spl_fstrans_mark(); - int error = -zfs_write(ITOZ(ip), &uio, - filp->f_flags | zfs_io_flags(kiocb), cr); + int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; + int error = -zfs_write(ITOZ(ip), &uio, flags, cr); spl_fstrans_unmark(cookie); crfree(cr); @@ -393,16 +455,95 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) if (error < 0) return (error); - ssize_t wrote = count - uio.uio_resid; + wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + if (wrote > 0) + iov_iter_advance(from, wrote); + + return (wrote); +} + +static ssize_t +zpl_iter_write_direct(struct kiocb *kiocb, struct iov_iter *from) +{ + cred_t *cr = CRED(); + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + size_t wrote; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t count = iov_iter_count(from); + + zfs_uio_t uio; + zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); + + /* On error, return to fallback to the buffered path. */ + ssize_t ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_WRITE, &flags); + if (ret) + return (-ret); + + ASSERT(uio.uio_extflg & UIO_DIRECT); + + crhold(cr); + fstrans_cookie_t cookie = spl_fstrans_mark(); + + int error = -zfs_write(ITOZ(ip), &uio, flags, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + zfs_uio_free_dio_pages(&uio, UIO_WRITE); + + if (error < 0) + return (error); + + wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } +static ssize_t +zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) +{ + struct inode *ip = kiocb->ki_filp->f_mapping->host; + struct file *filp = kiocb->ki_filp; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t count = 0; + int error = 0; + + ssize_t ret = zpl_generic_write_checks(kiocb, from, &count); + if (ret) + return (ret); + + loff_t offset = kiocb->ki_pos; + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(ITOZ(ip), flags, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (-error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + ssize_t wrote = zpl_iter_write_direct(kiocb, from); + + if (wrote >= 0 || wrote != -EAGAIN) { + return (wrote); + } + + /* + * If we are falling back to a buffered write, then the + * file position should not be updated at this point. + */ + ASSERT3U(offset, ==, kiocb->ki_pos); + } + + return (zpl_iter_write_buffered(kiocb, from)); +} + #else /* !HAVE_VFS_RW_ITERATE */ static ssize_t -zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, +zpl_aio_read_buffered(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); @@ -422,8 +563,9 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, crhold(cr); cookie = spl_fstrans_mark(); + int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, - filp->f_flags | zfs_io_flags(kiocb), cr); + flags, cr); spl_fstrans_unmark(cookie); crfree(cr); @@ -440,7 +582,87 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, } static ssize_t -zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, +zpl_aio_read_direct(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t count; + ssize_t ret; + + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (ret) + return (ret); + + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); + + /* On error, return to fallback to the buffered path */ + ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_READ, &flags); + if (ret) + return (-ret); + + ASSERT(uio.uio_extflg & UIO_DIRECT); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(ip), &uio, flags, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + zfs_uio_free_dio_pages(&uio, UIO_READ); + + if (error < 0) + return (error); + + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); + + return (read); +} + +static ssize_t +zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *ip = kiocb->ki_filp->f_mapping->host; + struct file *filp = kiocb->ki_filp; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t count; + ssize_t ret; + int error = 0; + + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (ret) + return (ret); + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(ITOZ(ip), flags, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (-error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + ssize_t read = zpl_aio_read_direct(kiocb, iov, nr_segs, pos); + + if (read >= 0 || read != -EAGAIN) + return (read); + + /* Otherwise fallback to buffered read */ + } + + return (zpl_aio_read_buffered(kiocb, iov, nr_segs, pos)); +} + +static ssize_t +zpl_aio_write_buffered(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); @@ -454,12 +676,6 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, if (ret) return (ret); - ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); - if (ret) - return (ret); - - kiocb->ki_pos = pos; - zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); @@ -467,8 +683,8 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, crhold(cr); cookie = spl_fstrans_mark(); - int error = -zfs_write(ITOZ(ip), &uio, - filp->f_flags | zfs_io_flags(kiocb), cr); + int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; + int error = -zfs_write(ITOZ(ip), &uio, flags, cr); spl_fstrans_unmark(cookie); crfree(cr); @@ -481,39 +697,135 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, return (wrote); } + +static ssize_t +zpl_aio_write_direct(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t count; + ssize_t ret; + + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (ret) + return (ret); + + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); + + /* On error, return to fallback to the buffered path. */ + ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_WRITE, &flags); + if (ret) + return (-ret); + + ASSERT(uio.uio_extflg & UIO_DIRECT); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_write(ITOZ(ip), &uio, flags, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + zfs_uio_free_dio_pages(&uio, UIO_WRITE); + + if (error < 0) + return (error); + + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); +} + +static ssize_t +zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t ocount; + size_t count; + ssize_t ret; + int error = 0; + + ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (ret) + return (ret); + + count = ocount; + + ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); + if (ret) + return (ret); + + kiocb->ki_pos = pos; + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(ITOZ(ip), flags, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (-error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + ssize_t wrote = zpl_aio_write_direct(kiocb, iov, nr_segs, pos); + + if (wrote >= 0 || wrote != -EAGAIN) { + return (wrote); + } + + /* + * If we are falling back to a buffered write, then the + * file position should not be updated at this point. + */ + ASSERT3U(pos, ==, kiocb->ki_pos); + } + + return (zpl_aio_write_buffered(kiocb, iov, nr_segs, pos)); +} + #endif /* HAVE_VFS_RW_ITERATE */ -#if defined(HAVE_VFS_RW_ITERATE) static ssize_t -zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) +zpl_direct_IO_impl(void) { - if (rw == WRITE) - return (zpl_iter_write(kiocb, iter)); - else - return (zpl_iter_read(kiocb, iter)); + /* + * All O_DIRCT requests should be handled by + * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code + * should call the direct_IO address_space_operations function. We set + * this code path to be fatal if it is executed. + */ + VERIFY(0); + return (0); } + +#if defined(HAVE_VFS_RW_ITERATE) #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { - return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - ASSERT3S(pos, ==, kiocb->ki_pos); - return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - ASSERT3S(pos, ==, kiocb->ki_pos); - return (zpl_direct_IO_impl(rw, kiocb, iter)); + return (zpl_direct_IO_impl()); } #else -#error "Unknown direct IO interface" +#error "Unknown Direct I/O interface" #endif #else /* HAVE_VFS_RW_ITERATE */ @@ -523,26 +835,16 @@ static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { - if (rw == WRITE) - return (zpl_aio_write(kiocb, iov, nr_segs, pos)); - else - return (zpl_aio_read(kiocb, iov, nr_segs, pos)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - const struct iovec *iovp = iov_iter_iovec(iter); - unsigned long nr_segs = iter->nr_segs; - - ASSERT3S(pos, ==, kiocb->ki_pos); - if (rw == WRITE) - return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); - else - return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); + return (zpl_direct_IO_impl()); } #else -#error "Unknown direct IO interface" +#error "Unknown Direct I/O interface" #endif #endif /* HAVE_VFS_RW_ITERATE */ @@ -627,6 +929,7 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); + if (error) return (error); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 764993b45e..10ac13a898 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -395,6 +395,13 @@ zfs_prop_init(void) { NULL } }; + static const zprop_index_t direct_table[] = { + { "disabled", ZFS_DIRECT_DISABLED }, + { "standard", ZFS_DIRECT_STANDARD }, + { "always", ZFS_DIRECT_ALWAYS }, + { NULL } + }; + struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_DATASET_PROPERTIES); @@ -479,6 +486,10 @@ zfs_prop_init(void) ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "default | full | geom | dev | none", "VOLMODE", volmode_table, sfeatures); + zprop_register_index(ZFS_PROP_DIRECT, "direct", + ZFS_DIRECT_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "disabled | standard | always", "DIRECT", direct_table, + sfeatures); /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, diff --git a/module/zfs/abd.c b/module/zfs/abd.c index c8c4d2270f..529deeecfd 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -89,8 +89,8 @@ * functions. * * As an additional feature, linear and scatter ABD's can be stitched together - * by using the gang ABD type (abd_alloc_gang_abd()). This allows for - * multiple ABDs to be viewed as a singular ABD. + * by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs + * to be viewed as a singular ABD. * * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to * B_FALSE. @@ -109,11 +109,15 @@ void abd_verify(abd_t *abd) { #ifdef ZFS_DEBUG - ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + if (abd_is_from_pages(abd)) { + ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS); + } else { + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + } ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | - ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD)); + ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { @@ -136,7 +140,7 @@ abd_verify(abd_t *abd) #endif } -static void +void abd_init_struct(abd_t *abd) { list_link_init(&abd->abd_gang_link); @@ -238,6 +242,7 @@ abd_free_linear(abd_t *abd) abd_free_linear_page(abd); return; } + if (abd->abd_flags & ABD_FLAG_META) { zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } else { @@ -520,6 +525,21 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) */ abd->abd_flags |= ABD_FLAG_LINEAR; + /* + * User pages from Direct I/O requests may be in a single page + * (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag + * that here for abd. This is required because we have to be + * careful when borrowing the buffer from the ABD because we + * can not place user pages under write protection on Linux. + * See the comments in abd_os.c for abd_borrow_buf(), + * abd_borrow_buf_copy(), abd_return_buf() and + * abd_return_buf_copy(). + */ + if (abd_is_from_pages(sabd)) { + abd->abd_flags |= ABD_FLAG_FROM_PAGES | + ABD_FLAG_LINEAR_PAGE; + } + ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; } else if (abd_is_gang(sabd)) { size_t left = size; @@ -648,70 +668,6 @@ abd_to_buf(abd_t *abd) return (ABD_LINEAR_BUF(abd)); } -/* - * Borrow a raw buffer from an ABD without copying the contents of the ABD - * into the buffer. If the ABD is scattered, this will allocate a raw buffer - * whose contents are undefined. To copy over the existing data in the ABD, use - * abd_borrow_buf_copy() instead. - */ -void * -abd_borrow_buf(abd_t *abd, size_t n) -{ - void *buf; - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - buf = abd_to_buf(abd); - } else { - buf = zio_buf_alloc(n); - } -#ifdef ZFS_DEBUG - (void) zfs_refcount_add_many(&abd->abd_children, n, buf); -#endif - return (buf); -} - -void * -abd_borrow_buf_copy(abd_t *abd, size_t n) -{ - void *buf = abd_borrow_buf(abd, n); - if (!abd_is_linear(abd)) { - abd_copy_to_buf(buf, abd, n); - } - return (buf); -} - -/* - * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * not change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. - */ -void -abd_return_buf(abd_t *abd, void *buf, size_t n) -{ - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); -#ifdef ZFS_DEBUG - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -#endif - if (abd_is_linear(abd)) { - ASSERT3P(buf, ==, abd_to_buf(abd)); - } else { - ASSERT0(abd_cmp_buf(abd, buf, n)); - zio_buf_free(buf, n); - } -} - -void -abd_return_buf_copy(abd_t *abd, void *buf, size_t n) -{ - if (!abd_is_linear(abd)) { - abd_copy_from_buf(abd, buf, n); - } - abd_return_buf(abd, buf, n); -} - void abd_release_ownership_of_buf(abd_t *abd) { diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 714a30e863..b5bcd367b2 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5961,7 +5961,7 @@ top: ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); - zfs_racct_read(size, 1); + zfs_racct_read(spa, size, 1, 0); } /* Check if the spa even has l2 configured */ diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 914260e742..27a04c2af0 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -217,8 +217,7 @@ dataset_kstats_rename(dataset_kstats_t *dk, const char *name) } void -dataset_kstats_update_write_kstats(dataset_kstats_t *dk, - int64_t nwritten) +dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten) { ASSERT3S(nwritten, >=, 0); @@ -230,8 +229,7 @@ dataset_kstats_update_write_kstats(dataset_kstats_t *dk, } void -dataset_kstats_update_read_kstats(dataset_kstats_t *dk, - int64_t nread) +dataset_kstats_update_read_kstats(dataset_kstats_t *dk, int64_t nread) { ASSERT3S(nread, >=, 0); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 099883ba26..cf66c38f0c 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -82,6 +82,13 @@ typedef struct dbuf_stats { */ kstat_named_t cache_levels[DN_MAX_LEVELS]; kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; + /* + * Statistics for Direct I/O. + */ + kstat_named_t direct_mixed_io_read_wait; + kstat_named_t direct_mixed_io_write_wait; + kstat_named_t direct_sync_wait; + kstat_named_t direct_undirty; /* * Statistics about the dbuf hash table. */ @@ -130,6 +137,10 @@ dbuf_stats_t dbuf_stats = { { "cache_total_evicts", KSTAT_DATA_UINT64 }, { { "cache_levels_N", KSTAT_DATA_UINT64 } }, { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, + { "direct_mixed_io_read_wait", KSTAT_DATA_UINT64 }, + { "direct_mixed_io_write_wait", KSTAT_DATA_UINT64 }, + { "direct_sync_wait", KSTAT_DATA_UINT64 }, + { "direct_undirty", KSTAT_DATA_UINT64 }, { "hash_hits", KSTAT_DATA_UINT64 }, { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -151,6 +162,10 @@ struct { wmsum_t cache_total_evicts; wmsum_t cache_levels[DN_MAX_LEVELS]; wmsum_t cache_levels_bytes[DN_MAX_LEVELS]; + wmsum_t direct_mixed_io_read_wait; + wmsum_t direct_mixed_io_write_wait; + wmsum_t direct_sync_wait; + wmsum_t direct_undirty; wmsum_t hash_hits; wmsum_t hash_misses; wmsum_t hash_collisions; @@ -628,7 +643,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db) * L2ARC. */ boolean_t -dbuf_is_l2cacheable(dmu_buf_impl_t *db) +dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp) { if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL || (db->db_objset->os_secondary_cache == @@ -636,10 +651,17 @@ dbuf_is_l2cacheable(dmu_buf_impl_t *db) if (l2arc_exclude_special == 0) return (B_TRUE); - blkptr_t *bp = db->db_blkptr; - if (bp == NULL || BP_IS_HOLE(bp)) + /* + * bp must be checked in the event it was passed from + * dbuf_read_impl() as the result of a the BP being set from + * a Direct I/O write in dbuf_read(). See comments in + * dbuf_read(). + */ + blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp; + + if (db_bp == NULL || BP_IS_HOLE(db_bp)) return (B_FALSE); - uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva); vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev; vdev_t *vd = NULL; @@ -889,6 +911,14 @@ dbuf_kstat_update(kstat_t *ksp, int rw) ds->cache_levels_bytes[i].value.ui64 = wmsum_value(&dbuf_sums.cache_levels_bytes[i]); } + ds->direct_mixed_io_read_wait.value.ui64 = + wmsum_value(&dbuf_sums.direct_mixed_io_read_wait); + ds->direct_mixed_io_write_wait.value.ui64 = + wmsum_value(&dbuf_sums.direct_mixed_io_write_wait); + ds->direct_sync_wait.value.ui64 = + wmsum_value(&dbuf_sums.direct_sync_wait); + ds->direct_undirty.value.ui64 = + wmsum_value(&dbuf_sums.direct_undirty); ds->hash_hits.value.ui64 = wmsum_value(&dbuf_sums.hash_hits); ds->hash_misses.value.ui64 = @@ -991,6 +1021,10 @@ dbuf_init(void) wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); } + wmsum_init(&dbuf_sums.direct_mixed_io_read_wait, 0); + wmsum_init(&dbuf_sums.direct_mixed_io_write_wait, 0); + wmsum_init(&dbuf_sums.direct_sync_wait, 0); + wmsum_init(&dbuf_sums.direct_undirty, 0); wmsum_init(&dbuf_sums.hash_hits, 0); wmsum_init(&dbuf_sums.hash_misses, 0); wmsum_init(&dbuf_sums.hash_collisions, 0); @@ -1063,6 +1097,10 @@ dbuf_fini(void) wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); } + wmsum_fini(&dbuf_sums.direct_mixed_io_read_wait); + wmsum_fini(&dbuf_sums.direct_mixed_io_write_wait); + wmsum_fini(&dbuf_sums.direct_sync_wait); + wmsum_fini(&dbuf_sums.direct_undirty); wmsum_fini(&dbuf_sums.hash_hits); wmsum_fini(&dbuf_sums.hash_misses); wmsum_fini(&dbuf_sums.hash_collisions); @@ -1233,8 +1271,9 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - ASSERT3P(db->db_buf, ==, NULL); - db->db.db_data = NULL; + /* Direct I/O writes may have data */ + if (db->db_buf == NULL) + db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "clear data"); @@ -1246,8 +1285,19 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); + dbuf_dirty_record_t *dr_dio = NULL; db->db_buf = buf; + dr_dio = dbuf_get_dirty_direct(db); + + /* + * If there is a Direct I/O, set its data too. Then its state + * will be the same as if we did a ZIL dmu_sync(). + */ + if (dbuf_dirty_is_direct_write(db, dr_dio)) { + dr_dio->dt.dl.dr_data = db->db_buf; + } + ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } @@ -1380,6 +1430,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); + /* * All reads are synchronous, so we must have a hold on the dbuf */ @@ -1570,12 +1621,12 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) */ static int dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, - db_lock_type_t dblt, const void *tag) + db_lock_type_t dblt, blkptr_t *bp, const void *tag) { zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - blkptr_t bp, *bpp = NULL; + blkptr_t *bpp = bp; ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1604,16 +1655,10 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, err = EIO; goto early_unlock; } - bp = dr->dt.dl.dr_overridden_by; - bpp = &bp; + bpp = &dr->dt.dl.dr_overridden_by; } } - if (bpp == NULL && db->db_blkptr != NULL) { - bp = *db->db_blkptr; - bpp = &bp; - } - err = dbuf_read_hole(db, dn, bpp); if (err == 0) goto early_unlock; @@ -1653,7 +1698,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, if (!DBUF_IS_CACHEABLE(db)) aflags |= ARC_FLAG_UNCACHED; - else if (dbuf_is_l2cacheable(db)) + else if (dbuf_is_l2cacheable(db, bpp)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -1661,17 +1706,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; - if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bpp)) zio_flags |= ZIO_FLAG_RAW; + /* - * The zio layer will copy the provided blkptr later, but we have our - * own copy so that we can release the parent's rwlock. We have to - * do that so that if dbuf_read_done is called synchronously (on + * The zio layer will copy the provided blkptr later, but we need to + * do this now so that we can release the parent's rwlock. We have to + * do that now so that if dbuf_read_done is called synchronously (on * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ + blkptr_t copy = *bpp; dmu_buf_unlock_parent(db, dblt, tag); - return (arc_read(zio, db->db_objset->os_spa, bpp, + return (arc_read(zio, db->db_objset->os_spa, ©, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb)); @@ -1841,16 +1888,24 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) } mutex_exit(&db->db_mtx); } else { + blkptr_t *bp = NULL; ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + + /* + * If a Direct I/O write has occurred we will use the updated + * block pointer. + */ + bp = dmu_buf_get_bp_from_dbuf(db); + if (pio == NULL && (db->db_state == DB_NOFILL || - (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { + (bp != NULL && !BP_IS_HOLE(bp)))) { spa_t *spa = dn->dn_objset->os_spa; pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG); + err = dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG); /* dbuf_read_impl drops db_mtx and parent's rwlock. */ miss = (db->db_state != DB_CACHED); } @@ -1918,6 +1973,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); + /* * This assert is valid because dmu_sync() expects to be called by * a zilog's get_data while holding a range lock. This call only @@ -1946,6 +2002,9 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dr->dt.dl.dr_has_raw_params = B_FALSE; /* + * In the event that Direct I/O was used, we do not + * need to release the buffer from the ARC. + * * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do @@ -2084,6 +2143,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) */ dmu_buf_will_dirty(&db->db, tx); + VERIFY3P(db->db_buf, !=, NULL); + /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); @@ -2144,11 +2205,26 @@ dbuf_redirty(dbuf_dirty_record_t *dr) */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) { - /* Already released on initial dirty, so just thaw. */ + db->db_state != DB_NOFILL && db->db_buf != NULL) { + /* + * Already released on initial dirty, + * so just thaw. + */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } + /* + * If initial dirty was via Direct I/O, may not have a dr_data. + * + * If the dirty record was associated with cloned block then + * the call above to dbuf_unoverride() will have reset + * dr->dt.dl.dr_data and it will not be NULL here. + */ + if (dr->dt.dl.dr_data == NULL) { + ASSERT3B(dbuf_dirty_is_direct_write(db, dr), ==, + B_TRUE); + dr->dt.dl.dr_data = db->db_buf; + } } } @@ -2525,13 +2601,17 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) /* * Undirty a buffer in the transaction group referenced by the given - * transaction. Return whether this evicted the dbuf. + * transaction. Return whether this evicted the dbuf. */ boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - uint64_t txg = tx->tx_txg; + uint64_t txg; boolean_t brtwrite; + dbuf_dirty_record_t *dr; + + txg = tx->tx_txg; + dr = dbuf_find_dirty_eq(db, txg); ASSERT(txg != 0); @@ -2551,7 +2631,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ - dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) return (B_FALSE); ASSERT(dr->dr_dbuf == db); @@ -2598,10 +2677,15 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state != DB_NOFILL && !brtwrite) { dbuf_unoverride(dr); - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) + /* + * In the Direct I/O case, the buffer is still dirty, but it + * may be UNCACHED, so we do not need to destroy an ARC buffer. + */ + if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) { + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); arc_buf_destroy(dr->dt.dl.dr_data, db); + } } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -2610,8 +2694,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { + /* + * In the Direct I/O case our db_buf will be NULL as we are not + * caching in the ARC. + */ ASSERT(db->db_state == DB_NOFILL || brtwrite || - arc_released(db->db_buf)); + db->db_buf == NULL || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } @@ -2701,6 +2789,167 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) return (dr != NULL); } +void +dmu_buf_direct_mixed_io_wait(dmu_buf_impl_t *db, uint64_t txg, boolean_t read) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (read == B_TRUE) { + /* + * If a buffered read is in process, a Direct I/O read will + * wait for the buffered I/O to complete. + */ + ASSERT3U(txg, ==, 0); + while (db->db_state == DB_READ) { + DBUF_STAT_BUMP(direct_mixed_io_read_wait); + cv_wait(&db->db_changed, &db->db_mtx); + } + } else { + /* + * There must be an ARC buf associated with this Direct I/O + * write otherwise there is no reason to wait for previous + * dirty records to sync out. + * + * The db_state will temporarily be set to DB_CACHED so that + * that any synchronous writes issued through the ZIL will + * still be handled properly. In particular, the call to + * dbuf_read() in dmu_sync_late_arrival() must account for the + * data still being in the ARC. After waiting here for previous + * TXGs to sync out, dmu_write_direct_done() will update the + * db_state. + */ + ASSERT3P(db->db_buf, !=, NULL); + ASSERT3U(txg, >, 0); + db->db_mixed_io_dio_wait = TRUE; + db->db_state = DB_CACHED; + while (dbuf_find_dirty_lte(db, txg) != NULL) { + DBUF_STAT_BUMP(direct_mixed_io_write_wait); + cv_wait(&db->db_changed, &db->db_mtx); + } + db->db_mixed_io_dio_wait = FALSE; + } +} + +/* + * Direct I/O writes may need to undirty the open-context dirty record + * associated with it in the event of an I/O error. + */ +void +dmu_buf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + /* + * Direct I/O writes always happen in open-context. + */ + ASSERT(!dmu_tx_is_syncing(tx)); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state == DB_NOFILL || db->db_state == DB_UNCACHED); + + + /* + * In the event of an I/O error we will handle the metaslab clean up in + * zio_done(). Also, the dirty record's dr_overridden_by BP is not + * currently set as that is done in dmu_sync_done(). Since the db_state + * is still set to DB_NOFILL, dbuf_unoverride() will not be called in + * dbuf_undirty() and the dirty record's BP will not be added the SPA's + * spa_free_bplist via zio_free(). + * + * This function can also be called in the event that a Direct I/O + * write is overwriting a previous Direct I/O to the same block for + * this TXG. It is important to go ahead and free up the space + * accounting in this case through dbuf_undirty() -> dbuf_unoverride() + * -> zio_free(). This is necessary because the space accounting for + * determining if a write can occur in zfs_write() happens through + * dmu_tx_assign(). This can cause an issue with Direct I/O writes in + * the case of overwrites, because all DVA allocations are being done + * in open-context. Constanstly allowing Direct I/O overwrites to the + * same blocks can exhaust the pools available space leading to ENOSPC + * errors at the DVA allcoation part of the ZIO pipeline, which will + * eventually suspend the pool. By cleaning up space accounting now + * the ENOSPC pool suspend can be avoided. + * + * Since we are undirtying the record for the Direct I/O in + * open-context we must have a hold on the db, so it should never be + * evicted after calling dbuf_undirty(). + */ + VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE); + + DBUF_STAT_BUMP(direct_undirty); +} + +/* + * Normally the db_blkptr points to the most recent on-disk content for the + * dbuf (and anything newer will be cached in the dbuf). However, a recent + * Direct I/O write could leave newer content on disk and the dbuf uncached. + * In this case we must return the (as yet unsynced) pointer to the lastest + * on-disk content. + */ +blkptr_t * +dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level != 0) + return (db->db_blkptr); + + blkptr_t *bp = db->db_blkptr; + + dbuf_dirty_record_t *dr_dio = dbuf_get_dirty_direct(db); + if (dr_dio && dr_dio->dt.dl.dr_override_state == DR_OVERRIDDEN && + dr_dio->dt.dl.dr_data == NULL) { + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); + /* We have a Direct I/O write or cloned block, use it's BP */ + bp = &dr_dio->dt.dl.dr_overridden_by; + } + + return (bp); +} + +/* + * Direct I/O reads can read directly from the ARC, but the data has + * to be untransformed in order to copy it over into user pages. + */ +int +dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa) +{ + int err = 0; + DB_DNODE_ENTER(db); + dnode_t *dn = DB_DNODE(db); + + ASSERT3S(db->db_state, ==, DB_CACHED); + ASSERT(MUTEX_HELD(&db->db_mtx)); + + /* + * Ensure that this block's dnode has been decrypted if + * the caller has requested decrypted data. + */ + err = dbuf_read_verify_dnode_crypt(db, dn, 0); + + /* + * If the arc buf is compressed or encrypted and the caller + * requested uncompressed data, we need to untransform it + * before returning. We also call arc_untransform() on any + * unauthenticated blocks, which will verify their MAC if + * the key is now available. + */ + if (err == 0 && db->db_buf != NULL && + (arc_is_encrypted(db->db_buf) || + arc_is_unauthenticated(db->db_buf) || + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), + db->db.db_object, db->db_level, db->db_blkid); + dbuf_fix_old_data(db, spa_syncing_txg(spa)); + err = arc_untransform(db->db_buf, spa, &zb, B_FALSE); + dbuf_set_data(db, db->db_buf); + } + DB_DNODE_EXIT(db); + DBUF_STAT_BUMP(hash_hits); + + return (err); +} + void dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -3283,6 +3532,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; + db->db_mixed_io_dio_wait = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); @@ -4080,7 +4330,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) } else { mutex_exit(&db->db_mtx); } - } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -4539,13 +4788,32 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); + /* - * To be synced, we must be dirtied. But we - * might have been freed after the dirty. + * It is possible a buffered read has come in after a Direct I/O + * write and is currently transistioning the db_state from DB_READ + * in dbuf_read_impl() to another state in dbuf_read_done(). We + * have to wait in order for the dbuf state to change from DB_READ + * before syncing the dirty record of the Direct I/O write. + */ + if (db->db_state == DB_READ && !dr->dt.dl.dr_brtwrite) { + ASSERT3P(*datap, ==, NULL); + ASSERT3P(db->db_buf, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_OVERRIDDEN); + while (db->db_state == DB_READ) { + DBUF_STAT_BUMP(direct_sync_wait); + cv_wait(&db->db_changed, &db->db_mtx); + } + } + + /* + * To be synced, we must be dirtied. But we might have been freed + * after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ - ASSERT(db->db.db_data == NULL); + ASSERT3P(db->db.db_data, ==, NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); @@ -4608,8 +4876,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_check_blkptr(dn, db); /* - * If this buffer is in the middle of an immediate write, - * wait for the synchronous IO to complete. + * If this buffer is in the middle of an immediate write, wait for the + * synchronous IO to complete. + * + * This is also valid even with Direct I/O writes setting a dirty + * records override state into DR_IN_DMU_SYNC, because all + * Direct I/O writes happen in open-context. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -4913,8 +5185,11 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + + /* no dr_data if this is a NO_FILL or Direct I/O */ if (dr->dt.dl.dr_data != NULL && dr->dt.dl.dr_data != db->db_buf) { + ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE); arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { @@ -4976,7 +5251,9 @@ dbuf_write_override_done(zio_t *zio) if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); - arc_release(dr->dt.dl.dr_data, db); + + if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) + arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); @@ -5180,10 +5457,17 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context - * (by dmu_sync() or dmu_buf_write_embedded()). + * (by dmu_sync(), dmu_write_direct(), + * or dmu_buf_write_embedded()). */ - abd_t *contents = (data != NULL) ? - abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; + abd_t *contents = NULL; + if (data) { + ASSERT(BP_IS_HOLE(bp) || + arc_buf_lsize(data) == BP_GET_LSIZE(bp)); + contents = abd_get_from_buf(data->b_data, + arc_buf_size(data)); + } dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, @@ -5192,9 +5476,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, - dr->dt.dl.dr_brtwrite); + zio_write_override(dr->dr_zio, bp, dr->dt.dl.dr_copies, + dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); } else if (data == NULL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || @@ -5219,7 +5502,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(pio, os->os_spa, txg, &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db), - dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, + dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index b3eda8ea50..ba47be9c9e 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -609,8 +609,16 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dbp[i] = &db->db; } - if (!read) - zfs_racct_write(length, nblks); + /* + * If we are doing O_DIRECT we still hold the dbufs, even for reads, + * but we do not issue any reads here. We do not want to account for + * writes in this case. + * + * O_DIRECT write/read accounting takes place in + * dmu_{write/read}_abd(). + */ + if (!read && ((flags & DMU_DIRECTIO) == 0)) + zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags); if (zs) dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); @@ -897,7 +905,7 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) /* * Get the next "chunk" of file data to free. We traverse the file from - * the end so that the file gets shorter over time (if we crashes in the + * the end so that the file gets shorter over time (if we crash in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * @@ -1168,7 +1176,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, /* * Deal with odd block sizes, where there can't be data past the first - * block. If we ever do the tail block optimization, we will need to + * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { @@ -1178,6 +1186,18 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, size = newsz; } + if (size == 0) + return (0); + + /* Allow Direct I/O when requested and properly aligned */ + if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) && + zfs_dio_aligned(offset, size, SPA_MINBLOCKSIZE)) { + abd_t *data = abd_get_from_buf(buf, size); + err = dmu_read_abd(dn, offset, size, data, flags); + abd_free(data); + return (err); + } + while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; @@ -1286,22 +1306,41 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, } /* - * Note: Lustre is an external consumer of this interface. + * This interface is not used internally by ZFS but is provided for + * use by Lustre which is built on the DMU interfaces. */ -void -dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) +int +dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx, uint32_t flags) { dmu_buf_t **dbp; int numbufs; + int error; if (size == 0) - return; + return (0); + + /* Allow Direct I/O when requested and properly aligned */ + if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) && + zfs_dio_aligned(offset, size, dn->dn_datablksz)) { + abd_t *data = abd_get_from_buf((void *)buf, size); + error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); + abd_free(data); + return (error); + } VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); + return (0); +} + +int +dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0)); } void @@ -1365,6 +1404,9 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) dmu_buf_t **dbp; int numbufs, i, err; + if (uio->uio_extflg & UIO_DIRECT) + return (dmu_read_uio_direct(dn, uio, size)); + /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. @@ -1453,23 +1495,52 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) dmu_buf_t **dbp; int numbufs; int err = 0; - int i; + uint64_t write_size; - err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, +top: + write_size = size; + + /* + * We only allow Direct I/O writes to happen if we are block + * sized aligned. Otherwise, we pass the write off to the ARC. + */ + if ((uio->uio_extflg & UIO_DIRECT) && + (write_size >= dn->dn_datablksz)) { + if (zfs_dio_aligned(zfs_uio_offset(uio), write_size, + dn->dn_datablksz)) { + return (dmu_write_uio_direct(dn, uio, size, tx)); + } else if (write_size > dn->dn_datablksz && + zfs_dio_offset_aligned(zfs_uio_offset(uio), + dn->dn_datablksz)) { + err = dmu_write_uio_direct(dn, uio, dn->dn_datablksz, + tx); + if (err == 0) { + size -= dn->dn_datablksz; + goto top; + } else { + return (err); + } + } else { + write_size = + P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz); + } + } + + err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); - for (i = 0; i < numbufs; i++) { + for (int i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; - ASSERT(size > 0); + ASSERT(write_size > 0); offset_t off = zfs_uio_offset(uio); bufoff = off - db->db_offset; - tocpy = MIN(db->db_size - bufoff, size); + tocpy = MIN(db->db_size - bufoff, write_size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); @@ -1489,10 +1560,18 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) if (err) break; + write_size -= tocpy; size -= tocpy; } + IMPLY(err == 0, write_size == 0); + dmu_buf_rele_array(dbp, numbufs, FTAG); + + if ((uio->uio_extflg & UIO_DIRECT) && size > 0) { + goto top; + } + return (err); } @@ -1731,7 +1810,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { - zfs_racct_write(blksz, 1); + zfs_racct_write(os->os_spa, blksz, 1, 0); dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { @@ -1761,23 +1840,22 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, return (err); } -typedef struct { - dbuf_dirty_record_t *dsa_dr; - dmu_sync_cb_t *dsa_done; - zgd_t *dsa_zgd; - dmu_tx_t *dsa_tx; -} dmu_sync_arg_t; - -static void +void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; dmu_sync_arg_t *dsa = varg; - dmu_buf_t *db = dsa->dsa_zgd->zgd_db; - blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { + dbuf_dirty_record_t *dr = dsa->dsa_dr; + blkptr_t *bp = zio->io_bp; + if (BP_IS_HOLE(bp)) { + dmu_buf_t *db = NULL; + if (dr) + db = &(dr->dr_dbuf->db); + else + db = dsa->dsa_zgd->zgd_db; /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. @@ -1796,7 +1874,7 @@ dmu_sync_late_arrival_ready(zio_t *zio) dmu_sync_ready(zio, NULL, zio->io_private); } -static void +void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; @@ -1809,7 +1887,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ - if (zio->io_error == 0) { + if (zgd && zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } @@ -1848,10 +1926,12 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } + cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); - dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + if (dsa->dsa_done) + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } @@ -2120,9 +2200,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, - dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), - &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), + dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL, + dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, + &zb)); return (0); } @@ -2385,6 +2466,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_nopwrite = nopwrite; zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; + zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE; memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); @@ -2817,8 +2899,15 @@ EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); +EXPORT_SYMBOL(dmu_read_uio); +EXPORT_SYMBOL(dmu_read_uio_dbuf); +EXPORT_SYMBOL(dmu_read_uio_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); +EXPORT_SYMBOL(dmu_write_by_dnode_flags); +EXPORT_SYMBOL(dmu_write_uio); +EXPORT_SYMBOL(dmu_write_uio_dbuf); +EXPORT_SYMBOL(dmu_write_uio_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c new file mode 100644 index 0000000000..0ff3e0e55e --- /dev/null +++ b/module/zfs/dmu_direct.c @@ -0,0 +1,437 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include +#include +#include +#include +#include +#include +#include +#include + +static abd_t * +make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset, + uint64_t size) +{ + size_t buf_size = db->db.db_size; + abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL; + size_t buf_off = 0; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (offset > db->db.db_offset) { + size_t pre_size = offset - db->db.db_offset; + pre_buf = abd_alloc_for_io(pre_size, B_TRUE); + buf_size -= pre_size; + buf_off = 0; + } else { + buf_off = db->db.db_offset - offset; + size -= buf_off; + } + + if (size < buf_size) { + size_t post_size = buf_size - size; + post_buf = abd_alloc_for_io(post_size, B_TRUE); + buf_size -= post_size; + } + + ASSERT3U(buf_size, >, 0); + abd_t *buf = abd_get_offset_size(data, buf_off, buf_size); + + if (pre_buf || post_buf) { + mbuf = abd_alloc_gang(); + if (pre_buf) + abd_gang_add(mbuf, pre_buf, B_TRUE); + abd_gang_add(mbuf, buf, B_TRUE); + if (post_buf) + abd_gang_add(mbuf, post_buf, B_TRUE); + } else { + mbuf = buf; + } + + return (mbuf); +} + +static void +dmu_read_abd_done(zio_t *zio) +{ + abd_free(zio->io_abd); +} + +static void +dmu_write_direct_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} + +static void +dmu_write_direct_done(zio_t *zio) +{ + dmu_sync_arg_t *dsa = zio->io_private; + dbuf_dirty_record_t *dr = dsa->dsa_dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dsa->dsa_tx->tx_txg; + + abd_free(zio->io_abd); + mutex_enter(&db->db_mtx); + + if (zio->io_error == 0) { + /* + * After a successful Direct I/O write any stale contents in + * the ARC must be cleaned up in order to force all future + * reads down to the VDEVs. + * + * If a previous write operation to this dbuf was buffered + * (in the ARC) we have to wait for the previous dirty records + * associated with this dbuf to be synced out if they are in + * the quiesce or sync phase for their TXG. This is done to + * guarantee we are not racing to destroy the ARC buf that + * is associated with the dbuf between this done callback and + * spa_sync(). Outside of using a heavy handed approach of + * locking down the spa_syncing_txg while it is being updated, + * there is no way to synchronize when a dirty record's TXG + * has moved over to the sync phase. + * + * In order to make sure all TXG's are consistent we must + * do this stall if there is an associated ARC buf with this + * dbuf. It is because of this that a user should not really + * be mixing buffered and Direct I/O writes. If they choose to + * do so, there is an associated performance penalty for that + * as we will not give up consistency with a TXG over + * performance. + */ + if (db->db_buf) { + dmu_buf_direct_mixed_io_wait(db, txg - 1, B_FALSE); + ASSERT3P(db->db_buf, ==, dr->dt.dl.dr_data); + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + dr->dt.dl.dr_data = NULL; + db->db.db_data = NULL; + ASSERT3U(db->db_dirtycnt, ==, 1); + } + + /* + * The current contents of the dbuf are now stale. + */ + ASSERT3P(dr->dt.dl.dr_data, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + db->db_state = DB_UNCACHED; + } else { + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + ASSERT3U(zio->io_error, ==, EAGAIN); + + /* + * If there is a valid ARC buffer assocatied with this dirty + * record we will stall just like on a successful Direct I/O + * write to make sure all TXG's are consistent. See comment + * above. + */ + if (db->db_buf) { + ASSERT3P(db->db_buf, ==, dr->dt.dl.dr_data); + dmu_buf_direct_mixed_io_wait(db, txg - 1, B_FALSE); + dmu_buf_undirty(db, dsa->dsa_tx); + db->db_state = DB_CACHED; + } else { + ASSERT3P(dr->dt.dl.dr_data, ==, NULL); + dmu_buf_undirty(db, dsa->dsa_tx); + db->db_state = DB_UNCACHED; + } + + ASSERT0(db->db_dirtycnt); + } + + mutex_exit(&db->db_mtx); + dmu_sync_done(zio, NULL, zio->io_private); + kmem_free(zio->io_bp, sizeof (blkptr_t)); + zio->io_bp = NULL; +} + +int +dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) +{ + objset_t *os = db->db_objset; + dsl_dataset_t *ds = dmu_objset_ds(os); + zbookmark_phys_t zb; + dbuf_dirty_record_t *dr_head; + + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + DB_DNODE_ENTER(db); + zio_prop_t zp; + dmu_write_policy(os, DB_DNODE(db), db->db_level, + WP_DMU_SYNC | WP_DIRECT_WR, &zp); + DB_DNODE_EXIT(db); + + /* + * If we going to overwrite a previous Direct I/O write that is part of + * the current TXG, then we can can go ahead and undirty it now. Part + * of it being undirtied will be allowing for previously allocated + * space in the dr_overridden_bp BP's DVAs to be freed. This avoids + * ENOSPC errors from possibly occuring when trying to allocate new + * metaslabs in open-context for Direct I/O writes. + */ + mutex_enter(&db->db_mtx); + dr_head = dbuf_find_dirty_eq(db, dmu_tx_get_txg(tx)); + if (dbuf_dirty_is_direct_write(db, dr_head)) { + dmu_buf_undirty(db, tx); + } + mutex_exit(&db->db_mtx); + + /* + * Dirty this dbuf with DB_NOFILL since we will not have any data + * associated with the dbuf. + */ + dmu_buf_will_not_fill(&db->db, tx); + + mutex_enter(&db->db_mtx); + + uint64_t txg = dmu_tx_get_txg(tx); + ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa)); + ASSERT3U(txg, >, spa_syncing_txg(os->os_spa)); + + dr_head = dbuf_get_dirty_direct(db); + ASSERT3U(dr_head->dr_txg, ==, txg); + dr_head->dr_accounted = db->db.db_size; + + blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + if (db->db_blkptr != NULL) { + /* + * Fill in bp with the current block pointer so that + * the nopwrite code can check if we're writing the same + * data that's already on disk. + */ + *bp = *db->db_blkptr; + } else { + memset(bp, 0, sizeof (blkptr_t)); + } + + /* + * Disable nopwrite if the current block pointer could change + * before this TXG syncs. + */ + if (list_next(&db->db_dirty_records, dr_head) != NULL) + zp.zp_nopwrite = B_FALSE; + + ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN); + dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + + mutex_exit(&db->db_mtx); + + dmu_objset_willuse_space(os, dr_head->dr_accounted, tx); + + dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr_head; + dsa->dsa_tx = tx; + + zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data, + db->db.db_size, db->db.db_size, &zp, + dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb); + + if (pio == NULL) + return (zio_wait(zio)); + + zio_nowait(zio); + + return (0); +} + +int +dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, + abd_t *data, uint32_t flags, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + spa_t *spa = dn->dn_objset->os_spa; + int numbufs, err; + + ASSERT(flags & DMU_DIRECTIO); + + err = dmu_buf_hold_array_by_dnode(dn, offset, + size, B_FALSE, FTAG, &numbufs, &dbp, flags); + if (err) + return (err); + + zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + for (int i = 0; i < numbufs && err == 0; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + + abd_t *abd = abd_get_offset_size(data, + db->db.db_offset - offset, dn->dn_datablksz); + + zfs_racct_write(spa, db->db.db_size, 1, flags); + err = dmu_write_direct(pio, db, abd, tx); + ASSERT0(err); + } + + err = zio_wait(pio); + + /* + * The dbuf must be held until the Direct I/O write has completed in + * the event there was any errors and dmu_buf_undirty() was called. + */ + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +int +dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, + abd_t *data, uint32_t flags) +{ + objset_t *os = dn->dn_objset; + spa_t *spa = os->os_spa; + dmu_buf_t **dbp; + int numbufs, err; + + ASSERT(flags & DMU_DIRECTIO); + + err = dmu_buf_hold_array_by_dnode(dn, offset, + size, B_FALSE, FTAG, &numbufs, &dbp, flags); + if (err) + return (err); + + zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + for (int i = 0; i < numbufs; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + abd_t *mbuf; + zbookmark_phys_t zb; + + mutex_enter(&db->db_mtx); + + SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + /* + * If there is another buffered read for this dbuf, we will + * wait for that to complete first. + */ + dmu_buf_direct_mixed_io_wait(db, 0, B_TRUE); + + blkptr_t *bp = dmu_buf_get_bp_from_dbuf(db); + + /* + * There is no need to read if this is a hole or the data is + * cached. This will not be considered a direct read for IO + * accounting in the same way that an ARC hit is not counted. + */ + if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) { + size_t aoff = offset < db->db.db_offset ? + db->db.db_offset - offset : 0; + size_t boff = offset > db->db.db_offset ? + offset - db->db.db_offset : 0; + size_t len = MIN(size - aoff, db->db.db_size - boff); + + if (db->db_state == DB_CACHED) { + /* + * We need to untransformed the ARC buf data + * before we copy it over. + */ + err = dmu_buf_untransform_direct(db, spa); + ASSERT0(err); + abd_copy_from_buf_off(data, + (char *)db->db.db_data + boff, aoff, len); + } else { + abd_zero_off(data, aoff, len); + } + + mutex_exit(&db->db_mtx); + continue; + } + + mbuf = make_abd_for_dbuf(db, data, offset, size); + ASSERT3P(mbuf, !=, NULL); + + /* + * The dbuf mutex (db_mtx) must be held when creating the ZIO + * for the read. The BP returned from + * dmu_buf_get_bp_from_dbuf() could be from a previous Direct + * I/O write that is in the dbuf's dirty record. When + * zio_read() is called, zio_create() will make a copy of the + * BP. However, if zio_read() is called without the mutex + * being held then the dirty record from the dbuf could be + * freed in dbuf_write_done() resulting in garbage being set + * for the zio BP. + */ + zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size, + dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL, &zb); + mutex_exit(&db->db_mtx); + + zfs_racct_read(spa, db->db.db_size, 1, flags); + zio_nowait(cio); + } + + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (zio_wait(rio)); +} + +#ifdef _KERNEL +int +dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size) +{ + offset_t offset = zfs_uio_offset(uio); + offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; + int err; + + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3U(page_index, <, uio->uio_dio.npages); + + abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], + offset & (PAGESIZE - 1), size); + err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO); + abd_free(data); + + if (err == 0) + zfs_uioskip(uio, size); + + return (err); +} + +int +dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) +{ + offset_t offset = zfs_uio_offset(uio); + offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; + int err; + + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3U(page_index, <, uio->uio_dio.npages); + + abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], + offset & (PAGESIZE - 1), size); + err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); + abd_free(data); + + if (err == 0) + zfs_uioskip(uio, size); + + return (err); +} +#endif /* _KERNEL */ + +EXPORT_SYMBOL(dmu_read_uio_direct); +EXPORT_SYMBOL(dmu_write_uio_direct); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 8f4fefa4f4..f030fba226 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -350,6 +350,20 @@ smallblk_changed_cb(void *arg, uint64_t newval) os->os_zpl_special_smallblock = newval; } +static void +direct_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD || + newval == ZFS_DIRECT_ALWAYS); + + os->os_direct = newval; +} + static void logbias_changed_cb(void *arg, uint64_t newval) { @@ -633,6 +647,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ZFS_PROP_SPECIAL_SMALL_BLOCKS), smallblk_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DIRECT), + direct_changed_cb, os); + } } if (err != 0) { arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 17ed2a620b..45a2f06263 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -895,6 +895,14 @@ static const spa_iostats_t spa_iostats_template = { { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, + { "arc_read_count", KSTAT_DATA_UINT64 }, + { "arc_read_bytes", KSTAT_DATA_UINT64 }, + { "arc_write_count", KSTAT_DATA_UINT64 }, + { "arc_write_bytes", KSTAT_DATA_UINT64 }, + { "direct_read_count", KSTAT_DATA_UINT64 }, + { "direct_read_bytes", KSTAT_DATA_UINT64 }, + { "direct_write_count", KSTAT_DATA_UINT64 }, + { "direct_write_bytes", KSTAT_DATA_UINT64 }, }; #define SPA_IOSTATS_ADD(stat, val) \ @@ -938,6 +946,44 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type, } } +void +spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + + if (ksp == NULL) + return; + + spa_iostats_t *iostats = ksp->ks_data; + if (flags & DMU_DIRECTIO) { + SPA_IOSTATS_ADD(direct_read_count, iops); + SPA_IOSTATS_ADD(direct_read_bytes, size); + } else { + SPA_IOSTATS_ADD(arc_read_count, iops); + SPA_IOSTATS_ADD(arc_read_bytes, size); + } +} + +void +spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + + if (ksp == NULL) + return; + + spa_iostats_t *iostats = ksp->ks_data; + if (flags & DMU_DIRECTIO) { + SPA_IOSTATS_ADD(direct_write_count, iops); + SPA_IOSTATS_ADD(direct_write_bytes, size); + } else { + SPA_IOSTATS_ADD(arc_write_count, iops); + SPA_IOSTATS_ADD(arc_write_bytes, size); + } +} + static int spa_iostats_update(kstat_t *ksp, int rw) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 6ae0a14127..fa3eceb697 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -117,6 +117,11 @@ static unsigned int zfs_slow_io_events_per_second = 20; */ static unsigned int zfs_deadman_events_per_second = 1; +/* + * Rate limit direct write IO verify failures to this many per scond. + */ +static unsigned int zfs_dio_write_verify_events_per_second = 20; + /* * Rate limit checksum events after this many checksum errors per second. */ @@ -153,6 +158,17 @@ int zfs_nocacheflush = 0; uint_t zfs_vdev_max_auto_ashift = 14; uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; +/* + * VDEV checksum verification percentage for Direct I/O writes. This is + * neccessary for Linux, because user pages can not be placed under write + * protection during Direct I/O writes. + */ +#if !defined(__FreeBSD__) +uint_t zfs_vdev_direct_write_verify_pct = 2; +#else +uint_t zfs_vdev_direct_write_verify_pct = 0; +#endif + void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) { @@ -673,6 +689,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); + zfs_ratelimit_init(&vd->vdev_dio_verify_rl, + &zfs_dio_write_verify_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); @@ -1182,6 +1200,7 @@ vdev_free(vdev_t *vd) zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); + zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) @@ -4475,6 +4494,7 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) @@ -6503,6 +6523,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, "Rate limit hung IO (deadman) events to this many per second"); +ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, + "Rate Direct I/O write verify events to this many per second"); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " @@ -6530,4 +6553,9 @@ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, direct_write_verify_pct, + param_set_direct_write_verify_pct, param_get_uint, ZMOD_RW, + "Percentage of Direct I/O writes per top-level VDEV for checksum " + "verification to be performed"); /* END CSTYLED */ diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 47346dd5ac..9d12bc2eb0 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -387,6 +387,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) /* IO delays */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); + /* Direct I/O write verify errors */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS, + vs->vs_dio_verify_errors); + /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index f7cecc9af8..25b05abd36 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -595,6 +595,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, vs->vs_checksum_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, DATA_TYPE_UINT64, vs->vs_slow_ios, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS, + DATA_TYPE_UINT64, vs->vs_dio_verify_errors, NULL); } diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 399f5a0117..8d0aebbec1 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -607,7 +607,7 @@ static int64_t zfs_immediate_write_sz = 32768; void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, boolean_t commit, - zil_callback_t callback, void *callback_data) + boolean_t o_direct, zil_callback_t callback, void *callback_data) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); uint32_t blocksize = zp->z_blksz; @@ -622,7 +622,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, return; } - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && resid >= zfs_immediate_write_sz) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index f3db953eab..fb6f9475d3 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -202,6 +202,99 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); } +zfs_direct_enabled_t +zfs_check_direct_enabled(znode_t *zp, int ioflags, int *error) +{ + zfs_direct_enabled_t is_direct = ZFS_DIRECT_IO_DISABLED; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + if ((*error = zfs_enter(zfsvfs, FTAG)) != 0) + return (ZFS_DIRECT_IO_ERR); + + if (ioflags & O_DIRECT && + zfsvfs->z_os->os_direct != ZFS_DIRECT_DISABLED) { + is_direct = ZFS_DIRECT_IO_ENABLED; + } else if (zfsvfs->z_os->os_direct == ZFS_DIRECT_ALWAYS) { + is_direct = ZFS_DIRECT_IO_ENABLED; + } + + zfs_exit(zfsvfs, FTAG); + + return (is_direct); +} + +/* + * Determine if Direct I/O has been requested (either via the O_DIRECT flag or + * the "direct" dataset property). When inherited by the property only apply + * the O_DIRECT flag to correctly aligned IO requests. The rational for this + * is it allows the property to be safely set on a dataset without forcing + * all of the applications to be aware of the alignment restrictions. When + * O_DIRECT is explicitly requested by an application return EINVAL if the + * request is unaligned. In all cases, if the range for this request has + * been mmap'ed then we will perform buffered I/O to keep the mapped region + * synhronized with the ARC. + * + * It is possible that a file's pages could be mmap'ed after it is checked + * here. If so, that is handled according in zfs_read() and zfs_write(). See + * comments in the following two areas for how this handled: + * zfs_read() -> mappedread() + * zfs_write() -> update_pages() + */ +int +zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, + int *ioflagp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + int ioflag = *ioflagp; + int error = 0; + + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); + + if (os->os_direct == ZFS_DIRECT_DISABLED) { + error = EAGAIN; + goto out; + + } else if (os->os_direct == ZFS_DIRECT_ALWAYS && + zfs_uio_page_aligned(uio) && + zfs_uio_aligned(uio, SPA_MINBLOCKSIZE)) { + if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) || + (rw == UIO_READ)) { + ioflag |= O_DIRECT; + } + } + + if (ioflag & O_DIRECT) { + if (!zfs_uio_page_aligned(uio) || + !zfs_uio_aligned(uio, SPA_MINBLOCKSIZE)) { + error = SET_ERROR(EINVAL); + goto out; + } + + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { + error = SET_ERROR(EAGAIN); + goto out; + } + + error = zfs_uio_get_dio_pages_alloc(uio, rw); + if (error) + goto out; + } else { + error = EAGAIN; + goto out; + } + + IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT); + ASSERT0(error); + + *ioflagp = ioflag; +out: + zfs_exit(zfsvfs, FTAG); + return (error); +} + /* * Read bytes from specified file into supplied buffer. * @@ -291,20 +384,61 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) #if defined(__linux__) ssize_t start_offset = zfs_uio_offset(uio); #endif + ssize_t chunk_size = zfs_vnops_read_chunk_size; ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; + ssize_t dio_remaining_resid = 0; + + if (uio->uio_extflg & UIO_DIRECT) { + /* + * All pages for an O_DIRECT request ahve already been mapped + * so there's no compelling reason to handle this uio in + * smaller chunks. + */ + chunk_size = DMU_MAX_ACCESS; + + /* + * In the event that the O_DIRECT request is reading the entire + * file, it is possible file's length is not page sized + * aligned. However, lower layers expect that the Direct I/O + * request is page-aligned. In this case, as much of the file + * that can be read using Direct I/O happens and the remaining + * amount will be read through the ARC. + * + * This is still consistent with the semantics of Direct I/O in + * ZFS as at a minimum the I/O request must be page-aligned. + */ + dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t); + if (dio_remaining_resid != 0) + n -= dio_remaining_resid; + } while (n > 0) { - ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); + ssize_t nbytes = MIN(n, chunk_size - + P2PHASE(zfs_uio_offset(uio), chunk_size)); #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) error = mappedread_sf(zp, nbytes, uio); else #endif if (zn_has_cached_data(zp, zfs_uio_offset(uio), - zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) { + zfs_uio_offset(uio) + nbytes - 1)) { + /* + * It is possible that a files pages have been mmap'ed + * since our check for Direct I/O reads and the read + * being issued. In this case, we will use the ARC to + * keep it synchronized with the page cache. In order + * to do this we temporarily remove the UIO_DIRECT + * flag. + */ + boolean_t uio_direct_mmap = B_FALSE; + if (uio->uio_extflg & UIO_DIRECT) { + uio->uio_extflg &= ~UIO_DIRECT; + uio_direct_mmap = B_TRUE; + } error = mappedread(zp, nbytes, uio); + if (uio_direct_mmap) + uio->uio_extflg |= UIO_DIRECT; } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); @@ -332,9 +466,30 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) n -= nbytes; } - int64_t nread = start_resid - n; + int64_t nread = start_resid; + if (error == 0 && (uio->uio_extflg & UIO_DIRECT) && + dio_remaining_resid != 0) { + /* + * Temporarily remove the UIO_DIRECT flag from the UIO so the + * remainder of the file can be read using the ARC. + */ + uio->uio_extflg &= ~UIO_DIRECT; + + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + dio_remaining_resid - 1)) { + error = mappedread(zp, dio_remaining_resid, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, + dio_remaining_resid); + } + uio->uio_extflg |= UIO_DIRECT; + + if (error != 0) + n -= dio_remaining_resid; + } + nread -= n; + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); - task_io_account_read(nread); out: zfs_rangelock_exit(lr); @@ -422,6 +577,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) int error = 0, error1; ssize_t start_resid = zfs_uio_resid(uio); uint64_t clear_setid_bits_txg = 0; + boolean_t o_direct_defer = B_FALSE; /* * Fasttrack empty write @@ -504,6 +660,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) woff = zp->z_size; } zfs_uio_setoffset(uio, woff); + /* + * We need to update the starting offset as well because it is + * set previously in the ZPL (Linux) and VNOPS (FreeBSD) + * layers. + */ + zfs_uio_setsoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of @@ -539,6 +701,33 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); const uint64_t projid = zp->z_projid; + /* + * In the event we are increasing the file block size + * (lr_length == UINT64_MAX), we will direct the write to the ARC. + * Because zfs_grow_blocksize() will read from the ARC in order to + * grow the dbuf, we avoid doing Direct I/O here as that would cause + * data written to disk to be overwritten by data in the ARC during + * the sync phase. Besides writing data twice to disk, we also + * want to avoid consistency concerns between data in the the ARC and + * on disk while growing the file's blocksize. + * + * We will only temporarily remove Direct I/O and put it back after + * we have grown the blocksize. We do this in the event a request + * is larger than max_blksz, so further requests to + * dmu_write_uio_dbuf() will still issue the requests using Direct + * IO. + * + * As an example: + * The first block to file is being written as a 4k request with + * a recorsize of 1K. The first 1K issued in the loop below will go + * through the ARC; however, the following 3 1K requests will + * use Direct I/O. + */ + if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) { + uio->uio_extflg &= ~UIO_DIRECT; + o_direct_defer = B_TRUE; + } + /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small @@ -580,6 +769,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) ssize_t nbytes = n; if (n >= blksz && woff >= zp->z_size && P2PHASE(woff, blksz) == 0 && + !(uio->uio_extflg & UIO_DIRECT) && (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { /* * This write covers a full block. "Borrow" a buffer @@ -705,9 +895,18 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } + + /* + * There is a a window where a file's pages can be mmap'ed after + * the Direct I/O write has started. In this case we will still + * call update_pages() to make sure there is consistency + * between the ARC and the page cache. This is unfortunate + * situation as the data will be read back into the ARC after + * the Direct I/O write has completed, but this is the pentalty + * for writing to a mmap'ed region of the file using O_DIRECT. + */ if (tx_bytes && - zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && - !(ioflag & O_DIRECT)) { + zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } @@ -756,10 +955,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * the TX_WRITE records logged here. */ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, - NULL, NULL); + uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL, + NULL); dmu_tx_commit(tx); + /* + * Direct I/O was deferred in order to grow the first block. + * At this point it can be re-enabled for subsequent writes. + */ + if (o_direct_defer) { + ASSERT(ioflag & O_DIRECT); + uio->uio_extflg |= UIO_DIRECT; + o_direct_defer = B_FALSE; + } + if (error != 0) break; ASSERT3S(tx_bytes, ==, nbytes); @@ -767,6 +977,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) pfbytes -= nbytes; } + if (o_direct_defer) { + ASSERT(ioflag & O_DIRECT); + uio->uio_extflg |= UIO_DIRECT; + o_direct_defer = B_FALSE; + } + zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); @@ -784,9 +1000,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if (commit) zil_commit(zilog, zp->z_id); - const int64_t nwritten = start_resid - zfs_uio_resid(uio); + int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); - task_io_account_write(nwritten); zfs_exit(zfsvfs, FTAG); return (0); @@ -846,7 +1061,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; - dmu_buf_t *db; zgd_t *zgd; int error = 0; uint64_t zp_gen; @@ -882,6 +1096,32 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, zgd->zgd_lwb = lwb; zgd->zgd_private = zp; + dmu_buf_t *dbp; + error = dmu_buf_hold_noread(os, object, offset, zgd, &dbp); + zgd->zgd_db = dbp; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; + + if (error) { + zfs_get_done(zgd, error); + return (error); + } + + /* + * If a Direct I/O write is waiting for previous dirty records to sync + * out in dmu_buf_direct_mixed_io_wait(), then the ranglock is already + * held across the entire block by the O_DIRECT write. + * + * The dirty record for this TXG will also be used to identify if this + * log record is associated with a Direct I/O write. + */ + mutex_enter(&db->db_mtx); + boolean_t rangelock_held = db->db_mixed_io_dio_wait; + zgd->zgd_grabbed_rangelock = !(rangelock_held); + dbuf_dirty_record_t *dr = + dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); + boolean_t direct_write = dbuf_dirty_is_direct_write(db, dr); + mutex_exit(&db->db_mtx); + /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the @@ -890,8 +1130,10 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); + if (zgd->zgd_grabbed_rangelock) { + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + } /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); @@ -908,18 +1150,29 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; - offset -= blkoff; - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - zfs_rangelock_exit(zgd->zgd_lr); + if (zgd->zgd_grabbed_rangelock) { + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : + offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter( + &zp->z_rangelock, offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); + } + ASSERT3U(dbp->db_size, ==, size); + ASSERT3U(dbp->db_offset, ==, offset); + } else { + /* + * A Direct I/O write always covers an entire block. + */ + ASSERT3U(dbp->db_size, ==, zp->z_blksz); } + /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); @@ -929,44 +1182,48 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, zil_fault_io = 0; } #endif + if (error) { + zfs_get_done(zgd, error); + return (error); + } + + /* + * All Direct I/O writes will have already completed and the + * block pointer can be immediately stored in the log record. + */ + if (direct_write) { + lr->lr_blkptr = dr->dt.dl.dr_overridden_by; + zfs_get_done(zgd, 0); + return (0); + } + + blkptr_t *bp = &lr->lr_blkptr; + zgd->zgd_bp = bp; + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ if (error == 0) - error = dmu_buf_hold_noread(os, object, offset, zgd, - &db); - - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); + return (0); + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. */ - if (error == 0) - return (0); - - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; - /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP. - */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; - } + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; } } @@ -982,10 +1239,11 @@ zfs_get_done(zgd_t *zgd, int error) (void) error; znode_t *zp = zgd->zgd_private; - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); + ASSERT3P(zgd->zgd_db, !=, NULL); + dmu_buf_rele(zgd->zgd_db, zgd); - zfs_rangelock_exit(zgd->zgd_lr); + if (zgd->zgd_grabbed_rangelock) + zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the diff --git a/module/zfs/zio.c b/module/zfs/zio.c index e4ccd144f0..f4ada08a91 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -803,6 +803,12 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { + ASSERT3U(*errorp, ==, EAGAIN); + ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + } + (*countp)--; if (*countp == 0 && pio->io_stall == countp) { @@ -1282,20 +1288,14 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; + enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ? + ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ? + ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE; - ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && - zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && - zp->zp_compress >= ZIO_COMPRESS_OFF && - zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && - DMU_OT_IS_VALID(zp->zp_type) && - zp->zp_level < 32 && - zp->zp_copies > 0 && - zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, - ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? - ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); + ZIO_STAGE_OPEN, pipeline); zio->io_ready = ready; zio->io_children_ready = children_ready; @@ -1572,6 +1572,19 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; + } else if (type == ZIO_TYPE_WRITE && + pio->io_prop.zp_direct_write == B_TRUE) { + /* + * By default we only will verify checksums for Direct I/O + * writes for Linux. FreeBSD is able to place user pages under + * write protection before issuing them to the ZIO pipeline. + * + * Checksum validation errors will only be reported through + * the top-level VDEV, which is set by this child ZIO. + */ + ASSERT3P(bp, !=, NULL); + ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { @@ -3100,6 +3113,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; + zp.zp_direct_write = B_FALSE; memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); @@ -4505,6 +4519,20 @@ zio_vdev_io_assess(zio_t *zio) zio->io_vsd = NULL; } + /* + * If a Direct I/O write checksum verify error has occurred then this + * I/O should not attempt to be issued again. Instead the EAGAIN will + * be returned and this write will attempt to be issued through the + * ARC instead. + */ + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); + ASSERT3U(zio->io_error, ==, EAGAIN); + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + return (zio); + } + + if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); @@ -4818,6 +4846,58 @@ zio_checksum_verify(zio_t *zio) return (zio); } +static zio_t * +zio_dio_checksum_verify(zio_t *zio) +{ + zio_t *pio = zio_unique_parent(zio); + + ASSERT3P(zio->io_vd, !=, NULL); + ASSERT3P(zio->io_bp, !=, NULL); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE); + ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + + if (zfs_vdev_direct_write_verify_pct == 0 || zio->io_error != 0) + goto out; + + /* + * A Direct I/O write checksum verification will only be + * performed based on the top-level VDEV percentage for checks. + */ + uint32_t rand = random_in_range(100); + int error; + + if (rand < zfs_vdev_direct_write_verify_pct) { + if ((error = zio_checksum_error(zio, NULL)) != 0) { + zio->io_error = error; + if (error == ECKSUM) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_dio_verify_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + zio->io_error = SET_ERROR(EAGAIN); + zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + + /* + * The EAGAIN error must be propagated up to the + * logical parent ZIO in zio_notify_parent() so + * it can be returned to dmu_write_abd(). + */ + zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE; + + (void) zfs_ereport_post( + FM_EREPORT_ZFS_DIO_VERIFY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0); + } + } + } + +out: + return (zio); +} + + /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ @@ -5148,7 +5228,8 @@ zio_done(zio_t *zio) * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && - !vdev_is_dead(zio->io_vd)) { + !vdev_is_dead(zio->io_vd) && + !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { @@ -5162,7 +5243,8 @@ zio_done(zio_t *zio) } if ((zio->io_error == EIO || !(zio->io_flags & - (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && + (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DIO_CHKSUM_ERR))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the @@ -5184,7 +5266,8 @@ zio_done(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && - !(zio->io_flags & ZIO_FLAG_CANFAIL)) { + !(zio->io_flags & ZIO_FLAG_CANFAIL) && + !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else @@ -5234,6 +5317,14 @@ zio_done(zio_t *zio) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { + /* + * A Direct I/O write that has a checksum verify error should + * not attempt to reexecute. Instead, EAGAIN should just be + * propagated back up so the write can be attempt to be issued + * through the ARC. + */ + ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)); + /* * This is a logical I/O that wants to reexecute. * @@ -5394,6 +5485,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, + zio_dio_checksum_verify, zio_done }; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 088e46ce57..f89a4b3e0a 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -693,6 +693,14 @@ tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] tags = ['functional', 'delegate'] +[tests/functional/direct] +tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines', + 'dio_compression', 'dio_dedup', 'dio_encryption', 'dio_grow_block', + 'dio_max_recordsize', 'dio_mixed', 'dio_mmap', 'dio_overwrites', + 'dio_property', 'dio_random', 'dio_recordsize', 'dio_unaligned_block', + 'dio_unaligned_filesize'] +tags = ['functional', 'direct'] + [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] @@ -735,7 +743,7 @@ pre = tags = ['functional', 'inheritance'] [tests/functional/io] -tests = ['sync', 'psync', 'posixaio', 'mmap'] +tests = ['mmap', 'posixaio', 'psync', 'sync'] tags = ['functional', 'io'] [tests/functional/inuse] diff --git a/tests/runfiles/freebsd.run b/tests/runfiles/freebsd.run index 13696d6458..e1ae0c6b77 100644 --- a/tests/runfiles/freebsd.run +++ b/tests/runfiles/freebsd.run @@ -30,3 +30,7 @@ tags = ['functional', 'cli_root', 'zfs_jail'] tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', 'pam_short_password'] tags = ['functional', 'pam'] + +[tests/functional/direct:FreeBSD] +tests = ['dio_write_stable_pages'] +tags = ['functional', 'direct'] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 5817e64900..4613c895b0 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -102,6 +102,10 @@ tags = ['functional', 'compression'] tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] tags = ['functional', 'devices'] +[tests/functional/direct:Linux] +tests = ['dio_write_verify'] +tags = ['functional', 'direct'] + [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 0ed0a69eb0..e9e3b8f73e 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -16,6 +16,7 @@ /getversion /largest_file /libzfs_input_check +/manipulate_user_buffer /mkbusy /mkfile /mkfiles diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index a8df06c2e9..5250e72f9f 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -60,6 +60,8 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/libzfs_input_check libzfs_core.la \ libnvpair.la +scripts_zfs_tests_bin_PROGRAMS += %D%/manipulate_user_buffer +%C%_manipulate_user_buffer_LDADD = -lpthread scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree %C%_mkfile_LDADD = $(LTLIBINTL) diff --git a/tests/zfs-tests/cmd/manipulate_user_buffer.c b/tests/zfs-tests/cmd/manipulate_user_buffer.c new file mode 100644 index 0000000000..c195a197ad --- /dev/null +++ b/tests/zfs-tests/cmd/manipulate_user_buffer.c @@ -0,0 +1,260 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 by Triad National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MIN +#define MIN(a, b) ((a) < (b)) ? (a) : (b) +#endif + +static char *outputfile = NULL; +static int blocksize = 131072; /* 128K */ +static int numblocks = 100; +static char *execname = NULL; +static int print_usage = 0; +static int randompattern = 0; +static int ofd; +char *buf = NULL; + +typedef struct { + int entire_file_written; +} pthread_args_t; + +static void +usage(void) +{ + (void) fprintf(stderr, + "usage %s -o outputfile [-b blocksize] [-n numblocks]\n" + " [-p randpattern] [-h help]\n" + "\n" + "Testing whether checksum verify works correctly for O_DIRECT.\n" + "when manipulating the contents of a userspace buffer.\n" + "\n" + " outputfile: File to write to.\n" + " blocksize: Size of each block to write (must be at \n" + " least >= 512).\n" + " numblocks: Total number of blocksized blocks to write.\n" + " randpattern: Fill data buffer with random data. Default \n" + " behavior is to fill the buffer with the \n" + " known data pattern (0xdeadbeef).\n" + " help: Print usage information and exit.\n" + "\n" + " Required parameters:\n" + " outputfile\n" + "\n" + " Default Values:\n" + " blocksize -> 131072\n" + " numblocks -> 100\n" + " randpattern -> false\n", + execname); + (void) exit(1); +} + +static void +parse_options(int argc, char *argv[]) +{ + int c; + int errflag = 0; + extern char *optarg; + extern int optind, optopt; + execname = argv[0]; + + while ((c = getopt(argc, argv, "b:hn:o:p")) != -1) { + switch (c) { + case 'b': + blocksize = atoi(optarg); + break; + + case 'h': + print_usage = 1; + break; + + case 'n': + numblocks = atoi(optarg); + break; + + case 'o': + outputfile = optarg; + break; + + case 'p': + randompattern = 1; + break; + + case ':': + (void) fprintf(stderr, + "Option -%c requires an opertand\n", + optopt); + errflag++; + break; + case '?': + default: + (void) fprintf(stderr, + "Unrecognized option: -%c\n", optopt); + errflag++; + break; + } + } + + if (errflag || print_usage == 1) + (void) usage(); + + if (blocksize < 512 || outputfile == NULL || numblocks <= 0) { + (void) fprintf(stderr, + "Required paramater(s) missing or invalid.\n"); + (void) usage(); + } +} + +/* + * Write blocksize * numblocks to the file using O_DIRECT. + */ +static void * +write_thread(void *arg) +{ + size_t offset = 0; + int total_data = blocksize * numblocks; + int left = total_data; + ssize_t wrote = 0; + pthread_args_t *args = (pthread_args_t *)arg; + + while (!args->entire_file_written) { + wrote = pwrite(ofd, buf, blocksize, offset); + if (wrote != blocksize) { + perror("write"); + exit(2); + } + + offset = ((offset + blocksize) % total_data); + left -= blocksize; + + if (left == 0) + args->entire_file_written = 1; + } + + pthread_exit(NULL); +} + +/* + * Update the buffers contents with random data. + */ +static void * +manipulate_buf_thread(void *arg) +{ + size_t rand_offset; + char rand_char; + pthread_args_t *args = (pthread_args_t *)arg; + + while (!args->entire_file_written) { + rand_offset = (rand() % blocksize); + rand_char = (rand() % (126 - 33) + 33); + buf[rand_offset] = rand_char; + } + + pthread_exit(NULL); +} + +int +main(int argc, char *argv[]) +{ + const char *datapattern = "0xdeadbeef"; + int ofd_flags = O_WRONLY | O_CREAT | O_DIRECT; + mode_t mode = S_IRUSR | S_IWUSR; + pthread_t write_thr; + pthread_t manipul_thr; + int left = blocksize; + int offset = 0; + int rc; + pthread_args_t args = { 0 }; + + parse_options(argc, argv); + + ofd = open(outputfile, ofd_flags, mode); + if (ofd == -1) { + (void) fprintf(stderr, "%s, %s\n", execname, outputfile); + perror("open"); + exit(2); + } + + int err = posix_memalign((void **)&buf, sysconf(_SC_PAGE_SIZE), + blocksize); + if (err != 0) { + (void) fprintf(stderr, + "%s: %s\n", execname, strerror(err)); + exit(2); + } + + if (!randompattern) { + /* Putting known data pattern in buffer */ + while (left) { + size_t amt = MIN(strlen(datapattern), left); + memcpy(&buf[offset], datapattern, amt); + offset += amt; + left -= amt; + } + } else { + /* Putting random data in buffer */ + for (int i = 0; i < blocksize; i++) + buf[i] = rand(); + } + + /* + * Writing using O_DIRECT while manipulating the buffer conntents until + * the entire file is written. + */ + if ((rc = pthread_create(&manipul_thr, NULL, manipulate_buf_thread, + &args))) { + fprintf(stderr, "error: pthreads_create, manipul_thr, " + "rc: %d\n", rc); + exit(2); + } + + if ((rc = pthread_create(&write_thr, NULL, write_thread, &args))) { + fprintf(stderr, "error: pthreads_create, write_thr, " + "rc: %d\n", rc); + exit(2); + } + + pthread_join(write_thr, NULL); + pthread_join(manipul_thr, NULL); + + assert(args.entire_file_written == 1); + + (void) close(ofd); + + free(buf); + + return (0); +} diff --git a/tests/zfs-tests/cmd/stride_dd.c b/tests/zfs-tests/cmd/stride_dd.c index a20b261316..19aab1c97f 100644 --- a/tests/zfs-tests/cmd/stride_dd.c +++ b/tests/zfs-tests/cmd/stride_dd.c @@ -21,12 +21,19 @@ #include #include +static int alignment = 0; static int bsize = 0; static int count = 0; static char *ifile = NULL; static char *ofile = NULL; -static off_t stride = 0; +static off_t stride = 1; static off_t seek = 0; +static int seekbytes = 0; +static int if_o_direct = 0; +static int of_o_direct = 0; +static int skip = 0; +static int skipbytes = 0; +static int entire_file = 0; static const char *execname = "stride_dd"; static void usage(void); @@ -36,8 +43,10 @@ static void usage(void) { (void) fprintf(stderr, - "usage: %s -i inputfile -o outputfile -b blocksize -c count \n" - " -s stride [ -k seekblocks]\n" + "usage: %s -i inputfile -o outputfile -b blocksize [-c count]\n" + " [-s stride] [-k seekblocks] [-K seekbytes]\n" + " [-a alignment] [-d if_o_direct] [-D of_o_direct]\n" + " [-p skipblocks] [-P skipbytes] [-e entire_file]\n" "\n" "Simplified version of dd that supports the stride option.\n" "A stride of n means that for each block written, n - 1 blocks\n" @@ -45,16 +54,47 @@ usage(void) "means that blocks are read and written consecutively.\n" "All numeric parameters must be integers.\n" "\n" - " inputfile: File to read from\n" - " outputfile: File to write to\n" - " blocksize: Size of each block to read/write\n" - " count: Number of blocks to read/write\n" - " stride: Read/write a block then skip (stride - 1) blocks\n" - " seekblocks: Number of blocks to skip at start of output\n", + " inputfile: File to read from\n" + " outputfile: File to write to\n" + " blocksize: Size of each block to read/write\n" + " count: Number of blocks to read/write (Required" + " unless -e is used)\n" + " stride: Read/write a block then skip (stride - 1) blocks" + "\n" + " seekblocks: Number of blocks to skip at start of output\n" + " seekbytes: Treat seekblocks as byte count\n" + " alignment: Alignment passed to posix_memalign() (default" + " PAGE_SIZE)\n" + " if_o_direct: Use O_DIRECT with inputfile (default no O_DIRECT)" + "\n" + " of_o_direct: Use O_DIRECT with outputfile (default no " + " O_DIRECT)\n" + " skipblocks: Number of blocks to skip at start of input " + " (default 0)\n" + " skipbytes: Treat skipblocks as byte count\n" + " entire_file: When used the entire inputfile will be read and" + " count will be ignored\n", execname); (void) exit(1); } +/* + * posix_memalign() only allows for alignments which are postive, powers of two + * and a multiple of sizeof (void *). + */ +static int +invalid_alignment(int alignment) +{ + if ((alignment < 0) || (alignment & (alignment - 1)) || + ((alignment % sizeof (void *)))) { + (void) fprintf(stderr, + "Alignment must be a postive, power of two, and multiple " + "of sizeof (void *).\n"); + return (1); + } + return (0); +} + static void parse_options(int argc, char *argv[]) { @@ -62,12 +102,17 @@ parse_options(int argc, char *argv[]) int errflag = 0; execname = argv[0]; + alignment = sysconf(_SC_PAGE_SIZE); extern char *optarg; extern int optind, optopt; - while ((c = getopt(argc, argv, ":b:c:i:o:s:k:")) != -1) { + while ((c = getopt(argc, argv, "a:b:c:deDi:o:s:k:Kp:P")) != -1) { switch (c) { + case 'a': + alignment = atoi(optarg); + break; + case 'b': bsize = atoi(optarg); break; @@ -76,6 +121,18 @@ parse_options(int argc, char *argv[]) count = atoi(optarg); break; + case 'd': + if_o_direct = 1; + break; + + case 'e': + entire_file = 1; + break; + + case 'D': + of_o_direct = 1; + break; + case 'i': ifile = optarg; break; @@ -92,6 +149,18 @@ parse_options(int argc, char *argv[]) seek = atoi(optarg); break; + case 'K': + seekbytes = 1; + break; + + case 'p': + skip = atoi(optarg); + break; + + case 'P': + skipbytes = 1; + break; + case ':': (void) fprintf(stderr, "Option -%c requires an operand\n", optopt); @@ -111,64 +180,60 @@ parse_options(int argc, char *argv[]) } } - if (bsize <= 0 || count <= 0 || stride <= 0 || ifile == NULL || - ofile == NULL || seek < 0) { + if (bsize <= 0 || stride <= 0 || ifile == NULL || ofile == NULL || + seek < 0 || invalid_alignment(alignment) || skip < 0) { + (void) fprintf(stderr, + "Required parameter(s) missing or invalid.\n"); + (void) usage(); + } + + if (count <= 0 && entire_file == 0) { (void) fprintf(stderr, "Required parameter(s) missing or invalid.\n"); (void) usage(); } } -int -main(int argc, char *argv[]) +static void +read_entire_file(int ifd, int ofd, void *buf) { - int i; - int ifd; - int ofd; - void *buf; int c; - parse_options(argc, argv); - - ifd = open(ifile, O_RDONLY); - if (ifd == -1) { - (void) fprintf(stderr, "%s: %s: ", execname, ifile); - perror("open"); - exit(2); - } - - ofd = open(ofile, O_WRONLY | O_CREAT, 0666); - if (ofd == -1) { - (void) fprintf(stderr, "%s: %s: ", execname, ofile); - perror("open"); - exit(2); - } - - /* - * We use valloc because some character block devices expect a - * page-aligned buffer. - */ - int err = posix_memalign(&buf, 4096, bsize); - if (err != 0) { - (void) fprintf(stderr, - "%s: %s\n", execname, strerror(err)); - exit(2); - } - - if (seek > 0) { - if (lseek(ofd, seek * bsize, SEEK_CUR) == -1) { - perror("output lseek"); + do { + c = read(ifd, buf, bsize); + if (c < 0) { + perror("read"); exit(2); + } else if (c != 0) { + c = write(ofd, buf, bsize); + if (c < 0) { + perror("write"); + exit(2); + } + } - } + + if (stride > 1) { + if (lseek(ifd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("input lseek"); + exit(2); + } + if (lseek(ofd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } + } + } while (c != 0); +} + +static void +read_on_count(int ifd, int ofd, void *buf) +{ + int i; + int c; for (i = 0; i < count; i++) { c = read(ifd, buf, bsize); - if (c != bsize) { - - perror("read"); - exit(2); - } if (c != bsize) { if (c < 0) { perror("read"); @@ -205,6 +270,71 @@ main(int argc, char *argv[]) } } } +} + +int +main(int argc, char *argv[]) +{ + int ifd; + int ofd; + int ifd_flags = O_RDONLY; + int ofd_flags = O_WRONLY | O_CREAT; + void *buf; + + parse_options(argc, argv); + + if (if_o_direct) + ifd_flags |= O_DIRECT; + + if (of_o_direct) + ofd_flags |= O_DIRECT; + + ifd = open(ifile, ifd_flags); + if (ifd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ifile); + perror("open"); + exit(2); + } + + ofd = open(ofile, ofd_flags, 0666); + if (ofd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ofile); + perror("open"); + exit(2); + } + + /* + * We use valloc because some character block devices expect a + * page-aligned buffer. + */ + int err = posix_memalign(&buf, alignment, bsize); + if (err != 0) { + (void) fprintf(stderr, + "%s: %s\n", execname, strerror(err)); + exit(2); + } + + if (skip > 0) { + int skipamt = skipbytes == 1 ? skip : skip * bsize; + if (lseek(ifd, skipamt, SEEK_CUR) == -1) { + perror("input lseek"); + exit(2); + } + } + + if (seek > 0) { + int seekamt = seekbytes == 1 ? seek : seek * bsize; + if (lseek(ofd, seekamt, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } + } + + if (entire_file == 1) + read_entire_file(ifd, ofd, buf); + else + read_on_count(ifd, ofd, buf); + free(buf); (void) close(ofd); diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 19770138bf..934aca6f91 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -200,6 +200,7 @@ export ZFSTEST_FILES='badsend getversion largest_file libzfs_input_check + manipulate_user_buffer mkbusy mkfile mkfiles diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index a2f42999a3..1c467ca65d 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3474,6 +3474,18 @@ function md5digest esac } +# +# Compare the MD5 digest of two files. +# +function cmp_md5s { + typeset file1=$1 + typeset file2=$2 + + typeset sum1=$(md5digest $file1) + typeset sum2=$(md5digest $file2) + test "$sum1" = "$sum2" +} + # # Compute SHA256 digest for given file or stdin if no file given. # Note: file path must not contain spaces diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 96943421f8..d3c4a7d940 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -93,6 +93,7 @@ VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count +VDEV_DIRECT_WR_VERIFY_PCT vdev.direct_write_verify_pct zfs_vdev_direct_write_verify_pct VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bbeabc6dfb..053a2c09f6 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -265,6 +265,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/delegate/delegate_common.kshlib \ functional/devices/devices.cfg \ functional/devices/devices_common.kshlib \ + functional/direct/dio.cfg \ + functional/direct/dio.kshlib \ functional/events/events.cfg \ functional/events/events_common.kshlib \ functional/fault/fault.cfg \ @@ -1458,6 +1460,26 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/devices/devices_002_neg.ksh \ functional/devices/devices_003_pos.ksh \ functional/devices/setup.ksh \ + functional/direct/dio_aligned_block.ksh \ + functional/direct/dio_async_always.ksh \ + functional/direct/dio_async_fio_ioengines.ksh \ + functional/direct/dio_compression.ksh \ + functional/direct/dio_dedup.ksh \ + functional/direct/dio_encryption.ksh \ + functional/direct/dio_grow_block.ksh \ + functional/direct/dio_max_recordsize.ksh \ + functional/direct/dio_mixed.ksh \ + functional/direct/dio_mmap.ksh \ + functional/direct/dio_overwrites.ksh \ + functional/direct/dio_property.ksh \ + functional/direct/dio_random.ksh \ + functional/direct/dio_recordsize.ksh \ + functional/direct/dio_unaligned_block.ksh \ + functional/direct/dio_unaligned_filesize.ksh \ + functional/direct/dio_write_verify.ksh \ + functional/direct/dio_write_stable_pages.ksh \ + functional/direct/setup.ksh \ + functional/direct/cleanup.ksh \ functional/dos_attributes/cleanup.ksh \ functional/dos_attributes/read_dos_attrs_001.ksh \ functional/dos_attributes/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh index 945db71bf1..20498440be 100755 --- a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh @@ -75,7 +75,7 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) )) log_must set_tunable32 L2ARC_WRITE_MAX $(( $VCACHE_SZ * 2 )) diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh index 57f6b6a024..1d3cbfc79e 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh @@ -36,7 +36,7 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 # # DESCRIPTION: diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh index f7b8a4b950..460c95bb60 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh @@ -37,7 +37,7 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 # # DESCRIPTION: diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh index 0838b2c93e..2f352e2af5 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh @@ -37,7 +37,7 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 # # DESCRIPTION: diff --git a/tests/zfs-tests/tests/functional/direct/cleanup.ksh b/tests/zfs-tests/tests/functional/direct/cleanup.ksh new file mode 100755 index 0000000000..382e9b1734 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/cleanup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/direct/dio.cfg b/tests/zfs-tests/tests/functional/direct/dio.cfg new file mode 100644 index 0000000000..6472610d7b --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio.cfg @@ -0,0 +1,26 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# + +DIO_VDEV1=$TEST_BASE_DIR/file1 +DIO_VDEV2=$TEST_BASE_DIR/file2 +DIO_VDEV3=$TEST_BASE_DIR/file3 +DIO_VDEVS="$DIO_VDEV1 $DIO_VDEV2 $DIO_VDEV3" + +DIO_FILESIZE=4M +DIO_BS=128K diff --git a/tests/zfs-tests/tests/functional/direct/dio.kshlib b/tests/zfs-tests/tests/functional/direct/dio.kshlib new file mode 100644 index 0000000000..3a70cf2939 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio.kshlib @@ -0,0 +1,331 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg + +function dio_cleanup +{ + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + rm -f $DIO_VDEVS +} + +# +# Generate an IO workload using fio and then verify the resulting data. +# +function dio_and_verify # mode file-size block-size directory ioengine extra-args +{ + typeset mode=$1 + typeset size=$2 + typeset bs=$3 + typeset mntpnt=$4 + typeset ioengine=$5 + typeset extra_args=$6 + + # Invoke an fio workload via Direct I/O and verify with Direct I/O. + log_must fio --directory=$mntpnt --name=direct-$mode \ + --rw=$mode --size=$size --bs=$bs --direct=1 --numjobs=1 \ + --verify=sha1 --ioengine=$ioengine --fallocate=none \ + --group_reporting --minimal --do_verify=1 $extra_args + + # Now just read back the file without Direct I/O into the ARC as an + # additional verfication step. + log_must fio --directory=$mntpnt --name=direct-$mode \ + --rw=read --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=$ioengine --group_reporting --minimal + + log_must rm -f "$mntpnt/direct-*" +} + +# +# Get zpool status -d checksum verify failures +# +function get_zpool_status_chksum_verify_failures # pool_name vdev_type +{ + typeset pool=$1 + typeset vdev_type=$2 + + if [[ "$vdev_type" == "stripe" ]]; then + val=$(zpool status -dp $pool | \ + awk '{s+=$6} END {print s}' ) + elif [[ "$vdev_type" == "mirror" || "$vdev_type" == "raidz" || + "$vdev_type" == "draid" ]]; then + val=$(zpool status -dp $pool | \ + awk -v d="$vdev_type" '$0 ~ d {print $6}' ) + else + log_fail "Unsupported VDEV type in \ + get_zpool_status_chksum_verify_failures(): $vdev_type" + fi + echo "$val" +} + +# +# Get ZED dio_verify events +# +function get_zed_dio_verify_events # pool +{ + typeset pool=$1 + + val=$(zpool events $pool | grep -c dio_verify) + + echo "$val" +} + +# +# Checking for checksum verify write failures with: +# zpool status -d +# zpool events +# After getting that counts will clear the out the ZPool errors and events +# +function check_dio_write_chksum_verify_failures # pool vdev_type expect_errors +{ + typeset pool=$1 + typeset vdev_type=$2 + typeset expect_errors=$3 + typeset note_str="expecting none" + + if [[ $expect_errors -ne 0 ]]; then + note_str="expecting some" + fi + + log_note "Checking for Direct I/O write checksum verify errors \ + $note_str on ZPool: $pool" + + status_failures=$(get_zpool_status_chksum_verify_failures $pool $vdev_type) + zed_dio_verify_events=$(get_zed_dio_verify_events $pool) + + if [[ $expect_errors -ne 0 ]]; then + if [[ $status_failures -eq 0 || + $zed_dio_verify_events -eq 0 ]]; then + zpool status -dp $pool + zpool events $pool + log_fail "Checksum verifies in zpool status -d \ + $status_failures. ZED dio_verify events \ + $zed_dio_verify_events. Neither should be 0." + fi + else + if [[ $status_failures -ne 0 || + $zed_dio_verify_events -ne 0 ]]; then + zpool status -dp $pool + zpool events $pool + log_fail "Checksum verifies in zpool status -d \ + $status_failures. ZED dio_verify events \ + $zed_dio_verify_events. Both should be zero." + fi + fi + + log_must zpool clear $pool + log_must zpool events -c + +} + +# +# Get the value of a counter from +# Linux: /proc/spl/kstat/zfs/$pool/iostats file. +# FreeBSD: kstat.zfs.$pool.msic.iostats.$stat +# +function get_iostats_stat # pool stat +{ + typeset pool=$1 + typeset stat=$2 + + if is_linux; then + iostats_file=/proc/spl/kstat/zfs/$pool/iostats + val=$(grep -m1 "$stat" $iostats_file | awk '{ print $3 }') + else + val=$(sysctl -n kstat.zfs.$pool.misc.iostats.$stat) + fi + if [[ -z "$val" ]]; then + log_fail "Unable to read $stat counter" + fi + + echo "$val" +} + +# +# Evict any buffered blocks by overwritting them using an O_DIRECT request. +# +function evict_blocks +{ + typeset pool=$1 + typeset file=$2 + typeset size=$3 + + log_must stride_dd -i /dev/urandom -o $file -b $size -c 1 -D +} + +# +# Perform FIO Direct I/O writes to a file with the given arguments. +# Then verify thae minimum expected number of blocks were written as +# Direct I/O. +# +function verify_dio_write_count #pool bs size mnpnt +{ + typeset pool=$1 + typeset bs=$2 + typeset size=$3 + typeset mntpnt=$4 + typeset dio_wr_expected=$(((size / bs) -1)) + + log_note "Checking for $dio_wr_expected Direct I/O writes" + + prev_dio_wr=$(get_iostats_stat $pool direct_write_count) + dio_and_verify write $size $bs $mntpnt "sync" + curr_dio_wr=$(get_iostats_stat $pool direct_write_count) + dio_wr_actual=$((curr_dio_wr - prev_dio_wr)) + + if [[ $dio_wr_actual -lt $dio_wr_expected ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Direct writes $dio_wr_actual of $dio_wr_expected" + fi +} + +# +# Perform a stride_dd write command to the file with the given arguments. +# Then verify the minimum expected number of blocks were written as either +# buffered IO (by the ARC), or Direct I/O to the application (dd). +# +function check_write # pool file bs count seek flags buf_wr dio_wr +{ + typeset pool=$1 + typeset file=$2 + typeset bs=$3 + typeset count=$4 + typeset seek=$5 + typeset flags=$6 + typeset buf_wr_expect=$7 + typeset dio_wr_expect=$8 + + log_note "Checking $count * $bs write(s) at offset $seek, $flags" + + prev_buf_wr=$(get_iostats_stat $pool arc_write_count) + prev_dio_wr=$(get_iostats_stat $pool direct_write_count) + + log_must stride_dd -i /dev/urandom -o $file -b $bs -c $count \ + -k $seek $flags + + curr_buf_wr=$(get_iostats_stat $pool arc_write_count) + buf_wr_actual=$((curr_buf_wr - prev_buf_wr)) + + curr_dio_wr=$(get_iostats_stat $pool direct_write_count) + dio_wr_actual=$((curr_dio_wr - prev_dio_wr)) + + if [[ $buf_wr_actual -lt $buf_wr_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Buffered writes $buf_wr_actual of $buf_wr_expect" + fi + + if [[ $dio_wr_actual -lt $dio_wr_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Direct writes $dio_wr_actual of $dio_wr_expect" + fi +} + +# +# Perform a stride_dd read command to the file with the given arguments. +# Then verify the minimum expected number of blocks were read as either +# buffered IO (by the ARC), or Direct I/O to the application (dd). +# +function check_read # pool file bs count skip flags buf_rd dio_rd +{ + typeset pool=$1 + typeset file=$2 + typeset bs=$3 + typeset count=$4 + typeset skip=$5 + typeset flags=$6 + typeset buf_rd_expect=$7 + typeset dio_rd_expect=$8 + + log_note "Checking $count * $bs read(s) at offset $skip, $flags" + + prev_buf_rd=$(get_iostats_stat $pool arc_read_count) + prev_dio_rd=$(get_iostats_stat $pool direct_read_count) + + log_must stride_dd -i $file -o /dev/null -b $bs -c $count \ + -p $skip $flags + + curr_buf_rd=$(get_iostats_stat $pool arc_read_count) + buf_rd_actual=$((curr_buf_rd - prev_buf_rd)) + + curr_dio_rd=$(get_iostats_stat $pool direct_read_count) + dio_rd_actual=$((curr_dio_rd - prev_dio_rd)) + + if [[ $buf_rd_actual -lt $buf_rd_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Buffered reads $buf_rd_actual of $buf_rd_expect" + fi + + if [[ $dio_rd_actual -lt $dio_rd_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Direct reads $dio_rd_actual of $dio_rd_expect" + fi +} + +function get_file_size +{ + typeset filename="$1" + + if is_linux; then + filesize=$(stat -c %s $filename) + else + filesize=$(stat -s $filename | awk '{print $8}' | grep -o '[0-9]\+') + fi + + echo $filesize +} + +function do_truncate_reduce +{ + typeset filename=$1 + typeset size=$2 + + filesize=$(get_file_size $filename) + eval "echo original filesize: $filesize" + if is_linux; then + truncate $filename -s $((filesize - size)) + else + truncate -s -$size $filename + fi + filesize=$(get_file_size $filename) + eval "echo new filesize after truncate: $filesize" +} diff --git a/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh new file mode 100755 index 0000000000..4aac5edd8e --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh @@ -0,0 +1,116 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify the number direct/buffered requests for (un)aligned access +# +# STRATEGY: +# 1. Create a multi-block file +# 2. Perform various (un)aligned accesses and verify the result. +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + log_must rm -f $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_onexit cleanup + +log_assert "Verify the number direct/buffered requests for unaligned access" + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +tmp_file=$mntpnt/tmp_file +file_size=$((rs * 8)) + +log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1 + +# N recordsize aligned writes which do not span blocks +check_write $TESTPOOL $tmp_file $rs 1 0 "-D" 0 1 +check_write $TESTPOOL $tmp_file $rs 2 0 "-D" 0 2 +check_write $TESTPOOL $tmp_file $rs 4 0 "-D" 0 4 +check_write $TESTPOOL $tmp_file $rs 8 0 "-D" 0 8 + +# 1 recordsize aligned write which spans multiple blocks at various offsets +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 0 "-D" 0 2 +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 1 "-D" 0 2 +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 2 "-D" 0 2 +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 3 "-D" 0 2 +check_write $TESTPOOL $tmp_file $((rs * 4)) 1 0 "-D" 0 4 +check_write $TESTPOOL $tmp_file $((rs * 4)) 1 1 "-D" 0 4 +check_write $TESTPOOL $tmp_file $((rs * 8)) 1 0 "-D" 0 8 + +# sub-blocksize unaligned writes which do not span blocks. +check_write $TESTPOOL $tmp_file $((rs / 2)) 1 0 "-D" 1 0 +check_write $TESTPOOL $tmp_file $((rs / 2)) 1 1 "-D" 1 0 +check_write $TESTPOOL $tmp_file $((rs / 2)) 1 2 "-D" 1 0 +check_write $TESTPOOL $tmp_file $((rs / 2)) 1 3 "-D" 1 0 + +# large unaligned writes which span multiple blocks +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 $((rs / 2)) "-D -K" 2 1 +check_write $TESTPOOL $tmp_file $((rs * 4)) 2 $((rs / 4)) "-D -K" 4 6 + +# evict any cached blocks by overwriting with O_DIRECT +evict_blocks $TESTPOOL $tmp_file $file_size + +# recordsize aligned reads which do not span blocks +check_read $TESTPOOL $tmp_file $rs 1 0 "-d" 0 1 +check_read $TESTPOOL $tmp_file $rs 2 0 "-d" 0 2 +check_read $TESTPOOL $tmp_file $rs 4 0 "-d" 0 4 +check_read $TESTPOOL $tmp_file $rs 8 0 "-d" 0 8 + +# 1 recordsize aligned read which spans multiple blocks at various offsets +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 0 "-d" 0 2 +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 1 "-d" 0 2 +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 2 "-d" 0 2 +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 3 "-d" 0 2 +check_read $TESTPOOL $tmp_file $((rs * 4)) 1 0 "-d" 0 4 +check_read $TESTPOOL $tmp_file $((rs * 4)) 1 1 "-d" 0 4 +check_read $TESTPOOL $tmp_file $((rs * 8)) 1 0 "-d" 0 8 + +# sub-blocksize unaligned reads which do not span blocks. +check_read $TESTPOOL $tmp_file $((rs / 2)) 1 0 "-d" 0 1 +check_read $TESTPOOL $tmp_file $((rs / 2)) 1 1 "-d" 0 1 +check_read $TESTPOOL $tmp_file $((rs / 2)) 1 2 "-d" 0 1 +check_read $TESTPOOL $tmp_file $((rs / 2)) 1 3 "-d" 0 1 + +# large unaligned reads which span multiple blocks +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 $((rs / 2)) "-d -P" 0 3 +check_read $TESTPOOL $tmp_file $((rs * 4)) 1 $((rs / 4)) "-d -P" 0 5 + +log_pass "Verify the number direct/buffered requests for (un)aligned access" diff --git a/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh b/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh new file mode 100755 index 0000000000..3f26715fc3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh @@ -0,0 +1,69 @@ +#!/bin/ksh -p +# +# DDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify small async Direct I/O requests +# +# STRATEGY: +# 1. Use fio to issue small read/write requests. Writes are +# smaller than the block size and thus will be buffered, +# reads satisfy the minimum alignment and will be direct. +# + +verify_runnable "global" + +function cleanup +{ + zfs set direct=standard $TESTPOOL/$TESTFS + rm $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify direct=always mixed small async requests" + +log_onexit cleanup + +log_must zfs set direct=always $TESTPOOL/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/tmp_file +page_size=$(getconf PAGESIZE) +file_size=1G +runtime=10 + +log_must truncate -s $file_size $tmp_file + +log_must fio --filename=$tmp_file --name=always-randrw \ + --rw=randwrite --bs=$page_size --size=$file_size --numjobs=1 \ + --ioengine=posixaio --fallocate=none --iodepth=4 --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based + +log_pass "Verify direct=always mixed small async requests" diff --git a/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh b/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh new file mode 100755 index 0000000000..82d7d8250f --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify FIO async engines work using Direct I/O. +# +# STRATEGY: +# 1. Select a FIO async ioengine +# 2. Start sequntial Direct I/O and verify with buffered I/O +# 3. Start mixed Direct I/O and verify with buffered I/O +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$mntpnt/direct-*" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +function check_fio_ioengine +{ + fio --ioengine=io_uring --parse-only > /dev/null 2>&1 + return $? +} + +log_assert "Verify FIO async ioengines work using Direct I/O." + +log_onexit cleanup + +typeset -a async_ioengine_args=("--iodepth=4" "--iodepth=4 --thread") + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +fio_async_ioengines="posixaio" + +if is_linux; then + fio_async_ioengines+=" libaio" + if $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then + if [ -e /etc/os-release ] ; then + source /etc/os-release + if [ $PLATFORM_ID = "platform:el9" ] ; then + log_note "io_uring disabled on RHEL 9 " \ + "variants: fails with " \ + "'Operation not permitted'" + elif $(check_fio_ioengine -eq 0); then + fio_async_ioengines+=" io_uring" + else + log_note "io_uring not supported by fio and " \ + "will not be tested" + fi + else + if $(check_fio_ioengine); then + fio_async_ioengines+=" io_uring" + + else + log_note "io_uring not supported by fio and " \ + "will not be tested" + fi + fi + else + log_note "io_uring not supported by kernel will not " \ + "be tested" + + fi +fi + +for ioengine in $fio_async_ioengines; do + for ioengine_args in "${async_ioengine_args[@]}"; do + for op in "rw" "randrw" "write"; do + log_note "Checking Direct I/O with FIO async ioengine" \ + " $ioengine with args $ioengine_args --rw=$op" + dio_and_verify $op $DIO_FILESIZE $DIO_BS $mntpnt "$ioengine" \ + "$ioengine_args" + done + done +done + +log_pass "Verfied FIO async ioengines work using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_compression.ksh b/tests/zfs-tests/tests/functional/direct/dio_compression.ksh new file mode 100755 index 0000000000..5be93d104d --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_compression.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify compression works using Direct I/O. +# +# STRATEGY: +# 1. Select a random compression algoritm +# 2. Start sequential Direct I/O and verify with buffered I/O +# 3. Start mixed Direct I/O and verify with buffered I/O +# 4. Repeat from 2 for all compression algoritms +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$mntpnt/direct-*" + log_must zfs set compression=off $TESTPOOL/$TESTFS + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify compression works using Direct I/O." + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +compress_args="--buffer_compress_percentage=50" + +for comp in "${compress_prop_vals[@]:1}"; do + log_must zfs set compression=$comp $TESTPOOL/$TESTFS + for op in "rw" "randrw" "write"; do + dio_and_verify $op $DIO_FILESIZE $DIO_BS $mntpnt "sync" $compress_args + done +done + +log_pass "Verfied compression works using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh b/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh new file mode 100755 index 0000000000..c703fcc05f --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# +# DDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify deduplication works using Direct I/O. +# +# STRATEGY: +# 1. Enable dedup +# 2. Start sequential Direct I/O and verify with buffered I/O +# 3. Start mixed Direct IO and verify with buffered I/O +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$mntpnt/direct-*" + log_must zfs set dedup=off $TESTPOOL/$TESTFS + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify deduplication works using Direct I/O." + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +dedup_args="--dedupe_percentage=50" + +log_must zfs set dedup=on $TESTPOOL/$TESTFS +for op in "rw" "randrw" "write"; do + dio_and_verify $op $DIO_FILESIZE $DIO_BS $mntpnt "sync" $dedup_args +done + +log_pass "Verfied deduplication works using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh b/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh new file mode 100755 index 0000000000..843b570d2d --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# +# DDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify encryption works using Direct I/O. +# +# STRATEGY: +# 1. Create multidisk pool. +# 2. Start some mixed readwrite Direct I/O. +# 3. Verify the results are as expected using buffered I/O. +# + +verify_runnable "global" + +log_assert "Verify encryption works using Direct I/O." + +log_onexit dio_cleanup + +log_must truncate -s $MINVDEVSIZE $DIO_VDEVS + +create_pool $TESTPOOL1 $DIO_VDEVS +log_must eval "echo 'password' | zfs create -o encryption=on \ + -o keyformat=passphrase -o keylocation=prompt -o compression=off \ + $TESTPOOL1/$TESTFS1" + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1) + +for bs in "4k" "128k" "1m"; do + for op in "rw" "randrw" "write"; do + dio_and_verify $op $DIO_FILESIZE $bs $mntpnt "sync" + done +done + +check_dio_write_chksum_verify_failures $TESTPOOL1 "stripe" 0 + +log_pass "Verified encryption works using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh new file mode 100755 index 0000000000..c54d079366 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify the number direct/buffered requests when growing a file +# +# STRATEGY: +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + log_must rm -f $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify the number direct/buffered requests when growing a file" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +tmp_file=$mntpnt/tmp_file + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +# +# Verify the expected number of buffered and Direct I/O's when growing +# the first block of a file up to the maximum recordsize. +# +for bs in "8192" "16384" "32768" "65536" "131072"; do + + # When O_DIRECT is set the first write to a new file, or when the + # block size needs to be grown, it will be done as a buffered write. + check_write $TESTPOOL $tmp_file $bs 1 0 "-D" 1 0 + + # Overwriting the first block of an existing file with O_DIRECT will + # be a buffered write if less than the block size. + check_write $TESTPOOL $tmp_file 4096 1 0 "-D" 1 0 + check_write $TESTPOOL $tmp_file 4096 1 1 "-D" 1 0 + + # Overwriting the first block of an existing file with O_DIRECT will + # be a direct write as long as the block size matches. + check_write $TESTPOOL $tmp_file $bs 1 0 "-D" 0 1 + + # Evict any blocks which may be buffered before the read tests. + evict_blocks $TESTPOOL $tmp_file $bs + + # Reading the first block of an existing file with O_DIRECT will + # be a direct read for part or all of the block size. + check_read $TESTPOOL $tmp_file $bs 1 0 "-d" 0 1 + check_read $TESTPOOL $tmp_file 4096 1 0 "-d" 0 1 + check_read $TESTPOOL $tmp_file 4096 1 1 "-d" 0 1 +done + +log_pass "Verify the number direct/buffered requests when growing a file" diff --git a/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh b/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh new file mode 100755 index 0000000000..87900443ed --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify max recordsizes are supported for Direct I/O. +# +# STRATEGY: +# 1. Create a pool from each vdev type with varying recordsizes. +# 2. Start sequential Direct I/O and verify with buffered I/O. +# + +verify_runnable "global" + +log_assert "Verify max recordsizes are supported for Direct I/O." + +log_onexit dio_cleanup + +log_must truncate -s $MINVDEVSIZE $DIO_VDEVS + +for type in "" "mirror" "raidz" "draid"; do; + for recsize in "2097152" "8388608" "16777216"; do + create_pool $TESTPOOL1 $type $DIO_VDEVS + log_must eval "zfs create \ + -o recordsize=$recsize -o compression=off \ + $TESTPOOL1/$TESTFS1" + + mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1) + + verify_dio_write_count $TESTPOOL1 $recsize $((4 * recsize)) \ + $mntpnt + + if [[ "$type" == "" ]]; then + check_dio_write_chksum_verify_failures $TESTPOOL1 \ + "stripe" 0 + else + check_dio_write_chksum_verify_failures $TESTPOOL1 \ + "$type" 0 + fi + + destroy_pool $TESTPOOL1 + done +done + +log_pass "Verified max recordsizes are supported for Direct I/O." diff --git a/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh b/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh new file mode 100755 index 0000000000..38c6159537 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh @@ -0,0 +1,108 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify mixed buffered and Direct I/O are coherent. +# +# STRATEGY: +# 1. Verify interleaved buffered and Direct I/O +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f $src_file $new_file $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify mixed buffered and Direct I/O are coherent." + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +src_file=$mntpnt/src_file +new_file=$mntpnt/new_file +tmp_file=$mntpnt/tmp_file +page_size=$(getconf PAGESIZE) +file_size=1048576 + +log_must stride_dd -i /dev/urandom -o $src_file -b $file_size -c 1 + +# +# Using mixed input and output block sizes verify that buffered and +# Direct I/O can be interleaved and the result with always be coherent. +# +for ibs in "512" "$page_size" "131072"; do + for obs in "512" "$page_size" "131072"; do + iblocks=$(($file_size / $ibs)) + oblocks=$(($file_size / $obs)) + iflags="" + oflags="" + + # Only allow Direct I/O when it is at least page sized. + if [[ $ibs -ge $page_size ]]; then + iflags="-d" + fi + + if [[ $obs -ge $page_size ]]; then + oflags="-D" + fi + + # Verify buffered write followed by a direct read. + log_must stride_dd -i $src_file -o $new_file -b $obs \ + -c $oblocks + log_must stride_dd -i $new_file -o $tmp_file -b $ibs \ + -c $iblocks $iflags + log_must cmp_md5s $new_file $tmp_file + log_must rm -f $new_file $tmp_file + + # Verify direct write followed by a buffered read. + log_must stride_dd -i $src_file -o $new_file -b $obs \ + -c $oblocks $oflags + log_must stride_dd -i $new_file -o $tmp_file -b $ibs \ + -c $iblocks + log_must cmp_md5s $new_file $tmp_file + log_must rm -f $new_file $tmp_file + + # Verify direct write followed by a direct read. + log_must stride_dd -i $src_file -o $new_file -b $obs \ + -c $oblocks $oflags + log_must stride_dd -i $new_file -o $tmp_file -b $ibs \ + -c $iblocks $iflags + log_must cmp_md5s $new_file $tmp_file + log_must rm -f $new_file $tmp_file + done +done + +log_pass "Verify mixed buffered and Direct I/O are coherent." diff --git a/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh b/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh new file mode 100755 index 0000000000..27d03e0412 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify mixed Direct I/O and mmap I/O. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a background Direct I/O random read/write fio to the +# file. +# 3. Start a background mmap random read/write fio to the file. +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + log_must rm -f "$tmp_file" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify mixed Direct I/O and mmap I/O" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((128 * 1024)) +blocks=64 +size=$((bs * blocks)) +runtime=60 + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +log_must stride_dd -i /dev/zero -o $tmp_file -b $bs -c $blocks + +# Direct I/O writes +log_must eval "fio --filename=$tmp_file --name=direct-write \ + --rw=randwrite --size=$size --bs=$bs --direct=1 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# Direct I/O reads +log_must eval "fio --filename=$tmp_file --name=direct-read \ + --rw=randread --size=$size --bs=$bs --direct=1 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap I/O writes +log_must eval "fio --filename=$tmp_file --name=mmap-write \ + --rw=randwrite --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap I/O reads +log_must eval "fio --filename=$tmp_file --name=mmap-read \ + --rw=randread --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +wait + +log_pass "Verfied mixed Direct I/O and mmap I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh b/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh new file mode 100755 index 0000000000..3854766ed8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify Direct I/O overwrite. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a Direct I/O random write fio to the file. +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + log_must rm -f "$tmp_file" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify Direct I/O overwrites" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((128 * 1024)) +blocks=64 +size=$((bs * blocks)) +runtime=60 + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +log_must stride_dd -i /dev/zero -o $tmp_file -b $bs -c $blocks + +# Direct I/O overwrites +log_must eval "fio --filename=$tmp_file --name=direct-write \ + --rw=randwrite --size=$size --bs=$bs --direct=1 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap" + +log_pass "Verfied Direct I/O overwrites" diff --git a/tests/zfs-tests/tests/functional/direct/dio_property.ksh b/tests/zfs-tests/tests/functional/direct/dio_property.ksh new file mode 100755 index 0000000000..4fbcfec068 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_property.ksh @@ -0,0 +1,126 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify the direct=always|disabled|standard property +# +# STRATEGY: +# 1. Verify direct=always behavior +# 2. Verify direct=disabled behavior +# 3. Verify direct=standard behavior +# + +verify_runnable "global" + +function cleanup +{ + zfs set direct=standard $TESTPOOL/$TESTFS + log_must rm -f $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify the direct=always|disabled|standard property" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) + +tmp_file=$mntpnt/tmp_file +page_size=$(getconf PAGESIZE) +file_size=1048576 +count=8 + +# +# Check when "direct=always" any aligned IO is done as direct. +# Note that "flag=direct" is not set in the following calls to dd(1). +# +log_must zfs set direct=always $TESTPOOL/$TESTFS + +log_note "Aligned writes (buffered, then all direct)" +check_write $TESTPOOL $tmp_file $rs $count 0 "" 1 $((count - 1)) + +log_note "Aligned overwrites" +check_write $TESTPOOL $tmp_file $rs $count 0 "" 0 $count + +log_note "Sub-recordsize unaligned overwrites" +check_write $TESTPOOL $tmp_file $((rs / 2)) $((2 * count)) 0 "" $((2 * count)) 0 + +log_note "Sub-page size aligned overwrites" +check_write $TESTPOOL $tmp_file 512 $count 0 "" $count 0 +evict_blocks $TESTPOOL $tmp_file $file_size + +log_note "Aligned reads" +check_read $TESTPOOL $tmp_file $rs $count 0 "" 0 $count + +log_note "Sub-recordsize unaligned reads" +check_read $TESTPOOL $tmp_file $((rs / 2)) $((count * 2)) 0 "" 0 $((2 * count)) + +log_note "Sub-page size aligned reads (one read then ARC hits)" +check_read $TESTPOOL $tmp_file 512 $count 0 "" 1 0 + +log_must rm -f $tmp_file + + +# +# Check when "direct=disabled" there are never any direct requests. +# Note that "flag=direct" is always set in the following calls to dd(1). +# +log_must zfs set direct=disabled $TESTPOOL/$TESTFS + +log_note "Aligned writes (all buffered with an extra for create)" +check_write $TESTPOOL $tmp_file $rs $count 0 "-D" $count 0 + +log_note "Aligned overwrites" +check_write $TESTPOOL $tmp_file $rs $count 0 "-D" $count 0 + +log_note "Aligned reads (all ARC hits)" +check_read $TESTPOOL $tmp_file $rs $count 0 "-d" 0 0 + +log_must rm -f $tmp_file + + +# +# Check when "direct=standard" only requested Direct I/O occur. +# +log_must zfs set direct=standard $TESTPOOL/$TESTFS + +log_note "Aligned writes/overwrites (buffered / direct)" +check_write $TESTPOOL $tmp_file $rs $count 0 "" $count 0 +check_write $TESTPOOL $tmp_file $rs $count 0 "-D" 0 $count + +log_note "Aligned reads (buffered / direct)" +evict_blocks $TESTPOOL $tmp_file $file_size +check_read $TESTPOOL $tmp_file $rs $count 0 "" $count 0 +evict_blocks $TESTPOOL $tmp_file $file_size +check_read $TESTPOOL $tmp_file $rs $count 0 "-d" 0 $count + +log_pass "Verify the direct=always|disabled|standard property" diff --git a/tests/zfs-tests/tests/functional/direct/dio_random.ksh b/tests/zfs-tests/tests/functional/direct/dio_random.ksh new file mode 100755 index 0000000000..42c18d4261 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_random.ksh @@ -0,0 +1,83 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify mixed Direct I/O and buffered I/O. A workload of random +# but correctly aligned direct read/writes is mixed with a +# concurrent workload of entirely unaligned buffered read/writes. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a background fio randomly issuing direct read/writes. +# 3. Start a background fio randomly issuing buffered read/writes. +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$tmp_file" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify randomly sized mixed Direct I/O and buffered I/O" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((1024 * 1024)) +blocks=32 +size=$((bs * blocks)) +runtime=10 +page_size=$(getconf PAGESIZE) + +log_must stride_dd -i /dev/zero -o $tmp_file -b $bs -c $blocks + +# Direct random read/write page-aligned IO of varying sizes with +# occasional calls to fsync(2), mixed with... +log_must eval "fio --filename=$tmp_file --name=direct-rwrand \ + --rw=randrw --size=$size --offset_align=$(getconf PAGESIZE) \ + --bsrange=$page_size-1m --direct=1 --fsync=32 --numjobs=2 \ + --ioengine=sync --fallocate=none --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based &" + +# Buffered random read/write entirely unaligned IO of varying sizes +# occasional calls to fsync(2). +log_must eval "fio --filename=$tmp_file --name=buffered-write \ + --rw=randrw --size=$size --offset_align=512 --bs_unaligned=1 \ + --bsrange=$page_size-1m --direct=0 --fsync=32 --numjobs=2 \ + --ioengine=sync --fallocate=none --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based &" + +wait + +log_pass "Verfied randomly sized mixed Direct I/O and buffered I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh b/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh new file mode 100755 index 0000000000..e1087e5ac3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh @@ -0,0 +1,76 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify different recordsizes are supported for Direct I/O. +# +# STRATEGY: +# 1. Create a pool from each vdev type with varying recordsizes. +# 2. Start sequential Direct I/O and verify with buffered I/O. +# 3. Start mixed Direct I/O and verify with buffered I/O. +# + +verify_runnable "global" + +log_assert "Verify different recordsizes are supported for Direct I/O." + +log_onexit dio_cleanup + +log_must truncate -s $MINVDEVSIZE $DIO_VDEVS + +for type in "" "mirror" "raidz" "draid"; do + for recsize in "1024" "4096" "128k"; do + create_pool $TESTPOOL1 $type $DIO_VDEVS + log_must eval "zfs create \ + -o recordsize=$recsize -o compression=off \ + $TESTPOOL1/$TESTFS1" + + mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1) + + for bs in "4k" "128k"; do + for op in "rw" "randrw" "write"; do + dio_and_verify $op $DIO_FILESIZE $bs $mntpnt "sync" + done + done + + if [[ "$type" == "" ]]; then + check_dio_write_chksum_verify_failures $TESTPOOL1 \ + "stripe" 0 + else + check_dio_write_chksum_verify_failures $TESTPOOL1 \ + "$type" 0 + fi + + destroy_pool $TESTPOOL1 + done +done + +log_pass "Verified different recordsizes are supported for Direct I/O." diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh new file mode 100755 index 0000000000..9f50187149 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify failure for (un)aligned O_DIRECT +# +# STRATEGY: +# 1. Create a multi-block file +# 2. Perform (un)aligned write/read verify the result. +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + zfs set direct=standard $TESTPOOL/$TESTFS + log_must rm -f $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_onexit cleanup + +log_assert "Verify direct requests for (un)aligned access" + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +tmp_file=$mntpnt/tmp_file +file_size=$((rs * 8)) + +log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1 + +log_must zfs set direct=standard $TESTPOOL/$TESTFS +# sub-pagesize direct writes/read will always fail if direct=standard. +log_mustnot stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D +log_mustnot stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 -d + +log_must zfs set direct=always $TESTPOOL/$TESTFS +# sub-pagesize direct writes/read will always pass if direct=always. +log_must stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 +log_must stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 + +log_must zfs set direct=disabled $TESTPOOL/$TESTFS +# sub-pagesize direct writes/read will always pass if direct=disabled. +log_must stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D +log_must stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 -d + +log_pass "Verify direct requests for (un)aligned access" diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh new file mode 100755 index 0000000000..571767d3b1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify Direct I/O reads can read an entire file that is not +# page-aligned in length. When a file is not page-aligned in total +# length, as much that can be read using using O_DIRECT is done so and +# the rest is read using the ARC. O_DIRECT requires page-size alignment. +# +# STRATEGY: +# 1. Write a file that is page-aligned (buffered) +# 2. Truncate the file to be 512 bytes less +# 3. Export then import the Zpool flushing out the ARC +# 4. Read back the file using O_DIRECT +# 5. Verify the file is read back with both Direct I/O and buffered I/O +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$filename" + log_must set recordsize=$rs $TESTPOOL/$TESTFS + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify Direct I/O reads can read an entire file that is not \ + page-aligned" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +bs=$((128 * 1024)) # bs=recordsize (128k) +filename="$mntpnt/testfile.iso" + +log_must stride_dd -i /dev/urandom -o $filename -b $bs -c 2 +# Truncating file so the total length is no longer page-size aligned +log_must do_truncate_reduce $filename 512 + +# Exporting the Zpool to make sure all future reads happen from the ARC +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +# Reading the file back using Direct I/O +prev_dio_read=$(get_iostats_stat $TESTPOOL direct_read_count) +prev_arc_read=$(get_iostats_stat $TESTPOOL arc_read_count) +log_must stride_dd -i $filename -o /dev/null -b $bs -e -d +curr_dio_read=$(get_iostats_stat $TESTPOOL direct_read_count) +curr_arc_read=$(get_iostats_stat $TESTPOOL arc_read_count) +total_dio_read=$((curr_dio_read - prev_dio_read)) +total_arc_read=$((curr_arc_read - prev_arc_read)) + +# We should see both Direct I/O reads an ARC read to read the entire file that +# is not page-size aligned +if [[ $total_dio_read -lt 2 ]] || [[ $total_arc_read -lt 1 ]]; then + log_fail "Expect 2 reads from Direct I/O and 1 from the ARC but \ + Direct I/O: $total_dio_read ARC: $total_arc_read" +fi + +log_pass "Verified Direct I/O read can read a none page-aligned length file" diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh new file mode 100755 index 0000000000..5a5a5cf7ad --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh @@ -0,0 +1,103 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify stable pages work for O_DIRECT writes. +# +# STRATEGY: +# 1. Start a Direct I/O write workload while manipulating the user +# buffer. +# 2. Verify we can Read the contents of the file using buffered reads. +# 3. Verify there is no checksum errors reported from zpool status. +# 4. Repeat steps 1 and 2 for 3 iterations. +# 5. Repeat 1-3 but with compression disabled. +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$mntpnt/direct-write.iso" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify stable pages work for Direct I/O writes." + +if is_linux; then + log_unsupported "Linux does not support stable pages for O_DIRECT \ + writes" +fi + +log_onexit cleanup + +ITERATIONS=3 +NUMBLOCKS=300 +BS=$((128 * 1024)) #128k +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +for compress in "on" "off"; +do + log_must zfs set compression=$compress $TESTPOOL/$TESTFS + + for i in $(seq 1 $ITERATIONS); do + log_note "Verifying stable pages for Direct I/O writes \ + iteration $i of $ITERATIONS" + + prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + + # Manipulate the user's buffer while running O_DIRECT write + # workload with the buffer. + log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS + + # Reading back the contents of the file + log_must stride_dd -i $mntpnt/direct-write.iso -o /dev/null \ + -b $BS -c $NUMBLOCKS + + curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + total_dio_wr=$((curr_dio_wr - prev_dio_wr)) + + log_note "Making sure we have Direct I/O writes logged" + if [[ $total_dio_wr -lt 1 ]]; then + log_fail "No Direct I/O writes $total_dio_wr" + fi + + # Making sure there are no data errors for the zpool + log_note "Making sure there are no checksum errors with the ZPool" + log_must check_pool_status $TESTPOOL "errors" \ + "No known data errors" + + log_must rm -f "$mntpnt/direct-write.iso" + done +done + +log_pass "Verified stable pages work for Direct I/O writes." diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh new file mode 100755 index 0000000000..a7e9dc0cde --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh @@ -0,0 +1,222 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify checksum verify works for Direct I/O writes. +# +# STRATEGY: +# 1. Set the module parameter zfs_vdev_direct_write_verify_pct to 30. +# 2. Check that manipulating the user buffer while Direct I/O writes are +# taking place does not cause any panics with compression turned on. +# 3. Start a Direct I/O write workload while manipulating the user buffer +# without compression. +# 4. Verify there are Direct I/O write verify failures using +# zpool status -d and checking for zevents. We also make sure there +# are reported data errors when reading the file back. +# 5. Repeat steps 3 and 4 for 3 iterations. +# 6. Set zfs_vdev_direct_write_verify_pct set to 1 and repeat 3. +# 7. Verify there are Direct I/O write verify failures using +# zpool status -d and checking for zevents. We also make sure there +# there are no reported data errors when reading the file back because +# with us checking every Direct I/O write and on checksum validation +# failure those writes will not be committed to a VDEV. +# + +verify_runnable "global" + +function cleanup +{ + # Clearing out DIO counts for Zpool + log_must zpool clear $TESTPOOL + # Clearing out dio_verify from event logs + log_must zpool events -c + log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT 2 +} + +log_assert "Verify checksum verify works for Direct I/O writes." + +if is_freebsd; then + log_unsupported "FeeBSD is capable of stable pages for O_DIRECT writes" +fi + +log_onexit cleanup + +ITERATIONS=3 +NUMBLOCKS=300 +VERIFY_PCT=30 +BS=$((128 * 1024)) # 128k +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +# Get a list of vdevs in our pool +set -A array $(get_disklist_fullpath $TESTPOOL) + +# Get the first vdev +firstvdev=${array[0]} + +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS +log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT $VERIFY_PCT + +# First we will verify there are no panics while manipulating the contents of +# the user buffer during Direct I/O writes with compression. The contents +# will always be copied out of the ABD and there should never be any ABD ASSERT +# failures +log_note "Verifying no panics for Direct I/O writes with compression" +log_must zfs set compression=on $TESTPOOL/$TESTFS +prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) +log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" -n $NUMBLOCKS \ + -b $BS +curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) +total_dio_wr=$((curr_dio_wr - prev_dio_wr)) + +log_note "Making sure we have Direct I/O writes logged" +if [[ $total_dio_wr -lt 1 ]]; then + log_fail "No Direct I/O writes $total_dio_wr" +fi + +log_must rm -f "$mntpnt/direct-write.iso" +# Clearing out DIO counts for Zpool +log_must zpool clear $TESTPOOL +# Clearing out dio_verify from event logs +log_must zpool events -c + + + +# Next we will verify there are checksum errors for Direct I/O writes while +# manipulating the contents of the user pages. +log_must zfs set compression=off $TESTPOOL/$TESTFS + +for i in $(seq 1 $ITERATIONS); do + log_note "Verifying 30% of Direct I/O write checksums iteration \ + $i of $ITERATIONS with \ + zfs_vdev_direct_write_verify_pct=$VERIFY_PCT" + + prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + prev_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) + log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS + + # Reading file back to verify checksum errors + filesize=$(get_file_size "$mntpnt/direct-write.iso") + num_blocks=$((filesize / BS)) + log_mustnot stride_dd -i "$mntpnt/direct-write.iso" -o /dev/null -b $BS \ + -c $num_blocks + + # Getting new Direct I/O and ARC write counts. + curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + curr_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) + total_dio_wr=$((curr_dio_wr - prev_dio_wr)) + total_arc_wr=$((curr_arc_wr - prev_arc_wr)) + + # Verifying there are checksum errors + log_note "Making sure there are checksum errors for the ZPool" + cksum=$(zpool status -P -v $TESTPOOL | awk -v v="$firstvdev" '$0 ~ v \ + {print $5}') + if [[ $cksum -eq 0 ]]; then + zpool status -P -v $TESTPOOL + log_fail "No checksum failures for ZPool $TESTPOOL" + fi + + # Getting checksum verify failures + verify_failures=$(get_zpool_status_chksum_verify_failures $TESTPOOL "raidz") + + log_note "Making sure we have Direct I/O writes logged" + if [[ $total_dio_wr -lt 1 ]]; then + log_fail "No Direct I/O writes $total_dio_wr" + fi + log_note "Making sure we have Direct I/O write checksum verifies with ZPool" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 1 + + # In the event of checksum verify error, the write will be redirected + # through the ARC. We check here that we have ARC writes. + log_note "Making sure we have ARC writes have taken place in the event \ + a Direct I/O checksum verify failures occurred" + if [[ $total_arc_wr -lt $verify_failures ]]; then + log_fail "ARC writes $total_arc_wr < $verify_failures" + fi + + log_must rm -f "$mntpnt/direct-write.iso" +done + +log_must zpool status -v $TESTPOOL +log_must zpool sync $TESTPOOL + +# Finally we will verfiy that with checking every Direct I/O write we have no +# errors at all. +VERIFY_PCT=100 +log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT $VERIFY_PCT + +for i in $(seq 1 $ITERATIONS); do + log_note "Verifying every Direct I/O write checksums iteration $i of \ + $ITERATIONS with zfs_vdev_direct_write_verify_pct=$VERIFY_PCT" + + prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + prev_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) + log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS + + # Reading file back to verify there no are checksum errors + filesize=$(get_file_size "$mntpnt/direct-write.iso") + num_blocks=$((filesize / BS)) + log_must stride_dd -i "$mntpnt/direct-write.iso" -o /dev/null -b $BS \ + -c $num_blocks + + # Getting new Direct I/O and ARC Write counts. + curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + curr_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) + total_dio_wr=$((curr_dio_wr - prev_dio_wr)) + total_arc_wr=$((curr_arc_wr - prev_arc_wr)) + + log_note "Making sure there are no checksum errors with the ZPool" + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + # Geting checksum verify failures + verify_failures=$(get_zpool_status_chksum_verify_failures $TESTPOOL "raidz") + + log_note "Making sure we have Direct I/O writes logged" + if [[ $total_dio_wr -lt 1 ]]; then + log_fail "No Direct I/O writes $total_dio_wr" + fi + + log_note "Making sure we have Direct I/O write checksum verifies with ZPool" + check_dio_write_chksum_verify_failures "$TESTPOOL" "raidz" 1 + + # In the event of checksum verify error, the write will be redirected + # through the ARC. We check here that we have ARC writes. + log_note "Making sure we have ARC writes have taken place in the event \ + a Direct I/O checksum verify failures occurred" + if [[ $total_arc_wr -lt $verify_failures ]]; then + log_fail "ARC writes $total_arc_wr < $verify_failures" + fi + + log_must rm -f "$mntpnt/direct-write.iso" +done + +log_pass "Verified checksum verify works for Direct I/O writes." diff --git a/tests/zfs-tests/tests/functional/direct/setup.ksh b/tests/zfs-tests/tests/functional/direct/setup.ksh new file mode 100755 index 0000000000..5ce95dddf4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +verify_runnable "global" + +default_raidz_setup_noexit "$DISKS" +log_must zfs set compression=off $TESTPOOL/$TESTFS +log_pass diff --git a/tests/zfs-tests/tests/functional/io/setup.ksh b/tests/zfs-tests/tests/functional/io/setup.ksh index 82aaf5bc91..29d2671158 100755 --- a/tests/zfs-tests/tests/functional/io/setup.ksh +++ b/tests/zfs-tests/tests/functional/io/setup.ksh @@ -27,5 +27,5 @@ . $STF_SUITE/include/libtest.shlib verify_runnable "global" -default_setup "$DISKS" +default_raidz_setup "$DISKS" log_must zfs set compression=on $TESTPOOL/$TESTFS diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg b/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg index 0302392f4c..f79123e5b2 100644 --- a/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg @@ -35,4 +35,4 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib index 26e7c2cc25..80badd2733 100644 --- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib +++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib @@ -155,13 +155,6 @@ function cleanup_pool fi } -function cmp_md5s { - typeset file1=$1 - typeset file2=$2 - - [ "$(md5digest $file1)" = "$(md5digest $file2)" ] -} - # # Detect if the given two filesystems have same sub-datasets # diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh index 8f3585a599..deb963f258 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh @@ -77,6 +77,14 @@ log_must zfs create $TESTPOOL/$TESTFS log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/sync \ conv=fdatasync,fsync bs=1 count=1 +# +# Create a small file for the O_DIRECT test before freezing the pool. This +# allows us to overwrite it after the pool is frozen and avoid the case +# where O_DIRECT is disabled because the first block must be grown. +# +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/direct \ + oflag=sync,direct bs=4k count=1 + # # 2. Freeze TESTFS # @@ -140,6 +148,10 @@ log_must truncate -s 0 /$TESTPOOL/$TESTFS/truncated_file log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/large \ oflag=sync bs=128k count=64 +# TX_WRITE (O_DIRECT) +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/direct \ + oflag=sync,direct bs=4k count=1 + # Write zeros, which compress to holes, in the middle of a file log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/holes.1 \ oflag=sync bs=128k count=8 diff --git a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh index a93d0b3cc8..62563e0dd4 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh @@ -77,7 +77,7 @@ export PERF_COMPCHUNK=0 export RUNTIME=30 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 # Write to the pool. log_must fio $FIO_SCRIPTS/mkfiles.fio From 7c8b7fe0f35daae05fbc088f5ace8824dd316493 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Wed, 19 Jun 2024 17:05:04 -0600 Subject: [PATCH 2/7] Fixing race condition with rangelocks There existed a race condition between when a Direct I/O write could complete and if a sync operation was issued. This was due to the fact that a Direct I/O would sleep waiting on previous TXG's to sync out their dirty records assosciated with a dbuf if there was an ARC buffer associated with the dbuf. This was necessay to safely destroy the ARC buffer in case previous dirty records dr_data as pointed at that the db_buf. The main issue with this approach is a Direct I/o write holds the rangelock across the entire block, so when a sync on that same block was issued and tried to grab the rangelock as reader, it would be blocked indefinitely because the Direct I/O that was now sleeping was holding that same rangelock as writer. This led to a complete deadlock. This commit fixes this issue and removes the wait in dmu_write_direct_done(). The way this is now handled is the ARC buffer is destroyed, if there an associated one with dbuf, before ever issuing the Direct I/O write. This implemenation heavily borrows from the block cloning implementation. A new function dmu_buf_wil_clone_or_dio() is called in both dmu_write_direct() and dmu_brt_clone() that does the following: 1. Undirties a dirty record for that db if there one currently associated with the current TXG. 2. Destroys the ARC buffer if the previous dirty record dr_data does not point at the dbufs ARC buffer (db_buf). 3. Sets the dbufs data pointers to NULL. 4. Redirties the dbuf using db_state = DB_NOFILL. As part of this commit, the dmu_write_direct_done() function was also cleaned up. Now dmu_sync_done() is called before undirtying the dbuf dirty record associated with a failed Direct I/O write. This is correct logic and how it always should have been. The additional benefits of these modifications is there is no longer a stall in a Direct I/O write if the user is mixing bufferd and O_DIRECT together. Also it unifies the block cloning and Direct I/O write path as they both need to call dbuf_fix_old_data() before destroying the ARC buffer. As part of this commit, there is also just general code cleanup. Various dbuf stats were removed because they are not necesary any longer. Additionally, useless functions were removed to make the code paths cleaner for Direct I/O. Below is the race condtion stack trace that was being consistently observed in the CI runs for the dio_random test case that prompted these changes: trace: [ 7795.294473] sd 0:0:0:0: [sda] Synchronizing SCSI cache [ 9954.769075] INFO: task z_wr_int:1051869 blocked for more than 120 seconds. [ 9954.770512] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [ 9954.772159] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 9954.773848] task:z_wr_int state:D stack:0 pid:1051869 ppid:2 flags:0x80004080 [ 9954.775512] Call Trace: [ 9954.776406] __schedule+0x2d1/0x870 [ 9954.777386] ? free_one_page+0x204/0x530 [ 9954.778466] schedule+0x55/0xf0 [ 9954.779355] cv_wait_common+0x16d/0x280 [spl] [ 9954.780491] ? finish_wait+0x80/0x80 [ 9954.781450] dmu_buf_direct_mixed_io_wait+0x84/0x1a0 [zfs] [ 9954.782889] dmu_write_direct_done+0x90/0x3b0 [zfs] [ 9954.784255] zio_done+0x373/0x1d50 [zfs] [ 9954.785410] zio_execute+0xee/0x210 [zfs] [ 9954.786588] taskq_thread+0x205/0x3f0 [spl] [ 9954.787673] ? wake_up_q+0x60/0x60 [ 9954.788571] ? zio_execute_stack_check.constprop.1+0x10/0x10 [zfs] [ 9954.790079] ? taskq_lowest_id+0xc0/0xc0 [spl] [ 9954.791199] kthread+0x134/0x150 [ 9954.792082] ? set_kthread_struct+0x50/0x50 [ 9954.793189] ret_from_fork+0x35/0x40 [ 9954.794108] INFO: task txg_sync:1051894 blocked for more than 120 seconds. [ 9954.795535] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [ 9954.797103] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 9954.798669] task:txg_sync state:D stack:0 pid:1051894 ppid:2 flags:0x80004080 [ 9954.800267] Call Trace: [ 9954.801096] __schedule+0x2d1/0x870 [ 9954.801972] ? __wake_up_common+0x7a/0x190 [ 9954.802963] schedule+0x55/0xf0 [ 9954.803884] schedule_timeout+0x19f/0x320 [ 9954.804837] ? __next_timer_interrupt+0xf0/0xf0 [ 9954.805932] ? taskq_dispatch+0xab/0x280 [spl] [ 9954.806959] io_schedule_timeout+0x19/0x40 [ 9954.807989] __cv_timedwait_common+0x19e/0x2c0 [spl] [ 9954.809110] ? finish_wait+0x80/0x80 [ 9954.810068] __cv_timedwait_io+0x15/0x20 [spl] [ 9954.811103] zio_wait+0x1ad/0x4f0 [zfs] [ 9954.812255] dsl_pool_sync+0xcb/0x6c0 [zfs] [ 9954.813442] ? spa_errlog_sync+0x2f0/0x3d0 [zfs] [ 9954.814648] spa_sync_iterate_to_convergence+0xcb/0x310 [zfs] [ 9954.816023] spa_sync+0x362/0x8f0 [zfs] [ 9954.817110] txg_sync_thread+0x27a/0x3b0 [zfs] [ 9954.818267] ? txg_dispatch_callbacks+0xf0/0xf0 [zfs] [ 9954.819510] ? spl_assert.constprop.0+0x20/0x20 [spl] [ 9954.820643] thread_generic_wrapper+0x63/0x90 [spl] [ 9954.821709] kthread+0x134/0x150 [ 9954.822590] ? set_kthread_struct+0x50/0x50 [ 9954.823584] ret_from_fork+0x35/0x40 [ 9954.824444] INFO: task fio:1055501 blocked for more than 120 seconds. [ 9954.825781] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [ 9954.827315] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 9954.828871] task:fio state:D stack:0 pid:1055501 ppid:1055490 flags:0x00004080 [ 9954.830463] Call Trace: [ 9954.831280] __schedule+0x2d1/0x870 [ 9954.832159] ? dbuf_hold_copy+0xec/0x230 [zfs] [ 9954.833396] schedule+0x55/0xf0 [ 9954.834286] cv_wait_common+0x16d/0x280 [spl] [ 9954.835291] ? finish_wait+0x80/0x80 [ 9954.836235] zfs_rangelock_enter_reader+0xa1/0x1f0 [zfs] [ 9954.837543] zfs_rangelock_enter_impl+0xbf/0x1b0 [zfs] [ 9954.838838] zfs_get_data+0x566/0x810 [zfs] [ 9954.840034] zil_lwb_commit+0x194/0x3f0 [zfs] [ 9954.841154] zil_lwb_write_issue+0x68/0xb90 [zfs] [ 9954.842367] ? __list_add+0x12/0x30 [zfs] [ 9954.843496] ? __raw_spin_unlock+0x5/0x10 [zfs] [ 9954.844665] ? zil_alloc_lwb+0x217/0x360 [zfs] [ 9954.845852] zil_commit_waiter_timeout+0x1f3/0x570 [zfs] [ 9954.847203] zil_commit_waiter+0x1d2/0x3b0 [zfs] [ 9954.848380] zil_commit_impl+0x6d/0xd0 [zfs] [ 9954.849550] zfs_fsync+0x66/0x90 [zfs] [ 9954.850640] zpl_fsync+0xe5/0x140 [zfs] [ 9954.851729] do_fsync+0x38/0x70 [ 9954.852585] __x64_sys_fsync+0x10/0x20 [ 9954.853486] do_syscall_64+0x5b/0x1b0 [ 9954.854416] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [ 9954.855466] RIP: 0033:0x7eff236bb057 [ 9954.856388] Code: Unable to access opcode bytes at RIP 0x7eff236bb02d. [ 9954.857651] RSP: 002b:00007ffffb8e5320 EFLAGS: 00000293 ORIG_RAX: 000000000000004a [ 9954.859141] RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007eff236bb057 [ 9954.860496] RDX: 0000000000000000 RSI: 000055e4d1f13ac0 RDI: 0000000000000006 [ 9954.861945] RBP: 00007efeb8ed8000 R08: 0000000000000000 R09: 0000000000000000 [ 9954.863327] R10: 0000000000056000 R11: 0000000000000293 R12: 0000000000000003 [ 9954.864765] R13: 000055e4d1f13ac0 R14: 0000000000000000 R15: 000055e4d1f13ae8 [ 9954.866149] INFO: task fio:1055502 blocked for more than 120 seconds. [ 9954.867490] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [ 9954.869029] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 9954.870571] task:fio state:D stack:0 pid:1055502 ppid:1055490 flags:0x00004080 [ 9954.872162] Call Trace: [ 9954.872947] __schedule+0x2d1/0x870 [ 9954.873844] schedule+0x55/0xf0 [ 9954.874716] schedule_timeout+0x19f/0x320 [ 9954.875645] ? __next_timer_interrupt+0xf0/0xf0 [ 9954.876722] io_schedule_timeout+0x19/0x40 [ 9954.877677] __cv_timedwait_common+0x19e/0x2c0 [spl] [ 9954.878822] ? finish_wait+0x80/0x80 [ 9954.879694] __cv_timedwait_io+0x15/0x20 [spl] [ 9954.880763] zio_wait+0x1ad/0x4f0 [zfs] [ 9954.881865] dmu_write_abd+0x174/0x1c0 [zfs] [ 9954.883074] dmu_write_uio_direct+0x79/0x100 [zfs] [ 9954.884285] dmu_write_uio_dnode+0xb2/0x320 [zfs] [ 9954.885507] dmu_write_uio_dbuf+0x47/0x60 [zfs] [ 9954.886687] zfs_write+0x581/0xe20 [zfs] [ 9954.887822] ? iov_iter_get_pages+0xe9/0x390 [ 9954.888862] ? trylock_page+0xd/0x20 [zfs] [ 9954.890005] ? __raw_spin_unlock+0x5/0x10 [zfs] [ 9954.891217] ? zfs_setup_direct+0x7e/0x1b0 [zfs] [ 9954.892391] zpl_iter_write_direct+0xd4/0x170 [zfs] [ 9954.893663] ? rrw_exit+0xc6/0x200 [zfs] [ 9954.894764] zpl_iter_write+0xd5/0x110 [zfs] [ 9954.895911] new_sync_write+0x112/0x160 [ 9954.896881] vfs_write+0xa5/0x1b0 [ 9954.897701] ksys_write+0x4f/0xb0 [ 9954.898569] do_syscall_64+0x5b/0x1b0 [ 9954.899417] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [ 9954.900515] RIP: 0033:0x7eff236baa47 [ 9954.901363] Code: Unable to access opcode bytes at RIP 0x7eff236baa1d. [ 9954.902673] RSP: 002b:00007ffffb8e5330 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [ 9954.904099] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007eff236baa47 [ 9954.905535] RDX: 00000000000e4000 RSI: 00007efeb7dd4000 RDI: 0000000000000005 [ 9954.906902] RBP: 00007efeb7dd4000 R08: 0000000000000000 R09: 0000000000000000 [ 9954.908339] R10: 0000000000000000 R11: 0000000000000293 R12: 00000000000e4000 [ 9954.909705] R13: 000055e4d1f13ac0 R14: 00000000000e4000 R15: 000055e4d1f13ae8 [ 9954.911129] INFO: task fio:1055504 blocked for more than 120 seconds. [ 9954.912381] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [ 9954.913978] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 9954.915434] task:fio state:D stack:0 pid:1055504 ppid:1055493 flags:0x00000080 [ 9954.917082] Call Trace: [ 9954.917773] __schedule+0x2d1/0x870 [ 9954.918648] ? zilog_dirty+0x4f/0xc0 [zfs] [ 9954.919831] schedule+0x55/0xf0 [ 9954.920717] cv_wait_common+0x16d/0x280 [spl] [ 9954.921704] ? finish_wait+0x80/0x80 [ 9954.922639] zfs_rangelock_enter_writer+0x46/0x1c0 [zfs] [ 9954.923940] zfs_rangelock_enter_impl+0x12a/0x1b0 [zfs] [ 9954.925306] zfs_write+0x703/0xe20 [zfs] [ 9954.926406] zpl_iter_write_buffered+0xb2/0x120 [zfs] [ 9954.927687] ? rrw_exit+0xc6/0x200 [zfs] [ 9954.928821] zpl_iter_write+0xbe/0x110 [zfs] [ 9954.930028] new_sync_write+0x112/0x160 [ 9954.930913] vfs_write+0xa5/0x1b0 [ 9954.931758] ksys_write+0x4f/0xb0 [ 9954.932666] do_syscall_64+0x5b/0x1b0 [ 9954.933544] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [ 9954.934689] RIP: 0033:0x7fcaee8f0a47 [ 9954.935551] Code: Unable to access opcode bytes at RIP 0x7fcaee8f0a1d. [ 9954.936893] RSP: 002b:00007fff56b2c240 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [ 9954.938327] RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007fcaee8f0a47 [ 9954.939777] RDX: 000000000001d000 RSI: 00007fca8300b010 RDI: 0000000000000006 [ 9954.941187] RBP: 00007fca8300b010 R08: 0000000000000000 R09: 0000000000000000 [ 9954.942655] R10: 0000000000000000 R11: 0000000000000293 R12: 000000000001d000 [ 9954.944062] R13: 0000557a2006bac0 R14: 000000000001d000 R15: 0000557a2006bae8 [ 9954.945525] INFO: task fio:1055505 blocked for more than 120 seconds. [ 9954.946819] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [ 9954.948466] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 9954.949959] task:fio state:D stack:0 pid:1055505 ppid:1055493 flags:0x00004080 [ 9954.951653] Call Trace: [ 9954.952417] __schedule+0x2d1/0x870 [ 9954.953393] ? finish_wait+0x3e/0x80 [ 9954.954315] schedule+0x55/0xf0 [ 9954.955212] cv_wait_common+0x16d/0x280 [spl] [ 9954.956211] ? finish_wait+0x80/0x80 [ 9954.957159] zil_commit_waiter+0xfa/0x3b0 [zfs] [ 9954.958343] zil_commit_impl+0x6d/0xd0 [zfs] [ 9954.959524] zfs_fsync+0x66/0x90 [zfs] [ 9954.960626] zpl_fsync+0xe5/0x140 [zfs] [ 9954.961763] do_fsync+0x38/0x70 [ 9954.962638] __x64_sys_fsync+0x10/0x20 [ 9954.963520] do_syscall_64+0x5b/0x1b0 [ 9954.964470] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [ 9954.965567] RIP: 0033:0x7fcaee8f1057 [ 9954.966490] Code: Unable to access opcode bytes at RIP 0x7fcaee8f102d. [ 9954.967752] RSP: 002b:00007fff56b2c230 EFLAGS: 00000293 ORIG_RAX: 000000000000004a [ 9954.969260] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fcaee8f1057 [ 9954.970628] RDX: 0000000000000000 RSI: 0000557a2006bac0 RDI: 0000000000000005 [ 9954.972092] RBP: 00007fca84152a18 R08: 0000000000000000 R09: 0000000000000000 [ 9954.973484] R10: 0000000000035000 R11: 0000000000000293 R12: 0000000000000003 [ 9954.974958] R13: 0000557a2006bac0 R14: 0000000000000000 R15: 0000557a2006bae8 [10077.648150] INFO: task z_wr_int:1051869 blocked for more than 120 seconds. [10077.649541] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [10077.651116] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [10077.652782] task:z_wr_int state:D stack:0 pid:1051869 ppid:2 flags:0x80004080 [10077.654420] Call Trace: [10077.655267] __schedule+0x2d1/0x870 [10077.656179] ? free_one_page+0x204/0x530 [10077.657192] schedule+0x55/0xf0 [10077.658004] cv_wait_common+0x16d/0x280 [spl] [10077.659018] ? finish_wait+0x80/0x80 [10077.660013] dmu_buf_direct_mixed_io_wait+0x84/0x1a0 [zfs] [10077.661396] dmu_write_direct_done+0x90/0x3b0 [zfs] [10077.662617] zio_done+0x373/0x1d50 [zfs] [10077.663783] zio_execute+0xee/0x210 [zfs] [10077.664921] taskq_thread+0x205/0x3f0 [spl] [10077.665982] ? wake_up_q+0x60/0x60 [10077.666842] ? zio_execute_stack_check.constprop.1+0x10/0x10 [zfs] [10077.668295] ? taskq_lowest_id+0xc0/0xc0 [spl] [10077.669360] kthread+0x134/0x150 [10077.670191] ? set_kthread_struct+0x50/0x50 [10077.671209] ret_from_fork+0x35/0x40 [10077.672076] INFO: task txg_sync:1051894 blocked for more than 120 seconds. [10077.673467] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [10077.675112] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [10077.676612] task:txg_sync state:D stack:0 pid:1051894 ppid:2 flags:0x80004080 [10077.678288] Call Trace: [10077.679024] __schedule+0x2d1/0x870 [10077.679948] ? __wake_up_common+0x7a/0x190 [10077.681042] schedule+0x55/0xf0 [10077.681899] schedule_timeout+0x19f/0x320 [10077.682951] ? __next_timer_interrupt+0xf0/0xf0 [10077.684005] ? taskq_dispatch+0xab/0x280 [spl] [10077.685085] io_schedule_timeout+0x19/0x40 [10077.686080] __cv_timedwait_common+0x19e/0x2c0 [spl] [10077.687227] ? finish_wait+0x80/0x80 [10077.688123] __cv_timedwait_io+0x15/0x20 [spl] [10077.689206] zio_wait+0x1ad/0x4f0 [zfs] [10077.690300] dsl_pool_sync+0xcb/0x6c0 [zfs] [10077.691435] ? spa_errlog_sync+0x2f0/0x3d0 [zfs] [10077.692636] spa_sync_iterate_to_convergence+0xcb/0x310 [zfs] [10077.693997] spa_sync+0x362/0x8f0 [zfs] [10077.695112] txg_sync_thread+0x27a/0x3b0 [zfs] [10077.696239] ? txg_dispatch_callbacks+0xf0/0xf0 [zfs] [10077.697512] ? spl_assert.constprop.0+0x20/0x20 [spl] [10077.698639] thread_generic_wrapper+0x63/0x90 [spl] [10077.699687] kthread+0x134/0x150 [10077.700567] ? set_kthread_struct+0x50/0x50 [10077.701502] ret_from_fork+0x35/0x40 [10077.702430] INFO: task fio:1055501 blocked for more than 120 seconds. [10077.703697] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [10077.705309] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [10077.706780] task:fio state:D stack:0 pid:1055501 ppid:1055490 flags:0x00004080 [10077.708479] Call Trace: [10077.709231] __schedule+0x2d1/0x870 [10077.710190] ? dbuf_hold_copy+0xec/0x230 [zfs] [10077.711368] schedule+0x55/0xf0 [10077.712286] cv_wait_common+0x16d/0x280 [spl] [10077.713316] ? finish_wait+0x80/0x80 [10077.714262] zfs_rangelock_enter_reader+0xa1/0x1f0 [zfs] [10077.715566] zfs_rangelock_enter_impl+0xbf/0x1b0 [zfs] [10077.716878] zfs_get_data+0x566/0x810 [zfs] [10077.718032] zil_lwb_commit+0x194/0x3f0 [zfs] [10077.719234] zil_lwb_write_issue+0x68/0xb90 [zfs] [10077.720413] ? __list_add+0x12/0x30 [zfs] [10077.721525] ? __raw_spin_unlock+0x5/0x10 [zfs] [10077.722708] ? zil_alloc_lwb+0x217/0x360 [zfs] [10077.723931] zil_commit_waiter_timeout+0x1f3/0x570 [zfs] [10077.725273] zil_commit_waiter+0x1d2/0x3b0 [zfs] [10077.726438] zil_commit_impl+0x6d/0xd0 [zfs] [10077.727586] zfs_fsync+0x66/0x90 [zfs] [10077.728675] zpl_fsync+0xe5/0x140 [zfs] [10077.729755] do_fsync+0x38/0x70 [10077.730607] __x64_sys_fsync+0x10/0x20 [10077.731482] do_syscall_64+0x5b/0x1b0 [10077.732415] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [10077.733487] RIP: 0033:0x7eff236bb057 [10077.734399] Code: Unable to access opcode bytes at RIP 0x7eff236bb02d. [10077.735657] RSP: 002b:00007ffffb8e5320 EFLAGS: 00000293 ORIG_RAX: 000000000000004a [10077.737163] RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007eff236bb057 [10077.738526] RDX: 0000000000000000 RSI: 000055e4d1f13ac0 RDI: 0000000000000006 [10077.739966] RBP: 00007efeb8ed8000 R08: 0000000000000000 R09: 0000000000000000 [10077.741336] R10: 0000000000056000 R11: 0000000000000293 R12: 0000000000000003 [10077.742773] R13: 000055e4d1f13ac0 R14: 0000000000000000 R15: 000055e4d1f13ae8 [10077.744168] INFO: task fio:1055502 blocked for more than 120 seconds. [10077.745505] Tainted: P OE -------- - - 4.18.0-553.5.1.el8_10.x86_64 #1 [10077.747073] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [10077.748642] task:fio state:D stack:0 pid:1055502 ppid:1055490 flags:0x00004080 [10077.750233] Call Trace: [10077.751011] __schedule+0x2d1/0x870 [10077.751915] schedule+0x55/0xf0 [10077.752811] schedule_timeout+0x19f/0x320 [10077.753762] ? __next_timer_interrupt+0xf0/0xf0 [10077.754824] io_schedule_timeout+0x19/0x40 [10077.755782] __cv_timedwait_common+0x19e/0x2c0 [spl] [10077.756922] ? finish_wait+0x80/0x80 [10077.757788] __cv_timedwait_io+0x15/0x20 [spl] [10077.758845] zio_wait+0x1ad/0x4f0 [zfs] [10077.759941] dmu_write_abd+0x174/0x1c0 [zfs] [10077.761144] dmu_write_uio_direct+0x79/0x100 [zfs] [10077.762327] dmu_write_uio_dnode+0xb2/0x320 [zfs] [10077.763523] dmu_write_uio_dbuf+0x47/0x60 [zfs] [10077.764749] zfs_write+0x581/0xe20 [zfs] [10077.765825] ? iov_iter_get_pages+0xe9/0x390 [10077.766842] ? trylock_page+0xd/0x20 [zfs] [10077.767956] ? __raw_spin_unlock+0x5/0x10 [zfs] [10077.769189] ? zfs_setup_direct+0x7e/0x1b0 [zfs] [10077.770343] zpl_iter_write_direct+0xd4/0x170 [zfs] [10077.771570] ? rrw_exit+0xc6/0x200 [zfs] [10077.772674] zpl_iter_write+0xd5/0x110 [zfs] [10077.773834] new_sync_write+0x112/0x160 [10077.774805] vfs_write+0xa5/0x1b0 [10077.775634] ksys_write+0x4f/0xb0 [10077.776526] do_syscall_64+0x5b/0x1b0 [10077.777386] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [10077.778488] RIP: 0033:0x7eff236baa47 [10077.779339] Code: Unable to access opcode bytes at RIP 0x7eff236baa1d. [10077.780655] RSP: 002b:00007ffffb8e5330 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [10077.782056] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007eff236baa47 [10077.783507] RDX: 00000000000e4000 RSI: 00007efeb7dd4000 RDI: 0000000000000005 [10077.784890] RBP: 00007efeb7dd4000 R08: 0000000000000000 R09: 0000000000000000 [10077.786303] R10: 0000000000000000 R11: 0000000000000293 R12: 00000000000e4000 [10077.787637] R13: 000055e4d1f13ac0 R14: 00000000000e4000 R15: 000055e4d1f13ae8 Signed-off-by: Brian Atkinson --- include/sys/dbuf.h | 13 +-- include/sys/dmu.h | 1 - module/zfs/dbuf.c | 210 +++++++++++----------------------------- module/zfs/dmu.c | 2 +- module/zfs/dmu_direct.c | 103 +++++--------------- module/zfs/zfs_vnops.c | 169 ++++++++++++++------------------ 6 files changed, 156 insertions(+), 342 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 5ce00bc025..7a2ba8ea0a 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -336,14 +336,6 @@ typedef struct dmu_buf_impl { /* The buffer was partially read. More reads may follow. */ uint8_t db_partial_read; - - /* - * This block is being held under a writer rangelock of a Direct I/O - * write that is waiting for previous buffered writes to synced out - * due to mixed buffered and O_DIRECT operations. This is needed to - * check whether to grab the rangelock in zfs_get_data(). - */ - uint8_t db_mixed_io_dio_wait; } dmu_buf_impl_t; #define DBUF_HASH_MUTEX(h, idx) \ @@ -392,7 +384,7 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid, uint64_t *hash_out); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); -void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); @@ -401,9 +393,6 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dmu_buf_direct_mixed_io_wait(dmu_buf_impl_t *db, uint64_t txg, - boolean_t read); -void dmu_buf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); blkptr_t *dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db); int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 216d7d2885..38ce279808 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1088,7 +1088,6 @@ typedef struct zgd { struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct zfs_locked_range *zgd_lr; - boolean_t zgd_grabbed_rangelock; void *zgd_private; } zgd_t; diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index cf66c38f0c..77f7664fb2 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -82,13 +82,6 @@ typedef struct dbuf_stats { */ kstat_named_t cache_levels[DN_MAX_LEVELS]; kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; - /* - * Statistics for Direct I/O. - */ - kstat_named_t direct_mixed_io_read_wait; - kstat_named_t direct_mixed_io_write_wait; - kstat_named_t direct_sync_wait; - kstat_named_t direct_undirty; /* * Statistics about the dbuf hash table. */ @@ -137,10 +130,6 @@ dbuf_stats_t dbuf_stats = { { "cache_total_evicts", KSTAT_DATA_UINT64 }, { { "cache_levels_N", KSTAT_DATA_UINT64 } }, { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, - { "direct_mixed_io_read_wait", KSTAT_DATA_UINT64 }, - { "direct_mixed_io_write_wait", KSTAT_DATA_UINT64 }, - { "direct_sync_wait", KSTAT_DATA_UINT64 }, - { "direct_undirty", KSTAT_DATA_UINT64 }, { "hash_hits", KSTAT_DATA_UINT64 }, { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -162,10 +151,6 @@ struct { wmsum_t cache_total_evicts; wmsum_t cache_levels[DN_MAX_LEVELS]; wmsum_t cache_levels_bytes[DN_MAX_LEVELS]; - wmsum_t direct_mixed_io_read_wait; - wmsum_t direct_mixed_io_write_wait; - wmsum_t direct_sync_wait; - wmsum_t direct_undirty; wmsum_t hash_hits; wmsum_t hash_misses; wmsum_t hash_collisions; @@ -911,14 +896,6 @@ dbuf_kstat_update(kstat_t *ksp, int rw) ds->cache_levels_bytes[i].value.ui64 = wmsum_value(&dbuf_sums.cache_levels_bytes[i]); } - ds->direct_mixed_io_read_wait.value.ui64 = - wmsum_value(&dbuf_sums.direct_mixed_io_read_wait); - ds->direct_mixed_io_write_wait.value.ui64 = - wmsum_value(&dbuf_sums.direct_mixed_io_write_wait); - ds->direct_sync_wait.value.ui64 = - wmsum_value(&dbuf_sums.direct_sync_wait); - ds->direct_undirty.value.ui64 = - wmsum_value(&dbuf_sums.direct_undirty); ds->hash_hits.value.ui64 = wmsum_value(&dbuf_sums.hash_hits); ds->hash_misses.value.ui64 = @@ -1021,10 +998,6 @@ dbuf_init(void) wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); } - wmsum_init(&dbuf_sums.direct_mixed_io_read_wait, 0); - wmsum_init(&dbuf_sums.direct_mixed_io_write_wait, 0); - wmsum_init(&dbuf_sums.direct_sync_wait, 0); - wmsum_init(&dbuf_sums.direct_undirty, 0); wmsum_init(&dbuf_sums.hash_hits, 0); wmsum_init(&dbuf_sums.hash_misses, 0); wmsum_init(&dbuf_sums.hash_collisions, 0); @@ -1097,10 +1070,6 @@ dbuf_fini(void) wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); } - wmsum_fini(&dbuf_sums.direct_mixed_io_read_wait); - wmsum_fini(&dbuf_sums.direct_mixed_io_write_wait); - wmsum_fini(&dbuf_sums.direct_sync_wait); - wmsum_fini(&dbuf_sums.direct_undirty); wmsum_fini(&dbuf_sums.hash_hits); wmsum_fini(&dbuf_sums.hash_misses); wmsum_fini(&dbuf_sums.hash_collisions); @@ -1271,9 +1240,8 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - /* Direct I/O writes may have data */ - if (db->db_buf == NULL) - db->db.db_data = NULL; + ASSERT3P(db->db_buf, ==, NULL); + db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "clear data"); @@ -2789,93 +2757,6 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) return (dr != NULL); } -void -dmu_buf_direct_mixed_io_wait(dmu_buf_impl_t *db, uint64_t txg, boolean_t read) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (read == B_TRUE) { - /* - * If a buffered read is in process, a Direct I/O read will - * wait for the buffered I/O to complete. - */ - ASSERT3U(txg, ==, 0); - while (db->db_state == DB_READ) { - DBUF_STAT_BUMP(direct_mixed_io_read_wait); - cv_wait(&db->db_changed, &db->db_mtx); - } - } else { - /* - * There must be an ARC buf associated with this Direct I/O - * write otherwise there is no reason to wait for previous - * dirty records to sync out. - * - * The db_state will temporarily be set to DB_CACHED so that - * that any synchronous writes issued through the ZIL will - * still be handled properly. In particular, the call to - * dbuf_read() in dmu_sync_late_arrival() must account for the - * data still being in the ARC. After waiting here for previous - * TXGs to sync out, dmu_write_direct_done() will update the - * db_state. - */ - ASSERT3P(db->db_buf, !=, NULL); - ASSERT3U(txg, >, 0); - db->db_mixed_io_dio_wait = TRUE; - db->db_state = DB_CACHED; - while (dbuf_find_dirty_lte(db, txg) != NULL) { - DBUF_STAT_BUMP(direct_mixed_io_write_wait); - cv_wait(&db->db_changed, &db->db_mtx); - } - db->db_mixed_io_dio_wait = FALSE; - } -} - -/* - * Direct I/O writes may need to undirty the open-context dirty record - * associated with it in the event of an I/O error. - */ -void -dmu_buf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - /* - * Direct I/O writes always happen in open-context. - */ - ASSERT(!dmu_tx_is_syncing(tx)); - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_state == DB_NOFILL || db->db_state == DB_UNCACHED); - - - /* - * In the event of an I/O error we will handle the metaslab clean up in - * zio_done(). Also, the dirty record's dr_overridden_by BP is not - * currently set as that is done in dmu_sync_done(). Since the db_state - * is still set to DB_NOFILL, dbuf_unoverride() will not be called in - * dbuf_undirty() and the dirty record's BP will not be added the SPA's - * spa_free_bplist via zio_free(). - * - * This function can also be called in the event that a Direct I/O - * write is overwriting a previous Direct I/O to the same block for - * this TXG. It is important to go ahead and free up the space - * accounting in this case through dbuf_undirty() -> dbuf_unoverride() - * -> zio_free(). This is necessary because the space accounting for - * determining if a write can occur in zfs_write() happens through - * dmu_tx_assign(). This can cause an issue with Direct I/O writes in - * the case of overwrites, because all DVA allocations are being done - * in open-context. Constanstly allowing Direct I/O overwrites to the - * same blocks can exhaust the pools available space leading to ENOSPC - * errors at the DVA allcoation part of the ZIO pipeline, which will - * eventually suspend the pool. By cleaning up space accounting now - * the ENOSPC pool suspend can be avoided. - * - * Since we are undirtying the record for the Direct I/O in - * open-context we must have a hold on the db, so it should never be - * evicted after calling dbuf_undirty(). - */ - VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE); - - DBUF_STAT_BUMP(direct_undirty); -} - /* * Normally the db_blkptr points to the most recent on-disk content for the * dbuf (and anything newer will be cached in the dbuf). However, a recent @@ -2951,7 +2832,7 @@ dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa) } void -dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) +dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT0(db->db_level); @@ -2959,14 +2840,41 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); /* - * Block cloning: We are going to clone into this block, so undirty - * modifications done to this block so far in this txg. This includes - * writes and clones into this block. + * Block clones and Direct I/O writes always happen in open-context. */ + ASSERT(!dmu_tx_is_syncing(tx)); + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + mutex_enter(&db->db_mtx); DBUF_VERIFY(db); - VERIFY(!dbuf_undirty(db, tx)); + + /* + * We are going to clone or issue a Direct I/O write on this block, so + * undirty modifications done to this block so far in this txg. This + * includes writes and clones into this block. + * + * If there dirty record associated with this txg from a previous Direct + * I/O write then space accounting cleanup takes place. It is important + * to go ahead free up the space accounting through dbuf_undirty() -> + * dbuf_unoverride() -> zio_free(). Space accountiung for determining + * if a write can occur in zfs_write() happens through dmu_tx_assign(). + * This can cuase an issue with Direct I/O writes in the case of + * overwriting the same block, because all DVA allocations are being + * done in open-context. Constantly allowing Direct I/O overwrites to + * the same block can exhaust the pools available space leading to + * ENOSPC errors at the DVA allocation part of the ZIO pipeline, which + * will eventually suspend the pool. By cleaning up sapce acccounting + * now, the ENOSPC error can be avoided. + * + * Since we are undirtying the record in open-context, we must have a + * hold on the db, so it should never be evicted after calling + * dbuf_undirty(). + */ + VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE); ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg)); + if (db->db_buf != NULL) { /* * If there is an associated ARC buffer with this dbuf we can @@ -2977,6 +2885,11 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(db->db_buf, db); + /* + * Setting the dbuf's data pointers to NULL will force all + * future reads down to the devices to get the most up to date + * version of the data after a Direct I/O write has completed. + */ db->db_buf = NULL; dbuf_clear_data(db); } @@ -2985,7 +2898,8 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT3P(db->db.db_data, ==, NULL); db->db_state = DB_NOFILL; - DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone"); + DTRACE_SET_STATE(db, + "allocating NOFILL buffer for clone or direct I/O write"); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); @@ -3532,7 +3446,6 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; - db->db_mixed_io_dio_wait = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); @@ -4788,25 +4701,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); - - /* - * It is possible a buffered read has come in after a Direct I/O - * write and is currently transistioning the db_state from DB_READ - * in dbuf_read_impl() to another state in dbuf_read_done(). We - * have to wait in order for the dbuf state to change from DB_READ - * before syncing the dirty record of the Direct I/O write. - */ - if (db->db_state == DB_READ && !dr->dt.dl.dr_brtwrite) { - ASSERT3P(*datap, ==, NULL); - ASSERT3P(db->db_buf, ==, NULL); - ASSERT3P(db->db.db_data, ==, NULL); - ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_OVERRIDDEN); - while (db->db_state == DB_READ) { - DBUF_STAT_BUMP(direct_sync_wait); - cv_wait(&db->db_changed, &db->db_mtx); - } - } - /* * To be synced, we must be dirtied. But we might have been freed * after the dirty. @@ -4819,13 +4713,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else if (db->db_state == DB_READ) { /* - * This buffer has a clone we need to write, and an in-flight - * read on the BP we're about to clone. Its safe to issue the - * write here because the read has already been issued and the - * contents won't change. + * This buffer was either cloned or had a Direct I/O write + * occur and has an in-flgiht read on the BP. It is safe to + * issue the write here, because the read has already been + * issued and the contents won't change. + * + * We can verify the case of both the clone and Direct I/O + * write by making sure the first dirty record for the dbuf + * has no ARC buffer associated with it. */ - ASSERT(dr->dt.dl.dr_brtwrite && - dr->dt.dl.dr_override_state == DR_OVERRIDDEN); + dbuf_dirty_record_t *dr_head = + list_head(&db->db_dirty_records); + ASSERT3P(db->db_buf, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL); + ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } @@ -5522,7 +5424,7 @@ EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); -EXPORT_SYMBOL(dmu_buf_will_clone); +EXPORT_SYMBOL(dmu_buf_will_clone_or_dio); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index ba47be9c9e..db08b18431 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2676,7 +2676,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT(db->db_blkid != DMU_SPILL_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); - dmu_buf_will_clone(dbuf, tx); + dmu_buf_will_clone_or_dio(dbuf, tx); mutex_enter(&db->db_mtx); diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 0ff3e0e55e..bf47335302 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -90,78 +90,35 @@ dmu_write_direct_done(zio_t *zio) dmu_sync_arg_t *dsa = zio->io_private; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; - uint64_t txg = dsa->dsa_tx->tx_txg; abd_free(zio->io_abd); + mutex_enter(&db->db_mtx); + ASSERT3P(db->db_buf, ==, NULL); + ASSERT3P(dr->dt.dl.dr_data, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + db->db_state = DB_UNCACHED; + mutex_exit(&db->db_mtx); - if (zio->io_error == 0) { - /* - * After a successful Direct I/O write any stale contents in - * the ARC must be cleaned up in order to force all future - * reads down to the VDEVs. - * - * If a previous write operation to this dbuf was buffered - * (in the ARC) we have to wait for the previous dirty records - * associated with this dbuf to be synced out if they are in - * the quiesce or sync phase for their TXG. This is done to - * guarantee we are not racing to destroy the ARC buf that - * is associated with the dbuf between this done callback and - * spa_sync(). Outside of using a heavy handed approach of - * locking down the spa_syncing_txg while it is being updated, - * there is no way to synchronize when a dirty record's TXG - * has moved over to the sync phase. - * - * In order to make sure all TXG's are consistent we must - * do this stall if there is an associated ARC buf with this - * dbuf. It is because of this that a user should not really - * be mixing buffered and Direct I/O writes. If they choose to - * do so, there is an associated performance penalty for that - * as we will not give up consistency with a TXG over - * performance. - */ - if (db->db_buf) { - dmu_buf_direct_mixed_io_wait(db, txg - 1, B_FALSE); - ASSERT3P(db->db_buf, ==, dr->dt.dl.dr_data); - arc_buf_destroy(db->db_buf, db); - db->db_buf = NULL; - dr->dt.dl.dr_data = NULL; - db->db.db_data = NULL; - ASSERT3U(db->db_dirtycnt, ==, 1); - } + dmu_sync_done(zio, NULL, zio->io_private); - /* - * The current contents of the dbuf are now stale. - */ - ASSERT3P(dr->dt.dl.dr_data, ==, NULL); - ASSERT3P(db->db.db_data, ==, NULL); - db->db_state = DB_UNCACHED; - } else { + if (zio->io_error != 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) ASSERT3U(zio->io_error, ==, EAGAIN); /* - * If there is a valid ARC buffer assocatied with this dirty - * record we will stall just like on a successful Direct I/O - * write to make sure all TXG's are consistent. See comment - * above. + * In the event of an I/O error the metaslab cleanup is taken + * care of in zio_done(). + * + * Since we are undirtying the record in open-context, we must + * have a hold on the db, so it should never be evicted after + * calling dbuf_undirty(). */ - if (db->db_buf) { - ASSERT3P(db->db_buf, ==, dr->dt.dl.dr_data); - dmu_buf_direct_mixed_io_wait(db, txg - 1, B_FALSE); - dmu_buf_undirty(db, dsa->dsa_tx); - db->db_state = DB_CACHED; - } else { - ASSERT3P(dr->dt.dl.dr_data, ==, NULL); - dmu_buf_undirty(db, dsa->dsa_tx); - db->db_state = DB_UNCACHED; - } - - ASSERT0(db->db_dirtycnt); + mutex_enter(&db->db_mtx); + VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE); + mutex_exit(&db->db_mtx); } - mutex_exit(&db->db_mtx); - dmu_sync_done(zio, NULL, zio->io_private); kmem_free(zio->io_bp, sizeof (blkptr_t)); zio->io_bp = NULL; } @@ -183,26 +140,11 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) WP_DMU_SYNC | WP_DIRECT_WR, &zp); DB_DNODE_EXIT(db); - /* - * If we going to overwrite a previous Direct I/O write that is part of - * the current TXG, then we can can go ahead and undirty it now. Part - * of it being undirtied will be allowing for previously allocated - * space in the dr_overridden_bp BP's DVAs to be freed. This avoids - * ENOSPC errors from possibly occuring when trying to allocate new - * metaslabs in open-context for Direct I/O writes. - */ - mutex_enter(&db->db_mtx); - dr_head = dbuf_find_dirty_eq(db, dmu_tx_get_txg(tx)); - if (dbuf_dirty_is_direct_write(db, dr_head)) { - dmu_buf_undirty(db, tx); - } - mutex_exit(&db->db_mtx); - /* * Dirty this dbuf with DB_NOFILL since we will not have any data * associated with the dbuf. */ - dmu_buf_will_not_fill(&db->db, tx); + dmu_buf_will_clone_or_dio(&db->db, tx); mutex_enter(&db->db_mtx); @@ -289,7 +231,7 @@ dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, /* * The dbuf must be held until the Direct I/O write has completed in - * the event there was any errors and dmu_buf_undirty() was called. + * the event there was any errors and dbuf_undirty() was called. */ dmu_buf_rele_array(dbp, numbufs, FTAG); @@ -325,10 +267,11 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, db->db.db_object, db->db_level, db->db_blkid); /* - * If there is another buffered read for this dbuf, we will - * wait for that to complete first. + * If there is another read for this dbuf, we will wait for + * that to complete first before checking the db_state below. */ - dmu_buf_direct_mixed_io_wait(db, 0, B_TRUE); + while (db->db_state == DB_READ) + cv_wait(&db->db_changed, &db->db_mtx); blkptr_t *bp = dmu_buf_get_bp_from_dbuf(db); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index fb6f9475d3..2460805582 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1096,32 +1096,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, zgd->zgd_lwb = lwb; zgd->zgd_private = zp; - dmu_buf_t *dbp; - error = dmu_buf_hold_noread(os, object, offset, zgd, &dbp); - zgd->zgd_db = dbp; - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; - - if (error) { - zfs_get_done(zgd, error); - return (error); - } - - /* - * If a Direct I/O write is waiting for previous dirty records to sync - * out in dmu_buf_direct_mixed_io_wait(), then the ranglock is already - * held across the entire block by the O_DIRECT write. - * - * The dirty record for this TXG will also be used to identify if this - * log record is associated with a Direct I/O write. - */ - mutex_enter(&db->db_mtx); - boolean_t rangelock_held = db->db_mixed_io_dio_wait; - zgd->zgd_grabbed_rangelock = !(rangelock_held); - dbuf_dirty_record_t *dr = - dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); - boolean_t direct_write = dbuf_dirty_is_direct_write(db, dr); - mutex_exit(&db->db_mtx); - /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the @@ -1130,10 +1104,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - if (zgd->zgd_grabbed_rangelock) { - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - } + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, + size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); @@ -1150,29 +1122,19 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ - if (zgd->zgd_grabbed_rangelock) { - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : - offset; - offset -= blkoff; - zgd->zgd_lr = zfs_rangelock_enter( - &zp->z_rangelock, offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - zfs_rangelock_exit(zgd->zgd_lr); - } - ASSERT3U(dbp->db_size, ==, size); - ASSERT3U(dbp->db_offset, ==, offset); - } else { - /* - * A Direct I/O write always covers an entire block. - */ - ASSERT3U(dbp->db_size, ==, zp->z_blksz); + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : + offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter( + &zp->z_rangelock, offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); } - /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); @@ -1182,48 +1144,69 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, zil_fault_io = 0; } #endif - if (error) { - zfs_get_done(zgd, error); - return (error); - } - /* - * All Direct I/O writes will have already completed and the - * block pointer can be immediately stored in the log record. - */ - if (direct_write) { - lr->lr_blkptr = dr->dt.dl.dr_overridden_by; - zfs_get_done(zgd, 0); - return (0); - } - - blkptr_t *bp = &lr->lr_blkptr; - zgd->zgd_bp = bp; - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); - - /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. - */ + dmu_buf_t *dbp; if (error == 0) - return (0); + error = dmu_buf_hold_noread(os, object, offset, zgd, + &dbp); + + if (error == 0) { + zgd->zgd_db = dbp; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr = + dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); + boolean_t direct_write = + dbuf_dirty_is_direct_write(db, dr); + mutex_exit(&db->db_mtx); - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP. + * All Direct I/O writes will have already completed and + * the block pointer can be immediately stored in the + * log record. */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; + if (direct_write) { + /* + * A Direct I/O write always covers an entire + * block. + */ + ASSERT3U(dbp->db_size, ==, zp->z_blksz); + lr->lr_blkptr = dr->dt.dl.dr_overridden_by; + zfs_get_done(zgd, 0); + return (0); + } + + blkptr_t *bp = &lr->lr_blkptr; + zgd->zgd_bp = bp; + + ASSERT3U(dbp->db_offset, ==, offset); + ASSERT3U(dbp->db_size, ==, size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } } } @@ -1232,18 +1215,16 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, return (error); } - static void zfs_get_done(zgd_t *zgd, int error) { (void) error; znode_t *zp = zgd->zgd_private; - ASSERT3P(zgd->zgd_db, !=, NULL); - dmu_buf_rele(zgd->zgd_db, zgd); + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); - if (zgd->zgd_grabbed_rangelock) - zfs_rangelock_exit(zgd->zgd_lr); + zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the From 506bc54006c06b1dfcc143cbe9b926d578656892 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Tue, 25 Jun 2024 15:23:45 -0600 Subject: [PATCH 3/7] Updating based on PR Feedback(1) Updating code based on PR code comments. I adjusted the following parts of code based on comments: 1. Revert dbuf_undirty() to original logic and got rid of uncessary code change. 2. Cleanup in abd_impl.h 3. Cleanup in abd.h 4. Got rid of duplicate declaration of dmu_buf_hold_noread() in dmu.h. 5. Cleaned up comment for db_mtx in dmu_imp.h 6. Updated zfsprop man page to state correct ZFS version 7. Updated to correct cast in zfs_uio_page_aligned() calls to use uintptr_t. 8. Cleaned up comment in FreeBSD uio code. 9. Removed unnecessary format changes in comments in Linux abd code. 10. Updated ZFS VFS hook for direct_IO to use PANIC(). 11. Updated comment above dbuf_undirty to use double space again. 12. Converted module paramter zfs_vdev_direct_write_verify_pct OS indedepent and in doing so this removed the uneccessary check for bounds. 13. Updated to casting in zfs_dio_page_aligned to uniptr_t and added kernel guard. 14. Updated zfs_dio_size_aligned() to use modulo math because dn->dn_datablksz is not required to be a power of 2. 15. Removed abd scatter stats update calls from all ABD_FLAG_FROM_PAGES. 16. Updated check in abd_alloc_from_pages() for the linear page. This way a single page that is even 4K can represented as an ABD_FLAG_LINEAR_PAGE. 17. Fixing types for UIO code. In FreeBSD the vm code expects and returns int's for values. In linux the interfaces return long value in get_user_pages_unlocked() and rest of the IOV interfaces return int's. Stuck with the worse case and used long for npages in Linux. Updated the uio npage struct to correspond to the correct types and that type checking is consistent in the UIO code. 18. Updated comments about what zfs_uio_get_pages_alloc() is doing. 19. Updated error handeling in zfs_uio_get_dio_pages_alloc() for Linux. Signed-off-by: Brian Atkinson --- include/os/freebsd/spl/sys/mod_os.h | 3 -- include/os/freebsd/zfs/sys/abd_os.h | 2 ++ include/os/linux/spl/sys/uio.h | 2 +- include/sys/abd_impl.h | 3 +- include/sys/dmu.h | 2 -- include/sys/dmu_impl.h | 2 +- include/sys/uio_impl.h | 6 ++-- include/sys/vdev_impl.h | 1 - lib/libspl/include/sys/uio.h | 2 +- man/man7/zfsprops.7 | 2 +- module/Kbuild.in | 1 - module/os/freebsd/spl/spl_uio.c | 40 +++++++++++----------- module/os/freebsd/zfs/abd_os.c | 21 ++++-------- module/os/freebsd/zfs/sysctl_os.c | 29 ---------------- module/os/linux/zfs/abd_os.c | 7 +--- module/os/linux/zfs/vdev_os.c | 49 --------------------------- module/os/linux/zfs/zfs_uio.c | 52 ++++++++++++++--------------- module/os/linux/zfs/zpl_file.c | 2 +- module/zfs/dbuf.c | 9 ++--- module/zfs/vdev.c | 9 +++-- 20 files changed, 73 insertions(+), 171 deletions(-) delete mode 100644 module/os/linux/zfs/vdev_os.c diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 01a660434f..df7be6fc13 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -100,9 +100,6 @@ #define spa_taskq_write_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A" -#define param_set_direct_write_verify_pct_args(var) \ - CTLTYPE_UINT, NULL, 0, param_set_direct_write_verify_pct, "IU" - #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" diff --git a/include/os/freebsd/zfs/sys/abd_os.h b/include/os/freebsd/zfs/sys/abd_os.h index c7895a5e43..be825b3b8a 100644 --- a/include/os/freebsd/zfs/sys/abd_os.h +++ b/include/os/freebsd/zfs/sys/abd_os.h @@ -42,7 +42,9 @@ struct abd_scatter { struct abd_linear { void *abd_buf; +#if defined(_KERNEL) struct sf_buf *sf; /* for LINEAR_PAGE FreeBSD */ +#endif }; __attribute__((malloc)) diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 1f0a7fa68d..5d483685eb 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -65,7 +65,7 @@ typedef enum zfs_uio_seg { */ typedef struct { struct page **pages; /* Mapped pages */ - int npages; /* Number of mapped pages */ + long npages; /* Number of mapped pages */ } zfs_uio_dio_t; typedef struct zfs_uio { diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 7b08798504..35a64f8621 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -73,9 +73,10 @@ struct abd_iter { size_t iter_pos; size_t iter_offset; /* offset in current sg/abd_buf, */ /* abd_offset included */ - struct scatterlist *iter_sg; /* current sg */ #if defined(__FreeBSD__) && defined(_KERNEL) struct sf_buf *sf; /* used to map in vm_page_t FreeBSD */ +#else + struct scatterlist *iter_sg; /* current sg */ #endif }; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 38ce279808..22cbd7fc73 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -576,8 +576,6 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp); * * The object number must be a valid, allocated object number. */ -int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, - const void *tag, dmu_buf_t **dbp); int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **, int flags); int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 8317072f62..4eaa399407 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -138,7 +138,7 @@ extern "C" { * db_data_pending * db_dirtied * db_link - * dbuf_dirty_records + * db_dirty_records * db_dirtycnt * db_d.* * db.* diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index 9911645ad2..3e2ac08e3a 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -49,12 +49,14 @@ extern void zfs_uio_free_dio_pages(zfs_uio_t *, zfs_uio_rw_t); extern int zfs_uio_get_dio_pages_alloc(zfs_uio_t *, zfs_uio_rw_t); extern boolean_t zfs_uio_page_aligned(zfs_uio_t *); +#ifdef _KERNEL static inline boolean_t zfs_dio_page_aligned(void *buf) { - return ((((unsigned long)(buf) & (PAGESIZE - 1)) == 0) ? + return ((((uintptr_t)(buf) & (PAGESIZE - 1)) == 0) ? B_TRUE : B_FALSE); } +#endif static inline boolean_t zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz) @@ -65,7 +67,7 @@ zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz) static inline boolean_t zfs_dio_size_aligned(uint64_t size, uint64_t blksz) { - return (IS_P2ALIGNED(size, blksz)); + return ((size % blksz) == 0); } static inline boolean_t diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 08ed38d5c2..54bdff611f 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -658,7 +658,6 @@ int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); * VDEV checksum verification precentage for Direct I/O writes */ extern uint_t zfs_vdev_direct_write_verify_pct; -int param_set_direct_write_verify_pct(ZFS_MODULE_PARAM_ARGS); #ifdef __cplusplus } diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index b107333d6f..2cb0107d58 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -98,7 +98,7 @@ zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz) static inline boolean_t zfs_dio_size_aligned(uint64_t size, uint64_t blksz) { - return (IS_P2ALIGNED(size, blksz)); + return ((size % blksz) == 0); } static inline boolean_t diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index cf3acf3622..fcfdebfe27 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -1054,7 +1054,7 @@ causes every properly aligned read or write to be treated as a direct request. .Sy disabled causes the O_DIRECT flag to be silently ignored and all direct requests will be handled by the ARC. -This is the default behavior for OpenZFS 2.1 and prior releases. +This is the default behavior for OpenZFS 2.2 and prior releases. .Pp Bypassing the ARC requires that a direct request be correctly aligned. For write requests the starting offset and size of the request must be diff --git a/module/Kbuild.in b/module/Kbuild.in index 4f266f62d6..0e62881baf 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -446,7 +446,6 @@ ZFS_OBJS_OS := \ vdev_disk.o \ vdev_file.o \ vdev_label_os.o \ - vdev_os.o \ zfs_acl.o \ zfs_ctldir.o \ zfs_debug.o \ diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c index f47952db22..e752675d10 100644 --- a/module/os/freebsd/spl/spl_uio.c +++ b/module/os/freebsd/spl/spl_uio.c @@ -129,7 +129,7 @@ zfs_uio_page_aligned(zfs_uio_t *uio) const struct iovec *iov = GET_UIO_STRUCT(uio)->uio_iov; for (int i = zfs_uio_iovcnt(uio); i > 0; iov++, i--) { - unsigned long addr = (unsigned long)iov->iov_base; + uintptr_t addr = (uintptr_t)iov->iov_base; size_t size = iov->iov_len; if ((addr & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { return (B_FALSE); @@ -143,7 +143,7 @@ static void zfs_uio_set_pages_to_stable(zfs_uio_t *uio) { ASSERT3P(uio->uio_dio.pages, !=, NULL); - ASSERT3U(uio->uio_dio.npages, >, 0); + ASSERT3S(uio->uio_dio.npages, >, 0); for (int i = 0; i < uio->uio_dio.npages; i++) { vm_page_t page = uio->uio_dio.pages[i]; @@ -172,7 +172,7 @@ zfs_uio_release_stable_pages(zfs_uio_t *uio) * written to and must be given write access. */ static int -zfs_uio_hold_pages(unsigned long start, size_t len, unsigned long nr_pages, +zfs_uio_hold_pages(unsigned long start, size_t len, int nr_pages, zfs_uio_rw_t rw, vm_page_t *pages) { vm_map_t map; @@ -206,8 +206,8 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) uio->uio_dio.npages * sizeof (vm_page_t)); } -static long -zfs_uio_get_user_pages(unsigned long start, unsigned long nr_pages, +static int +zfs_uio_get_user_pages(unsigned long start, int nr_pages, size_t len, zfs_uio_rw_t rw, vm_page_t *pages) { int count; @@ -220,12 +220,12 @@ zfs_uio_get_user_pages(unsigned long start, unsigned long nr_pages, return (count); } - ASSERT3U(count, ==, nr_pages); + ASSERT3S(count, ==, nr_pages); return (count); } -static size_t +static int zfs_uio_iov_step(struct iovec v, zfs_uio_t *uio, int *numpages) { unsigned long addr = (unsigned long)(v.iov_base); @@ -235,14 +235,13 @@ zfs_uio_iov_step(struct iovec v, zfs_uio_t *uio, int *numpages) int res = zfs_uio_get_user_pages( P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, len, zfs_uio_rw(uio), &uio->uio_dio.pages[uio->uio_dio.npages]); - if (res != n) { - *numpages = -1; - return (SET_ERROR(EFAULT)); - } - ASSERT3S(len, ==, res * PAGE_SIZE); + if (res != n) + return (SET_ERROR(EFAULT)); + + ASSERT3U(len, ==, res * PAGE_SIZE); *numpages = res; - return (len); + return (0); } static int @@ -264,12 +263,11 @@ zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) } iov.iov_len = MIN(maxsize, iovp->iov_len); iov.iov_base = iovp->iov_base; - size_t left = zfs_uio_iov_step(iov, uio, &numpages); + int error = zfs_uio_iov_step(iov, uio, &numpages); - if (numpages == -1) - return (left); + if (error) + return (error); - ASSERT3U(left, ==, iov.iov_len); uio->uio_dio.npages += numpages; maxsize -= iov.iov_len; wanted -= left; @@ -282,8 +280,8 @@ zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) } /* - * This function maps user pages into the kernel. In the event that the user - * pages were not mapped successfully an error value is reutrned. + * This function holds user pages into the kernel. In the event that the user + * pages are not successfully held an error value is returned. * * On success, 0 is returned. */ @@ -291,7 +289,7 @@ int zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) { int error = 0; - size_t npages = DIV_ROUND_UP(zfs_uio_resid(uio), PAGE_SIZE); + int npages = DIV_ROUND_UP(zfs_uio_resid(uio), PAGE_SIZE); size_t size = npages * sizeof (vm_page_t); ASSERT(zfs_uio_rw(uio) == rw); @@ -305,6 +303,8 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) return (error); } + ASSERT3S(uio->uio_dio.npages, >, 0); + /* * Since we will be writing the user pages we must make sure that * they are stable. That way the contents of the pages can not change diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index c7a1859f90..f20dc5d8c3 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -138,15 +138,9 @@ abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { uint_t n; - if (abd_is_from_pages(abd)) - n = abd_chunkcnt_for_bytes(abd->abd_size); - else - n = abd_scatter_chunkcnt(abd); + n = abd_scatter_chunkcnt(abd); ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); int waste = (n << PAGE_SHIFT) - abd->abd_size; - ASSERT3U(n, >, 0); - ASSERT3S(waste, >=, 0); - IMPLY(abd_is_linear_page(abd), waste < PAGE_SIZE); if (op == ABDSTAT_INCR) { ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); @@ -458,14 +452,13 @@ abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size) abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES; abd->abd_size = size; - if (size < PAGE_SIZE) { + if ((offset + size) <= PAGE_SIZE) { /* - * We do not have a full page so we will just use a linear ABD. - * We have to make sure to take into account the offset though. - * In all other cases our offset will be 0 as we are always - * PAGE_SIZE aligned. + * There is only a single page worth of data, so we will just + * use a linear ABD. We have to make sure to take into account + * the offset though. In all other cases our offset will be 0 + * as we are always PAGE_SIZE aligned. */ - ASSERT3U(offset + size, <=, PAGE_SIZE); abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; ABD_LINEAR_BUF(abd) = (char *)zfs_map_page(pages[0], &abd->abd_u.abd_linear.sf) + offset; @@ -480,8 +473,6 @@ abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size) ABD_SCATTER(abd).abd_chunks[i] = pages[i]; } - abd_update_scatter_stats(abd, ABDSTAT_INCR); - return (abd); } diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index d58fd241c5..c84cb7407a 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -831,35 +831,6 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, " new top-level vdevs. (LEGACY)"); /* END CSTYLED */ -int -param_set_direct_write_verify_pct(SYSCTL_HANDLER_ARGS) -{ - int val; - int err; - - val = zfs_vdev_direct_write_verify_pct; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (SET_ERROR(err)); - - if (val > 100 || val < 0) - return (SET_ERROR(EINVAL)); - - zfs_vdev_direct_write_verify_pct = val; - - return (0); -} - -/* BEGIN CSTYLED */ -SYSCTL_PROC(_vfs_zfs, OID_AUTO, direct_write_verify_pct, - CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_vdev_direct_write_verify_pct, - sizeof (zfs_vdev_direct_write_verify_pct), - param_set_direct_write_verify_pct, "IU", - "Percentage of Direct I/O writes per top-level VDEV for checksum" - " verification to be performed"); -/* END CSTYLED */ - /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 72b5a628ee..dae4107e03 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -708,8 +708,6 @@ abd_free_linear_page(abd_t *abd) ABD_SCATTER(abd).abd_offset = 0; ABD_SCATTER(abd).abd_sgl = sg; abd_free_chunks(abd); - - abd_update_scatter_stats(abd, ABDSTAT_DECR); } /* @@ -742,7 +740,7 @@ abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size) schedule_timeout_interruptible(1); } - if (size < PAGE_SIZE) { + if ((offset + size) <= PAGE_SIZE) { /* * Since there is only one entry, this ABD can be represented * as a linear buffer. All single-page (4K) ABD's constructed @@ -754,7 +752,6 @@ abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size) * the mapping needs to bet set up on all CPUs. Using kmap() * also enables the user of highmem pages when required. */ - ASSERT3U(offset + size, <=, PAGE_SIZE); abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; abd->abd_u.abd_linear.abd_sgl = table.sgl; zfs_kmap(sg_page(table.sgl)); @@ -770,8 +767,6 @@ abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size) ASSERT0(ABD_SCATTER(abd).abd_offset); } - abd_update_scatter_stats(abd, ABDSTAT_INCR); - return (abd); } diff --git a/module/os/linux/zfs/vdev_os.c b/module/os/linux/zfs/vdev_os.c deleted file mode 100644 index 3bd7296da9..0000000000 --- a/module/os/linux/zfs/vdev_os.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2022 by Triad National Security, LLC. - */ - -#include - -#ifdef _KERNEL - -int -param_set_direct_write_verify_pct(const char *buf, zfs_kernel_param_t *kp) -{ - uint_t val; - int error; - - error = kstrtouint(buf, 0, &val); - if (error < 0) - return (SET_ERROR(error)); - - if (val > 100) - return (SET_ERROR(-EINVAL)); - - error = param_set_uint(buf, kp); - if (error < 0) - return (SET_ERROR(error)); - - return (0); -} - -#endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index 75ce3b0d8f..43ff81a22b 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -469,8 +469,7 @@ zfs_uio_page_aligned(zfs_uio_t *uio) size_t skip = uio->uio_skip; for (int i = uio->uio_iovcnt; i > 0; iov++, i--) { - unsigned long addr = - (unsigned long)(iov->iov_base + skip); + uintptr_t addr = (uintptr_t)(iov->iov_base + skip); size_t size = iov->iov_len - skip; if ((addr & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { @@ -535,7 +534,7 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) { ASSERT3P(uio->uio_dio.pages, !=, NULL); - for (int i = 0; i < uio->uio_dio.npages; i++) { + for (long i = 0; i < uio->uio_dio.npages; i++) { struct page *p = uio->uio_dio.pages[i]; lock_page(p); @@ -568,7 +567,7 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) ASSERT(uio->uio_extflg & UIO_DIRECT); ASSERT3P(uio->uio_dio.pages, !=, NULL); - for (int i = 0; i < uio->uio_dio.npages; i++) { + for (long i = 0; i < uio->uio_dio.npages; i++) { struct page *p = uio->uio_dio.pages[i]; if (IS_ZFS_MARKED_PAGE(p)) { @@ -588,27 +587,26 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's * iov_iter_get_pages(). */ -static size_t -zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio, int *numpages) +static int +zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio, + long *numpages) { unsigned long addr = (unsigned long)(v.iov_base); size_t len = v.iov_len; - int n = DIV_ROUND_UP(len, PAGE_SIZE); + unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE); - int res = zfs_get_user_pages( + long res = zfs_get_user_pages( P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, rw == UIO_READ, &uio->uio_dio.pages[uio->uio_dio.npages]); if (res < 0) { - *numpages = -1; return (-res); } else if (len != (res * PAGE_SIZE)) { - *numpages = -1; - return (len); + return (EFAULT); } ASSERT3S(len, ==, res * PAGE_SIZE); *numpages = res; - return (len); + return (0); } static int @@ -623,7 +621,7 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) for (int i = 0; i < uio->uio_iovcnt; i++) { struct iovec iov; - int numpages = 0; + long numpages = 0; if (iovp->iov_len == 0) { iovp++; @@ -632,13 +630,11 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) } iov.iov_len = MIN(maxsize, iovp->iov_len - skip); iov.iov_base = iovp->iov_base + skip; - ssize_t left = zfs_uio_iov_step(iov, rw, uio, &numpages); + int error = zfs_uio_iov_step(iov, rw, uio, &numpages); - if (numpages == -1) { - return (left); - } + if (error) + return (SET_ERROR(error)); - ASSERT3U(left, ==, iov.iov_len); uio->uio_dio.npages += numpages; maxsize -= iov.iov_len; wanted -= left; @@ -656,9 +652,9 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) { size_t skip = uio->uio_skip; size_t wanted = uio->uio_resid - uio->uio_skip; - size_t rollback = 0; - size_t cnt; - size_t maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); + ssize_t rollback = 0; + ssize_t cnt; + unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); while (wanted) { #if defined(HAVE_IOV_ITER_GET_PAGES2) @@ -694,8 +690,8 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) #endif /* HAVE_VFS_IOV_ITER */ /* - * This function maps user pages into the kernel. In the event that the user - * pages were not mapped successfully an error value is returned. + * This function pins user pages. In the event that the user pages were not + * successfully pinned an error value is returned. * * On success, 0 is returned. */ @@ -703,26 +699,30 @@ int zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) { int error = 0; - size_t npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); + long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); size_t size = npages * sizeof (struct page *); if (uio->uio_segflg == UIO_USERSPACE) { uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); error = zfs_uio_get_dio_pages_iov(uio, rw); - ASSERT3S(uio->uio_dio.npages, ==, npages); #if defined(HAVE_VFS_IOV_ITER) } else if (uio->uio_segflg == UIO_ITER) { uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); error = zfs_uio_get_dio_pages_iov_iter(uio, rw); - ASSERT3S(uio->uio_dio.npages, ==, npages); #endif } else { return (SET_ERROR(EOPNOTSUPP)); } + ASSERT3S(uio->uio_dio.npages, >=, 0); + if (error) { + for (long i = 0; i < uio->uio_dio.npages; i++) + put_page(uio->uio_dio.pages[i]); vmem_free(uio->uio_dio.pages, size); return (error); + } else { + ASSERT3S(uio->uio_dio.npages, ==, npages); } if (rw == UIO_WRITE) { diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index d3fd4340e7..21bff54e9a 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -801,7 +801,7 @@ zpl_direct_IO_impl(void) * should call the direct_IO address_space_operations function. We set * this code path to be fatal if it is executed. */ - VERIFY(0); + PANIC(0); return (0); } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 77f7664fb2..dccbc0115b 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2569,17 +2569,13 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) /* * Undirty a buffer in the transaction group referenced by the given - * transaction. Return whether this evicted the dbuf. + * transaction. Return whether this evicted the dbuf. */ boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - uint64_t txg; + uint64_t txg = tx->tx_txg; boolean_t brtwrite; - dbuf_dirty_record_t *dr; - - txg = tx->tx_txg; - dr = dbuf_find_dirty_eq(db, txg); ASSERT(txg != 0); @@ -2599,6 +2595,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) return (B_FALSE); ASSERT(dr->dr_dbuf == db); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index fa3eceb697..56af7c1298 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -6527,6 +6527,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, "Rate Direct I/O write verify events to this many per second"); /* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify_pct, UINT, ZMOD_RW, + "Percentage of Direct I/O writes per top-level VDEV for checksum " + "verification to be performed"); + ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); @@ -6553,9 +6557,4 @@ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); - -ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, direct_write_verify_pct, - param_set_direct_write_verify_pct, param_get_uint, ZMOD_RW, - "Percentage of Direct I/O writes per top-level VDEV for checksum " - "verification to be performed"); /* END CSTYLED */ From b1ee3636750038b80dc142473cae3b7b5d888ac1 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Tue, 2 Jul 2024 16:09:45 -0600 Subject: [PATCH 4/7] Updating based on PR Feedback(2) Updating code base on PR code comments. I adjusted the following parts of the code base on the comments: 1. Updated zfs_check_direct_enabled() so it now just returns an error. This removed the need for the added enum and cleaned up the code. 2. Moved acquiring the rangelock from zfs_fillpage() out to zfs_getpage(). This cleans up the code and gets rid of the need to pass a boolean into zfs_fillpage() to conditionally gra the rangelock. 3. Cleaned up the code in both zfs_uio_get_dio_pages() and zfs_uio_get_dio_pages_iov(). There was no need to have wanted and maxsize as they were the same thing. Also, since the previous commit cleaned up the call to zfs_uio_iov_step() the code is much cleaner over all. 4. Removed dbuf_get_dirty_direct() function. 5. Unified dbuf_read() to account for both block clones and direct I/O writes. This removes redundant code from dbuf_read_impl() for grabbingthe BP. 6. Removed zfs_map_page() and zfs_unmap_page() declarations from Linux headers as those were never called. Signed-off-by: Brian Atkinson --- include/os/linux/zfs/sys/zfs_znode_impl.h | 6 -- include/sys/dbuf.h | 14 +-- include/sys/zfs_vnops.h | 8 +- module/os/freebsd/spl/spl_uio.c | 12 +-- module/os/freebsd/zfs/zfs_vnops_os.c | 22 ++-- module/os/linux/zfs/zfs_uio.c | 11 +- module/os/linux/zfs/zfs_vnops_os.c | 98 ++++++++---------- module/os/linux/zfs/zpl_file.c | 42 ++++---- module/zfs/dbuf.c | 118 +++++++++++----------- module/zfs/dmu_direct.c | 28 +++-- module/zfs/zfs_vnops.c | 19 ++-- 11 files changed, 166 insertions(+), 212 deletions(-) diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h index 0be2c445ab..e028865189 100644 --- a/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -184,12 +184,6 @@ extern int zfs_inode_alloc(struct super_block *, struct inode **ip); extern void zfs_inode_destroy(struct inode *); extern void zfs_mark_inode_dirty(struct inode *); extern boolean_t zfs_relatime_need_update(const struct inode *); - -#if defined(HAVE_UIO_RW) -extern caddr_t zfs_map_page(page_t *, enum seg_rw); -extern void zfs_unmap_page(page_t *, caddr_t); -#endif /* HAVE_UIO_RW */ - extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE]; #ifdef __cplusplus diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 7a2ba8ea0a..24112a9121 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -393,7 +393,7 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -blkptr_t *dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db); +int dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp); int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, @@ -467,18 +467,6 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) return (NULL); } -/* - * All Direct I/O writes happen in open context so the first dirty record will - * always be associated with the write. After a Direct I/O write completes the - * dirty records dr_overriden state will bet DR_OVERRIDDEN and the dr_data will - * get set to NULL. - */ -static inline dbuf_dirty_record_t * -dbuf_get_dirty_direct(dmu_buf_impl_t *db) -{ - return (list_head(&db->db_dirty_records)); -} - static inline boolean_t dbuf_dirty_is_direct_write(dmu_buf_impl_t *db, dbuf_dirty_record_t *dr) { diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 8de71448e4..4fd9525138 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -29,12 +29,6 @@ extern int zfs_bclone_enabled; -typedef enum zfs_direct_enabled { - ZFS_DIRECT_IO_ERR, - ZFS_DIRECT_IO_DISABLED, - ZFS_DIRECT_IO_ENABLED -} zfs_direct_enabled_t; - extern int zfs_fsync(znode_t *, int, cred_t *); extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); @@ -52,7 +46,7 @@ extern int mappedread(znode_t *, int, zfs_uio_t *); extern int mappedread_sf(znode_t *, int, zfs_uio_t *); extern void update_pages(znode_t *, int64_t, int, objset_t *); -extern zfs_direct_enabled_t zfs_check_direct_enabled(znode_t *, int, int *); +extern int zfs_check_direct_enabled(znode_t *, int, boolean_t *); extern int zfs_setup_direct(znode_t *, zfs_uio_t *, zfs_uio_rw_t, int *); /* diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c index e752675d10..dabbc88aba 100644 --- a/module/os/freebsd/spl/spl_uio.c +++ b/module/os/freebsd/spl/spl_uio.c @@ -248,10 +248,7 @@ static int zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) { const struct iovec *iovp = GET_UIO_STRUCT(uio)->uio_iov; - size_t wanted; - size_t maxsize = zfs_uio_resid(uio); - - wanted = maxsize; + size_t len = zfs_uio_resid(uio); for (int i = 0; i < zfs_uio_iovcnt(uio); i++) { struct iovec iov; @@ -261,7 +258,7 @@ zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) iovp++; continue; } - iov.iov_len = MIN(maxsize, iovp->iov_len); + iov.iov_len = MIN(len, iovp->iov_len); iov.iov_base = iovp->iov_base; int error = zfs_uio_iov_step(iov, uio, &numpages); @@ -269,12 +266,11 @@ zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) return (error); uio->uio_dio.npages += numpages; - maxsize -= iov.iov_len; - wanted -= left; + len -= iov.iov_len; iovp++; } - ASSERT0(wanted); + ASSERT0(len); return (0); } diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index d13db17516..91986cd433 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -4313,15 +4313,15 @@ zfs_freebsd_read(struct vop_read_args *ap) int error = 0; znode_t *zp = VTOZ(ap->a_vp); int ioflag = ioflags(ap->a_ioflag); + boolean_t is_direct; zfs_uio_init(&uio, ap->a_uio); - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(zp, ioflag, &error); + error = zfs_check_direct_enabled(zp, ioflag, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { + if (error) { return (error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + } else if (is_direct) { error = zfs_freebsd_read_direct(zp, &uio, UIO_READ, ioflag, ap->a_cred); @@ -4362,9 +4362,6 @@ zfs_freebsd_read(struct vop_read_args *ap) } - ASSERT(direct == ZFS_DIRECT_IO_DISABLED || - (direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN)); - error = zfs_read(zp, &uio, ioflag, ap->a_cred); return (error); @@ -4409,15 +4406,15 @@ zfs_freebsd_write(struct vop_write_args *ap) int error = 0; znode_t *zp = VTOZ(ap->a_vp); int ioflag = ioflags(ap->a_ioflag); + boolean_t is_direct; zfs_uio_init(&uio, ap->a_uio); - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(zp, ioflag, &error); + error = zfs_check_direct_enabled(zp, ioflag, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { + if (error) { return (error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + } else if (is_direct) { error = zfs_freebsd_write_direct(zp, &uio, UIO_WRITE, ioflag, ap->a_cred); @@ -4433,9 +4430,6 @@ zfs_freebsd_write(struct vop_write_args *ap) } - ASSERT(direct == ZFS_DIRECT_IO_DISABLED || - (direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN)); - error = zfs_write(zp, &uio, ioflag, ap->a_cred); return (error); diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index 43ff81a22b..9bb73811a6 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -614,10 +614,9 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) { const struct iovec *iovp = uio->uio_iov; size_t skip = uio->uio_skip; - size_t wanted, maxsize; + size_t len = uio->uio_resid - skip; ASSERT(uio->uio_segflg != UIO_SYSSPACE); - wanted = maxsize = uio->uio_resid - skip; for (int i = 0; i < uio->uio_iovcnt; i++) { struct iovec iov; @@ -628,7 +627,7 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) skip = 0; continue; } - iov.iov_len = MIN(maxsize, iovp->iov_len - skip); + iov.iov_len = MIN(len, iovp->iov_len - skip); iov.iov_base = iovp->iov_base + skip; int error = zfs_uio_iov_step(iov, rw, uio, &numpages); @@ -636,13 +635,13 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) return (SET_ERROR(error)); uio->uio_dio.npages += numpages; - maxsize -= iov.iov_len; - wanted -= left; + len -= iov.iov_len; skip = 0; iovp++; } - ASSERT0(wanted); + ASSERT0(len); + return (0); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index a1c55b81dd..77e59a3ba2 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -228,8 +228,7 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) #if defined(_KERNEL) -static int zfs_fillpage(struct inode *ip, struct page *pp, - boolean_t rangelock_held); +static int zfs_fillpage(struct inode *ip, struct page *pp); /* * When a file is memory mapped, we must keep the IO data synchronized @@ -304,7 +303,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) * In this case we must try and fill the page. */ if (unlikely(!PageUptodate(pp))) { - error = zfs_fillpage(ip, pp, B_TRUE); + error = zfs_fillpage(ip, pp); if (error) { unlock_page(pp); put_page(pp); @@ -4009,66 +4008,19 @@ zfs_inactive(struct inode *ip) * Fill pages with data from the disk. */ static int -zfs_fillpage(struct inode *ip, struct page *pp, boolean_t rangelock_held) +zfs_fillpage(struct inode *ip, struct page *pp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; - zfs_locked_range_t *lr = NULL; ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; - /* - * It is important to hold the rangelock here because it is possible - * a Direct I/O write might be taking place at the same time that a - * page is being faulted in through filemap_fault(). With a Direct I/O - * write, db->db_data will be set to NULL either in: - * 1. dmu_write_direct() -> dmu_buf_will_not_fill() -> - * dmu_buf_will_fill() -> dbuf_noread() -> dbuf_clear_data() - * 2. dmu_write_direct_done() - * If the rangelock is not held, then there is a race between faulting - * in a page and writing out a Direct I/O write. Without the rangelock - * a NULL pointer dereference can occur in dmu_read_impl() for - * db->db_data during the mempcy operation. - * - * Another important note here is we have to check to make sure the - * rangelock is not already held from mappedread() -> zfs_fillpage(). - * filemap_fault() will first add the page to the inode address_space - * mapping and then will drop the page lock. This leaves open a window - * for mappedread() to begin. In this case he page lock and rangelock, - * are both held and it might have to call here if the page is not - * up to date. In this case the rangelock can not be held twice or a - * deadlock can happen. So the rangelock only needs to be aquired if - * zfs_fillpage() is being called by zfs_getpage(). - * - * Finally it is also important to drop the page lock before grabbing - * the rangelock to avoid another deadlock between here and - * zfs_write() -> update_pages(). update_pages() holds both the - * rangelock and the page lock. - */ - if (rangelock_held == B_FALSE) { - /* - * First try grabbing the rangelock. If that can not be done - * the page lock must be dropped before grabbing the rangelock - * to avoid a deadlock with update_pages(). See comment above. - */ - lr = zfs_rangelock_tryenter(&zp->z_rangelock, io_off, io_len, - RL_READER); - if (lr == NULL) { - get_page(pp); - unlock_page(pp); - lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, - io_len, RL_READER); - lock_page(pp); - put_page(pp); - } - } - void *va = kmap(pp); int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, io_len, va, DMU_READ_PREFETCH); @@ -4088,10 +4040,6 @@ zfs_fillpage(struct inode *ip, struct page *pp, boolean_t rangelock_held) SetPageUptodate(pp); } - - if (rangelock_held == B_FALSE) - zfs_rangelock_exit(lr); - return (error); } @@ -4112,11 +4060,49 @@ zfs_getpage(struct inode *ip, struct page *pp) zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *zp = ITOZ(ip); int error; + loff_t i_size = i_size_read(ip); + u_offset_t io_off = page_offset(pp); + size_t io_len = PAGE_SIZE; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - error = zfs_fillpage(ip, pp, B_FALSE); + ASSERT3U(io_off, <, i_size); + + if (io_off + io_len > i_size) + io_len = i_size - io_off; + + /* + * It is important to hold the rangelock here because it is possible + * a Direct I/O write or block clone might be taking place at the same + * time that a page is being faulted in through filemap_fault(). With + * Direct I/O writes and block cloning db->db_data will be set to NULL + * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the + * rangelock is not held, then there is a race between faulting in a + * page and writing out a Direct I/O write or block cloning. Without + * the rangelock a NULL pointer dereference can occur in + * dmu_read_impl() for db->db_data during the mempcy operation when + * zfs_fillpage() calls dmu_read(). + */ + zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock, + io_off, io_len, RL_READER); + if (lr == NULL) { + /* + * It is important to drop the page lock before grabbing the + * rangelock to avoid another deadlock between here and + * zfs_write() -> update_pages(). update_pages() holds both the + * rangelock and the page lock. + */ + get_page(pp); + unlock_page(pp); + lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, + io_len, RL_READER); + lock_page(pp); + put_page(pp); + } + error = zfs_fillpage(ip, pp); + zfs_rangelock_exit(lr); + if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 21bff54e9a..8781e71847 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -387,14 +387,13 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) struct inode *ip = kiocb->ki_filp->f_mapping->host; struct file *filp = kiocb->ki_filp; int flags = filp->f_flags | zfs_io_flags(kiocb); - int error = 0; + boolean_t is_direct; - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(ITOZ(ip), flags, &error); + int error = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { + if (error) { return (-error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + } else if (is_direct) { ssize_t read = zpl_iter_read_direct(kiocb, to); if (read >= 0 || read != -EAGAIN) @@ -510,7 +509,7 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) struct file *filp = kiocb->ki_filp; int flags = filp->f_flags | zfs_io_flags(kiocb); size_t count = 0; - int error = 0; + boolean_t is_direct; ssize_t ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) @@ -518,12 +517,11 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) loff_t offset = kiocb->ki_pos; - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(ITOZ(ip), flags, &error); + ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { - return (-error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + if (ret) { + return (-ret); + } else if (is_direct) { ssize_t wrote = zpl_iter_write_direct(kiocb, from); if (wrote >= 0 || wrote != -EAGAIN) { @@ -638,18 +636,17 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, int flags = filp->f_flags | zfs_io_flags(kiocb); size_t count; ssize_t ret; - int error = 0; + boolean_t is_direct; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(ITOZ(ip), flags, &error); + ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { - return (-error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + if (ret) { + return (-ret); + } else if (is_direct) { ssize_t read = zpl_aio_read_direct(kiocb, iov, nr_segs, pos); if (read >= 0 || read != -EAGAIN) @@ -754,7 +751,7 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, size_t ocount; size_t count; ssize_t ret; - int error = 0; + boolean_t is_direct; ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); if (ret) @@ -768,12 +765,11 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, kiocb->ki_pos = pos; - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(ITOZ(ip), flags, &error); + ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { - return (-error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + if (ret) { + return (-ret); + } else if (is_direct) { ssize_t wrote = zpl_aio_write_direct(kiocb, iov, nr_segs, pos); if (wrote >= 0 || wrote != -EAGAIN) { diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index dccbc0115b..333ae7fc6c 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1253,17 +1253,16 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); - dbuf_dirty_record_t *dr_dio = NULL; db->db_buf = buf; - dr_dio = dbuf_get_dirty_direct(db); /* * If there is a Direct I/O, set its data too. Then its state * will be the same as if we did a ZIL dmu_sync(). */ - if (dbuf_dirty_is_direct_write(db, dr_dio)) { - dr_dio->dt.dl.dr_data = db->db_buf; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dbuf_dirty_is_direct_write(db, dr)) { + dr->dt.dl.dr_data = db->db_buf; } ASSERT(buf->b_data != NULL); @@ -1594,7 +1593,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - blkptr_t *bpp = bp; ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1608,37 +1606,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, goto early_unlock; } - /* - * If we have a pending block clone, we don't want to read the - * underlying block, but the content of the block being cloned, - * pointed by the dirty record, so we have the most recent data. - * If there is no dirty record, then we hit a race in a sync - * process when the dirty record is already removed, while the - * dbuf is not yet destroyed. Such case is equivalent to uncached. - */ - if (db->db_state == DB_NOFILL) { - dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); - if (dr != NULL) { - if (!dr->dt.dl.dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bpp = &dr->dt.dl.dr_overridden_by; - } - } - - err = dbuf_read_hole(db, dn, bpp); + err = dbuf_read_hole(db, dn, bp); if (err == 0) goto early_unlock; - ASSERT(bpp != NULL); + ASSERT(bp != NULL); /* * Any attempt to read a redacted block should result in an error. This * will never happen under normal conditions, but can be useful for * debugging purposes. */ - if (BP_IS_REDACTED(bpp)) { + if (BP_IS_REDACTED(bp)) { ASSERT(dsl_dataset_feature_is_active( db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); @@ -1653,9 +1632,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ - if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) { spa_log_error(db->db_objset->os_spa, &zb, - BP_GET_LOGICAL_BIRTH(bpp)); + BP_GET_LOGICAL_BIRTH(bp)); err = SET_ERROR(EIO); goto early_unlock; } @@ -1666,7 +1645,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, if (!DBUF_IS_CACHEABLE(db)) aflags |= ARC_FLAG_UNCACHED; - else if (dbuf_is_l2cacheable(db, bpp)) + else if (dbuf_is_l2cacheable(db, bp)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -1674,7 +1653,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; - if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bpp)) + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; /* @@ -1684,7 +1663,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ - blkptr_t copy = *bpp; + blkptr_t copy = *bp; dmu_buf_unlock_parent(db, dblt, tag); return (arc_read(zio, db->db_objset->os_spa, ©, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, @@ -1856,24 +1835,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) } mutex_exit(&db->db_mtx); } else { - blkptr_t *bp = NULL; ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + blkptr_t *bp; /* - * If a Direct I/O write has occurred we will use the updated - * block pointer. + * If a block clone or Direct I/O write has occurred we will + * get the dirty records overridden BP so we get the most + * recent data.. */ - bp = dmu_buf_get_bp_from_dbuf(db); + err = dmu_buf_get_bp_from_dbuf(db, &bp); - if (pio == NULL && (db->db_state == DB_NOFILL || - (bp != NULL && !BP_IS_HOLE(bp)))) { - spa_t *spa = dn->dn_objset->os_spa; - pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - need_wait = B_TRUE; + if (!err) { + if (pio == NULL && (db->db_state == DB_NOFILL || + (bp != NULL && !BP_IS_HOLE(bp)))) { + spa_t *spa = dn->dn_objset->os_spa; + pio = + zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + need_wait = B_TRUE; + } + + err = + dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG); + } else { + mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, FTAG); } - err = dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG); /* dbuf_read_impl drops db_mtx and parent's rwlock. */ miss = (db->db_state != DB_CACHED); } @@ -2756,31 +2744,39 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) /* * Normally the db_blkptr points to the most recent on-disk content for the - * dbuf (and anything newer will be cached in the dbuf). However, a recent - * Direct I/O write could leave newer content on disk and the dbuf uncached. - * In this case we must return the (as yet unsynced) pointer to the lastest - * on-disk content. + * dbuf (and anything newer will be cached in the dbuf). However, a pending + * block clone or not yet synced Direct I/O write will have a dirty record BP + * pointing to the most recent data. */ -blkptr_t * -dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db) +int +dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp) { ASSERT(MUTEX_HELD(&db->db_mtx)); + int error = 0; - if (db->db_level != 0) - return (db->db_blkptr); - - blkptr_t *bp = db->db_blkptr; - - dbuf_dirty_record_t *dr_dio = dbuf_get_dirty_direct(db); - if (dr_dio && dr_dio->dt.dl.dr_override_state == DR_OVERRIDDEN && - dr_dio->dt.dl.dr_data == NULL) { - ASSERT(db->db_state == DB_UNCACHED || - db->db_state == DB_NOFILL); - /* We have a Direct I/O write or cloned block, use it's BP */ - bp = &dr_dio->dt.dl.dr_overridden_by; + if (db->db_level != 0) { + *bp = db->db_blkptr; + return (0); } - return (bp); + *bp = db->db_blkptr; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr) { + if (db->db_state == DB_NOFILL) { + /* Block clone */ + if (!dr->dt.dl.dr_brtwrite) + error = EIO; + else + *bp = &dr->dt.dl.dr_overridden_by; + } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN && + dr->dt.dl.dr_data == NULL) { + ASSERT(db->db_state == DB_UNCACHED); + /* Direct I/O write */ + *bp = &dr->dt.dl.dr_overridden_by; + } + } + + return (error); } /* diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index bf47335302..465b6e5772 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -152,7 +152,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa)); ASSERT3U(txg, >, spa_syncing_txg(os->os_spa)); - dr_head = dbuf_get_dirty_direct(db); + dr_head = list_head(&db->db_dirty_records); ASSERT3U(dr_head->dr_txg, ==, txg); dr_head->dr_accounted = db->db.db_size; @@ -260,6 +260,7 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; abd_t *mbuf; zbookmark_phys_t zb; + blkptr_t *bp; mutex_enter(&db->db_mtx); @@ -273,7 +274,11 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, while (db->db_state == DB_READ) cv_wait(&db->db_changed, &db->db_mtx); - blkptr_t *bp = dmu_buf_get_bp_from_dbuf(db); + err = dmu_buf_get_bp_from_dbuf(db, &bp); + if (err) { + mutex_exit(&db->db_mtx); + goto error; + } /* * There is no need to read if this is a hole or the data is @@ -310,13 +315,13 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, /* * The dbuf mutex (db_mtx) must be held when creating the ZIO * for the read. The BP returned from - * dmu_buf_get_bp_from_dbuf() could be from a previous Direct - * I/O write that is in the dbuf's dirty record. When - * zio_read() is called, zio_create() will make a copy of the - * BP. However, if zio_read() is called without the mutex - * being held then the dirty record from the dbuf could be - * freed in dbuf_write_done() resulting in garbage being set - * for the zio BP. + * dmu_buf_get_bp_from_dbuf() could be from a pending block + * clone or a yet to be synced Direct I/O write that is in the + * dbuf's dirty record. When zio_read() is called, zio_create() + * will make a copy of the BP. However, if zio_read() is called + * without the mutex being held then the dirty record from the + * dbuf could be freed in dbuf_write_done() resulting in garbage + * being set for the zio BP. */ zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size, dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ, @@ -330,6 +335,11 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); return (zio_wait(rio)); + +error: + dmu_buf_rele_array(dbp, numbufs, FTAG); + (void) zio_wait(rio); + return (err); } #ifdef _KERNEL diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 2460805582..5fd6996215 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -202,25 +202,26 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); } -zfs_direct_enabled_t -zfs_check_direct_enabled(znode_t *zp, int ioflags, int *error) -{ - zfs_direct_enabled_t is_direct = ZFS_DIRECT_IO_DISABLED; +int +zfs_check_direct_enabled(znode_t *zp, int ioflags, boolean_t *is_direct) +{; zfsvfs_t *zfsvfs = ZTOZSB(zp); + *is_direct = B_FALSE; + int error; - if ((*error = zfs_enter(zfsvfs, FTAG)) != 0) - return (ZFS_DIRECT_IO_ERR); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (ioflags & O_DIRECT && zfsvfs->z_os->os_direct != ZFS_DIRECT_DISABLED) { - is_direct = ZFS_DIRECT_IO_ENABLED; + *is_direct = B_TRUE; } else if (zfsvfs->z_os->os_direct == ZFS_DIRECT_ALWAYS) { - is_direct = ZFS_DIRECT_IO_ENABLED; + *is_direct = B_TRUE; } zfs_exit(zfsvfs, FTAG); - return (is_direct); + return (0); } /* From 6e0ffaf6275b7115ee1f5543a5b55b8cc1b1a7c0 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Tue, 16 Jul 2024 10:55:59 -0600 Subject: [PATCH 5/7] Updating based on PR Feedback(3) 1. Unified the block cloning and Direct I/O code paths further. As part of this unification, it is important to outline that Direct I/O writes transition the db_state to DB_UNCACHED. This is used so that dbuf_unoverride() is called when dbuf_undirty() is called. This is needed to cleanup space accounting in a TXG. When a dbuf is redirtied through dbuf_redirty(), then dbuf_unoverride() is also called to clean up space accounting. This is a bit of a different approach that block cloning, which always calls dbuf_undirty(). 2. As part of uniying the two, Direct I/O also performs the same check in dmu_buf_will_fill() so that on failure the previous contents of the dbuf are set correctly. 3. General just code cleanup removing checks that are no longer necessary. Signed-off-by: Brian Atkinson --- include/sys/dbuf.h | 15 +---- module/zfs/dbuf.c | 120 +++++++++++++++------------------------- module/zfs/dmu_direct.c | 9 ++- module/zfs/zfs_vnops.c | 5 +- 4 files changed, 55 insertions(+), 94 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 24112a9121..56741cd2a5 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -176,6 +176,7 @@ typedef struct dbuf_dirty_record { uint8_t dr_copies; boolean_t dr_nopwrite; boolean_t dr_brtwrite; + boolean_t dr_diowrite; boolean_t dr_has_raw_params; /* @@ -467,20 +468,6 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) return (NULL); } -static inline boolean_t -dbuf_dirty_is_direct_write(dmu_buf_impl_t *db, dbuf_dirty_record_t *dr) -{ - boolean_t ret = B_FALSE; - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (dr != NULL && db->db_level == 0 && !dr->dt.dl.dr_brtwrite && - dr->dt.dl.dr_override_state == DR_OVERRIDDEN && - dr->dt.dl.dr_data == NULL) { - ret = B_TRUE; - } - return (ret); -} - #define DBUF_GET_BUFC_TYPE(_db) \ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 333ae7fc6c..0fe157e5c1 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1255,16 +1255,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) ASSERT(buf != NULL); db->db_buf = buf; - - /* - * If there is a Direct I/O, set its data too. Then its state - * will be the same as if we did a ZIL dmu_sync(). - */ - dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); - if (dbuf_dirty_is_direct_write(db, dr)) { - dr->dt.dl.dr_data = db->db_buf; - } - ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } @@ -1843,7 +1833,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) /* * If a block clone or Direct I/O write has occurred we will * get the dirty records overridden BP so we get the most - * recent data.. + * recent data. */ err = dmu_buf_get_bp_from_dbuf(db, &bp); @@ -1948,13 +1938,14 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); - if (dr->dt.dl.dr_brtwrite) { + if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) { ASSERT0P(dr->dt.dl.dr_data); dr->dt.dl.dr_data = db->db_buf; } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; dr->dt.dl.dr_brtwrite = B_FALSE; + dr->dt.dl.dr_diowrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; /* @@ -2161,26 +2152,11 @@ dbuf_redirty(dbuf_dirty_record_t *dr) */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL && db->db_buf != NULL) { - /* - * Already released on initial dirty, - * so just thaw. - */ + db->db_state != DB_NOFILL) { + /* Already released on initial dirty, so just thaw. */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } - /* - * If initial dirty was via Direct I/O, may not have a dr_data. - * - * If the dirty record was associated with cloned block then - * the call above to dbuf_unoverride() will have reset - * dr->dt.dl.dr_data and it will not be NULL here. - */ - if (dr->dt.dl.dr_data == NULL) { - ASSERT3B(dbuf_dirty_is_direct_write(db, dr), ==, - B_TRUE); - dr->dt.dl.dr_data = db->db_buf; - } } } @@ -2564,6 +2540,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { uint64_t txg = tx->tx_txg; boolean_t brtwrite; + boolean_t diowrite; ASSERT(txg != 0); @@ -2589,7 +2566,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(dr->dr_dbuf == db); brtwrite = dr->dt.dl.dr_brtwrite; + diowrite = dr->dt.dl.dr_diowrite; if (brtwrite) { + ASSERT3B(diowrite, ==, B_FALSE); /* * We are freeing a block that we cloned in the same * transaction group. @@ -2630,11 +2609,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state != DB_NOFILL && !brtwrite) { dbuf_unoverride(dr); - /* - * In the Direct I/O case, the buffer is still dirty, but it - * may be UNCACHED, so we do not need to destroy an ARC buffer. - */ - if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) { + if (dr->dt.dl.dr_data != db->db_buf) { ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); arc_buf_destroy(dr->dt.dl.dr_data, db); @@ -2647,12 +2622,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - /* - * In the Direct I/O case our db_buf will be NULL as we are not - * caching in the ARC. - */ - ASSERT(db->db_state == DB_NOFILL || brtwrite || - db->db_buf == NULL || arc_released(db->db_buf)); + ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite || + arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } @@ -2711,8 +2682,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we * want to make sure dbuf_read() will read the pending cloned block and * not the uderlying block that is being replaced. dbuf_undirty() will - * do dbuf_unoverride(), so we will end up with cloned block content, - * without overridden BP. + * do brt_pending_remove() before removing the dirty record. */ (void) dbuf_read(db, NULL, flags); if (undirty) { @@ -2761,19 +2731,16 @@ dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp) *bp = db->db_blkptr; dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); - if (dr) { - if (db->db_state == DB_NOFILL) { - /* Block clone */ - if (!dr->dt.dl.dr_brtwrite) - error = EIO; - else - *bp = &dr->dt.dl.dr_overridden_by; - } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN && - dr->dt.dl.dr_data == NULL) { - ASSERT(db->db_state == DB_UNCACHED); - /* Direct I/O write */ + if (dr && db->db_state == DB_NOFILL) { + /* Block clone */ + if (!dr->dt.dl.dr_brtwrite) + error = EIO; + else + *bp = &dr->dt.dl.dr_overridden_by; + } else if (dr && db->db_state == DB_UNCACHED) { + /* Direct I/O write */ + if (dr->dt.dl.dr_diowrite) *bp = &dr->dt.dl.dr_overridden_by; - } } return (error); @@ -2929,21 +2896,28 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) dmu_tx_private_ok(tx)); mutex_enter(&db->db_mtx); - if (db->db_state == DB_NOFILL) { + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); + if (db->db_state == DB_NOFILL || + (db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) { /* - * Block cloning: We will be completely overwriting a block - * cloned in this transaction group, so let's undirty the - * pending clone and mark the block as uncached. This will be - * as if the clone was never done. But if the fill can fail - * we should have a way to return back to the cloned data. + * If the fill can fail we should have a way to return back to + * the cloned or Direct I/O write data. */ - if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + if (canfail && dr) { mutex_exit(&db->db_mtx); dmu_buf_will_dirty(db_fake, tx); return; } - VERIFY(!dbuf_undirty(db, tx)); - db->db_state = DB_UNCACHED; + /* + * Block cloning: We will be completely overwriting a block + * cloned in this transaction group, so let's undirty the + * pending clone and mark the block as uncached. This will be + * as if the clone was never done. + */ + if (dr && dr->dt.dl.dr_brtwrite) { + VERIFY(!dbuf_undirty(db, tx)); + db->db_state = DB_UNCACHED; + } } mutex_exit(&db->db_mtx); @@ -5085,6 +5059,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (dr->dt.dl.dr_data != NULL && dr->dt.dl.dr_data != db->db_buf) { ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE); + ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE); arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { @@ -5146,9 +5121,7 @@ dbuf_write_override_done(zio_t *zio) if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); - - if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) - arc_release(dr->dt.dl.dr_data, db); + arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); @@ -5355,14 +5328,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) * (by dmu_sync(), dmu_write_direct(), * or dmu_buf_write_embedded()). */ - blkptr_t *bp = &dr->dt.dl.dr_overridden_by; - abd_t *contents = NULL; - if (data) { - ASSERT(BP_IS_HOLE(bp) || - arc_buf_lsize(data) == BP_GET_LSIZE(bp)); - contents = abd_get_from_buf(data->b_data, - arc_buf_size(data)); - } + abd_t *contents = (data != NULL) ? + abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, @@ -5371,8 +5338,9 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - zio_write_override(dr->dr_zio, bp, dr->dt.dl.dr_copies, - dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite); + zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, + dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, + dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); } else if (data == NULL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 465b6e5772..3d5fb23907 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -107,8 +107,12 @@ dmu_write_direct_done(zio_t *zio) ASSERT3U(zio->io_error, ==, EAGAIN); /* - * In the event of an I/O error the metaslab cleanup is taken - * care of in zio_done(). + * In the event of an I/O error this block has been freed in + * zio_done() through zio_dva_unallocate(). Calling + * dmu_sync_done() above set dr_override_state to + * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls + * dbuf_unoverride(), it will skip doing zio_free() to free + * this block as that was already taken care of. * * Since we are undirtying the record in open-context, we must * have a hold on the db, so it should never be evicted after @@ -154,6 +158,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) dr_head = list_head(&db->db_dirty_records); ASSERT3U(dr_head->dr_txg, ==, txg); + dr_head->dt.dl.dr_diowrite = B_TRUE; dr_head->dr_accounted = db->db.db_size; blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 5fd6996215..4cf03abc5a 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1154,11 +1154,12 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, if (error == 0) { zgd->zgd_db = dbp; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; + boolean_t direct_write = B_FALSE; mutex_enter(&db->db_mtx); dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); - boolean_t direct_write = - dbuf_dirty_is_direct_write(db, dr); + if (dr != NULL && dr->dt.dl.dr_diowrite) + direct_write = B_TRUE; mutex_exit(&db->db_mtx); /* From 71ce314930e037b94e3b73ee9f991e60d46aa781 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Thu, 22 Aug 2024 11:58:50 -0600 Subject: [PATCH 6/7] Updating based on PR Feedback(4) 1. When testing out installing a VM with virtual manager on Linux and a dataset with direct=always, there an ASSERT failure in abd_alloc_from_pages(). Originally zfs_setup_direct() did an alignment check of the UIO using SPA_MINBLOCKSIZE with zfs_uio_aligned(). The idea behind this was maybe the page alignment restriction could be changed to use ashift as the alignment check in the future. Howver, this diea never came to be. The alignment restrictions for Direct I/O are based on PAGE_SIZE. Updating the check zfs_setup_direct() for the UIO to use PAGE_SIZE fixed the issue. 2. Updated other alignment check in dmu_read_impl() to also use PAGE_SIZE. 3. As a consequence of updating the UIO alignment checks the ZTS test case dio_unaligned_filesize began to fail. This is because there was no way to detect reading past the end of the file before issue EINVAL in the ZPL and VOPs layers in FreeBSD. This was resolved by moving zfs_setup_direct() into zfs_write() and zfs_read(). This allows for other error checking to take place before checking any Direct I/O limitations. Updating the call site of zfs_setup_direct() did require a bit of changes to the logic in that function. In particular Direct I/O can just be avoid altogether depending on the checks in zfs_setup_direct() and there is no reason to return EAGAIN at all. 4. After moving zfs_setup_direct() into zfs_write() and zfs_read(), there was no reason to call zfs_check_direct_enabled() in the ZPL layer in Linux or in the VNOPS layer of FreeBSD. This function was completely removed. This allowed for much of the code in both those layers to return to their original code. 5. Upated the checksum verify module parameter for Direct I/O writes to only be a boolean and return EIO in the event a checksum verify failure occurs. By default, this module parameter is set to 1 for Linux and 0 for FreeBSD. The module parameter has been changed to zfs_vdev_direct_write_verify. There are still counters on the top-level VDEV for checksum verify failures, but this could be removed. It would still be good to to leave the ZED event dio_verify for checksum failures as a notification that an application was manipulating the contents of a buffer after issuing that buffer with for I/O using Direct I/O. As part of this cahnge, man pages were updated, the ZTS test case dio_writy_verify was updated, and all comments relating to the module parameter were udpated as well. 6. Updated comments in dio_property ZTS test to properly reflect that stride_dd is being called with check_write and check_read. Signed-off-by: Brian Atkinson --- include/sys/vdev_impl.h | 4 +- include/sys/zfs_vnops.h | 3 - man/man4/zfs.4 | 10 +- man/man8/zpool-events.8 | 6 +- man/man8/zpool-status.8 | 2 +- module/os/freebsd/zfs/zfs_vnops_os.c | 146 +------ module/os/linux/zfs/zpl_file.c | 410 +++--------------- module/zfs/dmu.c | 2 +- module/zfs/dmu_direct.c | 2 +- module/zfs/vdev.c | 16 +- module/zfs/zfs_vnops.c | 143 +++--- module/zfs/zio.c | 60 ++- tests/zfs-tests/cmd/manipulate_user_buffer.c | 44 +- tests/zfs-tests/cmd/stride_dd.c | 1 - tests/zfs-tests/include/tunables.cfg | 2 +- .../functional/direct/dio_aligned_block.ksh | 1 - .../functional/direct/dio_async_always.ksh | 1 - .../direct/dio_async_fio_ioengines.ksh | 1 - .../functional/direct/dio_compression.ksh | 1 - .../tests/functional/direct/dio_dedup.ksh | 1 - .../functional/direct/dio_encryption.ksh | 2 - .../functional/direct/dio_grow_block.ksh | 1 - .../functional/direct/dio_max_recordsize.ksh | 8 - .../tests/functional/direct/dio_mixed.ksh | 1 - .../tests/functional/direct/dio_mmap.ksh | 1 - .../functional/direct/dio_overwrites.ksh | 1 - .../tests/functional/direct/dio_property.ksh | 7 +- .../tests/functional/direct/dio_random.ksh | 1 - .../functional/direct/dio_recordsize.ksh | 8 - .../functional/direct/dio_unaligned_block.ksh | 1 - .../direct/dio_unaligned_filesize.ksh | 1 - .../direct/dio_write_stable_pages.ksh | 2 +- .../functional/direct/dio_write_verify.ksh | 70 +-- 33 files changed, 252 insertions(+), 708 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 54bdff611f..abd66b8abc 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -655,9 +655,9 @@ int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); /* - * VDEV checksum verification precentage for Direct I/O writes + * VDEV checksum verification for Direct I/O writes */ -extern uint_t zfs_vdev_direct_write_verify_pct; +extern uint_t zfs_vdev_direct_write_verify; #ifdef __cplusplus } diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 4fd9525138..e60b99bed1 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -46,9 +46,6 @@ extern int mappedread(znode_t *, int, zfs_uio_t *); extern int mappedread_sf(znode_t *, int, zfs_uio_t *); extern void update_pages(znode_t *, int64_t, int, objset_t *); -extern int zfs_check_direct_enabled(znode_t *, int, boolean_t *); -extern int zfs_setup_direct(znode_t *, zfs_uio_t *, zfs_uio_rw_t, int *); - /* * Platform code that asynchronously drops zp's inode / vnode_t. * diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index ab0ab5e716..5f3ad01a94 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -416,12 +416,10 @@ May be increased up to .Sy ASHIFT_MAX Po 16 Pc , but this may negatively impact pool space efficiency. . -.It Sy zfs_vdev_direct_write_verify_pct Ns = Ns Sy Linux 2 | FreeBSD 0 Pq uint +.It Sy zfs_vdev_direct_write_verify Ns = Ns Sy Linux 1 | FreeBSED 0 Pq uint If non-zero, then a Direct I/O write's checksum will be verified every -percentage (pct) of Direct I/O writes that are issued to a top-level VDEV -before it is committed and the block pointer is updated. -In the event the checksum is not valid then the I/O operation will be -redirected through the ARC. +time the write is issued and before it is commited to the block pointer. +In the event the checksum is not valid then the I/O operation will return EIO. This module parameter can be used to detect if the contents of the users buffer have changed in the process of doing a Direct I/O write. @@ -432,7 +430,7 @@ Each verify error causes a zevent. Direct Write I/O checkum verify errors can be seen with .Nm zpool Cm status Fl d . -The default value for this is 2 percent on Linux, but is 0 for +The default value for this is 1 on Linux, but is 0 for .Fx because user pages can be placed under write protection in .Fx diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 77d44bd8ad..234612baea 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -100,14 +100,14 @@ The number of delay events is ratelimited by the module parameter. .It Sy dio_verify Issued when there was a checksum verify error after a Direct I/O write has been -issued and is redirected through the ARC. +issued. This event can only take place if the module parameter -.Sy zfs_vdev_direct_write_verify_pct +.Sy zfs_vdev_direct_write_verify is not set to zero. See .Xr zfs 4 for more details on the -.Sy zfs_vdev_direct_write_verify_pct +.Sy zfs_vdev_direct_write_verify module paramter. .It Sy config Issued every time a vdev change have been done to the pool. diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 923b99de30..868fc4414d 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -85,7 +85,7 @@ to set pool GUID as key for pool objects instead of pool names. Display the number of Direct I/O write checksum verify errors that have occured on a top-level VDEV. See -.Sx zfs_vdev_direct_write_verify_pct +.Sx zfs_vdev_direct_write_verify in .Xr zfs 4 for details about the conditions that can cause Direct I/O write checksum diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 91986cd433..5dbca10a3e 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -4274,29 +4274,6 @@ ioflags(int ioflags) return (flags); } -static int -zfs_freebsd_read_direct(znode_t *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, - int ioflag, cred_t *cr) -{ - int ret; - int flags = ioflag; - - ASSERT3U(rw, ==, UIO_READ); - - /* On error, return to fallback to the buffred path */ - ret = zfs_setup_direct(zp, uio, rw, &flags); - if (ret) - return (ret); - - ASSERT(uio->uio_extflg & UIO_DIRECT); - - ret = zfs_read(zp, uio, flags, cr); - - zfs_uio_free_dio_pages(uio, rw); - - return (ret); -} - #ifndef _SYS_SYSPROTO_H_ struct vop_read_args { struct vnode *a_vp; @@ -4311,85 +4288,37 @@ zfs_freebsd_read(struct vop_read_args *ap) { zfs_uio_t uio; int error = 0; - znode_t *zp = VTOZ(ap->a_vp); - int ioflag = ioflags(ap->a_ioflag); - boolean_t is_direct; - zfs_uio_init(&uio, ap->a_uio); - - error = zfs_check_direct_enabled(zp, ioflag, &is_direct); - - if (error) { - return (error); - } else if (is_direct) { - error = - zfs_freebsd_read_direct(zp, &uio, UIO_READ, ioflag, - ap->a_cred); - /* - * XXX We occasionally get an EFAULT for Direct I/O reads on - * FreeBSD 13. This still needs to be resolved. The EFAULT comes - * from: - * zfs_uio_get__dio_pages_alloc() -> - * zfs_uio_get_dio_pages_impl() -> - * zfs_uio_iov_step() -> - * zfs_uio_get_user_pages(). - * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O - * read fails to map in the user pages (returning EFAULT) the - * Direct I/O request is broken up into two separate IO requests - * and issued separately using Direct I/O. - */ + error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), + ap->a_cred); + /* + * XXX We occasionally get an EFAULT for Direct I/O reads on + * FreeBSD 13. This still needs to be resolved. The EFAULT comes + * from: + * zfs_uio_get__dio_pages_alloc() -> + * zfs_uio_get_dio_pages_impl() -> + * zfs_uio_iov_step() -> + * zfs_uio_get_user_pages(). + * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O + * read fails to map in the user pages (returning EFAULT) the + * Direct I/O request is broken up into two separate IO requests + * and issued separately using Direct I/O. + */ #ifdef ZFS_DEBUG - if (error == EFAULT) { + if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) { #if 0 - printf("%s(%d): Direct I/O read returning EFAULT " - "uio = %p, zfs_uio_offset(uio) = %lu " - "zfs_uio_resid(uio) = %lu\n", - __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio), - zfs_uio_resid(&uio)); + printf("%s(%d): Direct I/O read returning EFAULT " + "uio = %p, zfs_uio_offset(uio) = %lu " + "zfs_uio_resid(uio) = %lu\n", + __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio), + zfs_uio_resid(&uio)); #endif - } - -#endif - - /* - * On error we will return unless the error is EAGAIN, which - * just tells us to fallback to the buffered path. - */ - if (error != EAGAIN) - return (error); - else - ioflag &= ~O_DIRECT; } - - error = zfs_read(zp, &uio, ioflag, ap->a_cred); - +#endif return (error); } -static int -zfs_freebsd_write_direct(znode_t *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, - int ioflag, cred_t *cr) -{ - int ret; - int flags = ioflag; - - ASSERT3U(rw, ==, UIO_WRITE); - - /* On error, return to fallback to the buffred path */ - ret = zfs_setup_direct(zp, uio, rw, &flags); - if (ret) - return (ret); - - ASSERT(uio->uio_extflg & UIO_DIRECT); - - ret = zfs_write(zp, uio, flags, cr); - - zfs_uio_free_dio_pages(uio, rw); - - return (ret); -} - #ifndef _SYS_SYSPROTO_H_ struct vop_write_args { struct vnode *a_vp; @@ -4403,36 +4332,9 @@ static int zfs_freebsd_write(struct vop_write_args *ap) { zfs_uio_t uio; - int error = 0; - znode_t *zp = VTOZ(ap->a_vp); - int ioflag = ioflags(ap->a_ioflag); - boolean_t is_direct; - zfs_uio_init(&uio, ap->a_uio); - - error = zfs_check_direct_enabled(zp, ioflag, &is_direct); - - if (error) { - return (error); - } else if (is_direct) { - error = - zfs_freebsd_write_direct(zp, &uio, UIO_WRITE, ioflag, - ap->a_cred); - - /* - * On error we will return unless the error is EAGAIN, which - * just tells us to fallback to the buffered path. - */ - if (error != EAGAIN) - return (error); - else - ioflag &= ~O_DIRECT; - - } - - error = zfs_write(zp, &uio, ioflag, ap->a_cred); - - return (error); + return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), + ap->a_cred)); } /* diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 8781e71847..62f772afef 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -309,7 +309,7 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, } static ssize_t -zpl_iter_read_buffered(struct kiocb *kiocb, struct iov_iter *to) +zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); fstrans_cookie_t cookie; @@ -322,15 +322,14 @@ zpl_iter_read_buffered(struct kiocb *kiocb, struct iov_iter *to) crhold(cr); cookie = spl_fstrans_mark(); - int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; - int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, - flags, cr); + ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); - if (error < 0) - return (error); + if (ret < 0) + return (ret); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; @@ -340,71 +339,6 @@ zpl_iter_read_buffered(struct kiocb *kiocb, struct iov_iter *to) return (read); } -static ssize_t -zpl_iter_read_direct(struct kiocb *kiocb, struct iov_iter *to) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - ssize_t count = iov_iter_count(to); - int flags = filp->f_flags | zfs_io_flags(kiocb); - zfs_uio_t uio; - ssize_t ret; - - zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); - - /* On error, return to fallback to the buffered path. */ - ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_READ, &flags); - if (ret) - return (-ret); - - ASSERT(uio.uio_extflg & UIO_DIRECT); - - crhold(cr); - fstrans_cookie_t cookie = spl_fstrans_mark(); - - int error = -zfs_read(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - zfs_uio_free_dio_pages(&uio, UIO_READ); - - if (error < 0) - return (error); - - ssize_t read = count - uio.uio_resid; - kiocb->ki_pos += read; - - zpl_file_accessed(filp); - - return (read); -} - -static ssize_t -zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) -{ - struct inode *ip = kiocb->ki_filp->f_mapping->host; - struct file *filp = kiocb->ki_filp; - int flags = filp->f_flags | zfs_io_flags(kiocb); - boolean_t is_direct; - - int error = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - - if (error) { - return (-error); - } else if (is_direct) { - ssize_t read = zpl_iter_read_direct(kiocb, to); - - if (read >= 0 || read != -EAGAIN) - return (read); - - /* Otherwise fallback to buffered read */ - } - - return (zpl_iter_read_buffered(kiocb, to)); -} - static inline ssize_t zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, size_t *countp) @@ -430,249 +364,57 @@ zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, return (0); } -static ssize_t -zpl_iter_write_buffered(struct kiocb *kiocb, struct iov_iter *from) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - size_t wrote; - size_t count = iov_iter_count(from); - - zfs_uio_t uio; - zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); - - crhold(cr); - fstrans_cookie_t cookie = spl_fstrans_mark(); - - int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; - int error = -zfs_write(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - if (error < 0) - return (error); - - wrote = count - uio.uio_resid; - kiocb->ki_pos += wrote; - - if (wrote > 0) - iov_iter_advance(from, wrote); - - return (wrote); -} - -static ssize_t -zpl_iter_write_direct(struct kiocb *kiocb, struct iov_iter *from) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - size_t wrote; - int flags = filp->f_flags | zfs_io_flags(kiocb); - size_t count = iov_iter_count(from); - - zfs_uio_t uio; - zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); - - /* On error, return to fallback to the buffered path. */ - ssize_t ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_WRITE, &flags); - if (ret) - return (-ret); - - ASSERT(uio.uio_extflg & UIO_DIRECT); - - crhold(cr); - fstrans_cookie_t cookie = spl_fstrans_mark(); - - int error = -zfs_write(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - zfs_uio_free_dio_pages(&uio, UIO_WRITE); - - if (error < 0) - return (error); - - wrote = count - uio.uio_resid; - kiocb->ki_pos += wrote; - - return (wrote); -} - static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { - struct inode *ip = kiocb->ki_filp->f_mapping->host; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; - int flags = filp->f_flags | zfs_io_flags(kiocb); + struct inode *ip = filp->f_mapping->host; + zfs_uio_t uio; size_t count = 0; - boolean_t is_direct; + ssize_t ret; - ssize_t ret = zpl_generic_write_checks(kiocb, from, &count); + ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) return (ret); - loff_t offset = kiocb->ki_pos; + zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); - ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); + crhold(cr); + cookie = spl_fstrans_mark(); - if (ret) { - return (-ret); - } else if (is_direct) { - ssize_t wrote = zpl_iter_write_direct(kiocb, from); + ret = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); - if (wrote >= 0 || wrote != -EAGAIN) { - return (wrote); - } + spl_fstrans_unmark(cookie); + crfree(cr); - /* - * If we are falling back to a buffered write, then the - * file position should not be updated at this point. - */ - ASSERT3U(offset, ==, kiocb->ki_pos); - } + if (ret < 0) + return (ret); - return (zpl_iter_write_buffered(kiocb, from)); + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); } #else /* !HAVE_VFS_RW_ITERATE */ -static ssize_t -zpl_aio_read_buffered(struct kiocb *kiocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - struct file *filp = kiocb->ki_filp; - size_t count; - ssize_t ret; - - ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (ret) - return (ret); - - zfs_uio_t uio; - zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, - count, 0); - - crhold(cr); - cookie = spl_fstrans_mark(); - - int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; - int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, - flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - if (error < 0) - return (error); - - ssize_t read = count - uio.uio_resid; - kiocb->ki_pos += read; - - zpl_file_accessed(filp); - - return (read); -} - -static ssize_t -zpl_aio_read_direct(struct kiocb *kiocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - int flags = filp->f_flags | zfs_io_flags(kiocb); - size_t count; - ssize_t ret; - - ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (ret) - return (ret); - - zfs_uio_t uio; - zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, - count, 0); - - /* On error, return to fallback to the buffered path */ - ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_READ, &flags); - if (ret) - return (-ret); - - ASSERT(uio.uio_extflg & UIO_DIRECT); - - crhold(cr); - cookie = spl_fstrans_mark(); - - int error = -zfs_read(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - zfs_uio_free_dio_pages(&uio, UIO_READ); - - if (error < 0) - return (error); - - ssize_t read = count - uio.uio_resid; - kiocb->ki_pos += read; - - zpl_file_accessed(filp); - - return (read); -} - static ssize_t zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *ip = kiocb->ki_filp->f_mapping->host; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; - int flags = filp->f_flags | zfs_io_flags(kiocb); size_t count; ssize_t ret; - boolean_t is_direct; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); - ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - - if (ret) { - return (-ret); - } else if (is_direct) { - ssize_t read = zpl_aio_read_direct(kiocb, iov, nr_segs, pos); - - if (read >= 0 || read != -EAGAIN) - return (read); - - /* Otherwise fallback to buffered read */ - } - - return (zpl_aio_read_buffered(kiocb, iov, nr_segs, pos)); -} - -static ssize_t -zpl_aio_write_buffered(struct kiocb *kiocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - size_t count; - ssize_t ret; - - ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); - if (ret) - return (ret); - zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); @@ -680,110 +422,64 @@ zpl_aio_write_buffered(struct kiocb *kiocb, const struct iovec *iov, crhold(cr); cookie = spl_fstrans_mark(); - int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; - int error = -zfs_write(ITOZ(ip), &uio, flags, cr); + ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + flip->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); - if (error < 0) - return (error); - - ssize_t wrote = count - uio.uio_resid; - kiocb->ki_pos += wrote; - - return (wrote); -} - -static ssize_t -zpl_aio_write_direct(struct kiocb *kiocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - int flags = filp->f_flags | zfs_io_flags(kiocb); - size_t count; - ssize_t ret; - - ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); - if (ret) + if (ret < 0) return (ret); - zfs_uio_t uio; - zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, - count, 0); + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; - /* On error, return to fallback to the buffered path. */ - ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_WRITE, &flags); - if (ret) - return (-ret); + zpl_file_accessed(filp); - ASSERT(uio.uio_extflg & UIO_DIRECT); - - crhold(cr); - cookie = spl_fstrans_mark(); - - int error = -zfs_write(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - zfs_uio_free_dio_pages(&uio, UIO_WRITE); - - if (error < 0) - return (error); - - ssize_t wrote = count - uio.uio_resid; - kiocb->ki_pos += wrote; - - return (wrote); + return (read); } static ssize_t zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { + cred_t *cr = CRED(); + fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; - int flags = filp->f_flags | zfs_io_flags(kiocb); - size_t ocount; size_t count; ssize_t ret; - boolean_t is_direct; - ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); - count = ocount; - - ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); + ret = geeric_write_checks(filep, &pos, &count, S_ISBLK(ip->i_mode)); if (ret) return (ret); kiocb->ki_pos = pos; - ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); - if (ret) { - return (-ret); - } else if (is_direct) { - ssize_t wrote = zpl_aio_write_direct(kiocb, iov, nr_segs, pos); + crhold(cr); + cookie = spl_fstrans_mark(); - if (wrote >= 0 || wrote != -EAGAIN) { - return (wrote); - } + ret = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); - /* - * If we are falling back to a buffered write, then the - * file position should not be updated at this point. - */ - ASSERT3U(pos, ==, kiocb->ki_pos); - } + spl_fstrans_unmark(cookie); + crfree(cr); - return (zpl_aio_write_buffered(kiocb, iov, nr_segs, pos)); + if (ret < 0) + return (ret); + + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); } #endif /* HAVE_VFS_RW_ITERATE */ diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index db08b18431..ea7731b8d8 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1191,7 +1191,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, /* Allow Direct I/O when requested and properly aligned */ if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) && - zfs_dio_aligned(offset, size, SPA_MINBLOCKSIZE)) { + zfs_dio_aligned(offset, size, PAGESIZE)) { abd_t *data = abd_get_from_buf(buf, size); err = dmu_read_abd(dn, offset, size, data, flags); abd_free(data); diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 3d5fb23907..91a7fd8df4 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -104,7 +104,7 @@ dmu_write_direct_done(zio_t *zio) if (zio->io_error != 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) - ASSERT3U(zio->io_error, ==, EAGAIN); + ASSERT3U(zio->io_error, ==, EIO); /* * In the event of an I/O error this block has been freed in diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 56af7c1298..9305bd894d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -159,14 +159,14 @@ uint_t zfs_vdev_max_auto_ashift = 14; uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; /* - * VDEV checksum verification percentage for Direct I/O writes. This is - * neccessary for Linux, because user pages can not be placed under write - * protection during Direct I/O writes. + * VDEV checksum verification for Direct I/O writes. This is neccessary for + * Linux, because anonymous pages can not be placed under write protection + * during Direct I/O writes. */ #if !defined(__FreeBSD__) -uint_t zfs_vdev_direct_write_verify_pct = 2; +uint_t zfs_vdev_direct_write_verify = 1; #else -uint_t zfs_vdev_direct_write_verify_pct = 0; +uint_t zfs_vdev_direct_write_verify = 0; #endif void @@ -6527,9 +6527,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, "Rate Direct I/O write verify events to this many per second"); /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify_pct, UINT, ZMOD_RW, - "Percentage of Direct I/O writes per top-level VDEV for checksum " - "verification to be performed"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, + "Direct I/O writes will perform for checksum verification before " + "commiting write"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 4cf03abc5a..bf81073a16 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -202,28 +202,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); } -int -zfs_check_direct_enabled(znode_t *zp, int ioflags, boolean_t *is_direct) -{; - zfsvfs_t *zfsvfs = ZTOZSB(zp); - *is_direct = B_FALSE; - int error; - - if ((error = zfs_enter(zfsvfs, FTAG)) != 0) - return (error); - - if (ioflags & O_DIRECT && - zfsvfs->z_os->os_direct != ZFS_DIRECT_DISABLED) { - *is_direct = B_TRUE; - } else if (zfsvfs->z_os->os_direct == ZFS_DIRECT_ALWAYS) { - *is_direct = B_TRUE; - } - - zfs_exit(zfsvfs, FTAG); - - return (0); -} - /* * Determine if Direct I/O has been requested (either via the O_DIRECT flag or * the "direct" dataset property). When inherited by the property only apply @@ -236,12 +214,11 @@ zfs_check_direct_enabled(znode_t *zp, int ioflags, boolean_t *is_direct) * synhronized with the ARC. * * It is possible that a file's pages could be mmap'ed after it is checked - * here. If so, that is handled according in zfs_read() and zfs_write(). See - * comments in the following two areas for how this handled: - * zfs_read() -> mappedread() + * here. If so, that is handled coorarding in zfs_write(). See comments in the + * following area for how this is handled: * zfs_write() -> update_pages() */ -int +static int zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, int *ioflagp) { @@ -250,49 +227,49 @@ zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, int ioflag = *ioflagp; int error = 0; - if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) - return (error); - - if (os->os_direct == ZFS_DIRECT_DISABLED) { - error = EAGAIN; + if (os->os_direct == ZFS_DIRECT_DISABLED || + zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { + /* + * Direct I/O is disabled or the region is mmap'ed. In either + * case the I/O request will just directed through the ARC. + */ + ioflag &= ~O_DIRECT; goto out; - } else if (os->os_direct == ZFS_DIRECT_ALWAYS && zfs_uio_page_aligned(uio) && - zfs_uio_aligned(uio, SPA_MINBLOCKSIZE)) { + zfs_uio_aligned(uio, PAGE_SIZE)) { if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) || (rw == UIO_READ)) { ioflag |= O_DIRECT; } + } else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) { + /* + * Direct I/O was requested through the direct=always, but it + * is not properly PAGE_SIZE aligned. The request will be + * directed through the ARC. + */ + ioflag &= ~O_DIRECT; } if (ioflag & O_DIRECT) { if (!zfs_uio_page_aligned(uio) || - !zfs_uio_aligned(uio, SPA_MINBLOCKSIZE)) { + !zfs_uio_aligned(uio, PAGE_SIZE)) { error = SET_ERROR(EINVAL); goto out; } - if (zn_has_cached_data(zp, zfs_uio_offset(uio), - zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { - error = SET_ERROR(EAGAIN); + error = zfs_uio_get_dio_pages_alloc(uio, rw); + if (error) { goto out; } - - error = zfs_uio_get_dio_pages_alloc(uio, rw); - if (error) - goto out; - } else { - error = EAGAIN; - goto out; } IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT); ASSERT0(error); - *ioflagp = ioflag; out: - zfs_exit(zfsvfs, FTAG); + *ioflagp = ioflag; return (error); } @@ -380,8 +357,16 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) error = 0; goto out; } - ASSERT(zfs_uio_offset(uio) < zp->z_size); + + /* + * Setting up Direct I/O if requested. + */ + error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag); + if (error) { + goto out; + } + #if defined(__linux__) ssize_t start_offset = zfs_uio_offset(uio); #endif @@ -424,22 +409,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) #endif if (zn_has_cached_data(zp, zfs_uio_offset(uio), zfs_uio_offset(uio) + nbytes - 1)) { - /* - * It is possible that a files pages have been mmap'ed - * since our check for Direct I/O reads and the read - * being issued. In this case, we will use the ARC to - * keep it synchronized with the page cache. In order - * to do this we temporarily remove the UIO_DIRECT - * flag. - */ - boolean_t uio_direct_mmap = B_FALSE; - if (uio->uio_extflg & UIO_DIRECT) { - uio->uio_extflg &= ~UIO_DIRECT; - uio_direct_mmap = B_TRUE; - } error = mappedread(zp, nbytes, uio); - if (uio_direct_mmap) - uio->uio_extflg |= UIO_DIRECT; } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); @@ -494,6 +464,12 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) out: zfs_rangelock_exit(lr); + /* + * Cleanup for Direct I/O if requested. + */ + if (uio->uio_extflg & UIO_DIRECT) + zfs_uio_free_dio_pages(uio, UIO_READ); + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); zfs_exit(zfsvfs, FTAG); return (error); @@ -631,6 +607,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (SET_ERROR(EINVAL)); } + /* + * Setting up Direct I/O if requested. + */ + error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag); + if (error) { + zfs_exit(zfsvfs, FTAG); + return (SET_ERROR(error)); + } + /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. @@ -641,6 +626,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (SET_ERROR(EFAULT)); } + /* * If in append mode, set the io offset pointer to eof. */ @@ -676,6 +662,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } + if (zn_rlimit_fsize_uio(zp, uio)) { zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); @@ -896,15 +883,27 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } - /* - * There is a a window where a file's pages can be mmap'ed after - * the Direct I/O write has started. In this case we will still - * call update_pages() to make sure there is consistency - * between the ARC and the page cache. This is unfortunate + * There is a window where a file's pages can be mmap'ed after + * zfs_setup_direct() is called. This is due to the fact that + * the rangelock in this function is acquired after calling + * zfs_setup_direct(). This is done so that + * zfs_uio_prefaultpages() does not attempt to fault in pages + * on Linux for Direct I/O requests. This is not necessary as + * the pages are pinned in memory and can not be faulted out. + * Ideally, the rangelock would be held before calling + * zfs_setup_direct() and zfs_uio_prefaultpages(); however, + * this can lead to a deadlock as zfs_getpage() also acquires + * the rangelock as a RL_WRITER and prefaulting the pages can + * lead to zfs_getpage() being called. + * + * In the case of the pages being mapped after + * zfs_setup_direct() is called, the call to update_pages() + * will still be made to make sure there is consistency between + * the ARC and the Linux page cache. This is an ufortunate * situation as the data will be read back into the ARC after - * the Direct I/O write has completed, but this is the pentalty - * for writing to a mmap'ed region of the file using O_DIRECT. + * the Direct I/O write has completed, but this is the penality + * for writing to a mmap'ed region of a file using Direct I/O. */ if (tx_bytes && zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { @@ -987,6 +986,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); + /* + * Cleanup for Direct I/O if requested. + */ + if (uio->uio_extflg & UIO_DIRECT) + zfs_uio_free_dio_pages(uio, UIO_WRITE); + /* * If we're in replay mode, or we made no progress, or the * uio data is inaccessible return an error. Otherwise, it's diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f4ada08a91..ae58f7704e 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -804,7 +804,7 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, ASSERT3U(*countp, >, 0); if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { - ASSERT3U(*errorp, ==, EAGAIN); + ASSERT3U(*errorp, ==, EIO); ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; } @@ -4521,13 +4521,12 @@ zio_vdev_io_assess(zio_t *zio) /* * If a Direct I/O write checksum verify error has occurred then this - * I/O should not attempt to be issued again. Instead the EAGAIN will - * be returned and this write will attempt to be issued through the - * ARC instead. + * I/O should not attempt to be issued again. Instead the EIO will + * be returned. */ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); - ASSERT3U(zio->io_error, ==, EAGAIN); + ASSERT3U(zio->io_error, ==, EIO); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } @@ -4850,6 +4849,7 @@ static zio_t * zio_dio_checksum_verify(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); + int error; ASSERT3P(zio->io_vd, !=, NULL); ASSERT3P(zio->io_bp, !=, NULL); @@ -4858,38 +4858,28 @@ zio_dio_checksum_verify(zio_t *zio) ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE); ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); - if (zfs_vdev_direct_write_verify_pct == 0 || zio->io_error != 0) + if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0) goto out; - /* - * A Direct I/O write checksum verification will only be - * performed based on the top-level VDEV percentage for checks. - */ - uint32_t rand = random_in_range(100); - int error; + if ((error = zio_checksum_error(zio, NULL)) != 0) { + zio->io_error = error; + if (error == ECKSUM) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_dio_verify_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + zio->io_error = SET_ERROR(EIO); + zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; - if (rand < zfs_vdev_direct_write_verify_pct) { - if ((error = zio_checksum_error(zio, NULL)) != 0) { - zio->io_error = error; - if (error == ECKSUM) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - zio->io_vd->vdev_stat.vs_dio_verify_errors++; - mutex_exit(&zio->io_vd->vdev_stat_lock); - zio->io_error = SET_ERROR(EAGAIN); - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + /* + * The EIO error must be propagated up to the logical + * parent ZIO in zio_notify_parent() so it can be + * returned to dmu_write_abd(). + */ + zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE; - /* - * The EAGAIN error must be propagated up to the - * logical parent ZIO in zio_notify_parent() so - * it can be returned to dmu_write_abd(). - */ - zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE; - - (void) zfs_ereport_post( - FM_EREPORT_ZFS_DIO_VERIFY, - zio->io_spa, zio->io_vd, &zio->io_bookmark, - zio, 0); - } + (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0); } } @@ -5243,8 +5233,8 @@ zio_done(zio_t *zio) } if ((zio->io_error == EIO || !(zio->io_flags & - (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DIO_CHKSUM_ERR))) && + (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && + !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the diff --git a/tests/zfs-tests/cmd/manipulate_user_buffer.c b/tests/zfs-tests/cmd/manipulate_user_buffer.c index c195a197ad..7daa7d3084 100644 --- a/tests/zfs-tests/cmd/manipulate_user_buffer.c +++ b/tests/zfs-tests/cmd/manipulate_user_buffer.c @@ -41,6 +41,7 @@ static char *outputfile = NULL; static int blocksize = 131072; /* 128K */ +static int wr_err_expected = 0; static int numblocks = 100; static char *execname = NULL; static int print_usage = 0; @@ -56,28 +57,33 @@ static void usage(void) { (void) fprintf(stderr, - "usage %s -o outputfile [-b blocksize] [-n numblocks]\n" - " [-p randpattern] [-h help]\n" + "usage %s -o outputfile [-b blocksize] [-e wr_error_expected]\n" + " [-n numblocks] [-p randpattern] [-h help]\n" "\n" "Testing whether checksum verify works correctly for O_DIRECT.\n" "when manipulating the contents of a userspace buffer.\n" "\n" - " outputfile: File to write to.\n" - " blocksize: Size of each block to write (must be at \n" - " least >= 512).\n" - " numblocks: Total number of blocksized blocks to write.\n" - " randpattern: Fill data buffer with random data. Default \n" - " behavior is to fill the buffer with the \n" - " known data pattern (0xdeadbeef).\n" - " help: Print usage information and exit.\n" + " outputfile: File to write to.\n" + " blocksize: Size of each block to write (must be at \n" + " least >= 512).\n" + " wr_err_expected: Whether pwrite() is expected to return EIO\n" + " while manipulating the contents of the\n" + " buffer.\n" + " numblocks: Total number of blocksized blocks to\n" + " write.\n" + " randpattern: Fill data buffer with random data. Default\n" + " behavior is to fill the buffer with the \n" + " known data pattern (0xdeadbeef).\n" + " help: Print usage information and exit.\n" "\n" " Required parameters:\n" " outputfile\n" "\n" " Default Values:\n" - " blocksize -> 131072\n" - " numblocks -> 100\n" - " randpattern -> false\n", + " blocksize -> 131072\n" + " wr_err_expexted -> false\n" + " numblocks -> 100\n" + " randpattern -> false\n", execname); (void) exit(1); } @@ -91,12 +97,16 @@ parse_options(int argc, char *argv[]) extern int optind, optopt; execname = argv[0]; - while ((c = getopt(argc, argv, "b:hn:o:p")) != -1) { + while ((c = getopt(argc, argv, "b:ehn:o:p")) != -1) { switch (c) { case 'b': blocksize = atoi(optarg); break; + case 'e': + wr_err_expected = 1; + break; + case 'h': print_usage = 1; break; @@ -153,8 +163,10 @@ write_thread(void *arg) while (!args->entire_file_written) { wrote = pwrite(ofd, buf, blocksize, offset); if (wrote != blocksize) { - perror("write"); - exit(2); + if (wr_err_expected) + assert(errno == EIO); + else + exit(2); } offset = ((offset + blocksize) % total_data); diff --git a/tests/zfs-tests/cmd/stride_dd.c b/tests/zfs-tests/cmd/stride_dd.c index 19aab1c97f..e1e45794cf 100644 --- a/tests/zfs-tests/cmd/stride_dd.c +++ b/tests/zfs-tests/cmd/stride_dd.c @@ -212,7 +212,6 @@ read_entire_file(int ifd, int ofd, void *buf) } } - if (stride > 1) { if (lseek(ifd, (stride - 1) * bsize, SEEK_CUR) == -1) { perror("input lseek"); diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index d3c4a7d940..b41f54ba35 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -93,7 +93,7 @@ VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count -VDEV_DIRECT_WR_VERIFY_PCT vdev.direct_write_verify_pct zfs_vdev_direct_write_verify_pct +VDEV_DIRECT_WR_VERIFY vdev.direct_write_verify zfs_vdev_direct_write_verify VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode diff --git a/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh index 4aac5edd8e..e26fbdfc25 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh @@ -43,7 +43,6 @@ function cleanup { zfs set recordsize=$rs $TESTPOOL/$TESTFS log_must rm -f $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh b/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh index 3f26715fc3..27fd66ccd2 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh @@ -44,7 +44,6 @@ function cleanup { zfs set direct=standard $TESTPOOL/$TESTFS rm $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify direct=always mixed small async requests" diff --git a/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh b/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh index 82d7d8250f..5492a5a905 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh @@ -44,7 +44,6 @@ verify_runnable "global" function cleanup { log_must rm -f "$mntpnt/direct-*" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } function check_fio_ioengine diff --git a/tests/zfs-tests/tests/functional/direct/dio_compression.ksh b/tests/zfs-tests/tests/functional/direct/dio_compression.ksh index 5be93d104d..5463715d7b 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_compression.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_compression.ksh @@ -46,7 +46,6 @@ function cleanup { log_must rm -f "$mntpnt/direct-*" log_must zfs set compression=off $TESTPOOL/$TESTFS - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify compression works using Direct I/O." diff --git a/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh b/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh index c703fcc05f..9de94dee6c 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh @@ -45,7 +45,6 @@ function cleanup { log_must rm -f "$mntpnt/direct-*" log_must zfs set dedup=off $TESTPOOL/$TESTFS - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify deduplication works using Direct I/O." diff --git a/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh b/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh index 843b570d2d..b6faa11970 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh @@ -59,6 +59,4 @@ for bs in "4k" "128k" "1m"; do done done -check_dio_write_chksum_verify_failures $TESTPOOL1 "stripe" 0 - log_pass "Verified encryption works using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh index c54d079366..12b2f21275 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh @@ -41,7 +41,6 @@ function cleanup { zfs set recordsize=$rs $TESTPOOL/$TESTFS log_must rm -f $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify the number direct/buffered requests when growing a file" diff --git a/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh b/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh index 87900443ed..2c0ce832b1 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh @@ -57,14 +57,6 @@ for type in "" "mirror" "raidz" "draid"; do; verify_dio_write_count $TESTPOOL1 $recsize $((4 * recsize)) \ $mntpnt - if [[ "$type" == "" ]]; then - check_dio_write_chksum_verify_failures $TESTPOOL1 \ - "stripe" 0 - else - check_dio_write_chksum_verify_failures $TESTPOOL1 \ - "$type" 0 - fi - destroy_pool $TESTPOOL1 done done diff --git a/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh b/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh index 38c6159537..6f217d91d5 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh @@ -42,7 +42,6 @@ verify_runnable "global" function cleanup { log_must rm -f $src_file $new_file $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify mixed buffered and Direct I/O are coherent." diff --git a/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh b/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh index 27d03e0412..fbd6afd7b3 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh @@ -45,7 +45,6 @@ function cleanup { zfs set recordsize=$rs $TESTPOOL/$TESTFS log_must rm -f "$tmp_file" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify mixed Direct I/O and mmap I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh b/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh index 3854766ed8..04973fc886 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh @@ -43,7 +43,6 @@ function cleanup { zfs set recordsize=$rs $TESTPOOL/$TESTFS log_must rm -f "$tmp_file" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify Direct I/O overwrites" diff --git a/tests/zfs-tests/tests/functional/direct/dio_property.ksh b/tests/zfs-tests/tests/functional/direct/dio_property.ksh index 4fbcfec068..9e18f0bf78 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_property.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_property.ksh @@ -44,7 +44,6 @@ function cleanup { zfs set direct=standard $TESTPOOL/$TESTFS log_must rm -f $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify the direct=always|disabled|standard property" @@ -61,7 +60,8 @@ count=8 # # Check when "direct=always" any aligned IO is done as direct. -# Note that "flag=direct" is not set in the following calls to dd(1). +# Note that the "-D" and "-d" flags are not set in the following calls to +# stride_dd. # log_must zfs set direct=always $TESTPOOL/$TESTFS @@ -92,7 +92,8 @@ log_must rm -f $tmp_file # # Check when "direct=disabled" there are never any direct requests. -# Note that "flag=direct" is always set in the following calls to dd(1). +# Note that the "-D" and "-d" flags are always set in the following calls to +# stride_dd. # log_must zfs set direct=disabled $TESTPOOL/$TESTFS diff --git a/tests/zfs-tests/tests/functional/direct/dio_random.ksh b/tests/zfs-tests/tests/functional/direct/dio_random.ksh index 42c18d4261..abe8d5c0dc 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_random.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_random.ksh @@ -45,7 +45,6 @@ verify_runnable "global" function cleanup { log_must rm -f "$tmp_file" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify randomly sized mixed Direct I/O and buffered I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh b/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh index e1087e5ac3..def4682213 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh @@ -61,14 +61,6 @@ for type in "" "mirror" "raidz" "draid"; do done done - if [[ "$type" == "" ]]; then - check_dio_write_chksum_verify_failures $TESTPOOL1 \ - "stripe" 0 - else - check_dio_write_chksum_verify_failures $TESTPOOL1 \ - "$type" 0 - fi - destroy_pool $TESTPOOL1 done done diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh index 9f50187149..309d35ea0e 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh @@ -44,7 +44,6 @@ function cleanup zfs set recordsize=$rs $TESTPOOL/$TESTFS zfs set direct=standard $TESTPOOL/$TESTFS log_must rm -f $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh index 571767d3b1..8bb363f1a9 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh @@ -49,7 +49,6 @@ function cleanup { log_must rm -f "$filename" log_must set recordsize=$rs $TESTPOOL/$TESTFS - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify Direct I/O reads can read an entire file that is not \ diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh index 5a5a5cf7ad..efc9ee6391 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh @@ -77,7 +77,7 @@ do # Manipulate the user's buffer while running O_DIRECT write # workload with the buffer. log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ - -n $NUMBLOCKS -b $BS + -n $NUMBLOCKS -b $BS # Reading back the contents of the file log_must stride_dd -i $mntpnt/direct-write.iso -o /dev/null \ diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh index a7e9dc0cde..536459a35e 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh @@ -33,7 +33,7 @@ # Verify checksum verify works for Direct I/O writes. # # STRATEGY: -# 1. Set the module parameter zfs_vdev_direct_write_verify_pct to 30. +# 1. Set the module parameter zfs_vdev_direct_write_verify to 0. # 2. Check that manipulating the user buffer while Direct I/O writes are # taking place does not cause any panics with compression turned on. # 3. Start a Direct I/O write workload while manipulating the user buffer @@ -42,7 +42,7 @@ # zpool status -d and checking for zevents. We also make sure there # are reported data errors when reading the file back. # 5. Repeat steps 3 and 4 for 3 iterations. -# 6. Set zfs_vdev_direct_write_verify_pct set to 1 and repeat 3. +# 6. Set zfs_vdev_direct_write_verify set to 1 and repeat 3. # 7. Verify there are Direct I/O write verify failures using # zpool status -d and checking for zevents. We also make sure there # there are no reported data errors when reading the file back because @@ -58,22 +58,22 @@ function cleanup log_must zpool clear $TESTPOOL # Clearing out dio_verify from event logs log_must zpool events -c - log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT 2 + log_must set_tunable32 VDEV_DIRECT_WR_VERIFY $DIO_WR_VERIFY_TUNABLE } log_assert "Verify checksum verify works for Direct I/O writes." if is_freebsd; then - log_unsupported "FeeBSD is capable of stable pages for O_DIRECT writes" + log_unsupported "FreeBSD is capable of stable pages for O_DIRECT writes" fi log_onexit cleanup ITERATIONS=3 NUMBLOCKS=300 -VERIFY_PCT=30 BS=$((128 * 1024)) # 128k mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +typeset DIO_WR_VERIFY_TUNABLE=$(get_tunable VDEV_DIRECT_WR_VERIFY) # Get a list of vdevs in our pool set -A array $(get_disklist_fullpath $TESTPOOL) @@ -82,7 +82,7 @@ set -A array $(get_disklist_fullpath $TESTPOOL) firstvdev=${array[0]} log_must zfs set recordsize=128k $TESTPOOL/$TESTFS -log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT $VERIFY_PCT +log_must set_tunable32 VDEV_DIRECT_WR_VERIFY 0 # First we will verify there are no panics while manipulating the contents of # the user buffer during Direct I/O writes with compression. The contents @@ -101,25 +101,21 @@ if [[ $total_dio_wr -lt 1 ]]; then log_fail "No Direct I/O writes $total_dio_wr" fi -log_must rm -f "$mntpnt/direct-write.iso" # Clearing out DIO counts for Zpool log_must zpool clear $TESTPOOL # Clearing out dio_verify from event logs log_must zpool events -c - - +log_must rm -f "$mntpnt/direct-write.iso" # Next we will verify there are checksum errors for Direct I/O writes while # manipulating the contents of the user pages. log_must zfs set compression=off $TESTPOOL/$TESTFS for i in $(seq 1 $ITERATIONS); do - log_note "Verifying 30% of Direct I/O write checksums iteration \ - $i of $ITERATIONS with \ - zfs_vdev_direct_write_verify_pct=$VERIFY_PCT" + log_note "Verifying Direct I/O write checksums iteration \ + $i of $ITERATIONS with zfs_vdev_direct_write_verify=0" prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - prev_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ -n $NUMBLOCKS -b $BS @@ -131,9 +127,7 @@ for i in $(seq 1 $ITERATIONS); do # Getting new Direct I/O and ARC write counts. curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - curr_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) - total_arc_wr=$((curr_arc_wr - prev_arc_wr)) # Verifying there are checksum errors log_note "Making sure there are checksum errors for the ZPool" @@ -144,23 +138,13 @@ for i in $(seq 1 $ITERATIONS); do log_fail "No checksum failures for ZPool $TESTPOOL" fi - # Getting checksum verify failures - verify_failures=$(get_zpool_status_chksum_verify_failures $TESTPOOL "raidz") - log_note "Making sure we have Direct I/O writes logged" if [[ $total_dio_wr -lt 1 ]]; then log_fail "No Direct I/O writes $total_dio_wr" fi - log_note "Making sure we have Direct I/O write checksum verifies with ZPool" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 1 - - # In the event of checksum verify error, the write will be redirected - # through the ARC. We check here that we have ARC writes. - log_note "Making sure we have ARC writes have taken place in the event \ - a Direct I/O checksum verify failures occurred" - if [[ $total_arc_wr -lt $verify_failures ]]; then - log_fail "ARC writes $total_arc_wr < $verify_failures" - fi + log_note "Making sure we have no Direct I/O write checksum verifies \ + with ZPool" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 log_must rm -f "$mntpnt/direct-write.iso" done @@ -168,19 +152,22 @@ done log_must zpool status -v $TESTPOOL log_must zpool sync $TESTPOOL + + # Finally we will verfiy that with checking every Direct I/O write we have no # errors at all. -VERIFY_PCT=100 -log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT $VERIFY_PCT +# Create the file before trying to manipulate the contents +log_must file_write -o create -f "$mntpnt/direct-write.iso" -b $BS \ + -c $NUMBLOCKS -w +log_must set_tunable32 VDEV_DIRECT_WR_VERIFY 1 for i in $(seq 1 $ITERATIONS); do log_note "Verifying every Direct I/O write checksums iteration $i of \ - $ITERATIONS with zfs_vdev_direct_write_verify_pct=$VERIFY_PCT" + $ITERATIONS with zfs_vdev_direct_write_verify=1" prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - prev_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ - -n $NUMBLOCKS -b $BS + -n $NUMBLOCKS -b $BS -e # Reading file back to verify there no are checksum errors filesize=$(get_file_size "$mntpnt/direct-write.iso") @@ -190,16 +177,11 @@ for i in $(seq 1 $ITERATIONS); do # Getting new Direct I/O and ARC Write counts. curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - curr_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) - total_arc_wr=$((curr_arc_wr - prev_arc_wr)) log_note "Making sure there are no checksum errors with the ZPool" log_must check_pool_status $TESTPOOL "errors" "No known data errors" - # Geting checksum verify failures - verify_failures=$(get_zpool_status_chksum_verify_failures $TESTPOOL "raidz") - log_note "Making sure we have Direct I/O writes logged" if [[ $total_dio_wr -lt 1 ]]; then log_fail "No Direct I/O writes $total_dio_wr" @@ -207,16 +189,8 @@ for i in $(seq 1 $ITERATIONS); do log_note "Making sure we have Direct I/O write checksum verifies with ZPool" check_dio_write_chksum_verify_failures "$TESTPOOL" "raidz" 1 - - # In the event of checksum verify error, the write will be redirected - # through the ARC. We check here that we have ARC writes. - log_note "Making sure we have ARC writes have taken place in the event \ - a Direct I/O checksum verify failures occurred" - if [[ $total_arc_wr -lt $verify_failures ]]; then - log_fail "ARC writes $total_arc_wr < $verify_failures" - fi - - log_must rm -f "$mntpnt/direct-write.iso" done +log_must rm -f "$mntpnt/direct-write.iso" + log_pass "Verified checksum verify works for Direct I/O writes." From 72f674a22bf802d2ecc3390e63dd03507f1ba7f4 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Wed, 28 Aug 2024 14:38:23 -0600 Subject: [PATCH 7/7] Updating based on PR Feedback(5) 1. Added new module parameter zfs_dio_enabled which allows for all reads and writes to pass through the ARC. This module parameter can be set to 0 by default in OpenZFS 2.3 release if necessary. 2. Updated ZTS direct tests to account for the new zfs_dio_enabled module parameter. 3. Updated libzfs.abi to account for changes. Signed-off-by: Brian Atkinson --- lib/libzfs/libzfs.abi | 6919 ++++++++++++----- man/man4/zfs.4 | 8 + module/zfs/zfs_vnops.c | 13 +- tests/zfs-tests/include/tunables.cfg | 1 + .../tests/functional/direct/cleanup.ksh | 8 +- .../tests/functional/direct/setup.ksh | 5 + 6 files changed, 4975 insertions(+), 1979 deletions(-) diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 3eac5f504a..e647e7aa88 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -7,12 +7,9 @@ - - + - - @@ -630,192 +627,61 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + - - - - - - - + + + + + + - - - + + + + + + + + + - - - - - - - - - - - - - - - - + + + - - - + + + + - - - - - - @@ -860,11 +726,54 @@ + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -876,6 +785,11 @@ + + + + + @@ -886,40 +800,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - @@ -972,11 +892,6 @@ - - - - - @@ -997,11 +912,6 @@ - - - - - @@ -1204,11 +1114,6 @@ - - - - - @@ -1232,16 +1137,20 @@ - - - - - + + + + + + + + + @@ -1249,6 +1158,10 @@ + + + + @@ -1359,266 +1272,59 @@ - - - - - + + + + + + + + + + + - - - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + - - - - + + + - - - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - + + + + + @@ -1637,27 +1343,48 @@ + + + - - - - - - - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1672,10 +1399,10 @@ - + - + @@ -1693,30 +1420,11 @@ - - - - - - - - - + - - - - + - - - - - - - - @@ -1738,6 +1446,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1791,61 +1678,12 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1862,6 +1700,7 @@ + @@ -1869,16 +1708,29 @@ + + + + + + + + + + + + + + - @@ -1955,6 +1807,123 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1973,15 +1942,23 @@ + + + + + + + + @@ -1998,15 +1975,6 @@ - - - - - - - - - @@ -2033,6 +2001,15 @@ + + + + + + + + + @@ -2220,27 +2197,670 @@ - + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2271,16 +2891,47 @@ - - - + + + + + + + - - - - + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + @@ -2288,9 +2939,715 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2335,8 +3692,28 @@ + + + + + + + + + + + + + + + + + + + + @@ -2374,113 +3751,12 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + @@ -2498,22 +3774,36 @@ - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + @@ -2528,19 +3818,669 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + + + + + + + + + + + + + + + + + + - + - + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2554,28 +4494,10 @@ - - - - - - - - - - - - - - - - - - @@ -2612,17 +4534,6 @@ - - - - - - - - - - - @@ -2664,22 +4575,6 @@ - - - - - - - - - - - - - - - - @@ -2717,10 +4612,6 @@ - - - - @@ -2733,12 +4624,6 @@ - - - - - - @@ -2918,6 +4803,90 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2927,6 +4896,125 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2935,8 +5023,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + @@ -2960,12 +5072,111 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2992,19 +5203,39 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - @@ -3014,28 +5245,12 @@ - - - - - - - - - - - - - - - - @@ -3074,13 +5289,6 @@ - - - - - - - @@ -3088,22 +5296,68 @@ - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -3116,31 +5370,245 @@ - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -3150,42 +5618,15 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -3219,6 +5660,7 @@ + @@ -3246,6 +5688,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -3255,51 +5734,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -3359,6 +5793,21 @@ + + + + + + + + + + + + + + + @@ -3412,6 +5861,15 @@ + + + + + + + + + @@ -3429,7 +5887,6 @@ -<<<<<<< HEAD @@ -3538,8 +5995,6 @@ -======= ->>>>>>> ccf1a36dc (Adding Direct IO Support) @@ -3560,7 +6015,6 @@ -<<<<<<< HEAD @@ -3798,6 +6252,10 @@ + + + + @@ -3936,6 +6394,11 @@ + + + + + @@ -3948,17 +6411,6 @@ - - - - - - - - - - - @@ -3969,6 +6421,11 @@ + + + + + @@ -3988,22 +6445,10 @@ -======= - - - - ->>>>>>> ccf1a36dc (Adding Direct IO Support) - - - - - - @@ -4017,15 +6462,6 @@ - - - - - - - - - @@ -4053,31 +6489,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - @@ -4463,6 +6874,39 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -4566,10 +7010,792 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -4624,66 +7850,22 @@ + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -4692,10 +7874,24 @@ + + + + + + + + + + + + + + + - @@ -4764,10 +7960,106 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + -<<<<<<< HEAD @@ -4792,10 +8084,6 @@ - - - - @@ -4863,6 +8151,11 @@ + + + + + @@ -4880,9 +8173,26 @@ - + + + + + + + + + + + + + + + + + + + - @@ -4919,6 +8229,12 @@ + + + + + + @@ -4958,31 +8274,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - @@ -4990,8 +8281,6 @@ -======= ->>>>>>> ccf1a36dc (Adding Direct IO Support) @@ -5004,12 +8293,6 @@ - - - - - - @@ -5042,10 +8325,6 @@ - - - - @@ -5053,18 +8332,6 @@ - - - - - - - - - - - - @@ -5106,12 +8373,6 @@ - - - - - - @@ -5139,19 +8400,6 @@ -<<<<<<< HEAD -======= - - - - - - - - - - ->>>>>>> ccf1a36dc (Adding Direct IO Support) @@ -5193,14 +8441,25 @@ - - - - - + + + + + + + + + + + + + + + + @@ -5220,274 +8479,210 @@ - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - + + - + - - + + - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - @@ -5496,19 +8691,36 @@ - - - - + + + + + + + + - - - + + + - - + + + + + + + + + + + + + + + @@ -5521,29 +8733,129 @@ - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - @@ -5556,11 +8868,6 @@ - - - - - @@ -5571,55 +8878,49 @@ - - - - + + + + + + + + + + + + + + + + + + + + - - - - - -<<<<<<< HEAD -======= - - - - - ->>>>>>> ccf1a36dc (Adding Direct IO Support) - - - - - - - - - - - - + + + @@ -5677,19 +8978,88 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - @@ -5702,8 +9072,19 @@ + + + + + + + + + + + @@ -5712,6 +9093,15 @@ + + + + + + + + + @@ -5723,12 +9113,6 @@ - - - - - - @@ -5741,12 +9125,6 @@ - - - - - - @@ -5790,20 +9168,18 @@ - + + + + + + + - - - - - - - - @@ -5815,19 +9191,6 @@ - - - - - - - - - - - - - @@ -5838,18 +9201,6 @@ - - - - - - - - - - - - @@ -5857,16 +9208,6 @@ - - - - - - - - - - @@ -5887,30 +9228,10 @@ - - - - - - - - - - - - - - - - - - - - @@ -5925,54 +9246,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -6015,6 +9288,24 @@ + + + + + + + + + + + + + + + + + + @@ -6027,27 +9318,36 @@ - + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - @@ -6059,32 +9359,23 @@ - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + @@ -6094,11 +9385,6 @@ - - - - - @@ -6113,6 +9399,9 @@ + + + @@ -6178,6 +9467,16 @@ + + + + + + + + + + @@ -6195,9 +9494,6 @@ - - - @@ -6213,12 +9509,6 @@ - - - - - - @@ -6299,12 +9589,21 @@ + + - + + + + + + + + @@ -6346,12 +9645,6 @@ - - - - - - @@ -6359,24 +9652,6 @@ - - - - - - - - - - - - - - - - - - @@ -6462,24 +9737,6 @@ - - - - - - - - - - - - - - - - - - @@ -6491,22 +9748,6 @@ - - - - - - - - - - - - - - - - @@ -6525,124 +9766,100 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + + + - + + - - - + + + + + + + + + + + + + + + + - - - + + + + + + + + + + - - - + + + + + + + + + + + - - - + + + + + + + + + + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -6650,64 +9867,15 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -6726,60 +9894,12 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -6800,53 +9920,12 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -6866,122 +9945,8 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5f3ad01a94..98564c384c 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -291,6 +291,14 @@ Default dnode block size as a power of 2. .It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int Default dnode indirect block size as a power of 2. . +.It Sy zfs_dio_enabled Ns = Ns Sy 0 Ns | Ns 1 Pq int +Enable Direct I/O. +If this setting is 0, then all I/O requests will be directed through the ARC +acting as though the dataset property +.Sy direct +was set to +.Sy disabled . +. .It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 When attempting to log an output nvlist of an ioctl in the on-disk history, the output will not be stored if it is larger than this size (in bytes). diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index bf81073a16..af07c96af9 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -74,6 +74,14 @@ int zfs_bclone_enabled = 1; */ static int zfs_bclone_wait_dirty = 0; +/* + * Enable Direct I/O. If this setting is 0, then all I/O requests will be + * directed through the ARC acting as though the dataset property direct was + * set to disabled. + */ +static int zfs_dio_enabled = 1; + + /* * Maximum bytes to read per chunk in zfs_read(). */ @@ -227,7 +235,7 @@ zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, int ioflag = *ioflagp; int error = 0; - if (os->os_direct == ZFS_DIRECT_DISABLED || + if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED || zn_has_cached_data(zp, zfs_uio_offset(uio), zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { /* @@ -1805,3 +1813,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, "Wait for dirty blocks when cloning"); + +ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW, + "Enable Direct I/O"); diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index b41f54ba35..9f436eb402 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -101,6 +101,7 @@ VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq BCLONE_ENABLED bclone_enabled zfs_bclone_enabled BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty +DIO_ENABLED dio_enabled zfs_dio_enabled XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/functional/direct/cleanup.ksh b/tests/zfs-tests/tests/functional/direct/cleanup.ksh index 382e9b1734..75fe97f923 100755 --- a/tests/zfs-tests/tests/functional/direct/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/direct/cleanup.ksh @@ -28,4 +28,10 @@ verify_runnable "global" -default_cleanup +default_cleanup_noexit + +if tunable_exists DIO_ENABLED ; then + log_must restore_tunable DIO_ENABLED +fi + +log_pass diff --git a/tests/zfs-tests/tests/functional/direct/setup.ksh b/tests/zfs-tests/tests/functional/direct/setup.ksh index 5ce95dddf4..f66d6531c1 100755 --- a/tests/zfs-tests/tests/functional/direct/setup.ksh +++ b/tests/zfs-tests/tests/functional/direct/setup.ksh @@ -27,6 +27,11 @@ . $STF_SUITE/include/libtest.shlib verify_runnable "global" +if tunable_exists DIO_ENABLED ; then + log_must save_tunable DIO_ENABLED + log_must set_tunable32 DIO_ENABLED 1 +fi + default_raidz_setup_noexit "$DISKS" log_must zfs set compression=off $TESTPOOL/$TESTFS log_pass