From 71ce314930e037b94e3b73ee9f991e60d46aa781 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Thu, 22 Aug 2024 11:58:50 -0600 Subject: [PATCH] Updating based on PR Feedback(4) 1. When testing out installing a VM with virtual manager on Linux and a dataset with direct=always, there an ASSERT failure in abd_alloc_from_pages(). Originally zfs_setup_direct() did an alignment check of the UIO using SPA_MINBLOCKSIZE with zfs_uio_aligned(). The idea behind this was maybe the page alignment restriction could be changed to use ashift as the alignment check in the future. Howver, this diea never came to be. The alignment restrictions for Direct I/O are based on PAGE_SIZE. Updating the check zfs_setup_direct() for the UIO to use PAGE_SIZE fixed the issue. 2. Updated other alignment check in dmu_read_impl() to also use PAGE_SIZE. 3. As a consequence of updating the UIO alignment checks the ZTS test case dio_unaligned_filesize began to fail. This is because there was no way to detect reading past the end of the file before issue EINVAL in the ZPL and VOPs layers in FreeBSD. This was resolved by moving zfs_setup_direct() into zfs_write() and zfs_read(). This allows for other error checking to take place before checking any Direct I/O limitations. Updating the call site of zfs_setup_direct() did require a bit of changes to the logic in that function. In particular Direct I/O can just be avoid altogether depending on the checks in zfs_setup_direct() and there is no reason to return EAGAIN at all. 4. After moving zfs_setup_direct() into zfs_write() and zfs_read(), there was no reason to call zfs_check_direct_enabled() in the ZPL layer in Linux or in the VNOPS layer of FreeBSD. This function was completely removed. This allowed for much of the code in both those layers to return to their original code. 5. Upated the checksum verify module parameter for Direct I/O writes to only be a boolean and return EIO in the event a checksum verify failure occurs. By default, this module parameter is set to 1 for Linux and 0 for FreeBSD. The module parameter has been changed to zfs_vdev_direct_write_verify. There are still counters on the top-level VDEV for checksum verify failures, but this could be removed. It would still be good to to leave the ZED event dio_verify for checksum failures as a notification that an application was manipulating the contents of a buffer after issuing that buffer with for I/O using Direct I/O. As part of this cahnge, man pages were updated, the ZTS test case dio_writy_verify was updated, and all comments relating to the module parameter were udpated as well. 6. Updated comments in dio_property ZTS test to properly reflect that stride_dd is being called with check_write and check_read. Signed-off-by: Brian Atkinson --- include/sys/vdev_impl.h | 4 +- include/sys/zfs_vnops.h | 3 - man/man4/zfs.4 | 10 +- man/man8/zpool-events.8 | 6 +- man/man8/zpool-status.8 | 2 +- module/os/freebsd/zfs/zfs_vnops_os.c | 146 +------ module/os/linux/zfs/zpl_file.c | 410 +++--------------- module/zfs/dmu.c | 2 +- module/zfs/dmu_direct.c | 2 +- module/zfs/vdev.c | 16 +- module/zfs/zfs_vnops.c | 143 +++--- module/zfs/zio.c | 60 ++- tests/zfs-tests/cmd/manipulate_user_buffer.c | 44 +- tests/zfs-tests/cmd/stride_dd.c | 1 - tests/zfs-tests/include/tunables.cfg | 2 +- .../functional/direct/dio_aligned_block.ksh | 1 - .../functional/direct/dio_async_always.ksh | 1 - .../direct/dio_async_fio_ioengines.ksh | 1 - .../functional/direct/dio_compression.ksh | 1 - .../tests/functional/direct/dio_dedup.ksh | 1 - .../functional/direct/dio_encryption.ksh | 2 - .../functional/direct/dio_grow_block.ksh | 1 - .../functional/direct/dio_max_recordsize.ksh | 8 - .../tests/functional/direct/dio_mixed.ksh | 1 - .../tests/functional/direct/dio_mmap.ksh | 1 - .../functional/direct/dio_overwrites.ksh | 1 - .../tests/functional/direct/dio_property.ksh | 7 +- .../tests/functional/direct/dio_random.ksh | 1 - .../functional/direct/dio_recordsize.ksh | 8 - .../functional/direct/dio_unaligned_block.ksh | 1 - .../direct/dio_unaligned_filesize.ksh | 1 - .../direct/dio_write_stable_pages.ksh | 2 +- .../functional/direct/dio_write_verify.ksh | 70 +-- 33 files changed, 252 insertions(+), 708 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 54bdff611f..abd66b8abc 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -655,9 +655,9 @@ int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); /* - * VDEV checksum verification precentage for Direct I/O writes + * VDEV checksum verification for Direct I/O writes */ -extern uint_t zfs_vdev_direct_write_verify_pct; +extern uint_t zfs_vdev_direct_write_verify; #ifdef __cplusplus } diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 4fd9525138..e60b99bed1 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -46,9 +46,6 @@ extern int mappedread(znode_t *, int, zfs_uio_t *); extern int mappedread_sf(znode_t *, int, zfs_uio_t *); extern void update_pages(znode_t *, int64_t, int, objset_t *); -extern int zfs_check_direct_enabled(znode_t *, int, boolean_t *); -extern int zfs_setup_direct(znode_t *, zfs_uio_t *, zfs_uio_rw_t, int *); - /* * Platform code that asynchronously drops zp's inode / vnode_t. * diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index ab0ab5e716..5f3ad01a94 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -416,12 +416,10 @@ May be increased up to .Sy ASHIFT_MAX Po 16 Pc , but this may negatively impact pool space efficiency. . -.It Sy zfs_vdev_direct_write_verify_pct Ns = Ns Sy Linux 2 | FreeBSD 0 Pq uint +.It Sy zfs_vdev_direct_write_verify Ns = Ns Sy Linux 1 | FreeBSED 0 Pq uint If non-zero, then a Direct I/O write's checksum will be verified every -percentage (pct) of Direct I/O writes that are issued to a top-level VDEV -before it is committed and the block pointer is updated. -In the event the checksum is not valid then the I/O operation will be -redirected through the ARC. +time the write is issued and before it is commited to the block pointer. +In the event the checksum is not valid then the I/O operation will return EIO. This module parameter can be used to detect if the contents of the users buffer have changed in the process of doing a Direct I/O write. @@ -432,7 +430,7 @@ Each verify error causes a zevent. Direct Write I/O checkum verify errors can be seen with .Nm zpool Cm status Fl d . -The default value for this is 2 percent on Linux, but is 0 for +The default value for this is 1 on Linux, but is 0 for .Fx because user pages can be placed under write protection in .Fx diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 77d44bd8ad..234612baea 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -100,14 +100,14 @@ The number of delay events is ratelimited by the module parameter. .It Sy dio_verify Issued when there was a checksum verify error after a Direct I/O write has been -issued and is redirected through the ARC. +issued. This event can only take place if the module parameter -.Sy zfs_vdev_direct_write_verify_pct +.Sy zfs_vdev_direct_write_verify is not set to zero. See .Xr zfs 4 for more details on the -.Sy zfs_vdev_direct_write_verify_pct +.Sy zfs_vdev_direct_write_verify module paramter. .It Sy config Issued every time a vdev change have been done to the pool. diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 923b99de30..868fc4414d 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -85,7 +85,7 @@ to set pool GUID as key for pool objects instead of pool names. Display the number of Direct I/O write checksum verify errors that have occured on a top-level VDEV. See -.Sx zfs_vdev_direct_write_verify_pct +.Sx zfs_vdev_direct_write_verify in .Xr zfs 4 for details about the conditions that can cause Direct I/O write checksum diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 91986cd433..5dbca10a3e 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -4274,29 +4274,6 @@ ioflags(int ioflags) return (flags); } -static int -zfs_freebsd_read_direct(znode_t *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, - int ioflag, cred_t *cr) -{ - int ret; - int flags = ioflag; - - ASSERT3U(rw, ==, UIO_READ); - - /* On error, return to fallback to the buffred path */ - ret = zfs_setup_direct(zp, uio, rw, &flags); - if (ret) - return (ret); - - ASSERT(uio->uio_extflg & UIO_DIRECT); - - ret = zfs_read(zp, uio, flags, cr); - - zfs_uio_free_dio_pages(uio, rw); - - return (ret); -} - #ifndef _SYS_SYSPROTO_H_ struct vop_read_args { struct vnode *a_vp; @@ -4311,85 +4288,37 @@ zfs_freebsd_read(struct vop_read_args *ap) { zfs_uio_t uio; int error = 0; - znode_t *zp = VTOZ(ap->a_vp); - int ioflag = ioflags(ap->a_ioflag); - boolean_t is_direct; - zfs_uio_init(&uio, ap->a_uio); - - error = zfs_check_direct_enabled(zp, ioflag, &is_direct); - - if (error) { - return (error); - } else if (is_direct) { - error = - zfs_freebsd_read_direct(zp, &uio, UIO_READ, ioflag, - ap->a_cred); - /* - * XXX We occasionally get an EFAULT for Direct I/O reads on - * FreeBSD 13. This still needs to be resolved. The EFAULT comes - * from: - * zfs_uio_get__dio_pages_alloc() -> - * zfs_uio_get_dio_pages_impl() -> - * zfs_uio_iov_step() -> - * zfs_uio_get_user_pages(). - * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O - * read fails to map in the user pages (returning EFAULT) the - * Direct I/O request is broken up into two separate IO requests - * and issued separately using Direct I/O. - */ + error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), + ap->a_cred); + /* + * XXX We occasionally get an EFAULT for Direct I/O reads on + * FreeBSD 13. This still needs to be resolved. The EFAULT comes + * from: + * zfs_uio_get__dio_pages_alloc() -> + * zfs_uio_get_dio_pages_impl() -> + * zfs_uio_iov_step() -> + * zfs_uio_get_user_pages(). + * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O + * read fails to map in the user pages (returning EFAULT) the + * Direct I/O request is broken up into two separate IO requests + * and issued separately using Direct I/O. + */ #ifdef ZFS_DEBUG - if (error == EFAULT) { + if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) { #if 0 - printf("%s(%d): Direct I/O read returning EFAULT " - "uio = %p, zfs_uio_offset(uio) = %lu " - "zfs_uio_resid(uio) = %lu\n", - __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio), - zfs_uio_resid(&uio)); + printf("%s(%d): Direct I/O read returning EFAULT " + "uio = %p, zfs_uio_offset(uio) = %lu " + "zfs_uio_resid(uio) = %lu\n", + __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio), + zfs_uio_resid(&uio)); #endif - } - -#endif - - /* - * On error we will return unless the error is EAGAIN, which - * just tells us to fallback to the buffered path. - */ - if (error != EAGAIN) - return (error); - else - ioflag &= ~O_DIRECT; } - - error = zfs_read(zp, &uio, ioflag, ap->a_cred); - +#endif return (error); } -static int -zfs_freebsd_write_direct(znode_t *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, - int ioflag, cred_t *cr) -{ - int ret; - int flags = ioflag; - - ASSERT3U(rw, ==, UIO_WRITE); - - /* On error, return to fallback to the buffred path */ - ret = zfs_setup_direct(zp, uio, rw, &flags); - if (ret) - return (ret); - - ASSERT(uio->uio_extflg & UIO_DIRECT); - - ret = zfs_write(zp, uio, flags, cr); - - zfs_uio_free_dio_pages(uio, rw); - - return (ret); -} - #ifndef _SYS_SYSPROTO_H_ struct vop_write_args { struct vnode *a_vp; @@ -4403,36 +4332,9 @@ static int zfs_freebsd_write(struct vop_write_args *ap) { zfs_uio_t uio; - int error = 0; - znode_t *zp = VTOZ(ap->a_vp); - int ioflag = ioflags(ap->a_ioflag); - boolean_t is_direct; - zfs_uio_init(&uio, ap->a_uio); - - error = zfs_check_direct_enabled(zp, ioflag, &is_direct); - - if (error) { - return (error); - } else if (is_direct) { - error = - zfs_freebsd_write_direct(zp, &uio, UIO_WRITE, ioflag, - ap->a_cred); - - /* - * On error we will return unless the error is EAGAIN, which - * just tells us to fallback to the buffered path. - */ - if (error != EAGAIN) - return (error); - else - ioflag &= ~O_DIRECT; - - } - - error = zfs_write(zp, &uio, ioflag, ap->a_cred); - - return (error); + return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), + ap->a_cred)); } /* diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 8781e71847..62f772afef 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -309,7 +309,7 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, } static ssize_t -zpl_iter_read_buffered(struct kiocb *kiocb, struct iov_iter *to) +zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); fstrans_cookie_t cookie; @@ -322,15 +322,14 @@ zpl_iter_read_buffered(struct kiocb *kiocb, struct iov_iter *to) crhold(cr); cookie = spl_fstrans_mark(); - int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; - int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, - flags, cr); + ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); - if (error < 0) - return (error); + if (ret < 0) + return (ret); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; @@ -340,71 +339,6 @@ zpl_iter_read_buffered(struct kiocb *kiocb, struct iov_iter *to) return (read); } -static ssize_t -zpl_iter_read_direct(struct kiocb *kiocb, struct iov_iter *to) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - ssize_t count = iov_iter_count(to); - int flags = filp->f_flags | zfs_io_flags(kiocb); - zfs_uio_t uio; - ssize_t ret; - - zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); - - /* On error, return to fallback to the buffered path. */ - ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_READ, &flags); - if (ret) - return (-ret); - - ASSERT(uio.uio_extflg & UIO_DIRECT); - - crhold(cr); - fstrans_cookie_t cookie = spl_fstrans_mark(); - - int error = -zfs_read(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - zfs_uio_free_dio_pages(&uio, UIO_READ); - - if (error < 0) - return (error); - - ssize_t read = count - uio.uio_resid; - kiocb->ki_pos += read; - - zpl_file_accessed(filp); - - return (read); -} - -static ssize_t -zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) -{ - struct inode *ip = kiocb->ki_filp->f_mapping->host; - struct file *filp = kiocb->ki_filp; - int flags = filp->f_flags | zfs_io_flags(kiocb); - boolean_t is_direct; - - int error = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - - if (error) { - return (-error); - } else if (is_direct) { - ssize_t read = zpl_iter_read_direct(kiocb, to); - - if (read >= 0 || read != -EAGAIN) - return (read); - - /* Otherwise fallback to buffered read */ - } - - return (zpl_iter_read_buffered(kiocb, to)); -} - static inline ssize_t zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, size_t *countp) @@ -430,249 +364,57 @@ zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, return (0); } -static ssize_t -zpl_iter_write_buffered(struct kiocb *kiocb, struct iov_iter *from) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - size_t wrote; - size_t count = iov_iter_count(from); - - zfs_uio_t uio; - zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); - - crhold(cr); - fstrans_cookie_t cookie = spl_fstrans_mark(); - - int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; - int error = -zfs_write(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - if (error < 0) - return (error); - - wrote = count - uio.uio_resid; - kiocb->ki_pos += wrote; - - if (wrote > 0) - iov_iter_advance(from, wrote); - - return (wrote); -} - -static ssize_t -zpl_iter_write_direct(struct kiocb *kiocb, struct iov_iter *from) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - size_t wrote; - int flags = filp->f_flags | zfs_io_flags(kiocb); - size_t count = iov_iter_count(from); - - zfs_uio_t uio; - zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); - - /* On error, return to fallback to the buffered path. */ - ssize_t ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_WRITE, &flags); - if (ret) - return (-ret); - - ASSERT(uio.uio_extflg & UIO_DIRECT); - - crhold(cr); - fstrans_cookie_t cookie = spl_fstrans_mark(); - - int error = -zfs_write(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - zfs_uio_free_dio_pages(&uio, UIO_WRITE); - - if (error < 0) - return (error); - - wrote = count - uio.uio_resid; - kiocb->ki_pos += wrote; - - return (wrote); -} - static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { - struct inode *ip = kiocb->ki_filp->f_mapping->host; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; - int flags = filp->f_flags | zfs_io_flags(kiocb); + struct inode *ip = filp->f_mapping->host; + zfs_uio_t uio; size_t count = 0; - boolean_t is_direct; + ssize_t ret; - ssize_t ret = zpl_generic_write_checks(kiocb, from, &count); + ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) return (ret); - loff_t offset = kiocb->ki_pos; + zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); - ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); + crhold(cr); + cookie = spl_fstrans_mark(); - if (ret) { - return (-ret); - } else if (is_direct) { - ssize_t wrote = zpl_iter_write_direct(kiocb, from); + ret = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); - if (wrote >= 0 || wrote != -EAGAIN) { - return (wrote); - } + spl_fstrans_unmark(cookie); + crfree(cr); - /* - * If we are falling back to a buffered write, then the - * file position should not be updated at this point. - */ - ASSERT3U(offset, ==, kiocb->ki_pos); - } + if (ret < 0) + return (ret); - return (zpl_iter_write_buffered(kiocb, from)); + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); } #else /* !HAVE_VFS_RW_ITERATE */ -static ssize_t -zpl_aio_read_buffered(struct kiocb *kiocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - struct file *filp = kiocb->ki_filp; - size_t count; - ssize_t ret; - - ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (ret) - return (ret); - - zfs_uio_t uio; - zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, - count, 0); - - crhold(cr); - cookie = spl_fstrans_mark(); - - int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; - int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, - flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - if (error < 0) - return (error); - - ssize_t read = count - uio.uio_resid; - kiocb->ki_pos += read; - - zpl_file_accessed(filp); - - return (read); -} - -static ssize_t -zpl_aio_read_direct(struct kiocb *kiocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - int flags = filp->f_flags | zfs_io_flags(kiocb); - size_t count; - ssize_t ret; - - ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (ret) - return (ret); - - zfs_uio_t uio; - zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, - count, 0); - - /* On error, return to fallback to the buffered path */ - ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_READ, &flags); - if (ret) - return (-ret); - - ASSERT(uio.uio_extflg & UIO_DIRECT); - - crhold(cr); - cookie = spl_fstrans_mark(); - - int error = -zfs_read(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - zfs_uio_free_dio_pages(&uio, UIO_READ); - - if (error < 0) - return (error); - - ssize_t read = count - uio.uio_resid; - kiocb->ki_pos += read; - - zpl_file_accessed(filp); - - return (read); -} - static ssize_t zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *ip = kiocb->ki_filp->f_mapping->host; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; - int flags = filp->f_flags | zfs_io_flags(kiocb); size_t count; ssize_t ret; - boolean_t is_direct; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); - ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - - if (ret) { - return (-ret); - } else if (is_direct) { - ssize_t read = zpl_aio_read_direct(kiocb, iov, nr_segs, pos); - - if (read >= 0 || read != -EAGAIN) - return (read); - - /* Otherwise fallback to buffered read */ - } - - return (zpl_aio_read_buffered(kiocb, iov, nr_segs, pos)); -} - -static ssize_t -zpl_aio_write_buffered(struct kiocb *kiocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - size_t count; - ssize_t ret; - - ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); - if (ret) - return (ret); - zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); @@ -680,110 +422,64 @@ zpl_aio_write_buffered(struct kiocb *kiocb, const struct iovec *iov, crhold(cr); cookie = spl_fstrans_mark(); - int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; - int error = -zfs_write(ITOZ(ip), &uio, flags, cr); + ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + flip->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); - if (error < 0) - return (error); - - ssize_t wrote = count - uio.uio_resid; - kiocb->ki_pos += wrote; - - return (wrote); -} - -static ssize_t -zpl_aio_write_direct(struct kiocb *kiocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - int flags = filp->f_flags | zfs_io_flags(kiocb); - size_t count; - ssize_t ret; - - ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); - if (ret) + if (ret < 0) return (ret); - zfs_uio_t uio; - zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, - count, 0); + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; - /* On error, return to fallback to the buffered path. */ - ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_WRITE, &flags); - if (ret) - return (-ret); + zpl_file_accessed(filp); - ASSERT(uio.uio_extflg & UIO_DIRECT); - - crhold(cr); - cookie = spl_fstrans_mark(); - - int error = -zfs_write(ITOZ(ip), &uio, flags, cr); - - spl_fstrans_unmark(cookie); - crfree(cr); - - zfs_uio_free_dio_pages(&uio, UIO_WRITE); - - if (error < 0) - return (error); - - ssize_t wrote = count - uio.uio_resid; - kiocb->ki_pos += wrote; - - return (wrote); + return (read); } static ssize_t zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { + cred_t *cr = CRED(); + fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; - int flags = filp->f_flags | zfs_io_flags(kiocb); - size_t ocount; size_t count; ssize_t ret; - boolean_t is_direct; - ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); - count = ocount; - - ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); + ret = geeric_write_checks(filep, &pos, &count, S_ISBLK(ip->i_mode)); if (ret) return (ret); kiocb->ki_pos = pos; - ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); - if (ret) { - return (-ret); - } else if (is_direct) { - ssize_t wrote = zpl_aio_write_direct(kiocb, iov, nr_segs, pos); + crhold(cr); + cookie = spl_fstrans_mark(); - if (wrote >= 0 || wrote != -EAGAIN) { - return (wrote); - } + ret = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); - /* - * If we are falling back to a buffered write, then the - * file position should not be updated at this point. - */ - ASSERT3U(pos, ==, kiocb->ki_pos); - } + spl_fstrans_unmark(cookie); + crfree(cr); - return (zpl_aio_write_buffered(kiocb, iov, nr_segs, pos)); + if (ret < 0) + return (ret); + + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); } #endif /* HAVE_VFS_RW_ITERATE */ diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index db08b18431..ea7731b8d8 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1191,7 +1191,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, /* Allow Direct I/O when requested and properly aligned */ if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) && - zfs_dio_aligned(offset, size, SPA_MINBLOCKSIZE)) { + zfs_dio_aligned(offset, size, PAGESIZE)) { abd_t *data = abd_get_from_buf(buf, size); err = dmu_read_abd(dn, offset, size, data, flags); abd_free(data); diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 3d5fb23907..91a7fd8df4 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -104,7 +104,7 @@ dmu_write_direct_done(zio_t *zio) if (zio->io_error != 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) - ASSERT3U(zio->io_error, ==, EAGAIN); + ASSERT3U(zio->io_error, ==, EIO); /* * In the event of an I/O error this block has been freed in diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 56af7c1298..9305bd894d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -159,14 +159,14 @@ uint_t zfs_vdev_max_auto_ashift = 14; uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; /* - * VDEV checksum verification percentage for Direct I/O writes. This is - * neccessary for Linux, because user pages can not be placed under write - * protection during Direct I/O writes. + * VDEV checksum verification for Direct I/O writes. This is neccessary for + * Linux, because anonymous pages can not be placed under write protection + * during Direct I/O writes. */ #if !defined(__FreeBSD__) -uint_t zfs_vdev_direct_write_verify_pct = 2; +uint_t zfs_vdev_direct_write_verify = 1; #else -uint_t zfs_vdev_direct_write_verify_pct = 0; +uint_t zfs_vdev_direct_write_verify = 0; #endif void @@ -6527,9 +6527,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, "Rate Direct I/O write verify events to this many per second"); /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify_pct, UINT, ZMOD_RW, - "Percentage of Direct I/O writes per top-level VDEV for checksum " - "verification to be performed"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, + "Direct I/O writes will perform for checksum verification before " + "commiting write"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 4cf03abc5a..bf81073a16 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -202,28 +202,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); } -int -zfs_check_direct_enabled(znode_t *zp, int ioflags, boolean_t *is_direct) -{; - zfsvfs_t *zfsvfs = ZTOZSB(zp); - *is_direct = B_FALSE; - int error; - - if ((error = zfs_enter(zfsvfs, FTAG)) != 0) - return (error); - - if (ioflags & O_DIRECT && - zfsvfs->z_os->os_direct != ZFS_DIRECT_DISABLED) { - *is_direct = B_TRUE; - } else if (zfsvfs->z_os->os_direct == ZFS_DIRECT_ALWAYS) { - *is_direct = B_TRUE; - } - - zfs_exit(zfsvfs, FTAG); - - return (0); -} - /* * Determine if Direct I/O has been requested (either via the O_DIRECT flag or * the "direct" dataset property). When inherited by the property only apply @@ -236,12 +214,11 @@ zfs_check_direct_enabled(znode_t *zp, int ioflags, boolean_t *is_direct) * synhronized with the ARC. * * It is possible that a file's pages could be mmap'ed after it is checked - * here. If so, that is handled according in zfs_read() and zfs_write(). See - * comments in the following two areas for how this handled: - * zfs_read() -> mappedread() + * here. If so, that is handled coorarding in zfs_write(). See comments in the + * following area for how this is handled: * zfs_write() -> update_pages() */ -int +static int zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, int *ioflagp) { @@ -250,49 +227,49 @@ zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, int ioflag = *ioflagp; int error = 0; - if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) - return (error); - - if (os->os_direct == ZFS_DIRECT_DISABLED) { - error = EAGAIN; + if (os->os_direct == ZFS_DIRECT_DISABLED || + zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { + /* + * Direct I/O is disabled or the region is mmap'ed. In either + * case the I/O request will just directed through the ARC. + */ + ioflag &= ~O_DIRECT; goto out; - } else if (os->os_direct == ZFS_DIRECT_ALWAYS && zfs_uio_page_aligned(uio) && - zfs_uio_aligned(uio, SPA_MINBLOCKSIZE)) { + zfs_uio_aligned(uio, PAGE_SIZE)) { if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) || (rw == UIO_READ)) { ioflag |= O_DIRECT; } + } else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) { + /* + * Direct I/O was requested through the direct=always, but it + * is not properly PAGE_SIZE aligned. The request will be + * directed through the ARC. + */ + ioflag &= ~O_DIRECT; } if (ioflag & O_DIRECT) { if (!zfs_uio_page_aligned(uio) || - !zfs_uio_aligned(uio, SPA_MINBLOCKSIZE)) { + !zfs_uio_aligned(uio, PAGE_SIZE)) { error = SET_ERROR(EINVAL); goto out; } - if (zn_has_cached_data(zp, zfs_uio_offset(uio), - zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { - error = SET_ERROR(EAGAIN); + error = zfs_uio_get_dio_pages_alloc(uio, rw); + if (error) { goto out; } - - error = zfs_uio_get_dio_pages_alloc(uio, rw); - if (error) - goto out; - } else { - error = EAGAIN; - goto out; } IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT); ASSERT0(error); - *ioflagp = ioflag; out: - zfs_exit(zfsvfs, FTAG); + *ioflagp = ioflag; return (error); } @@ -380,8 +357,16 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) error = 0; goto out; } - ASSERT(zfs_uio_offset(uio) < zp->z_size); + + /* + * Setting up Direct I/O if requested. + */ + error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag); + if (error) { + goto out; + } + #if defined(__linux__) ssize_t start_offset = zfs_uio_offset(uio); #endif @@ -424,22 +409,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) #endif if (zn_has_cached_data(zp, zfs_uio_offset(uio), zfs_uio_offset(uio) + nbytes - 1)) { - /* - * It is possible that a files pages have been mmap'ed - * since our check for Direct I/O reads and the read - * being issued. In this case, we will use the ARC to - * keep it synchronized with the page cache. In order - * to do this we temporarily remove the UIO_DIRECT - * flag. - */ - boolean_t uio_direct_mmap = B_FALSE; - if (uio->uio_extflg & UIO_DIRECT) { - uio->uio_extflg &= ~UIO_DIRECT; - uio_direct_mmap = B_TRUE; - } error = mappedread(zp, nbytes, uio); - if (uio_direct_mmap) - uio->uio_extflg |= UIO_DIRECT; } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); @@ -494,6 +464,12 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) out: zfs_rangelock_exit(lr); + /* + * Cleanup for Direct I/O if requested. + */ + if (uio->uio_extflg & UIO_DIRECT) + zfs_uio_free_dio_pages(uio, UIO_READ); + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); zfs_exit(zfsvfs, FTAG); return (error); @@ -631,6 +607,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (SET_ERROR(EINVAL)); } + /* + * Setting up Direct I/O if requested. + */ + error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag); + if (error) { + zfs_exit(zfsvfs, FTAG); + return (SET_ERROR(error)); + } + /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. @@ -641,6 +626,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (SET_ERROR(EFAULT)); } + /* * If in append mode, set the io offset pointer to eof. */ @@ -676,6 +662,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } + if (zn_rlimit_fsize_uio(zp, uio)) { zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); @@ -896,15 +883,27 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } - /* - * There is a a window where a file's pages can be mmap'ed after - * the Direct I/O write has started. In this case we will still - * call update_pages() to make sure there is consistency - * between the ARC and the page cache. This is unfortunate + * There is a window where a file's pages can be mmap'ed after + * zfs_setup_direct() is called. This is due to the fact that + * the rangelock in this function is acquired after calling + * zfs_setup_direct(). This is done so that + * zfs_uio_prefaultpages() does not attempt to fault in pages + * on Linux for Direct I/O requests. This is not necessary as + * the pages are pinned in memory and can not be faulted out. + * Ideally, the rangelock would be held before calling + * zfs_setup_direct() and zfs_uio_prefaultpages(); however, + * this can lead to a deadlock as zfs_getpage() also acquires + * the rangelock as a RL_WRITER and prefaulting the pages can + * lead to zfs_getpage() being called. + * + * In the case of the pages being mapped after + * zfs_setup_direct() is called, the call to update_pages() + * will still be made to make sure there is consistency between + * the ARC and the Linux page cache. This is an ufortunate * situation as the data will be read back into the ARC after - * the Direct I/O write has completed, but this is the pentalty - * for writing to a mmap'ed region of the file using O_DIRECT. + * the Direct I/O write has completed, but this is the penality + * for writing to a mmap'ed region of a file using Direct I/O. */ if (tx_bytes && zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { @@ -987,6 +986,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); + /* + * Cleanup for Direct I/O if requested. + */ + if (uio->uio_extflg & UIO_DIRECT) + zfs_uio_free_dio_pages(uio, UIO_WRITE); + /* * If we're in replay mode, or we made no progress, or the * uio data is inaccessible return an error. Otherwise, it's diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f4ada08a91..ae58f7704e 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -804,7 +804,7 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, ASSERT3U(*countp, >, 0); if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { - ASSERT3U(*errorp, ==, EAGAIN); + ASSERT3U(*errorp, ==, EIO); ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; } @@ -4521,13 +4521,12 @@ zio_vdev_io_assess(zio_t *zio) /* * If a Direct I/O write checksum verify error has occurred then this - * I/O should not attempt to be issued again. Instead the EAGAIN will - * be returned and this write will attempt to be issued through the - * ARC instead. + * I/O should not attempt to be issued again. Instead the EIO will + * be returned. */ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); - ASSERT3U(zio->io_error, ==, EAGAIN); + ASSERT3U(zio->io_error, ==, EIO); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } @@ -4850,6 +4849,7 @@ static zio_t * zio_dio_checksum_verify(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); + int error; ASSERT3P(zio->io_vd, !=, NULL); ASSERT3P(zio->io_bp, !=, NULL); @@ -4858,38 +4858,28 @@ zio_dio_checksum_verify(zio_t *zio) ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE); ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); - if (zfs_vdev_direct_write_verify_pct == 0 || zio->io_error != 0) + if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0) goto out; - /* - * A Direct I/O write checksum verification will only be - * performed based on the top-level VDEV percentage for checks. - */ - uint32_t rand = random_in_range(100); - int error; + if ((error = zio_checksum_error(zio, NULL)) != 0) { + zio->io_error = error; + if (error == ECKSUM) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_dio_verify_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + zio->io_error = SET_ERROR(EIO); + zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; - if (rand < zfs_vdev_direct_write_verify_pct) { - if ((error = zio_checksum_error(zio, NULL)) != 0) { - zio->io_error = error; - if (error == ECKSUM) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - zio->io_vd->vdev_stat.vs_dio_verify_errors++; - mutex_exit(&zio->io_vd->vdev_stat_lock); - zio->io_error = SET_ERROR(EAGAIN); - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + /* + * The EIO error must be propagated up to the logical + * parent ZIO in zio_notify_parent() so it can be + * returned to dmu_write_abd(). + */ + zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE; - /* - * The EAGAIN error must be propagated up to the - * logical parent ZIO in zio_notify_parent() so - * it can be returned to dmu_write_abd(). - */ - zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE; - - (void) zfs_ereport_post( - FM_EREPORT_ZFS_DIO_VERIFY, - zio->io_spa, zio->io_vd, &zio->io_bookmark, - zio, 0); - } + (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0); } } @@ -5243,8 +5233,8 @@ zio_done(zio_t *zio) } if ((zio->io_error == EIO || !(zio->io_flags & - (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DIO_CHKSUM_ERR))) && + (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && + !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the diff --git a/tests/zfs-tests/cmd/manipulate_user_buffer.c b/tests/zfs-tests/cmd/manipulate_user_buffer.c index c195a197ad..7daa7d3084 100644 --- a/tests/zfs-tests/cmd/manipulate_user_buffer.c +++ b/tests/zfs-tests/cmd/manipulate_user_buffer.c @@ -41,6 +41,7 @@ static char *outputfile = NULL; static int blocksize = 131072; /* 128K */ +static int wr_err_expected = 0; static int numblocks = 100; static char *execname = NULL; static int print_usage = 0; @@ -56,28 +57,33 @@ static void usage(void) { (void) fprintf(stderr, - "usage %s -o outputfile [-b blocksize] [-n numblocks]\n" - " [-p randpattern] [-h help]\n" + "usage %s -o outputfile [-b blocksize] [-e wr_error_expected]\n" + " [-n numblocks] [-p randpattern] [-h help]\n" "\n" "Testing whether checksum verify works correctly for O_DIRECT.\n" "when manipulating the contents of a userspace buffer.\n" "\n" - " outputfile: File to write to.\n" - " blocksize: Size of each block to write (must be at \n" - " least >= 512).\n" - " numblocks: Total number of blocksized blocks to write.\n" - " randpattern: Fill data buffer with random data. Default \n" - " behavior is to fill the buffer with the \n" - " known data pattern (0xdeadbeef).\n" - " help: Print usage information and exit.\n" + " outputfile: File to write to.\n" + " blocksize: Size of each block to write (must be at \n" + " least >= 512).\n" + " wr_err_expected: Whether pwrite() is expected to return EIO\n" + " while manipulating the contents of the\n" + " buffer.\n" + " numblocks: Total number of blocksized blocks to\n" + " write.\n" + " randpattern: Fill data buffer with random data. Default\n" + " behavior is to fill the buffer with the \n" + " known data pattern (0xdeadbeef).\n" + " help: Print usage information and exit.\n" "\n" " Required parameters:\n" " outputfile\n" "\n" " Default Values:\n" - " blocksize -> 131072\n" - " numblocks -> 100\n" - " randpattern -> false\n", + " blocksize -> 131072\n" + " wr_err_expexted -> false\n" + " numblocks -> 100\n" + " randpattern -> false\n", execname); (void) exit(1); } @@ -91,12 +97,16 @@ parse_options(int argc, char *argv[]) extern int optind, optopt; execname = argv[0]; - while ((c = getopt(argc, argv, "b:hn:o:p")) != -1) { + while ((c = getopt(argc, argv, "b:ehn:o:p")) != -1) { switch (c) { case 'b': blocksize = atoi(optarg); break; + case 'e': + wr_err_expected = 1; + break; + case 'h': print_usage = 1; break; @@ -153,8 +163,10 @@ write_thread(void *arg) while (!args->entire_file_written) { wrote = pwrite(ofd, buf, blocksize, offset); if (wrote != blocksize) { - perror("write"); - exit(2); + if (wr_err_expected) + assert(errno == EIO); + else + exit(2); } offset = ((offset + blocksize) % total_data); diff --git a/tests/zfs-tests/cmd/stride_dd.c b/tests/zfs-tests/cmd/stride_dd.c index 19aab1c97f..e1e45794cf 100644 --- a/tests/zfs-tests/cmd/stride_dd.c +++ b/tests/zfs-tests/cmd/stride_dd.c @@ -212,7 +212,6 @@ read_entire_file(int ifd, int ofd, void *buf) } } - if (stride > 1) { if (lseek(ifd, (stride - 1) * bsize, SEEK_CUR) == -1) { perror("input lseek"); diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index d3c4a7d940..b41f54ba35 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -93,7 +93,7 @@ VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count -VDEV_DIRECT_WR_VERIFY_PCT vdev.direct_write_verify_pct zfs_vdev_direct_write_verify_pct +VDEV_DIRECT_WR_VERIFY vdev.direct_write_verify zfs_vdev_direct_write_verify VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode diff --git a/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh index 4aac5edd8e..e26fbdfc25 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh @@ -43,7 +43,6 @@ function cleanup { zfs set recordsize=$rs $TESTPOOL/$TESTFS log_must rm -f $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh b/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh index 3f26715fc3..27fd66ccd2 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh @@ -44,7 +44,6 @@ function cleanup { zfs set direct=standard $TESTPOOL/$TESTFS rm $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify direct=always mixed small async requests" diff --git a/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh b/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh index 82d7d8250f..5492a5a905 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh @@ -44,7 +44,6 @@ verify_runnable "global" function cleanup { log_must rm -f "$mntpnt/direct-*" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } function check_fio_ioengine diff --git a/tests/zfs-tests/tests/functional/direct/dio_compression.ksh b/tests/zfs-tests/tests/functional/direct/dio_compression.ksh index 5be93d104d..5463715d7b 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_compression.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_compression.ksh @@ -46,7 +46,6 @@ function cleanup { log_must rm -f "$mntpnt/direct-*" log_must zfs set compression=off $TESTPOOL/$TESTFS - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify compression works using Direct I/O." diff --git a/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh b/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh index c703fcc05f..9de94dee6c 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh @@ -45,7 +45,6 @@ function cleanup { log_must rm -f "$mntpnt/direct-*" log_must zfs set dedup=off $TESTPOOL/$TESTFS - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify deduplication works using Direct I/O." diff --git a/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh b/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh index 843b570d2d..b6faa11970 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh @@ -59,6 +59,4 @@ for bs in "4k" "128k" "1m"; do done done -check_dio_write_chksum_verify_failures $TESTPOOL1 "stripe" 0 - log_pass "Verified encryption works using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh index c54d079366..12b2f21275 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh @@ -41,7 +41,6 @@ function cleanup { zfs set recordsize=$rs $TESTPOOL/$TESTFS log_must rm -f $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify the number direct/buffered requests when growing a file" diff --git a/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh b/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh index 87900443ed..2c0ce832b1 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh @@ -57,14 +57,6 @@ for type in "" "mirror" "raidz" "draid"; do; verify_dio_write_count $TESTPOOL1 $recsize $((4 * recsize)) \ $mntpnt - if [[ "$type" == "" ]]; then - check_dio_write_chksum_verify_failures $TESTPOOL1 \ - "stripe" 0 - else - check_dio_write_chksum_verify_failures $TESTPOOL1 \ - "$type" 0 - fi - destroy_pool $TESTPOOL1 done done diff --git a/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh b/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh index 38c6159537..6f217d91d5 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh @@ -42,7 +42,6 @@ verify_runnable "global" function cleanup { log_must rm -f $src_file $new_file $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify mixed buffered and Direct I/O are coherent." diff --git a/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh b/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh index 27d03e0412..fbd6afd7b3 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh @@ -45,7 +45,6 @@ function cleanup { zfs set recordsize=$rs $TESTPOOL/$TESTFS log_must rm -f "$tmp_file" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify mixed Direct I/O and mmap I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh b/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh index 3854766ed8..04973fc886 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh @@ -43,7 +43,6 @@ function cleanup { zfs set recordsize=$rs $TESTPOOL/$TESTFS log_must rm -f "$tmp_file" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify Direct I/O overwrites" diff --git a/tests/zfs-tests/tests/functional/direct/dio_property.ksh b/tests/zfs-tests/tests/functional/direct/dio_property.ksh index 4fbcfec068..9e18f0bf78 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_property.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_property.ksh @@ -44,7 +44,6 @@ function cleanup { zfs set direct=standard $TESTPOOL/$TESTFS log_must rm -f $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify the direct=always|disabled|standard property" @@ -61,7 +60,8 @@ count=8 # # Check when "direct=always" any aligned IO is done as direct. -# Note that "flag=direct" is not set in the following calls to dd(1). +# Note that the "-D" and "-d" flags are not set in the following calls to +# stride_dd. # log_must zfs set direct=always $TESTPOOL/$TESTFS @@ -92,7 +92,8 @@ log_must rm -f $tmp_file # # Check when "direct=disabled" there are never any direct requests. -# Note that "flag=direct" is always set in the following calls to dd(1). +# Note that the "-D" and "-d" flags are always set in the following calls to +# stride_dd. # log_must zfs set direct=disabled $TESTPOOL/$TESTFS diff --git a/tests/zfs-tests/tests/functional/direct/dio_random.ksh b/tests/zfs-tests/tests/functional/direct/dio_random.ksh index 42c18d4261..abe8d5c0dc 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_random.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_random.ksh @@ -45,7 +45,6 @@ verify_runnable "global" function cleanup { log_must rm -f "$tmp_file" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify randomly sized mixed Direct I/O and buffered I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh b/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh index e1087e5ac3..def4682213 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh @@ -61,14 +61,6 @@ for type in "" "mirror" "raidz" "draid"; do done done - if [[ "$type" == "" ]]; then - check_dio_write_chksum_verify_failures $TESTPOOL1 \ - "stripe" 0 - else - check_dio_write_chksum_verify_failures $TESTPOOL1 \ - "$type" 0 - fi - destroy_pool $TESTPOOL1 done done diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh index 9f50187149..309d35ea0e 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh @@ -44,7 +44,6 @@ function cleanup zfs set recordsize=$rs $TESTPOOL/$TESTFS zfs set direct=standard $TESTPOOL/$TESTFS log_must rm -f $tmp_file - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh index 571767d3b1..8bb363f1a9 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh @@ -49,7 +49,6 @@ function cleanup { log_must rm -f "$filename" log_must set recordsize=$rs $TESTPOOL/$TESTFS - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 } log_assert "Verify Direct I/O reads can read an entire file that is not \ diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh index 5a5a5cf7ad..efc9ee6391 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh @@ -77,7 +77,7 @@ do # Manipulate the user's buffer while running O_DIRECT write # workload with the buffer. log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ - -n $NUMBLOCKS -b $BS + -n $NUMBLOCKS -b $BS # Reading back the contents of the file log_must stride_dd -i $mntpnt/direct-write.iso -o /dev/null \ diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh index a7e9dc0cde..536459a35e 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh @@ -33,7 +33,7 @@ # Verify checksum verify works for Direct I/O writes. # # STRATEGY: -# 1. Set the module parameter zfs_vdev_direct_write_verify_pct to 30. +# 1. Set the module parameter zfs_vdev_direct_write_verify to 0. # 2. Check that manipulating the user buffer while Direct I/O writes are # taking place does not cause any panics with compression turned on. # 3. Start a Direct I/O write workload while manipulating the user buffer @@ -42,7 +42,7 @@ # zpool status -d and checking for zevents. We also make sure there # are reported data errors when reading the file back. # 5. Repeat steps 3 and 4 for 3 iterations. -# 6. Set zfs_vdev_direct_write_verify_pct set to 1 and repeat 3. +# 6. Set zfs_vdev_direct_write_verify set to 1 and repeat 3. # 7. Verify there are Direct I/O write verify failures using # zpool status -d and checking for zevents. We also make sure there # there are no reported data errors when reading the file back because @@ -58,22 +58,22 @@ function cleanup log_must zpool clear $TESTPOOL # Clearing out dio_verify from event logs log_must zpool events -c - log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT 2 + log_must set_tunable32 VDEV_DIRECT_WR_VERIFY $DIO_WR_VERIFY_TUNABLE } log_assert "Verify checksum verify works for Direct I/O writes." if is_freebsd; then - log_unsupported "FeeBSD is capable of stable pages for O_DIRECT writes" + log_unsupported "FreeBSD is capable of stable pages for O_DIRECT writes" fi log_onexit cleanup ITERATIONS=3 NUMBLOCKS=300 -VERIFY_PCT=30 BS=$((128 * 1024)) # 128k mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +typeset DIO_WR_VERIFY_TUNABLE=$(get_tunable VDEV_DIRECT_WR_VERIFY) # Get a list of vdevs in our pool set -A array $(get_disklist_fullpath $TESTPOOL) @@ -82,7 +82,7 @@ set -A array $(get_disklist_fullpath $TESTPOOL) firstvdev=${array[0]} log_must zfs set recordsize=128k $TESTPOOL/$TESTFS -log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT $VERIFY_PCT +log_must set_tunable32 VDEV_DIRECT_WR_VERIFY 0 # First we will verify there are no panics while manipulating the contents of # the user buffer during Direct I/O writes with compression. The contents @@ -101,25 +101,21 @@ if [[ $total_dio_wr -lt 1 ]]; then log_fail "No Direct I/O writes $total_dio_wr" fi -log_must rm -f "$mntpnt/direct-write.iso" # Clearing out DIO counts for Zpool log_must zpool clear $TESTPOOL # Clearing out dio_verify from event logs log_must zpool events -c - - +log_must rm -f "$mntpnt/direct-write.iso" # Next we will verify there are checksum errors for Direct I/O writes while # manipulating the contents of the user pages. log_must zfs set compression=off $TESTPOOL/$TESTFS for i in $(seq 1 $ITERATIONS); do - log_note "Verifying 30% of Direct I/O write checksums iteration \ - $i of $ITERATIONS with \ - zfs_vdev_direct_write_verify_pct=$VERIFY_PCT" + log_note "Verifying Direct I/O write checksums iteration \ + $i of $ITERATIONS with zfs_vdev_direct_write_verify=0" prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - prev_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ -n $NUMBLOCKS -b $BS @@ -131,9 +127,7 @@ for i in $(seq 1 $ITERATIONS); do # Getting new Direct I/O and ARC write counts. curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - curr_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) - total_arc_wr=$((curr_arc_wr - prev_arc_wr)) # Verifying there are checksum errors log_note "Making sure there are checksum errors for the ZPool" @@ -144,23 +138,13 @@ for i in $(seq 1 $ITERATIONS); do log_fail "No checksum failures for ZPool $TESTPOOL" fi - # Getting checksum verify failures - verify_failures=$(get_zpool_status_chksum_verify_failures $TESTPOOL "raidz") - log_note "Making sure we have Direct I/O writes logged" if [[ $total_dio_wr -lt 1 ]]; then log_fail "No Direct I/O writes $total_dio_wr" fi - log_note "Making sure we have Direct I/O write checksum verifies with ZPool" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 1 - - # In the event of checksum verify error, the write will be redirected - # through the ARC. We check here that we have ARC writes. - log_note "Making sure we have ARC writes have taken place in the event \ - a Direct I/O checksum verify failures occurred" - if [[ $total_arc_wr -lt $verify_failures ]]; then - log_fail "ARC writes $total_arc_wr < $verify_failures" - fi + log_note "Making sure we have no Direct I/O write checksum verifies \ + with ZPool" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 log_must rm -f "$mntpnt/direct-write.iso" done @@ -168,19 +152,22 @@ done log_must zpool status -v $TESTPOOL log_must zpool sync $TESTPOOL + + # Finally we will verfiy that with checking every Direct I/O write we have no # errors at all. -VERIFY_PCT=100 -log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT $VERIFY_PCT +# Create the file before trying to manipulate the contents +log_must file_write -o create -f "$mntpnt/direct-write.iso" -b $BS \ + -c $NUMBLOCKS -w +log_must set_tunable32 VDEV_DIRECT_WR_VERIFY 1 for i in $(seq 1 $ITERATIONS); do log_note "Verifying every Direct I/O write checksums iteration $i of \ - $ITERATIONS with zfs_vdev_direct_write_verify_pct=$VERIFY_PCT" + $ITERATIONS with zfs_vdev_direct_write_verify=1" prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - prev_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ - -n $NUMBLOCKS -b $BS + -n $NUMBLOCKS -b $BS -e # Reading file back to verify there no are checksum errors filesize=$(get_file_size "$mntpnt/direct-write.iso") @@ -190,16 +177,11 @@ for i in $(seq 1 $ITERATIONS); do # Getting new Direct I/O and ARC Write counts. curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - curr_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) - total_arc_wr=$((curr_arc_wr - prev_arc_wr)) log_note "Making sure there are no checksum errors with the ZPool" log_must check_pool_status $TESTPOOL "errors" "No known data errors" - # Geting checksum verify failures - verify_failures=$(get_zpool_status_chksum_verify_failures $TESTPOOL "raidz") - log_note "Making sure we have Direct I/O writes logged" if [[ $total_dio_wr -lt 1 ]]; then log_fail "No Direct I/O writes $total_dio_wr" @@ -207,16 +189,8 @@ for i in $(seq 1 $ITERATIONS); do log_note "Making sure we have Direct I/O write checksum verifies with ZPool" check_dio_write_chksum_verify_failures "$TESTPOOL" "raidz" 1 - - # In the event of checksum verify error, the write will be redirected - # through the ARC. We check here that we have ARC writes. - log_note "Making sure we have ARC writes have taken place in the event \ - a Direct I/O checksum verify failures occurred" - if [[ $total_arc_wr -lt $verify_failures ]]; then - log_fail "ARC writes $total_arc_wr < $verify_failures" - fi - - log_must rm -f "$mntpnt/direct-write.iso" done +log_must rm -f "$mntpnt/direct-write.iso" + log_pass "Verified checksum verify works for Direct I/O writes."