diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h index 47f91e4a6c..129bc43a89 100644 --- a/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -68,7 +68,7 @@ extern void zfs_inactive(struct inode *ip); extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr); extern int zfs_fid(struct inode *ip, fid_t *fidp); -extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); +extern int zfs_getpage(struct inode *ip, struct page *pp); extern int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc); extern int zfs_dirty_inode(struct inode *ip, int flags); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index ae0401e60e..27e1ab420b 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -244,43 +244,46 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) } #if defined(_KERNEL) + +static int zfs_fillpage(struct inode *ip, struct page *pp); + /* * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. + * between the DMU cache and the memory mapped pages. Update all mapped + * pages with the contents of the coresponding dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { - struct inode *ip = ZTOI(zp); - struct address_space *mp = ip->i_mapping; - struct page *pp; - uint64_t nbytes; - int64_t off; - void *pb; + struct address_space *mp = ZTOI(zp)->i_mapping; + int64_t off = start & (PAGE_SIZE - 1); - off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - nbytes = MIN(PAGE_SIZE - off, len); + uint64_t nbytes = MIN(PAGE_SIZE - off, len); - pp = find_lock_page(mp, start >> PAGE_SHIFT); + struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); - pb = kmap(pp); - (void) dmu_read(os, zp->z_id, start + off, nbytes, - pb + off, DMU_READ_PREFETCH); + void *pb = kmap(pp); + int error = dmu_read(os, zp->z_id, start + off, + nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); + if (error) { + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + } - mark_page_accessed(pp); - SetPageUptodate(pp); - ClearPageError(pp); unlock_page(pp); put_page(pp); } @@ -291,38 +294,44 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os) } /* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. + * When a file is memory mapped, we must keep the I/O data synchronized + * between the DMU cache and the memory mapped pages. Preferentially read + * from memory mapped pages, otherwise fallback to reading through the dmu. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; - struct page *pp; - int64_t start, off; - uint64_t bytes; + int64_t start = uio->uio_loffset; + int64_t off = start & (PAGE_SIZE - 1); int len = nbytes; int error = 0; - void *pb; - start = uio->uio_loffset; - off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - bytes = MIN(PAGE_SIZE - off, len); + uint64_t bytes = MIN(PAGE_SIZE - off, len); - pp = find_lock_page(mp, start >> PAGE_SHIFT); + struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { - ASSERT(PageUptodate(pp)); + /* + * If filemap_fault() retries there exists a window + * where the page will be unlocked and not up to date. + * In this case we must try and fill the page. + */ + if (unlikely(!PageUptodate(pp))) { + error = zfs_fillpage(ip, pp); + if (error) { + unlock_page(pp); + put_page(pp); + return (error); + } + } + + ASSERT(PageUptodate(pp) || PageDirty(pp)); + unlock_page(pp); - pb = kmap(pp); + void *pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); @@ -338,9 +347,11 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) len -= bytes; off = 0; + if (error) break; } + return (error); } #endif /* _KERNEL */ @@ -3766,55 +3777,41 @@ zfs_inactive(struct inode *ip) * Fill pages with data from the disk. */ static int -zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) +zfs_fillpage(struct inode *ip, struct page *pp) { - znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - struct page *cur_pp; - u_offset_t io_off, total; - size_t io_len; - loff_t i_size; - unsigned page_idx; - int err; - - os = zfsvfs->z_os; - io_len = nr_pages << PAGE_SHIFT; - i_size = i_size_read(ip); - io_off = page_offset(pl[0]); + loff_t i_size = i_size_read(ip); + u_offset_t io_off = page_offset(pp); + size_t io_len = PAGE_SIZE; if (io_off + io_len > i_size) io_len = i_size - io_off; - /* - * Iterate over list of pages and read each page individually. - */ - page_idx = 0; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - caddr_t va; + void *va = kmap(pp); + int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, + PAGE_SIZE, va, DMU_READ_PREFETCH); + kunmap(pp); - cur_pp = pl[page_idx++]; - va = kmap(cur_pp); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); - kunmap(cur_pp); - if (err) { - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = SET_ERROR(EIO); - return (err); - } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); } - return (0); + return (error); } /* - * Uses zfs_fillpage to read data from the file and fill the pages. + * Uses zfs_fillpage to read data from the file and fill the page. * * IN: ip - inode of file to get data from. - * pl - list of pages to read - * nr_pages - number of pages to read + * pp - page to read * * RETURN: 0 on success, error code on failure. * @@ -3823,24 +3820,22 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) */ /* ARGSUSED */ int -zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) +zfs_getpage(struct inode *ip, struct page *pp) { - znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); - int err; - - if (pl == NULL) - return (0); + znode_t *zp = ITOZ(ip); + int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - err = zfs_fillpage(ip, pl, nr_pages); - - dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE); + error = zfs_fillpage(ip, pp); + if (error == 0) + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); ZFS_EXIT(zfsvfs); - return (err); + + return (error); } /* diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 38d2bd1470..effd7dae13 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -609,29 +609,16 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) static inline int zpl_readpage_common(struct page *pp) { - struct inode *ip; - struct page *pl[1]; - int error = 0; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); - ip = pp->mapping->host; - pl[0] = pp; cookie = spl_fstrans_mark(); - error = -zfs_getpage(ip, pl, 1); + int error = -zfs_getpage(pp->mapping->host, pp); spl_fstrans_unmark(cookie); - if (error) { - SetPageError(pp); - ClearPageUptodate(pp); - } else { - ClearPageError(pp); - SetPageUptodate(pp); - flush_dcache_page(pp); - } - unlock_page(pp); + return (error); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 554cf96f88..083f0ef69a 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -669,7 +669,8 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', tags = ['functional', 'migration'] [tests/functional/mmap] -tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos'] +tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos', + 'mmap_write_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mount] diff --git a/tests/zfs-tests/cmd/mmap_sync.c b/tests/zfs-tests/cmd/mmap_sync.c new file mode 100644 index 0000000000..226e71be2f --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_sync.c @@ -0,0 +1,152 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +cleanup(char *file) +{ + (void) remove(file); +} + +int +main(int argc, char *argv[]) +{ + char *testdir = getenv("TESTDIR"); + if (!testdir) { + fprintf(stderr, "environment variable TESTDIR not set\n"); + return (1); + } + + struct stat st; + umask(0); + if (stat(testdir, &st) != 0 && + mkdir(testdir, 0777) != 0) { + perror("mkdir"); + return (1); + } + + if (argc > 3) { + fprintf(stderr, "usage: %s " + "[run time in mins] " + "[max msync time in ms]\n", argv[0]); + return (1); + } + + int run_time_mins = 1; + if (argc >= 2) { + run_time_mins = atoi(argv[1]); + } + + int max_msync_time_ms = 1000; + if (argc >= 3) { + max_msync_time_ms = atoi(argv[2]); + } + + char filepath[512]; + filepath[0] = '\0'; + char *file = &filepath[0]; + + (void) snprintf(file, 512, "%s/msync_file", testdir); + + const int LEN = 8; + cleanup(file); + + int fd = open(file, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR | + S_IRGRP | S_IROTH); + + if (fd == -1) { + (void) fprintf(stderr, "%s: %s: ", argv[0], file); + perror("open"); + return (1); + } + + if (ftruncate(fd, LEN) != 0) { + perror("ftruncate"); + cleanup(file); + return (1); + } + + void *ptr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (ptr == MAP_FAILED) { + perror("mmap"); + cleanup(file); + return (1); + } + + struct timeval tstart; + gettimeofday(&tstart, NULL); + + long long x = 0LL; + + for (;;) { + *((long long *)ptr) = x; + x++; + + struct timeval t1, t2; + gettimeofday(&t1, NULL); + if (msync(ptr, LEN, MS_SYNC|MS_INVALIDATE) != 0) { + perror("msync"); + cleanup(file); + return (1); + } + + gettimeofday(&t2, NULL); + + double elapsed = (t2.tv_sec - t1.tv_sec) * 1000.0; + elapsed += ((t2.tv_usec - t1.tv_usec) / 1000.0); + if (elapsed > max_msync_time_ms) { + fprintf(stderr, "slow msync: %f ms\n", elapsed); + if (munmap(ptr, LEN) != 0) + perror("munmap"); + cleanup(file); + return (1); + } + + double elapsed_start = (t2.tv_sec - tstart.tv_sec) * 1000.0; + elapsed_start += ((t2.tv_usec - tstart.tv_usec) / 1000.0); + if (elapsed_start > run_time_mins * 60 * 1000) { + break; + } + } + + if (munmap(ptr, LEN) != 0) { + perror("munmap"); + cleanup(file); + return (1); + } + + if (close(fd) != 0) { + perror("close"); + } + + cleanup(file); + return (0); +} diff --git a/tests/zfs-tests/tests/functional/mmap/Makefile.am b/tests/zfs-tests/tests/functional/mmap/Makefile.am index b26791ee7c..301007f63f 100644 --- a/tests/zfs-tests/tests/functional/mmap/Makefile.am +++ b/tests/zfs-tests/tests/functional/mmap/Makefile.am @@ -2,6 +2,7 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/mmap dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ + mmap_mixed.ksh \ mmap_read_001_pos.ksh \ mmap_write_001_pos.ksh \ mmap_libaio_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh new file mode 100755 index 0000000000..6c8246d48a --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmap/mmap.cfg + +# +# DESCRIPTION: +# Verify mixed buffered and mmap IO. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a background buffered read/write fio to the file. +# 3. Start a background mmap read/write fio to the file. +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$tmp_file" +} + +log_assert "Verify mixed buffered and mmap IO" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((128 * 1024)) +blocks=64 +size=$((bs * blocks)) +runtime=60 + +log_must dd if=/dev/zero of=$tmp_file bs=$bs count=$blocks + +# Buffered IO writes +log_must eval "fio --filename=$tmp_file --name=buffer-write \ + --rw=randwrite --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# Buffered IO reads +log_must eval "fio --filename=$tmp_file --name=buffer-read \ + --rw=randread --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap IO writes +log_must eval "fio --filename=$tmp_file --name=mmap-write \ + --rw=randwrite --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap IO reads +log_must eval "fio --filename=$tmp_file --name=mmap-read \ + --rw=randread --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +log_must wait + +log_pass "Verfied mixed buffered and mmap IO"