Fix lseek(SEEK_DATA/SEEK_HOLE) mmap consistency

When using lseek(2) to report data/holes memory mapped regions of
the file were ignored.  This could result in incorrect results.
To handle this zfs_holey_common() was updated to asynchronously
writeback any dirty mmap(2) regions prior to reporting holes.

Additionally, while not strictly required, the dn_struct_rwlock is
now held over the dirty check to prevent the dnode structure from
changing.  This ensures that a clean dnode can't be dirtied before
the data/hole is located.  The range lock is now also taken to
ensure the call cannot race with zfs_write().

Furthermore, the code was refactored to provide a dnode_is_dirty()
helper function which checks the dnode for any dirty records to
determine its dirtiness.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #11900
Closes #12724
This commit is contained in:
Brian Behlendorf 2021-11-07 13:27:44 -08:00 committed by Tony Hutter
parent 5bf81fea2f
commit 664d487a5d
18 changed files with 305 additions and 32 deletions

View File

@ -221,6 +221,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/cmd/mktree/Makefile tests/zfs-tests/cmd/mktree/Makefile
tests/zfs-tests/cmd/mmap_exec/Makefile tests/zfs-tests/cmd/mmap_exec/Makefile
tests/zfs-tests/cmd/mmap_libaio/Makefile tests/zfs-tests/cmd/mmap_libaio/Makefile
tests/zfs-tests/cmd/mmap_seek/Makefile
tests/zfs-tests/cmd/mmapwrite/Makefile tests/zfs-tests/cmd/mmapwrite/Makefile
tests/zfs-tests/cmd/nvlist_to_lua/Makefile tests/zfs-tests/cmd/nvlist_to_lua/Makefile
tests/zfs-tests/cmd/randfree_file/Makefile tests/zfs-tests/cmd/randfree_file/Makefile

View File

@ -59,6 +59,8 @@ enum symfollow { NO_FOLLOW = NOFOLLOW };
#include <sys/file.h> #include <sys/file.h>
#include <sys/filedesc.h> #include <sys/filedesc.h>
#include <sys/syscallsubr.h> #include <sys/syscallsubr.h>
#include <sys/vm.h>
#include <vm/vm_object.h>
typedef struct vop_vector vnodeops_t; typedef struct vop_vector vnodeops_t;
#define VOP_FID VOP_VPTOFH #define VOP_FID VOP_VPTOFH
@ -83,6 +85,22 @@ vn_is_readonly(vnode_t *vp)
#define vn_has_cached_data(vp) \ #define vn_has_cached_data(vp) \
((vp)->v_object != NULL && \ ((vp)->v_object != NULL && \
(vp)->v_object->resident_page_count > 0) (vp)->v_object->resident_page_count > 0)
static __inline void
vn_flush_cached_data(vnode_t *vp, boolean_t sync)
{
#if __FreeBSD_version > 1300054
if (vm_object_mightbedirty(vp->v_object)) {
#else
if (vp->v_object->flags & OBJ_MIGHTBEDIRTY) {
#endif
int flags = sync ? OBJPC_SYNC : 0;
zfs_vmobject_wlock(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, flags);
zfs_vmobject_wunlock(vp->v_object);
}
}
#define vn_exists(vp) do { } while (0) #define vn_exists(vp) do { } while (0)
#define vn_invalid(vp) do { } while (0) #define vn_invalid(vp) do { } while (0)
#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) #define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0)

View File

@ -118,7 +118,8 @@ extern minor_t zfsdev_minor_alloc(void);
#define Z_ISLNK(type) ((type) == VLNK) #define Z_ISLNK(type) ((type) == VLNK)
#define Z_ISDIR(type) ((type) == VDIR) #define Z_ISDIR(type) ((type) == VDIR)
#define zn_has_cached_data(zp) vn_has_cached_data(ZTOV(zp)) #define zn_has_cached_data(zp) vn_has_cached_data(ZTOV(zp))
#define zn_flush_cached_data(zp, sync) vn_flush_cached_data(ZTOV(zp), sync)
#define zn_rlimit_fsize(zp, uio) \ #define zn_rlimit_fsize(zp, uio) \
vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio)) vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio))

View File

@ -71,6 +71,7 @@ extern "C" {
#define Z_ISDIR(type) S_ISDIR(type) #define Z_ISDIR(type) S_ISDIR(type)
#define zn_has_cached_data(zp) ((zp)->z_is_mapped) #define zn_has_cached_data(zp) ((zp)->z_is_mapped)
#define zn_flush_cached_data(zp, sync) write_inode_now(ZTOI(zp), sync)
#define zn_rlimit_fsize(zp, uio) (0) #define zn_rlimit_fsize(zp, uio) (0)
/* /*

View File

@ -425,6 +425,7 @@ boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref);
void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
int dnode_try_claim(objset_t *os, uint64_t object, int slots); int dnode_try_claim(objset_t *os, uint64_t object, int slots);
boolean_t dnode_is_dirty(dnode_t *dn);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag); void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx);

View File

@ -1574,7 +1574,7 @@ Allow no-operation writes.
The occurrence of nopwrites will further depend on other pool properties The occurrence of nopwrites will further depend on other pool properties
.Pq i.a. the checksumming and compression algorithms . .Pq i.a. the checksumming and compression algorithms .
. .
.It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 0 Ns | ns 1 Pq int .It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 0 Ns | Ns 1 Pq int
Enable forcing TXG sync to find holes. Enable forcing TXG sync to find holes.
When enabled forces ZFS to act like prior versions when When enabled forces ZFS to act like prior versions when
.Sy SEEK_HOLE No or Sy SEEK_DATA .Sy SEEK_HOLE No or Sy SEEK_DATA

View File

@ -2095,42 +2095,41 @@ int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{ {
dnode_t *dn; dnode_t *dn;
int i, err; int err;
boolean_t clean = B_TRUE;
restart:
err = dnode_hold(os, object, FTAG, &dn); err = dnode_hold(os, object, FTAG, &dn);
if (err) if (err)
return (err); return (err);
/* rw_enter(&dn->dn_struct_rwlock, RW_READER);
* Check if dnode is dirty
*/ if (dnode_is_dirty(dn)) {
for (i = 0; i < TXG_SIZE; i++) { /*
if (multilist_link_active(&dn->dn_dirty_link[i])) { * If the zfs_dmu_offset_next_sync module option is enabled
clean = B_FALSE; * then strict hole reporting has been requested. Dirty
break; * dnodes must be synced to disk to accurately report all
* holes. When disabled (the default) dirty dnodes are
* reported to not have any holes which is always safe.
*
* When called by zfs_holey_common() the zp->z_rangelock
* is held to prevent zfs_write() and mmap writeback from
* re-dirtying the dnode after txg_wait_synced().
*/
if (zfs_dmu_offset_next_sync) {
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
txg_wait_synced(dmu_objset_pool(os), 0);
goto restart;
} }
}
/*
* If compatibility option is on, sync any current changes before
* we go trundling through the block pointers.
*/
if (!clean && zfs_dmu_offset_next_sync) {
clean = B_TRUE;
dnode_rele(dn, FTAG);
txg_wait_synced(dmu_objset_pool(os), 0);
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
}
if (clean)
err = dnode_next_offset(dn,
(hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
else
err = SET_ERROR(EBUSY); err = SET_ERROR(EBUSY);
} else {
err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
(hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
}
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);
return (err); return (err);

View File

@ -1648,6 +1648,26 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
slots, NULL, NULL)); slots, NULL, NULL));
} }
/*
* Checks if the dnode contains any uncommitted dirty records.
*/
boolean_t
dnode_is_dirty(dnode_t *dn)
{
mutex_enter(&dn->dn_mtx);
for (int i = 0; i < TXG_SIZE; i++) {
if (list_head(&dn->dn_dirty_records[i]) != NULL) {
mutex_exit(&dn->dn_mtx);
return (B_TRUE);
}
}
mutex_exit(&dn->dn_mtx);
return (B_FALSE);
}
void void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{ {

View File

@ -85,6 +85,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
static int static int
zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
{ {
zfs_locked_range_t *lr;
uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t noff = (uint64_t)*off; /* new offset */
uint64_t file_sz; uint64_t file_sz;
int error; int error;
@ -100,12 +101,18 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
else else
hole = B_FALSE; hole = B_FALSE;
/* Flush any mmap()'d data to disk */
if (zn_has_cached_data(zp))
zn_flush_cached_data(zp, B_FALSE);
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER);
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
zfs_rangelock_exit(lr);
if (error == ESRCH) if (error == ESRCH)
return (SET_ERROR(ENXIO)); return (SET_ERROR(ENXIO));
/* file was dirty, so fall back to using generic logic */ /* File was dirty, so fall back to using generic logic */
if (error == EBUSY) { if (error == EBUSY) {
if (hole) if (hole)
*off = file_sz; *off = file_sz;

View File

@ -669,7 +669,7 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
tags = ['functional', 'migration'] tags = ['functional', 'migration']
[tests/functional/mmap] [tests/functional/mmap]
tests = ['mmap_write_001_pos', 'mmap_read_001_pos'] tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos']
tags = ['functional', 'mmap'] tags = ['functional', 'mmap']
[tests/functional/mount] [tests/functional/mount]

View File

@ -19,6 +19,7 @@ SUBDIRS = \
mktree \ mktree \
mmap_exec \ mmap_exec \
mmap_libaio \ mmap_libaio \
mmap_seek \
mmapwrite \ mmapwrite \
nvlist_to_lua \ nvlist_to_lua \
randwritecomp \ randwritecomp \

View File

@ -0,0 +1 @@
/mmap_seek

View File

@ -0,0 +1,6 @@
include $(top_srcdir)/config/Rules.am
pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin
pkgexec_PROGRAMS = mmap_seek
mmap_seek_SOURCES = mmap_seek.c

View File

@ -0,0 +1,147 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
*/
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <errno.h>
static void
seek_data(int fd, off_t offset, off_t expected)
{
off_t data_offset = lseek(fd, offset, SEEK_DATA);
if (data_offset != expected) {
fprintf(stderr, "lseek(fd, %d, SEEK_DATA) = %d (expected %d)\n",
(int)offset, (int)data_offset, (int)expected);
exit(2);
}
}
static void
seek_hole(int fd, off_t offset, off_t expected)
{
off_t hole_offset = lseek(fd, offset, SEEK_HOLE);
if (hole_offset != expected) {
fprintf(stderr, "lseek(fd, %d, SEEK_HOLE) = %d (expected %d)\n",
(int)offset, (int)hole_offset, (int)expected);
exit(2);
}
}
int
main(int argc, char **argv)
{
char *execname = argv[0];
char *file_path = argv[1];
char *buf = NULL;
int err;
if (argc != 4) {
(void) printf("usage: %s <file name> <file size> "
"<block size>\n", argv[0]);
exit(1);
}
int fd = open(file_path, O_RDWR | O_CREAT, 0666);
if (fd == -1) {
(void) fprintf(stderr, "%s: %s: ", execname, file_path);
perror("open");
exit(2);
}
off_t file_size = atoi(argv[2]);
off_t block_size = atoi(argv[3]);
if (block_size * 2 > file_size) {
(void) fprintf(stderr, "file size must be at least "
"double the block size\n");
exit(2);
}
err = ftruncate(fd, file_size);
if (err == -1) {
perror("ftruncate");
exit(2);
}
if ((buf = mmap(NULL, file_size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0)) == MAP_FAILED) {
perror("mmap");
exit(2);
}
/* Verify the file is sparse and reports no data. */
seek_data(fd, 0, -1);
/* Verify the file is reported as a hole. */
seek_hole(fd, 0, 0);
/* Verify search beyond end of file is an error. */
seek_data(fd, 2 * file_size, -1);
seek_hole(fd, 2 * file_size, -1);
/* Dirty the first byte. */
memset(buf, 'a', 1);
seek_data(fd, 0, 0);
seek_data(fd, block_size, -1);
seek_hole(fd, 0, block_size);
seek_hole(fd, block_size, block_size);
/* Dirty the first half of the file. */
memset(buf, 'b', file_size / 2);
seek_data(fd, 0, 0);
seek_data(fd, block_size, block_size);
seek_hole(fd, 0, P2ROUNDUP(file_size / 2, block_size));
seek_hole(fd, block_size, P2ROUNDUP(file_size / 2, block_size));
/* Dirty the whole file. */
memset(buf, 'c', file_size);
seek_data(fd, 0, 0);
seek_data(fd, file_size * 3 / 4,
P2ROUNDUP(file_size * 3 / 4, block_size));
seek_hole(fd, 0, file_size);
seek_hole(fd, file_size / 2, file_size);
/* Punch a hole (required compression be enabled). */
memset(buf + block_size, 0, block_size);
seek_data(fd, 0, 0);
seek_data(fd, block_size, 2 * block_size);
seek_hole(fd, 0, block_size);
seek_hole(fd, block_size, block_size);
seek_hole(fd, 2 * block_size, file_size);
err = munmap(buf, file_size);
if (err == -1) {
perror("munmap");
exit(2);
}
close(fd);
return (0);
}

View File

@ -209,6 +209,7 @@ export ZFSTEST_FILES='badsend
mktree mktree
mmap_exec mmap_exec
mmap_libaio mmap_libaio
mmap_seek
mmapwrite mmapwrite
nvlist_to_lua nvlist_to_lua
randfree_file randfree_file

View File

@ -33,6 +33,7 @@ DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode
DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms
DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms
DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check
DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync
INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size
INITIALIZE_VALUE initialize_value zfs_initialize_value INITIALIZE_VALUE initialize_value zfs_initialize_value
KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export

View File

@ -4,7 +4,8 @@ dist_pkgdata_SCRIPTS = \
cleanup.ksh \ cleanup.ksh \
mmap_read_001_pos.ksh \ mmap_read_001_pos.ksh \
mmap_write_001_pos.ksh \ mmap_write_001_pos.ksh \
mmap_libaio_001_pos.ksh mmap_libaio_001_pos.ksh \
mmap_seek_001_pos.ksh
dist_pkgdata_DATA = \ dist_pkgdata_DATA = \
mmap.cfg mmap.cfg

View File

@ -0,0 +1,67 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/mmap/mmap.cfg
#
# DESCRIPTION:
# lseek() data/holes for an mmap()'d file.
#
# STRATEGY:
# 1. Enable compression and hole reporting for dirty files.
# 2. Call mmap_seek binary test case for various record sizes.
#
verify_runnable "global"
function cleanup
{
log_must zfs set compression=off $TESTPOOL/$TESTFS
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
log_must rm -f $TESTDIR/test-mmap-file
log_must set_tunable64 DMU_OFFSET_NEXT_SYNC $dmu_offset_next_sync
}
log_assert "lseek() data/holes for an mmap()'d file."
log_onexit cleanup
# Enable hole reporting for dirty files.
typeset dmu_offset_next_sync=$(get_tunable DMU_OFFSET_NEXT_SYNC)
log_must set_tunable64 DMU_OFFSET_NEXT_SYNC 1
# Compression must be enabled to convert zero'd blocks to holes.
# This behavior is checked by the mmap_seek test.
log_must zfs set compression=on $TESTPOOL/$TESTFS
for bs in 4096 8192 16384 32768 65536 131072; do
log_must zfs set recordsize=$bs $TESTPOOL/$TESTFS
log_must mmap_seek $TESTDIR/test-mmap-file $((1024*1024)) $bs
log_must rm $TESTDIR/test-mmap-file
done
log_pass "lseek() data/holes for an mmap()'d file succeeded."