Fix lseek(SEEK_DATA/SEEK_HOLE) mmap consistency
When using lseek(2) to report data/holes memory mapped regions of the file were ignored. This could result in incorrect results. To handle this zfs_holey_common() was updated to asynchronously writeback any dirty mmap(2) regions prior to reporting holes. Additionally, while not strictly required, the dn_struct_rwlock is now held over the dirty check to prevent the dnode structure from changing. This ensures that a clean dnode can't be dirtied before the data/hole is located. The range lock is now also taken to ensure the call cannot race with zfs_write(). Furthermore, the code was refactored to provide a dnode_is_dirty() helper function which checks the dnode for any dirty records to determine its dirtiness. Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Rich Ercolani <rincebrain@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #11900 Closes #12724
This commit is contained in:
parent
4ba1a6227a
commit
a524f8d6af
|
@ -221,6 +221,7 @@ AC_CONFIG_FILES([
|
|||
tests/zfs-tests/cmd/mktree/Makefile
|
||||
tests/zfs-tests/cmd/mmap_exec/Makefile
|
||||
tests/zfs-tests/cmd/mmap_libaio/Makefile
|
||||
tests/zfs-tests/cmd/mmap_seek/Makefile
|
||||
tests/zfs-tests/cmd/mmapwrite/Makefile
|
||||
tests/zfs-tests/cmd/nvlist_to_lua/Makefile
|
||||
tests/zfs-tests/cmd/randfree_file/Makefile
|
||||
|
|
|
@ -59,6 +59,8 @@ enum symfollow { NO_FOLLOW = NOFOLLOW };
|
|||
#include <sys/file.h>
|
||||
#include <sys/filedesc.h>
|
||||
#include <sys/syscallsubr.h>
|
||||
#include <sys/vm.h>
|
||||
#include <vm/vm_object.h>
|
||||
|
||||
typedef struct vop_vector vnodeops_t;
|
||||
#define VOP_FID VOP_VPTOFH
|
||||
|
@ -88,6 +90,22 @@ vn_is_readonly(vnode_t *vp)
|
|||
#define vn_has_cached_data(vp) \
|
||||
((vp)->v_object != NULL && \
|
||||
(vp)->v_object->resident_page_count > 0)
|
||||
|
||||
static __inline void
|
||||
vn_flush_cached_data(vnode_t *vp, boolean_t sync)
|
||||
{
|
||||
#if __FreeBSD_version > 1300054
|
||||
if (vm_object_mightbedirty(vp->v_object)) {
|
||||
#else
|
||||
if (vp->v_object->flags & OBJ_MIGHTBEDIRTY) {
|
||||
#endif
|
||||
int flags = sync ? OBJPC_SYNC : 0;
|
||||
zfs_vmobject_wlock(vp->v_object);
|
||||
vm_object_page_clean(vp->v_object, 0, 0, flags);
|
||||
zfs_vmobject_wunlock(vp->v_object);
|
||||
}
|
||||
}
|
||||
|
||||
#define vn_exists(vp) do { } while (0)
|
||||
#define vn_invalid(vp) do { } while (0)
|
||||
#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0)
|
||||
|
|
|
@ -117,7 +117,8 @@ extern minor_t zfsdev_minor_alloc(void);
|
|||
#define Z_ISLNK(type) ((type) == VLNK)
|
||||
#define Z_ISDIR(type) ((type) == VDIR)
|
||||
|
||||
#define zn_has_cached_data(zp) vn_has_cached_data(ZTOV(zp))
|
||||
#define zn_has_cached_data(zp) vn_has_cached_data(ZTOV(zp))
|
||||
#define zn_flush_cached_data(zp, sync) vn_flush_cached_data(ZTOV(zp), sync)
|
||||
#define zn_rlimit_fsize(zp, uio) \
|
||||
vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio))
|
||||
|
||||
|
|
|
@ -71,6 +71,7 @@ extern "C" {
|
|||
#define Z_ISDIR(type) S_ISDIR(type)
|
||||
|
||||
#define zn_has_cached_data(zp) ((zp)->z_is_mapped)
|
||||
#define zn_flush_cached_data(zp, sync) write_inode_now(ZTOI(zp), sync)
|
||||
#define zn_rlimit_fsize(zp, uio) (0)
|
||||
|
||||
/*
|
||||
|
|
|
@ -425,6 +425,7 @@ boolean_t dnode_add_ref(dnode_t *dn, void *ref);
|
|||
void dnode_rele(dnode_t *dn, void *ref);
|
||||
void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
|
||||
int dnode_try_claim(objset_t *os, uint64_t object, int slots);
|
||||
boolean_t dnode_is_dirty(dnode_t *dn);
|
||||
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
|
||||
void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag);
|
||||
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
|
||||
|
|
|
@ -2045,42 +2045,41 @@ int
|
|||
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
|
||||
{
|
||||
dnode_t *dn;
|
||||
int i, err;
|
||||
boolean_t clean = B_TRUE;
|
||||
int err;
|
||||
|
||||
restart:
|
||||
err = dnode_hold(os, object, FTAG, &dn);
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
/*
|
||||
* Check if dnode is dirty
|
||||
*/
|
||||
for (i = 0; i < TXG_SIZE; i++) {
|
||||
if (multilist_link_active(&dn->dn_dirty_link[i])) {
|
||||
clean = B_FALSE;
|
||||
break;
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||
|
||||
if (dnode_is_dirty(dn)) {
|
||||
/*
|
||||
* If the zfs_dmu_offset_next_sync module option is enabled
|
||||
* then strict hole reporting has been requested. Dirty
|
||||
* dnodes must be synced to disk to accurately report all
|
||||
* holes. When disabled (the default) dirty dnodes are
|
||||
* reported to not have any holes which is always safe.
|
||||
*
|
||||
* When called by zfs_holey_common() the zp->z_rangelock
|
||||
* is held to prevent zfs_write() and mmap writeback from
|
||||
* re-dirtying the dnode after txg_wait_synced().
|
||||
*/
|
||||
if (zfs_dmu_offset_next_sync) {
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dnode_rele(dn, FTAG);
|
||||
txg_wait_synced(dmu_objset_pool(os), 0);
|
||||
goto restart;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If compatibility option is on, sync any current changes before
|
||||
* we go trundling through the block pointers.
|
||||
*/
|
||||
if (!clean && zfs_dmu_offset_next_sync) {
|
||||
clean = B_TRUE;
|
||||
dnode_rele(dn, FTAG);
|
||||
txg_wait_synced(dmu_objset_pool(os), 0);
|
||||
err = dnode_hold(os, object, FTAG, &dn);
|
||||
if (err)
|
||||
return (err);
|
||||
}
|
||||
|
||||
if (clean)
|
||||
err = dnode_next_offset(dn,
|
||||
(hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
|
||||
else
|
||||
err = SET_ERROR(EBUSY);
|
||||
} else {
|
||||
err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
|
||||
(hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
|
||||
}
|
||||
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dnode_rele(dn, FTAG);
|
||||
|
||||
return (err);
|
||||
|
|
|
@ -1653,6 +1653,26 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
|
|||
slots, NULL, NULL));
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks if the dnode contains any uncommitted dirty records.
|
||||
*/
|
||||
boolean_t
|
||||
dnode_is_dirty(dnode_t *dn)
|
||||
{
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
|
||||
for (int i = 0; i < TXG_SIZE; i++) {
|
||||
if (list_head(&dn->dn_dirty_records[i]) != NULL) {
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
return (B_TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
void
|
||||
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
|
||||
{
|
||||
|
|
|
@ -85,6 +85,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
|
|||
static int
|
||||
zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
|
||||
{
|
||||
zfs_locked_range_t *lr;
|
||||
uint64_t noff = (uint64_t)*off; /* new offset */
|
||||
uint64_t file_sz;
|
||||
int error;
|
||||
|
@ -100,12 +101,18 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
|
|||
else
|
||||
hole = B_FALSE;
|
||||
|
||||
/* Flush any mmap()'d data to disk */
|
||||
if (zn_has_cached_data(zp))
|
||||
zn_flush_cached_data(zp, B_FALSE);
|
||||
|
||||
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER);
|
||||
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
|
||||
zfs_rangelock_exit(lr);
|
||||
|
||||
if (error == ESRCH)
|
||||
return (SET_ERROR(ENXIO));
|
||||
|
||||
/* file was dirty, so fall back to using generic logic */
|
||||
/* File was dirty, so fall back to using generic logic */
|
||||
if (error == EBUSY) {
|
||||
if (hole)
|
||||
*off = file_sz;
|
||||
|
|
|
@ -658,7 +658,7 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
|
|||
tags = ['functional', 'migration']
|
||||
|
||||
[tests/functional/mmap]
|
||||
tests = ['mmap_write_001_pos', 'mmap_read_001_pos']
|
||||
tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos']
|
||||
tags = ['functional', 'mmap']
|
||||
|
||||
[tests/functional/mount]
|
||||
|
|
|
@ -18,6 +18,7 @@ SUBDIRS = \
|
|||
mktree \
|
||||
mmap_exec \
|
||||
mmap_libaio \
|
||||
mmap_seek \
|
||||
mmapwrite \
|
||||
nvlist_to_lua \
|
||||
randwritecomp \
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
/mmap_seek
|
|
@ -0,0 +1,6 @@
|
|||
include $(top_srcdir)/config/Rules.am
|
||||
|
||||
pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin
|
||||
|
||||
pkgexec_PROGRAMS = mmap_seek
|
||||
mmap_seek_SOURCES = mmap_seek.c
|
|
@ -0,0 +1,147 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <errno.h>
|
||||
|
||||
static void
|
||||
seek_data(int fd, off_t offset, off_t expected)
|
||||
{
|
||||
off_t data_offset = lseek(fd, offset, SEEK_DATA);
|
||||
if (data_offset != expected) {
|
||||
fprintf(stderr, "lseek(fd, %d, SEEK_DATA) = %d (expected %d)\n",
|
||||
(int)offset, (int)data_offset, (int)expected);
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
seek_hole(int fd, off_t offset, off_t expected)
|
||||
{
|
||||
off_t hole_offset = lseek(fd, offset, SEEK_HOLE);
|
||||
if (hole_offset != expected) {
|
||||
fprintf(stderr, "lseek(fd, %d, SEEK_HOLE) = %d (expected %d)\n",
|
||||
(int)offset, (int)hole_offset, (int)expected);
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
char *execname = argv[0];
|
||||
char *file_path = argv[1];
|
||||
char *buf = NULL;
|
||||
int err;
|
||||
|
||||
if (argc != 4) {
|
||||
(void) printf("usage: %s <file name> <file size> "
|
||||
"<block size>\n", argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int fd = open(file_path, O_RDWR | O_CREAT, 0666);
|
||||
if (fd == -1) {
|
||||
(void) fprintf(stderr, "%s: %s: ", execname, file_path);
|
||||
perror("open");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
off_t file_size = atoi(argv[2]);
|
||||
off_t block_size = atoi(argv[3]);
|
||||
|
||||
if (block_size * 2 > file_size) {
|
||||
(void) fprintf(stderr, "file size must be at least "
|
||||
"double the block size\n");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
err = ftruncate(fd, file_size);
|
||||
if (err == -1) {
|
||||
perror("ftruncate");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
if ((buf = mmap(NULL, file_size, PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED, fd, 0)) == MAP_FAILED) {
|
||||
perror("mmap");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
/* Verify the file is sparse and reports no data. */
|
||||
seek_data(fd, 0, -1);
|
||||
|
||||
/* Verify the file is reported as a hole. */
|
||||
seek_hole(fd, 0, 0);
|
||||
|
||||
/* Verify search beyond end of file is an error. */
|
||||
seek_data(fd, 2 * file_size, -1);
|
||||
seek_hole(fd, 2 * file_size, -1);
|
||||
|
||||
/* Dirty the first byte. */
|
||||
memset(buf, 'a', 1);
|
||||
seek_data(fd, 0, 0);
|
||||
seek_data(fd, block_size, -1);
|
||||
seek_hole(fd, 0, block_size);
|
||||
seek_hole(fd, block_size, block_size);
|
||||
|
||||
/* Dirty the first half of the file. */
|
||||
memset(buf, 'b', file_size / 2);
|
||||
seek_data(fd, 0, 0);
|
||||
seek_data(fd, block_size, block_size);
|
||||
seek_hole(fd, 0, P2ROUNDUP(file_size / 2, block_size));
|
||||
seek_hole(fd, block_size, P2ROUNDUP(file_size / 2, block_size));
|
||||
|
||||
/* Dirty the whole file. */
|
||||
memset(buf, 'c', file_size);
|
||||
seek_data(fd, 0, 0);
|
||||
seek_data(fd, file_size * 3 / 4,
|
||||
P2ROUNDUP(file_size * 3 / 4, block_size));
|
||||
seek_hole(fd, 0, file_size);
|
||||
seek_hole(fd, file_size / 2, file_size);
|
||||
|
||||
/* Punch a hole (required compression be enabled). */
|
||||
memset(buf + block_size, 0, block_size);
|
||||
seek_data(fd, 0, 0);
|
||||
seek_data(fd, block_size, 2 * block_size);
|
||||
seek_hole(fd, 0, block_size);
|
||||
seek_hole(fd, block_size, block_size);
|
||||
seek_hole(fd, 2 * block_size, file_size);
|
||||
|
||||
err = munmap(buf, file_size);
|
||||
if (err == -1) {
|
||||
perror("munmap");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
close(fd);
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -208,6 +208,7 @@ export ZFSTEST_FILES='badsend
|
|||
mktree
|
||||
mmap_exec
|
||||
mmap_libaio
|
||||
mmap_seek
|
||||
mmapwrite
|
||||
nvlist_to_lua
|
||||
randfree_file
|
||||
|
|
|
@ -33,6 +33,7 @@ DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode
|
|||
DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms
|
||||
DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms
|
||||
DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check
|
||||
DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync
|
||||
INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size
|
||||
INITIALIZE_VALUE initialize_value zfs_initialize_value
|
||||
KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export
|
||||
|
|
|
@ -4,7 +4,8 @@ dist_pkgdata_SCRIPTS = \
|
|||
cleanup.ksh \
|
||||
mmap_read_001_pos.ksh \
|
||||
mmap_write_001_pos.ksh \
|
||||
mmap_libaio_001_pos.ksh
|
||||
mmap_libaio_001_pos.ksh \
|
||||
mmap_seek_001_pos.ksh
|
||||
|
||||
dist_pkgdata_DATA = \
|
||||
mmap.cfg
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or http://www.opensolaris.org/os/licensing.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/mmap/mmap.cfg
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# lseek() data/holes for an mmap()'d file.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Enable compression and hole reporting for dirty files.
|
||||
# 2. Call mmap_seek binary test case for various record sizes.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must zfs set compression=off $TESTPOOL/$TESTFS
|
||||
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
|
||||
log_must rm -f $TESTDIR/test-mmap-file
|
||||
log_must set_tunable64 DMU_OFFSET_NEXT_SYNC $dmu_offset_next_sync
|
||||
}
|
||||
|
||||
log_assert "lseek() data/holes for an mmap()'d file."
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
# Enable hole reporting for dirty files.
|
||||
typeset dmu_offset_next_sync=$(get_tunable DMU_OFFSET_NEXT_SYNC)
|
||||
log_must set_tunable64 DMU_OFFSET_NEXT_SYNC 1
|
||||
|
||||
# Compression must be enabled to convert zero'd blocks to holes.
|
||||
# This behavior is checked by the mmap_seek test.
|
||||
log_must zfs set compression=on $TESTPOOL/$TESTFS
|
||||
|
||||
for bs in 4096 8192 16384 32768 65536 131072; do
|
||||
log_must zfs set recordsize=$bs $TESTPOOL/$TESTFS
|
||||
log_must mmap_seek $TESTDIR/test-mmap-file $((1024*1024)) $bs
|
||||
log_must rm $TESTDIR/test-mmap-file
|
||||
done
|
||||
|
||||
log_pass "lseek() data/holes for an mmap()'d file succeeded."
|
Loading…
Reference in New Issue