Add Linux posix_fadvise support
The purpose of this PR is to accepts fadvise ioctl from userland to do read-ahead by demand. It could dramatically improve sequential read performance especially when primarycache is set to metadata or zfs_prefetch_disable is 1. If the file is mmaped, generic_fadvise is also called for page cache read-ahead besides dmu_prefetch. Only POSIX_FADV_WILLNEED and POSIX_FADV_SEQUENTIAL are supported in this PR currently. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Finix Yan <yancw@info2soft.com> Closes #13694
This commit is contained in:
parent
380b08098e
commit
320f0c6022
|
@ -0,0 +1,23 @@
|
|||
dnl #
|
||||
dnl # Linux 4.19 API
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_FADVISE], [
|
||||
ZFS_LINUX_TEST_SRC([file_fadvise], [
|
||||
#include <linux/fs.h>
|
||||
|
||||
static const struct file_operations
|
||||
fops __attribute__ ((unused)) = {
|
||||
.fadvise = NULL,
|
||||
};
|
||||
],[])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_FADVISE], [
|
||||
AC_MSG_CHECKING([whether fops->fadvise() exists])
|
||||
ZFS_LINUX_TEST_RESULT([file_fadvise], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_FILE_FADVISE, 1, [fops->fadvise() exists])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
|
@ -0,0 +1,27 @@
|
|||
dnl #
|
||||
dnl # 5.3 API change
|
||||
dnl # The generic_fadvise() function is present since 4.19 kernel
|
||||
dnl # but it was not exported until Linux 5.3.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FADVISE], [
|
||||
ZFS_LINUX_TEST_SRC([generic_fadvise], [
|
||||
#include <linux/fs.h>
|
||||
], [
|
||||
struct file *fp __attribute__ ((unused)) = NULL;
|
||||
loff_t offset __attribute__ ((unused)) = 0;
|
||||
loff_t len __attribute__ ((unused)) = 0;
|
||||
int advise __attribute__ ((unused)) = 0;
|
||||
generic_fadvise(fp, offset, len, advise);
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FADVISE], [
|
||||
AC_MSG_CHECKING([whether generic_fadvise() is available])
|
||||
ZFS_LINUX_TEST_RESULT_SYMBOL([generic_fadvise],
|
||||
[generic_fadvise], [mm/fadvise.c], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_GENERIC_FADVISE, 1, [yes])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
|
@ -42,6 +42,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|||
ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE
|
||||
ZFS_AC_KERNEL_SRC_PDE_DATA
|
||||
ZFS_AC_KERNEL_SRC_FALLOCATE
|
||||
ZFS_AC_KERNEL_SRC_FADVISE
|
||||
ZFS_AC_KERNEL_SRC_GENERIC_FADVISE
|
||||
ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
|
||||
ZFS_AC_KERNEL_SRC_RWSEM
|
||||
ZFS_AC_KERNEL_SRC_SCHED
|
||||
|
@ -161,6 +163,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|||
ZFS_AC_KERNEL_OBJTOOL
|
||||
ZFS_AC_KERNEL_PDE_DATA
|
||||
ZFS_AC_KERNEL_FALLOCATE
|
||||
ZFS_AC_KERNEL_FADVISE
|
||||
ZFS_AC_KERNEL_GENERIC_FADVISE
|
||||
ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
|
||||
ZFS_AC_KERNEL_RWSEM
|
||||
ZFS_AC_KERNEL_SCHED
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#ifdef CONFIG_COMPAT
|
||||
#include <linux/compat.h>
|
||||
#endif
|
||||
#include <linux/fs.h>
|
||||
#include <sys/file.h>
|
||||
#include <sys/dmu_objset.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
|
@ -37,6 +38,9 @@
|
|||
defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO)
|
||||
#include <linux/pagemap.h>
|
||||
#endif
|
||||
#ifdef HAVE_FILE_FADVISE
|
||||
#include <linux/fadvise.h>
|
||||
#endif
|
||||
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
|
||||
#include <linux/writeback.h>
|
||||
#endif
|
||||
|
@ -906,6 +910,61 @@ zpl_ioctl_getversion(struct file *filp, void __user *arg)
|
|||
return (copy_to_user(arg, &generation, sizeof (generation)));
|
||||
}
|
||||
|
||||
#ifdef HAVE_FILE_FADVISE
|
||||
static int
|
||||
zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
|
||||
{
|
||||
struct inode *ip = file_inode(filp);
|
||||
znode_t *zp = ITOZ(ip);
|
||||
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
||||
objset_t *os = zfsvfs->z_os;
|
||||
int error = 0;
|
||||
|
||||
if (S_ISFIFO(ip->i_mode))
|
||||
return (-ESPIPE);
|
||||
|
||||
if (offset < 0 || len < 0)
|
||||
return (-EINVAL);
|
||||
|
||||
ZFS_ENTER(zfsvfs);
|
||||
ZFS_VERIFY_ZP(zp);
|
||||
|
||||
switch (advice) {
|
||||
case POSIX_FADV_SEQUENTIAL:
|
||||
case POSIX_FADV_WILLNEED:
|
||||
#ifdef HAVE_GENERIC_FADVISE
|
||||
if (zn_has_cached_data(zp))
|
||||
error = generic_fadvise(filp, offset, len, advice);
|
||||
#endif
|
||||
/*
|
||||
* Pass on the caller's size directly, but note that
|
||||
* dmu_prefetch_max will effectively cap it. If there
|
||||
* really is a larger sequential access pattern, perhaps
|
||||
* dmu_zfetch will detect it.
|
||||
*/
|
||||
if (len == 0)
|
||||
len = i_size_read(ip) - offset;
|
||||
|
||||
dmu_prefetch(os, zp->z_id, 0, offset, len,
|
||||
ZIO_PRIORITY_ASYNC_READ);
|
||||
break;
|
||||
case POSIX_FADV_NORMAL:
|
||||
case POSIX_FADV_RANDOM:
|
||||
case POSIX_FADV_DONTNEED:
|
||||
case POSIX_FADV_NOREUSE:
|
||||
/* ignored for now */
|
||||
break;
|
||||
default:
|
||||
error = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
ZFS_EXIT(zfsvfs);
|
||||
|
||||
return (error);
|
||||
}
|
||||
#endif /* HAVE_FILE_FADVISE */
|
||||
|
||||
#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
|
||||
#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
|
||||
|
||||
|
@ -1259,6 +1318,9 @@ const struct file_operations zpl_file_operations = {
|
|||
.aio_fsync = zpl_aio_fsync,
|
||||
#endif
|
||||
.fallocate = zpl_fallocate,
|
||||
#ifdef HAVE_FILE_FADVISE
|
||||
.fadvise = zpl_fadvise,
|
||||
#endif
|
||||
.unlocked_ioctl = zpl_ioctl,
|
||||
#ifdef CONFIG_COMPAT
|
||||
.compat_ioctl = zpl_compat_ioctl,
|
||||
|
|
|
@ -89,6 +89,10 @@ tags = ['functional', 'devices']
|
|||
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill']
|
||||
tags = ['functional', 'events']
|
||||
|
||||
[tests/functional/fadvise:Linux]
|
||||
tests = ['fadvise_sequential']
|
||||
tags = ['functional', 'fadvise']
|
||||
|
||||
[tests/functional/fallocate:Linux]
|
||||
tests = ['fallocate_prealloc', 'fallocate_zero-range']
|
||||
tags = ['functional', 'fallocate']
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
/devname2devid
|
||||
/dir_rd_update
|
||||
/draid
|
||||
/file_fadvise
|
||||
/file_append
|
||||
/file_check
|
||||
/file_trunc
|
||||
|
|
|
@ -128,4 +128,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/read_dos_attributes %D%/write_dos_attribu
|
|||
|
||||
scripts_zfs_tests_bin_PROGRAMS += %D%/randfree_file
|
||||
%C%_randfree_file_SOURCES = %D%/file/randfree_file.c
|
||||
|
||||
scripts_zfs_tests_bin_PROGRAMS += %D%/file_fadvise
|
||||
%C%_file_fadvise_SOURCES = %D%/file/file_fadvise.c
|
||||
endif
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2022 by Information2 Software, Inc. All rights reserved.
|
||||
*/
|
||||
|
||||
#include "file_common.h"
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Call fadvise to prefetch data
|
||||
*/
|
||||
static const char *execname = "file_fadvise";
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
{
|
||||
(void) fprintf(stderr,
|
||||
"usage: %s -f filename -a advise \n", execname);
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
char *filename = NULL;
|
||||
int advise = 0;
|
||||
int fd, ch;
|
||||
int err = 0;
|
||||
|
||||
while ((ch = getopt(argc, argv, "a:f:")) != EOF) {
|
||||
switch (ch) {
|
||||
case 'a':
|
||||
advise = atoll(optarg);
|
||||
break;
|
||||
case 'f':
|
||||
filename = optarg;
|
||||
break;
|
||||
case '?':
|
||||
(void) printf("unknown arg %c\n", optopt);
|
||||
usage();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!filename) {
|
||||
(void) printf("Filename not specified (-f <file>)\n");
|
||||
err++;
|
||||
}
|
||||
|
||||
if (advise < POSIX_FADV_NORMAL || advise > POSIX_FADV_NOREUSE) {
|
||||
(void) printf("advise is invalid\n");
|
||||
err++;
|
||||
}
|
||||
|
||||
if (err) {
|
||||
usage(); /* no return */
|
||||
return (1);
|
||||
}
|
||||
|
||||
if ((fd = open(filename, O_RDWR, 0666)) < 0) {
|
||||
perror("open");
|
||||
return (1);
|
||||
}
|
||||
|
||||
posix_fadvise(fd, 0, 0, advise);
|
||||
|
||||
close(fd);
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -184,6 +184,7 @@ export ZFSTEST_FILES='badsend
|
|||
devname2devid
|
||||
dir_rd_update
|
||||
draid
|
||||
file_fadvise
|
||||
file_append
|
||||
file_check
|
||||
file_trunc
|
||||
|
|
|
@ -1370,6 +1370,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||
functional/exec/exec_001_pos.ksh \
|
||||
functional/exec/exec_002_neg.ksh \
|
||||
functional/exec/setup.ksh \
|
||||
functional/fadvise/cleanup.ksh \
|
||||
functional/fadvise/fadvise_sequential.ksh \
|
||||
functional/fadvise/setup.ksh \
|
||||
functional/fallocate/cleanup.ksh \
|
||||
functional/fallocate/fallocate_prealloc.ksh \
|
||||
functional/fallocate/fallocate_punch-hole.ksh \
|
||||
|
|
|
@ -76,7 +76,7 @@ while [[ $j -lt ${#CHECKSUM_TYPES[*]} ]]; do
|
|||
log_must zpool export $TESTPOOL
|
||||
log_must zpool import $TESTPOOL
|
||||
|
||||
log_mustnot eval "cat $TESTDIR/test_$type >/dev/null"
|
||||
log_mustnot eval "dd if=$TESTDIR/test_$type of=/dev/null bs=$WRITESZ count=$NWRITES"
|
||||
|
||||
cksum=$(zpool status -P -v $TESTPOOL | grep "$firstvdev" | \
|
||||
awk '{print $5}')
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
|
||||
#
|
||||
# Portions Copyright (c) 2022 Information2 Software, Inc.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
default_cleanup
|
|
@ -0,0 +1,80 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Portions Copyright (c) 2022 Information2 Software, Inc.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/include/math.shlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Test posix_fadvise.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Set primarycache to metadata in order to disable prefetch
|
||||
# 2. Write some data to file
|
||||
# 3. get data_size field from arcstat
|
||||
# 4. call file_fadvise with POSIX_FADV_SEQUENTIAL
|
||||
# 5. get data_size field from arcstat again
|
||||
# 6. latter data_size should be bigger than former one
|
||||
#
|
||||
|
||||
# NOTE: if HAVE_FILE_FADVISE is not defined former data_size
|
||||
# should less or eaqul to latter one
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
FILE=$TESTDIR/$TESTFILE0
|
||||
BLKSZ=$(get_prop recordsize $TESTPOOL)
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must zfs set primarycache=all $TESTPOOL
|
||||
[[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/*
|
||||
}
|
||||
|
||||
getstat() {
|
||||
awk -v c="$1" '$1 == c {print $3; exit}' /proc/spl/kstat/zfs/arcstats
|
||||
}
|
||||
|
||||
log_assert "Ensure fadvise prefetch data"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
log_must zfs set primarycache=metadata $TESTPOOL
|
||||
|
||||
log_must file_write -o create -f $FILE -b $BLKSZ -c 1000
|
||||
sync_pool $TESTPOOL
|
||||
|
||||
data_size1=$(getstat data_size)
|
||||
|
||||
log_must file_fadvise -f $FILE -a 2
|
||||
sleep 10
|
||||
|
||||
data_size2=$(getstat data_size)
|
||||
log_note "original data_size is $data_size1, final data_size is $data_size2"
|
||||
|
||||
log_must [ $data_size1 -le $data_size2 ]
|
||||
|
||||
log_pass "Ensure data could be prefetched"
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
|
||||
#
|
||||
# Portions Copyright (c) 2022 Information2 Software, Inc.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
DISK=${DISKS%% *}
|
||||
default_setup_noexit $DISK
|
||||
log_pass
|
|
@ -73,7 +73,7 @@ for type in "mirror" "raidz" "raidz2"; do
|
|||
|
||||
# 4. Inject CHECKSUM ERRORS on read with a zinject error handler
|
||||
log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL
|
||||
log_must cp $TESTFILE /dev/null
|
||||
log_must dd if=$TESTFILE of=/dev/null bs=1M count=64
|
||||
|
||||
# 5. Verify the ZED kicks in a hot spare and expected pool/device status
|
||||
log_note "Wait for ZED to auto-spare"
|
||||
|
|
Loading…
Reference in New Issue