Add Linux posix_fadvise support

The purpose of this PR is to accepts fadvise ioctl from userland
to do read-ahead by demand.

It could dramatically improve sequential read performance especially
when primarycache is set to metadata or zfs_prefetch_disable is 1.

If the file is mmaped, generic_fadvise is also called for page cache
read-ahead besides dmu_prefetch.

Only POSIX_FADV_WILLNEED and POSIX_FADV_SEQUENTIAL are supported in
this PR currently.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Finix Yan <yancw@info2soft.com>
Closes #13694
This commit is contained in:
Finix1979 2022-09-09 01:29:41 +08:00 committed by GitHub
parent 380b08098e
commit 320f0c6022
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 365 additions and 2 deletions

23
config/kernel-fadvise.m4 Normal file
View File

@ -0,0 +1,23 @@
dnl #
dnl # Linux 4.19 API
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_FADVISE], [
ZFS_LINUX_TEST_SRC([file_fadvise], [
#include <linux/fs.h>
static const struct file_operations
fops __attribute__ ((unused)) = {
.fadvise = NULL,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_FADVISE], [
AC_MSG_CHECKING([whether fops->fadvise() exists])
ZFS_LINUX_TEST_RESULT([file_fadvise], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_FILE_FADVISE, 1, [fops->fadvise() exists])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -0,0 +1,27 @@
dnl #
dnl # 5.3 API change
dnl # The generic_fadvise() function is present since 4.19 kernel
dnl # but it was not exported until Linux 5.3.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FADVISE], [
ZFS_LINUX_TEST_SRC([generic_fadvise], [
#include <linux/fs.h>
], [
struct file *fp __attribute__ ((unused)) = NULL;
loff_t offset __attribute__ ((unused)) = 0;
loff_t len __attribute__ ((unused)) = 0;
int advise __attribute__ ((unused)) = 0;
generic_fadvise(fp, offset, len, advise);
])
])
AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FADVISE], [
AC_MSG_CHECKING([whether generic_fadvise() is available])
ZFS_LINUX_TEST_RESULT_SYMBOL([generic_fadvise],
[generic_fadvise], [mm/fadvise.c], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_GENERIC_FADVISE, 1, [yes])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -42,6 +42,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE
ZFS_AC_KERNEL_SRC_PDE_DATA ZFS_AC_KERNEL_SRC_PDE_DATA
ZFS_AC_KERNEL_SRC_FALLOCATE ZFS_AC_KERNEL_SRC_FALLOCATE
ZFS_AC_KERNEL_SRC_FADVISE
ZFS_AC_KERNEL_SRC_GENERIC_FADVISE
ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
ZFS_AC_KERNEL_SRC_RWSEM ZFS_AC_KERNEL_SRC_RWSEM
ZFS_AC_KERNEL_SRC_SCHED ZFS_AC_KERNEL_SRC_SCHED
@ -161,6 +163,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_OBJTOOL ZFS_AC_KERNEL_OBJTOOL
ZFS_AC_KERNEL_PDE_DATA ZFS_AC_KERNEL_PDE_DATA
ZFS_AC_KERNEL_FALLOCATE ZFS_AC_KERNEL_FALLOCATE
ZFS_AC_KERNEL_FADVISE
ZFS_AC_KERNEL_GENERIC_FADVISE
ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
ZFS_AC_KERNEL_RWSEM ZFS_AC_KERNEL_RWSEM
ZFS_AC_KERNEL_SCHED ZFS_AC_KERNEL_SCHED

View File

@ -27,6 +27,7 @@
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
#include <linux/compat.h> #include <linux/compat.h>
#endif #endif
#include <linux/fs.h>
#include <sys/file.h> #include <sys/file.h>
#include <sys/dmu_objset.h> #include <sys/dmu_objset.h>
#include <sys/zfs_znode.h> #include <sys/zfs_znode.h>
@ -37,6 +38,9 @@
defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO)
#include <linux/pagemap.h> #include <linux/pagemap.h>
#endif #endif
#ifdef HAVE_FILE_FADVISE
#include <linux/fadvise.h>
#endif
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
#include <linux/writeback.h> #include <linux/writeback.h>
#endif #endif
@ -906,6 +910,61 @@ zpl_ioctl_getversion(struct file *filp, void __user *arg)
return (copy_to_user(arg, &generation, sizeof (generation))); return (copy_to_user(arg, &generation, sizeof (generation)));
} }
#ifdef HAVE_FILE_FADVISE
static int
zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
{
struct inode *ip = file_inode(filp);
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
objset_t *os = zfsvfs->z_os;
int error = 0;
if (S_ISFIFO(ip->i_mode))
return (-ESPIPE);
if (offset < 0 || len < 0)
return (-EINVAL);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
switch (advice) {
case POSIX_FADV_SEQUENTIAL:
case POSIX_FADV_WILLNEED:
#ifdef HAVE_GENERIC_FADVISE
if (zn_has_cached_data(zp))
error = generic_fadvise(filp, offset, len, advice);
#endif
/*
* Pass on the caller's size directly, but note that
* dmu_prefetch_max will effectively cap it. If there
* really is a larger sequential access pattern, perhaps
* dmu_zfetch will detect it.
*/
if (len == 0)
len = i_size_read(ip) - offset;
dmu_prefetch(os, zp->z_id, 0, offset, len,
ZIO_PRIORITY_ASYNC_READ);
break;
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_DONTNEED:
case POSIX_FADV_NOREUSE:
/* ignored for now */
break;
default:
error = -EINVAL;
break;
}
ZFS_EXIT(zfsvfs);
return (error);
}
#endif /* HAVE_FILE_FADVISE */
#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
@ -1259,6 +1318,9 @@ const struct file_operations zpl_file_operations = {
.aio_fsync = zpl_aio_fsync, .aio_fsync = zpl_aio_fsync,
#endif #endif
.fallocate = zpl_fallocate, .fallocate = zpl_fallocate,
#ifdef HAVE_FILE_FADVISE
.fadvise = zpl_fadvise,
#endif
.unlocked_ioctl = zpl_ioctl, .unlocked_ioctl = zpl_ioctl,
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
.compat_ioctl = zpl_compat_ioctl, .compat_ioctl = zpl_compat_ioctl,

View File

@ -89,6 +89,10 @@ tags = ['functional', 'devices']
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill'] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill']
tags = ['functional', 'events'] tags = ['functional', 'events']
[tests/functional/fadvise:Linux]
tests = ['fadvise_sequential']
tags = ['functional', 'fadvise']
[tests/functional/fallocate:Linux] [tests/functional/fallocate:Linux]
tests = ['fallocate_prealloc', 'fallocate_zero-range'] tests = ['fallocate_prealloc', 'fallocate_zero-range']
tags = ['functional', 'fallocate'] tags = ['functional', 'fallocate']

View File

@ -4,6 +4,7 @@
/devname2devid /devname2devid
/dir_rd_update /dir_rd_update
/draid /draid
/file_fadvise
/file_append /file_append
/file_check /file_check
/file_trunc /file_trunc

View File

@ -128,4 +128,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/read_dos_attributes %D%/write_dos_attribu
scripts_zfs_tests_bin_PROGRAMS += %D%/randfree_file scripts_zfs_tests_bin_PROGRAMS += %D%/randfree_file
%C%_randfree_file_SOURCES = %D%/file/randfree_file.c %C%_randfree_file_SOURCES = %D%/file/randfree_file.c
scripts_zfs_tests_bin_PROGRAMS += %D%/file_fadvise
%C%_file_fadvise_SOURCES = %D%/file/file_fadvise.c
endif endif

View File

@ -0,0 +1,97 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2022 by Information2 Software, Inc. All rights reserved.
*/
#include "file_common.h"
#include <sys/types.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
/*
* Call fadvise to prefetch data
*/
static const char *execname = "file_fadvise";
static void
usage(void)
{
(void) fprintf(stderr,
"usage: %s -f filename -a advise \n", execname);
}
int
main(int argc, char *argv[])
{
char *filename = NULL;
int advise = 0;
int fd, ch;
int err = 0;
while ((ch = getopt(argc, argv, "a:f:")) != EOF) {
switch (ch) {
case 'a':
advise = atoll(optarg);
break;
case 'f':
filename = optarg;
break;
case '?':
(void) printf("unknown arg %c\n", optopt);
usage();
break;
}
}
if (!filename) {
(void) printf("Filename not specified (-f <file>)\n");
err++;
}
if (advise < POSIX_FADV_NORMAL || advise > POSIX_FADV_NOREUSE) {
(void) printf("advise is invalid\n");
err++;
}
if (err) {
usage(); /* no return */
return (1);
}
if ((fd = open(filename, O_RDWR, 0666)) < 0) {
perror("open");
return (1);
}
posix_fadvise(fd, 0, 0, advise);
close(fd);
return (0);
}

View File

@ -184,6 +184,7 @@ export ZFSTEST_FILES='badsend
devname2devid devname2devid
dir_rd_update dir_rd_update
draid draid
file_fadvise
file_append file_append
file_check file_check
file_trunc file_trunc

View File

@ -1370,6 +1370,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/exec/exec_001_pos.ksh \ functional/exec/exec_001_pos.ksh \
functional/exec/exec_002_neg.ksh \ functional/exec/exec_002_neg.ksh \
functional/exec/setup.ksh \ functional/exec/setup.ksh \
functional/fadvise/cleanup.ksh \
functional/fadvise/fadvise_sequential.ksh \
functional/fadvise/setup.ksh \
functional/fallocate/cleanup.ksh \ functional/fallocate/cleanup.ksh \
functional/fallocate/fallocate_prealloc.ksh \ functional/fallocate/fallocate_prealloc.ksh \
functional/fallocate/fallocate_punch-hole.ksh \ functional/fallocate/fallocate_punch-hole.ksh \

View File

@ -76,7 +76,7 @@ while [[ $j -lt ${#CHECKSUM_TYPES[*]} ]]; do
log_must zpool export $TESTPOOL log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL log_must zpool import $TESTPOOL
log_mustnot eval "cat $TESTDIR/test_$type >/dev/null" log_mustnot eval "dd if=$TESTDIR/test_$type of=/dev/null bs=$WRITESZ count=$NWRITES"
cksum=$(zpool status -P -v $TESTPOOL | grep "$firstvdev" | \ cksum=$(zpool status -P -v $TESTPOOL | grep "$firstvdev" | \
awk '{print $5}') awk '{print $5}')

View File

@ -0,0 +1,28 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Portions Copyright (c) 2022 Information2 Software, Inc.
#
. $STF_SUITE/include/libtest.shlib
default_cleanup

View File

@ -0,0 +1,80 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Portions Copyright (c) 2022 Information2 Software, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/include/math.shlib
#
# DESCRIPTION:
# Test posix_fadvise.
#
# STRATEGY:
# 1. Set primarycache to metadata in order to disable prefetch
# 2. Write some data to file
# 3. get data_size field from arcstat
# 4. call file_fadvise with POSIX_FADV_SEQUENTIAL
# 5. get data_size field from arcstat again
# 6. latter data_size should be bigger than former one
#
# NOTE: if HAVE_FILE_FADVISE is not defined former data_size
# should less or eaqul to latter one
verify_runnable "global"
FILE=$TESTDIR/$TESTFILE0
BLKSZ=$(get_prop recordsize $TESTPOOL)
function cleanup
{
log_must zfs set primarycache=all $TESTPOOL
[[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/*
}
getstat() {
awk -v c="$1" '$1 == c {print $3; exit}' /proc/spl/kstat/zfs/arcstats
}
log_assert "Ensure fadvise prefetch data"
log_onexit cleanup
log_must zfs set primarycache=metadata $TESTPOOL
log_must file_write -o create -f $FILE -b $BLKSZ -c 1000
sync_pool $TESTPOOL
data_size1=$(getstat data_size)
log_must file_fadvise -f $FILE -a 2
sleep 10
data_size2=$(getstat data_size)
log_note "original data_size is $data_size1, final data_size is $data_size2"
log_must [ $data_size1 -le $data_size2 ]
log_pass "Ensure data could be prefetched"

View File

@ -0,0 +1,30 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Portions Copyright (c) 2022 Information2 Software, Inc.
#
. $STF_SUITE/include/libtest.shlib
DISK=${DISKS%% *}
default_setup_noexit $DISK
log_pass

View File

@ -73,7 +73,7 @@ for type in "mirror" "raidz" "raidz2"; do
# 4. Inject CHECKSUM ERRORS on read with a zinject error handler # 4. Inject CHECKSUM ERRORS on read with a zinject error handler
log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL
log_must cp $TESTFILE /dev/null log_must dd if=$TESTFILE of=/dev/null bs=1M count=64
# 5. Verify the ZED kicks in a hot spare and expected pool/device status # 5. Verify the ZED kicks in a hot spare and expected pool/device status
log_note "Wait for ZED to auto-spare" log_note "Wait for ZED to auto-spare"