linux: add basic fallocate(mode=0/2) compatibility

Implement semi-compatible functionality for mode=0 (preallocation)
and mode=FALLOC_FL_KEEP_SIZE (preallocation beyond EOF) for ZPL.

Since ZFS does COW and snapshots, preallocating blocks for a file
cannot guarantee that writes to the file will not run out of space.
Even if the first overwrite was guaranteed, it would not handle any
later overwrite of blocks due to COW, so strict compliance is futile.
Instead, make a best-effort check that at least enough free space is
currently available in the pool (with a bit of margin), then create
a sparse file of the requested size and continue on with life.

This does not handle all cases (e.g. several fallocate() calls before
writing into the files when the filesystem is nearly full), which
would require a more complex mechanism to be implemented, probably
based on a modified version of dmu_prealloc(), but is usable as-is.

A new module option zfs_fallocate_reserve_percent is used to control
the reserve margin for any single fallocate call.  By default, this
is 110% of the requested preallocation size, so an additional 10% of
available space is reserved for overhead to allow the application a
good chance of finishing the write when the fallocate() succeeds.
If the heuristics of this basic fallocate implementation are not
desirable, the old non-functional behavior of returning EOPNOTSUPP
for calls can be restored by setting zfs_fallocate_reserve_percent=0.

The parameter of zfs_statvfs() is changed to take an inode instead
of a dentry, since no dentry is available in zfs_fallocate_common().

A few tests from @behlendorf cover basic fallocate functionality.

Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed-by: Arshad Hussain <arshad.super@gmail.com>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Andreas Dilger <adilger@dilger.ca>
Issue #326
Closes #10408
This commit is contained in:
adilger 2020-06-18 12:22:11 -06:00 committed by GitHub
parent d553fb9b9e
commit f734301d22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 317 additions and 23 deletions

View File

@ -325,6 +325,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/devices/Makefile
tests/zfs-tests/tests/functional/events/Makefile
tests/zfs-tests/tests/functional/exec/Makefile
tests/zfs-tests/tests/functional/fallocate/Makefile
tests/zfs-tests/tests/functional/fault/Makefile
tests/zfs-tests/tests/functional/features/Makefile
tests/zfs-tests/tests/functional/features/async_destroy/Makefile

View File

@ -210,7 +210,7 @@ extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent);
extern void zfs_preumount(struct super_block *sb);
extern int zfs_umount(struct super_block *sb);
extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm);
extern int zfs_statvfs(struct dentry *dentry, struct kstatfs *statp);
extern int zfs_statvfs(struct inode *ip, struct kstatfs *statp);
extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp);
extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan,
int *objects);

View File

@ -1658,6 +1658,25 @@ as a percentage of \fBzfs_dirty_data_max\fR. This should be less than
Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
.RE
.sp
.ne 2
.na
\fBzfs_fallocate_reserve_percent\fR (uint)
.ad
.RS 12n
Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
preallocated for a file in order to guarantee that later writes will not
run out of space. Instead, fallocate() space preallocation only checks
that sufficient space is currently available in the pool or the user's
project quota allocation, and then creates a sparse file of the requested
size. The requested space is multiplied by \fBzfs_fallocate_reserve_percent\fR
to allow additional space for indirect blocks and other internal metadata.
Setting this value to 0 disables support for fallocate(2) and returns
EOPNOTSUPP for fallocate() space preallocation again.
.sp
Default value: \fB110\fR%
.RE
.sp
.ne 2
.na

View File

@ -1088,9 +1088,9 @@ objs:
}
int
zfs_statvfs(struct dentry *dentry, struct kstatfs *statp)
zfs_statvfs(struct inode *ip, struct kstatfs *statp)
{
zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
zfsvfs_t *zfsvfs = ITOZSB(ip);
uint64_t refdbytes, availbytes, usedobjs, availobjs;
int err = 0;
@ -1148,7 +1148,7 @@ zfs_statvfs(struct dentry *dentry, struct kstatfs *statp)
if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
dmu_objset_projectquota_present(zfsvfs->z_os)) {
znode_t *zp = ITOZ(dentry->d_inode);
znode_t *zp = ITOZ(ip);
if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
zpl_is_valid_projid(zp->z_projid))

View File

@ -34,6 +34,11 @@
#include <sys/zfs_vnops.h>
#include <sys/zfs_project.h>
/*
* When using fallocate(2) to preallocate space, inflate the requested
* capacity check by 10% to account for the required metadata blocks.
*/
unsigned int zfs_fallocate_reserve_percent = 110;
static int
zpl_open(struct inode *ip, struct file *filp)
@ -721,20 +726,23 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
}
/*
* The only flag combination which matches the behavior of zfs_space()
* is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
* The flag combination which matches the behavior of zfs_space() is
* FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
* flag was introduced in the 2.6.38 kernel.
*
* The original mode=0 (allocate space) behavior can be reasonably emulated
* by checking if enough space exists and creating a sparse file, as real
* persistent space reservation is not possible due to COW, snapshots, etc.
*/
static long
zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
{
cred_t *cr = CRED();
flock64_t bf;
loff_t olen;
fstrans_cookie_t cookie;
int error;
int error = 0;
if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0)
return (-EOPNOTSUPP);
if (offset < 0 || len <= 0)
@ -743,21 +751,54 @@ zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
spl_inode_lock(ip);
olen = i_size_read(ip);
if (offset > olen) {
spl_inode_unlock(ip);
return (0);
}
if (offset + len > olen)
len = olen - offset;
bf.l_type = F_WRLCK;
bf.l_whence = SEEK_SET;
bf.l_start = offset;
bf.l_len = len;
bf.l_pid = 0;
crhold(cr);
cookie = spl_fstrans_mark();
error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
if (mode & FALLOC_FL_PUNCH_HOLE) {
flock64_t bf;
if (offset > olen)
goto out_unmark;
if (offset + len > olen)
len = olen - offset;
bf.l_type = F_WRLCK;
bf.l_whence = SEEK_SET;
bf.l_start = offset;
bf.l_len = len;
bf.l_pid = 0;
error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
unsigned int percent = zfs_fallocate_reserve_percent;
struct kstatfs statfs;
/* Legacy mode, disable fallocate compatibility. */
if (percent == 0) {
error = -EOPNOTSUPP;
goto out_unmark;
}
/*
* Use zfs_statvfs() instead of dmu_objset_space() since it
* also checks project quota limits, which are relevant here.
*/
error = zfs_statvfs(ip, &statfs);
if (error)
goto out_unmark;
/*
* Shrink available space a bit to account for overhead/races.
* We know the product previously fit into availbytes from
* dmu_objset_space(), so the smaller product will also fit.
*/
if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
error = -ENOSPC;
goto out_unmark;
}
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
}
out_unmark:
spl_fstrans_unmark(cookie);
spl_inode_unlock(ip);
@ -1030,3 +1071,9 @@ const struct file_operations zpl_dir_file_operations = {
.compat_ioctl = zpl_compat_ioctl,
#endif
};
/* BEGIN CSTYLED */
module_param(zfs_fallocate_reserve_percent, uint, 0644);
MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
"Percentage of length to use for the available capacity check");
/* END CSTYLED */

View File

@ -138,7 +138,7 @@ zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
int error;
cookie = spl_fstrans_mark();
error = -zfs_statvfs(dentry, statp);
error = -zfs_statvfs(dentry->d_inode, statp);
spl_fstrans_unmark(cookie);
ASSERT3S(error, <=, 0);

View File

@ -94,6 +94,10 @@ tags = ['functional', 'devices']
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter']
tags = ['functional', 'events']
[tests/functional/fallocate:Linux]
tests = ['fallocate_prealloc', 'fallocate_punch-hole']
tags = ['functional', 'fallocate']
[tests/functional/fault:Linux]
tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_replace_001_pos',
'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple',

View File

@ -22,6 +22,7 @@ SUBDIRS = \
devices \
events \
exec \
fallocate \
fault \
features \
grow \

View File

@ -0,0 +1,6 @@
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/fallocate
dist_pkgdata_SCRIPTS = \
setup.ksh \
cleanup.ksh \
fallocate_prealloc.ksh \
fallocate_punch-hole.ksh

View File

@ -0,0 +1,27 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
default_cleanup

View File

@ -0,0 +1,63 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Test fallocate(2) preallocation.
#
# STRATEGY:
# 1. Verify mode 0 fallocate is supported.
# 2. Verify default 10% reserve space is honored by setting a quota.
#
verify_runnable "global"
FILE=$TESTDIR/$TESTFILE0
function cleanup
{
log_must zfs set quota=none $TESTPOOL
[[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/*
}
log_assert "Ensure sparse files can be preallocated"
log_onexit cleanup
# Pre-allocate a sparse 1GB file.
log_must fallocate -l $((1024 * 1024 * 1024)) $FILE
log_must rm -Rf $TESTDIR/*
# Verify that an additional ~10% reserve space is required.
log_must zfs set quota=100M $TESTPOOL
log_mustnot fallocate -l $((150 * 1024 * 1024)) $FILE
log_mustnot fallocate -l $((110 * 1024 * 1024)) $FILE
log_must fallocate -l $((90 * 1024 * 1024)) $FILE
log_pass "Ensure sparse files can be preallocated"

View File

@ -0,0 +1,97 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Test `fallocate --punch-hole`
#
# STRATEGY:
# 1. Create a dense file
# 2. Punch an assortment of holes in the file and verify the result.
#
verify_runnable "global"
FILE=$TESTDIR/$TESTFILE0
BLKSZ=$(get_prop recordsize $TESTPOOL)
function cleanup
{
[[ -e $TESTDIR ]] && log_must rm -f $FILE
}
function check_disk_size
{
typeset expected_size=$1
disk_size=$(du $TESTDIR/file | awk '{print $1}')
if [ $disk_size -ne $expected_size ]; then
log_fail "Incorrect size: $disk_size != $expected_size"
fi
}
function check_apparent_size
{
typeset expected_size=$1
apparent_size=$(stat_size)
if [ $apparent_size -ne $expected_size ]; then
log_fail "Incorrect size: $apparent_size != $expected_size"
fi
}
log_assert "Ensure holes can be punched in files making them sparse"
log_onexit cleanup
# Create a dense file and check it is the correct size.
log_must file_write -o create -f $FILE -b $BLKSZ -c 8
log_must check_disk_size $((131072 * 8))
# Punch a hole for the first full block.
log_must fallocate --punch-hole --offset 0 --length $BLKSZ $FILE
log_must check_disk_size $((131072 * 7))
# Partially punch a hole in the second block.
log_must fallocate --punch-hole --offset $BLKSZ --length $((BLKSZ / 2)) $FILE
log_must check_disk_size $((131072 * 7))
# Punch a hole which overlaps the third and forth block.
log_must fallocate --punch-hole --offset $(((BLKSZ * 2) + (BLKSZ / 2))) \
--length $((BLKSZ)) $FILE
log_must check_disk_size $((131072 * 7))
# Punch a hole from the fifth block past the end of file. The apparent
# file size should not change since --keep-size is implied.
apparent_size=$(stat_size $FILE)
log_must fallocate --punch-hole --offset $((BLKSZ * 4)) \
--length $((BLKSZ * 10)) $FILE
log_must check_disk_size $((131072 * 4))
log_must check_apparent_size $apparent_size
log_pass "Ensure holes can be punched in files making them sparse"

View File

@ -0,0 +1,29 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
DISK=${DISKS%% *}
default_setup $DISK