linux: add basic fallocate(mode=0/2) compatibility
Implement semi-compatible functionality for mode=0 (preallocation) and mode=FALLOC_FL_KEEP_SIZE (preallocation beyond EOF) for ZPL. Since ZFS does COW and snapshots, preallocating blocks for a file cannot guarantee that writes to the file will not run out of space. Even if the first overwrite was guaranteed, it would not handle any later overwrite of blocks due to COW, so strict compliance is futile. Instead, make a best-effort check that at least enough free space is currently available in the pool (with a bit of margin), then create a sparse file of the requested size and continue on with life. This does not handle all cases (e.g. several fallocate() calls before writing into the files when the filesystem is nearly full), which would require a more complex mechanism to be implemented, probably based on a modified version of dmu_prealloc(), but is usable as-is. A new module option zfs_fallocate_reserve_percent is used to control the reserve margin for any single fallocate call. By default, this is 110% of the requested preallocation size, so an additional 10% of available space is reserved for overhead to allow the application a good chance of finishing the write when the fallocate() succeeds. If the heuristics of this basic fallocate implementation are not desirable, the old non-functional behavior of returning EOPNOTSUPP for calls can be restored by setting zfs_fallocate_reserve_percent=0. The parameter of zfs_statvfs() is changed to take an inode instead of a dentry, since no dentry is available in zfs_fallocate_common(). A few tests from @behlendorf cover basic fallocate functionality. Reviewed-by: Richard Laager <rlaager@wiktel.com> Reviewed-by: Arshad Hussain <arshad.super@gmail.com> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Andreas Dilger <adilger@dilger.ca> Issue #326 Closes #10408
This commit is contained in:
parent
d553fb9b9e
commit
f734301d22
|
@ -325,6 +325,7 @@ AC_CONFIG_FILES([
|
|||
tests/zfs-tests/tests/functional/devices/Makefile
|
||||
tests/zfs-tests/tests/functional/events/Makefile
|
||||
tests/zfs-tests/tests/functional/exec/Makefile
|
||||
tests/zfs-tests/tests/functional/fallocate/Makefile
|
||||
tests/zfs-tests/tests/functional/fault/Makefile
|
||||
tests/zfs-tests/tests/functional/features/Makefile
|
||||
tests/zfs-tests/tests/functional/features/async_destroy/Makefile
|
||||
|
|
|
@ -210,7 +210,7 @@ extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent);
|
|||
extern void zfs_preumount(struct super_block *sb);
|
||||
extern int zfs_umount(struct super_block *sb);
|
||||
extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm);
|
||||
extern int zfs_statvfs(struct dentry *dentry, struct kstatfs *statp);
|
||||
extern int zfs_statvfs(struct inode *ip, struct kstatfs *statp);
|
||||
extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp);
|
||||
extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan,
|
||||
int *objects);
|
||||
|
|
|
@ -1658,6 +1658,25 @@ as a percentage of \fBzfs_dirty_data_max\fR. This should be less than
|
|||
Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fBzfs_fallocate_reserve_percent\fR (uint)
|
||||
.ad
|
||||
.RS 12n
|
||||
Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
|
||||
preallocated for a file in order to guarantee that later writes will not
|
||||
run out of space. Instead, fallocate() space preallocation only checks
|
||||
that sufficient space is currently available in the pool or the user's
|
||||
project quota allocation, and then creates a sparse file of the requested
|
||||
size. The requested space is multiplied by \fBzfs_fallocate_reserve_percent\fR
|
||||
to allow additional space for indirect blocks and other internal metadata.
|
||||
Setting this value to 0 disables support for fallocate(2) and returns
|
||||
EOPNOTSUPP for fallocate() space preallocation again.
|
||||
.sp
|
||||
Default value: \fB110\fR%
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
|
|
|
@ -1088,9 +1088,9 @@ objs:
|
|||
}
|
||||
|
||||
int
|
||||
zfs_statvfs(struct dentry *dentry, struct kstatfs *statp)
|
||||
zfs_statvfs(struct inode *ip, struct kstatfs *statp)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
|
||||
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
||||
uint64_t refdbytes, availbytes, usedobjs, availobjs;
|
||||
int err = 0;
|
||||
|
||||
|
@ -1148,7 +1148,7 @@ zfs_statvfs(struct dentry *dentry, struct kstatfs *statp)
|
|||
|
||||
if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
|
||||
dmu_objset_projectquota_present(zfsvfs->z_os)) {
|
||||
znode_t *zp = ITOZ(dentry->d_inode);
|
||||
znode_t *zp = ITOZ(ip);
|
||||
|
||||
if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
|
||||
zpl_is_valid_projid(zp->z_projid))
|
||||
|
|
|
@ -34,6 +34,11 @@
|
|||
#include <sys/zfs_vnops.h>
|
||||
#include <sys/zfs_project.h>
|
||||
|
||||
/*
|
||||
* When using fallocate(2) to preallocate space, inflate the requested
|
||||
* capacity check by 10% to account for the required metadata blocks.
|
||||
*/
|
||||
unsigned int zfs_fallocate_reserve_percent = 110;
|
||||
|
||||
static int
|
||||
zpl_open(struct inode *ip, struct file *filp)
|
||||
|
@ -721,20 +726,23 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
|
|||
}
|
||||
|
||||
/*
|
||||
* The only flag combination which matches the behavior of zfs_space()
|
||||
* is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
|
||||
* The flag combination which matches the behavior of zfs_space() is
|
||||
* FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
|
||||
* flag was introduced in the 2.6.38 kernel.
|
||||
*
|
||||
* The original mode=0 (allocate space) behavior can be reasonably emulated
|
||||
* by checking if enough space exists and creating a sparse file, as real
|
||||
* persistent space reservation is not possible due to COW, snapshots, etc.
|
||||
*/
|
||||
static long
|
||||
zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
|
||||
{
|
||||
cred_t *cr = CRED();
|
||||
flock64_t bf;
|
||||
loff_t olen;
|
||||
fstrans_cookie_t cookie;
|
||||
int error;
|
||||
int error = 0;
|
||||
|
||||
if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
|
||||
if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0)
|
||||
return (-EOPNOTSUPP);
|
||||
|
||||
if (offset < 0 || len <= 0)
|
||||
|
@ -743,21 +751,54 @@ zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
|
|||
spl_inode_lock(ip);
|
||||
olen = i_size_read(ip);
|
||||
|
||||
if (offset > olen) {
|
||||
spl_inode_unlock(ip);
|
||||
return (0);
|
||||
}
|
||||
if (offset + len > olen)
|
||||
len = olen - offset;
|
||||
bf.l_type = F_WRLCK;
|
||||
bf.l_whence = SEEK_SET;
|
||||
bf.l_start = offset;
|
||||
bf.l_len = len;
|
||||
bf.l_pid = 0;
|
||||
|
||||
crhold(cr);
|
||||
cookie = spl_fstrans_mark();
|
||||
error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
|
||||
if (mode & FALLOC_FL_PUNCH_HOLE) {
|
||||
flock64_t bf;
|
||||
|
||||
if (offset > olen)
|
||||
goto out_unmark;
|
||||
|
||||
if (offset + len > olen)
|
||||
len = olen - offset;
|
||||
bf.l_type = F_WRLCK;
|
||||
bf.l_whence = SEEK_SET;
|
||||
bf.l_start = offset;
|
||||
bf.l_len = len;
|
||||
bf.l_pid = 0;
|
||||
|
||||
error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
|
||||
} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
|
||||
unsigned int percent = zfs_fallocate_reserve_percent;
|
||||
struct kstatfs statfs;
|
||||
|
||||
/* Legacy mode, disable fallocate compatibility. */
|
||||
if (percent == 0) {
|
||||
error = -EOPNOTSUPP;
|
||||
goto out_unmark;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use zfs_statvfs() instead of dmu_objset_space() since it
|
||||
* also checks project quota limits, which are relevant here.
|
||||
*/
|
||||
error = zfs_statvfs(ip, &statfs);
|
||||
if (error)
|
||||
goto out_unmark;
|
||||
|
||||
/*
|
||||
* Shrink available space a bit to account for overhead/races.
|
||||
* We know the product previously fit into availbytes from
|
||||
* dmu_objset_space(), so the smaller product will also fit.
|
||||
*/
|
||||
if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
|
||||
error = -ENOSPC;
|
||||
goto out_unmark;
|
||||
}
|
||||
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
|
||||
error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
|
||||
}
|
||||
out_unmark:
|
||||
spl_fstrans_unmark(cookie);
|
||||
spl_inode_unlock(ip);
|
||||
|
||||
|
@ -1030,3 +1071,9 @@ const struct file_operations zpl_dir_file_operations = {
|
|||
.compat_ioctl = zpl_compat_ioctl,
|
||||
#endif
|
||||
};
|
||||
|
||||
/* BEGIN CSTYLED */
|
||||
module_param(zfs_fallocate_reserve_percent, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
|
||||
"Percentage of length to use for the available capacity check");
|
||||
/* END CSTYLED */
|
||||
|
|
|
@ -138,7 +138,7 @@ zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
|
|||
int error;
|
||||
|
||||
cookie = spl_fstrans_mark();
|
||||
error = -zfs_statvfs(dentry, statp);
|
||||
error = -zfs_statvfs(dentry->d_inode, statp);
|
||||
spl_fstrans_unmark(cookie);
|
||||
ASSERT3S(error, <=, 0);
|
||||
|
||||
|
|
|
@ -94,6 +94,10 @@ tags = ['functional', 'devices']
|
|||
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter']
|
||||
tags = ['functional', 'events']
|
||||
|
||||
[tests/functional/fallocate:Linux]
|
||||
tests = ['fallocate_prealloc', 'fallocate_punch-hole']
|
||||
tags = ['functional', 'fallocate']
|
||||
|
||||
[tests/functional/fault:Linux]
|
||||
tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_replace_001_pos',
|
||||
'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple',
|
||||
|
|
|
@ -22,6 +22,7 @@ SUBDIRS = \
|
|||
devices \
|
||||
events \
|
||||
exec \
|
||||
fallocate \
|
||||
fault \
|
||||
features \
|
||||
grow \
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/fallocate
|
||||
dist_pkgdata_SCRIPTS = \
|
||||
setup.ksh \
|
||||
cleanup.ksh \
|
||||
fallocate_prealloc.ksh \
|
||||
fallocate_punch-hole.ksh
|
|
@ -0,0 +1,27 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or http://www.opensolaris.org/os/licensing.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
default_cleanup
|
|
@ -0,0 +1,63 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or http://www.opensolaris.org/os/licensing.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Test fallocate(2) preallocation.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Verify mode 0 fallocate is supported.
|
||||
# 2. Verify default 10% reserve space is honored by setting a quota.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
FILE=$TESTDIR/$TESTFILE0
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must zfs set quota=none $TESTPOOL
|
||||
|
||||
[[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/*
|
||||
}
|
||||
|
||||
log_assert "Ensure sparse files can be preallocated"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
# Pre-allocate a sparse 1GB file.
|
||||
log_must fallocate -l $((1024 * 1024 * 1024)) $FILE
|
||||
log_must rm -Rf $TESTDIR/*
|
||||
|
||||
# Verify that an additional ~10% reserve space is required.
|
||||
log_must zfs set quota=100M $TESTPOOL
|
||||
log_mustnot fallocate -l $((150 * 1024 * 1024)) $FILE
|
||||
log_mustnot fallocate -l $((110 * 1024 * 1024)) $FILE
|
||||
log_must fallocate -l $((90 * 1024 * 1024)) $FILE
|
||||
|
||||
log_pass "Ensure sparse files can be preallocated"
|
|
@ -0,0 +1,97 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or http://www.opensolaris.org/os/licensing.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Test `fallocate --punch-hole`
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a dense file
|
||||
# 2. Punch an assortment of holes in the file and verify the result.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
FILE=$TESTDIR/$TESTFILE0
|
||||
BLKSZ=$(get_prop recordsize $TESTPOOL)
|
||||
|
||||
function cleanup
|
||||
{
|
||||
[[ -e $TESTDIR ]] && log_must rm -f $FILE
|
||||
}
|
||||
|
||||
function check_disk_size
|
||||
{
|
||||
typeset expected_size=$1
|
||||
|
||||
disk_size=$(du $TESTDIR/file | awk '{print $1}')
|
||||
if [ $disk_size -ne $expected_size ]; then
|
||||
log_fail "Incorrect size: $disk_size != $expected_size"
|
||||
fi
|
||||
}
|
||||
|
||||
function check_apparent_size
|
||||
{
|
||||
typeset expected_size=$1
|
||||
|
||||
apparent_size=$(stat_size)
|
||||
if [ $apparent_size -ne $expected_size ]; then
|
||||
log_fail "Incorrect size: $apparent_size != $expected_size"
|
||||
fi
|
||||
}
|
||||
|
||||
log_assert "Ensure holes can be punched in files making them sparse"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
# Create a dense file and check it is the correct size.
|
||||
log_must file_write -o create -f $FILE -b $BLKSZ -c 8
|
||||
log_must check_disk_size $((131072 * 8))
|
||||
|
||||
# Punch a hole for the first full block.
|
||||
log_must fallocate --punch-hole --offset 0 --length $BLKSZ $FILE
|
||||
log_must check_disk_size $((131072 * 7))
|
||||
|
||||
# Partially punch a hole in the second block.
|
||||
log_must fallocate --punch-hole --offset $BLKSZ --length $((BLKSZ / 2)) $FILE
|
||||
log_must check_disk_size $((131072 * 7))
|
||||
|
||||
# Punch a hole which overlaps the third and forth block.
|
||||
log_must fallocate --punch-hole --offset $(((BLKSZ * 2) + (BLKSZ / 2))) \
|
||||
--length $((BLKSZ)) $FILE
|
||||
log_must check_disk_size $((131072 * 7))
|
||||
|
||||
# Punch a hole from the fifth block past the end of file. The apparent
|
||||
# file size should not change since --keep-size is implied.
|
||||
apparent_size=$(stat_size $FILE)
|
||||
log_must fallocate --punch-hole --offset $((BLKSZ * 4)) \
|
||||
--length $((BLKSZ * 10)) $FILE
|
||||
log_must check_disk_size $((131072 * 4))
|
||||
log_must check_apparent_size $apparent_size
|
||||
|
||||
log_pass "Ensure holes can be punched in files making them sparse"
|
|
@ -0,0 +1,29 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or http://www.opensolaris.org/os/licensing.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
|
||||
#
|
||||
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
DISK=${DISKS%% *}
|
||||
default_setup $DISK
|
Loading…
Reference in New Issue