port async unlinked drain from illumos-nexenta

This patch is an async implementation of the existing sync
zfs_unlinked_drain() function. This function is called at mount time and
is responsible for freeing znodes that we didn't get to freeing before.
We don't have to hold mounting of the dataset until the unlinked list is
fully drained as is done now. Since we can process the unlinked set
asynchronously this results in a better user experience when mounting a
dataset with entries in the unlinked set.

Reviewed by: Jorgen Lundman <lundman@lundman.net>
Reviewed by: Tom Caputi <tcaputi@datto.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Alek Pinchuk <apinchuk@datto.com>
Closes #8142
This commit is contained in:
Alek P 2019-02-12 10:41:15 -08:00 committed by Brian Behlendorf
parent 425d3237ee
commit dcec0a12c8
13 changed files with 300 additions and 10 deletions

View File

@ -21,6 +21,7 @@
/*
* Copyright (c) 2018 by Delphix. All rights reserved.
* Copyright (c) 2018 Datto Inc.
*/
#ifndef _SYS_DATASET_KSTATS_H
@ -35,6 +36,8 @@ typedef struct dataset_aggsum_stats_t {
aggsum_t das_nwritten;
aggsum_t das_reads;
aggsum_t das_nread;
aggsum_t das_nunlinks;
aggsum_t das_nunlinked;
} dataset_aggsum_stats_t;
typedef struct dataset_kstat_values {
@ -43,6 +46,16 @@ typedef struct dataset_kstat_values {
kstat_named_t dkv_nwritten;
kstat_named_t dkv_reads;
kstat_named_t dkv_nread;
/*
* nunlinks is initialized to the unlinked set size on mount and
* is incremented whenever a new entry is added to the unlinked set
*/
kstat_named_t dkv_nunlinks;
/*
* nunlinked is initialized to zero on mount and is incremented when an
* entry is removed from the unlinked set
*/
kstat_named_t dkv_nunlinked;
} dataset_kstat_values_t;
typedef struct dataset_kstats {
@ -56,4 +69,7 @@ void dataset_kstats_destroy(dataset_kstats_t *);
void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t);
void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t);
void dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *, int64_t);
void dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *, int64_t);
#endif /* _SYS_DATASET_KSTATS_H */

View File

@ -96,6 +96,7 @@ typedef struct dsl_pool {
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_iput_taskq;
struct taskq *dp_unlinked_drain_taskq;
/* No lock needed - sync context only */
blkptr_t dp_meta_rootbp;
@ -176,6 +177,7 @@ boolean_t dsl_pool_config_held(dsl_pool_t *dp);
boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp);
taskq_t *dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp);
int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
const char *tag, uint64_t now, dmu_tx_t *tx);

View File

@ -64,6 +64,7 @@ extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *);
extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
extern void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs);
extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
extern int zfs_get_xattrdir(znode_t *, struct inode **, cred_t *, int);
extern int zfs_make_xattrdir(znode_t *, vattr_t *, struct inode **, cred_t *);

View File

@ -117,6 +117,8 @@ struct zfsvfs {
boolean_t z_replay; /* set during ZIL replay */
boolean_t z_use_sa; /* version allow system attributes */
boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */
boolean_t z_draining; /* is true when drain is active */
boolean_t z_drain_cancel; /* signal the unlinked drain to stop */
uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
dataset_kstats_t z_kstat; /* fs kstats */
@ -132,6 +134,7 @@ struct zfsvfs {
uint64_t z_hold_size; /* znode hold array size */
avl_tree_t *z_hold_trees; /* znode hold trees */
kmutex_t *z_hold_locks; /* znode hold locks */
taskqid_t z_drain_task; /* task id for the unlink drain task */
};
#define ZSB_XATTR 0x0001 /* Enable user xattrs */

View File

@ -1149,6 +1149,21 @@ Rate limit delay zevents (which report slow I/Os) to this many per second.
Default value: 20
.RE
.sp
.ne 2
.na
\fBzfs_unlink_suspend_progress\fR (uint)
.ad
.RS 12n
When enabled, files will not be asynchronously removed from the list of pending
unlinks and the space they consume will be leaked. Once this option has been
disabled and the dataset is remounted, the pending unlinks will be processed
and the freed space returned to the pool.
This option is used by the test suite to facilitate testing.
.sp
Uses \fB0\fR (default) to allow progress and \fB1\fR to pause progress.
.RE
.sp
.ne 2
.na

View File

@ -21,6 +21,7 @@
/*
* Copyright (c) 2018 by Delphix. All rights reserved.
* Copyright (c) 2018 Datto Inc.
*/
#include <sys/dataset_kstats.h>
@ -34,6 +35,8 @@ static dataset_kstat_values_t empty_dataset_kstats = {
{ "nwritten", KSTAT_DATA_UINT64 },
{ "reads", KSTAT_DATA_UINT64 },
{ "nread", KSTAT_DATA_UINT64 },
{ "nunlinks", KSTAT_DATA_UINT64 },
{ "nunlinked", KSTAT_DATA_UINT64 },
};
static int
@ -54,6 +57,10 @@ dataset_kstats_update(kstat_t *ksp, int rw)
aggsum_value(&dk->dk_aggsums.das_reads);
dkv->dkv_nread.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nread);
dkv->dkv_nunlinks.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nunlinks);
dkv->dkv_nunlinked.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nunlinked);
return (0);
}
@ -136,6 +143,8 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
aggsum_init(&dk->dk_aggsums.das_nwritten, 0);
aggsum_init(&dk->dk_aggsums.das_reads, 0);
aggsum_init(&dk->dk_aggsums.das_nread, 0);
aggsum_init(&dk->dk_aggsums.das_nunlinks, 0);
aggsum_init(&dk->dk_aggsums.das_nunlinked, 0);
}
void
@ -156,6 +165,8 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
aggsum_fini(&dk->dk_aggsums.das_nwritten);
aggsum_fini(&dk->dk_aggsums.das_reads);
aggsum_fini(&dk->dk_aggsums.das_nread);
aggsum_fini(&dk->dk_aggsums.das_nunlinks);
aggsum_fini(&dk->dk_aggsums.das_nunlinked);
}
void
@ -183,3 +194,21 @@ dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
aggsum_add(&dk->dk_aggsums.das_reads, 1);
aggsum_add(&dk->dk_aggsums.das_nread, nread);
}
void
dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta)
{
if (dk->dk_kstats == NULL)
return;
aggsum_add(&dk->dk_aggsums.das_nunlinks, delta);
}
void
dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta)
{
if (dk->dk_kstats == NULL)
return;
aggsum_add(&dk->dk_aggsums.das_nunlinked, delta);
}

View File

@ -223,6 +223,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri,
max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
max_ncpus, defclsyspri, max_ncpus, INT_MAX,
TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
return (dp);
}
@ -413,6 +416,7 @@ dsl_pool_close(dsl_pool_t *dp)
rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
cv_destroy(&dp->dp_spaceavail_cv);
taskq_destroy(dp->dp_unlinked_drain_taskq);
taskq_destroy(dp->dp_iput_taskq);
if (dp->dp_blkstats != NULL) {
mutex_destroy(&dp->dp_blkstats->zab_lock);
@ -1097,6 +1101,12 @@ dsl_pool_iput_taskq(dsl_pool_t *dp)
return (dp->dp_iput_taskq);
}
taskq_t *
dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)
{
return (dp->dp_unlinked_drain_taskq);
}
/*
* Walk through the pool-wide zap object of temporary snapshot user holds
* and release them.

View File

@ -458,26 +458,31 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
VERIFY3U(0, ==,
zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
}
/*
* Clean up any znodes that had no links when we either crashed or
* (force) umounted the file system.
*/
void
zfs_unlinked_drain(zfsvfs_t *zfsvfs)
static void
zfs_unlinked_drain_task(void *arg)
{
zfsvfs_t *zfsvfs = arg;
zap_cursor_t zc;
zap_attribute_t zap;
dmu_object_info_t doi;
znode_t *zp;
int error;
ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
/*
* Iterate over the contents of the unlinked set.
*/
for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
zap_cursor_retrieve(&zc, &zap) == 0;
zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
zap_cursor_advance(&zc)) {
/*
@ -507,9 +512,61 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
continue;
zp->z_unlinked = B_TRUE;
/*
* iput() is Linux's equivalent to illumos' VN_RELE(). It will
* decrement the inode's ref count and may cause the inode to be
* synchronously freed. We interrupt freeing of this inode, by
* checking the return value of dmu_objset_zfs_unmounting() in
* dmu_free_long_range(), when an unmount is requested.
*/
iput(ZTOI(zp));
ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
}
zap_cursor_fini(&zc);
zfsvfs->z_draining = B_FALSE;
zfsvfs->z_drain_task = TASKQID_INVALID;
}
/*
* Sets z_draining then tries to dispatch async unlinked drain.
* If that fails executes synchronous unlinked drain.
*/
void
zfs_unlinked_drain(zfsvfs_t *zfsvfs)
{
ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
zfsvfs->z_draining = B_TRUE;
zfsvfs->z_drain_cancel = B_FALSE;
zfsvfs->z_drain_task = taskq_dispatch(
dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
if (zfsvfs->z_drain_task == TASKQID_INVALID) {
zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
zfs_unlinked_drain_task(zfsvfs);
}
}
/*
* Wait for the unlinked drain taskq task to stop. This will interrupt the
* unlinked set processing if it is in progress.
*/
void
zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
{
ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
if (zfsvfs->z_draining) {
zfsvfs->z_drain_cancel = B_TRUE;
taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
zfsvfs->z_drain_task = TASKQID_INVALID;
zfsvfs->z_draining = B_FALSE;
}
}
/*
@ -684,6 +741,8 @@ zfs_rmnode(znode_t *zp)
VERIFY3U(0, ==,
zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
zfs_znode_delete(zp, tx);
dmu_tx_commit(tx);

View File

@ -1178,6 +1178,10 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
return (error);
}
zfsvfs->z_drain_task = TASKQID_INVALID;
zfsvfs->z_draining = B_FALSE;
zfsvfs->z_drain_cancel = B_TRUE;
*zfvp = zfsvfs;
return (0);
}
@ -1200,14 +1204,27 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
* operations out since we closed the ZIL.
*/
if (mounting) {
ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
if (readonly != 0)
if (readonly != 0) {
readonly_changed_cb(zfsvfs, B_FALSE);
else
} else {
zap_stats_t zs;
if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
&zs) == 0) {
dataset_kstats_update_nunlinks_kstat(
&zfsvfs->z_kstat, zs.zs_num_entries);
}
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
"num_entries in unlinked set: %llu",
zs.zs_num_entries);
zfs_unlinked_drain(zfsvfs);
}
/*
* Parse and replay the intent log.
@ -1250,9 +1267,6 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
/* restore readonly bit */
if (readonly != 0)
readonly_changed_cb(zfsvfs, B_TRUE);
ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
}
/*
@ -1633,6 +1647,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
{
znode_t *zp;
zfs_unlinked_drain_stop_wait(zfsvfs);
/*
* If someone has not already unmounted this file system,
* drain the iput_taskq to ensure all active references to the
@ -1884,6 +1900,7 @@ zfs_preumount(struct super_block *sb)
/* zfsvfs is NULL when zfs_domount fails during mount */
if (zfsvfs) {
zfs_unlinked_drain_stop_wait(zfsvfs);
zfsctl_destroy(sb->s_fs_info);
/*
* Wait for iput_async before entering evict_inodes in
@ -2159,6 +2176,15 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
}
mutex_exit(&zfsvfs->z_znodes_lock);
if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) {
/*
* zfs_suspend_fs() could have interrupted freeing
* of dnodes. We need to restart this freeing so
* that we don't "leak" the space.
*/
zfs_unlinked_drain(zfsvfs);
}
bail:
/* release the VFS ops */
rw_exit(&zfsvfs->z_teardown_inactive_lock);

View File

@ -91,6 +91,12 @@ static kmem_cache_t *znode_cache = NULL;
static kmem_cache_t *znode_hold_cache = NULL;
unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
/*
* This is used by the test suite so that it can delay znodes from being
* freed in order to inspect the unlinked set.
*/
int zfs_unlink_suspend_progress = 0;
/*
* This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
* z_rangelock. It will modify the offset and length of the lock to reflect
@ -1339,7 +1345,7 @@ zfs_zinactive(znode_t *zp)
*/
if (zp->z_unlinked) {
ASSERT(!zfsvfs->z_issnap);
if (!zfs_is_readonly(zfsvfs)) {
if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
mutex_exit(&zp->z_lock);
zfs_znode_hold_exit(zfsvfs, zh);
zfs_rmnode(zp);
@ -2214,4 +2220,7 @@ EXPORT_SYMBOL(zfs_obj_to_path);
/* CSTYLED */
module_param(zfs_object_mutex_size, uint, 0644);
MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
module_param(zfs_unlink_suspend_progress, int, 0644);
MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
"(debug - leaks space into the unlinked set)");
#endif

View File

@ -644,7 +644,7 @@ tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
tags = ['functional', 'mmp']
[tests/functional/mount]
tests = ['umount_001', 'umountall_001']
tests = ['umount_001', 'umount_unlinked_drain', 'umountall_001']
tags = ['functional', 'mount']
[tests/functional/mv_files]

View File

@ -3,4 +3,5 @@ dist_pkgdata_SCRIPTS = \
setup.ksh \
cleanup.ksh \
umount_001.ksh \
umount_unlinked_drain.ksh \
umountall_001.ksh

View File

@ -0,0 +1,119 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright 2018 Datto Inc.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Test async unlinked drain to ensure mounting is not held up when there are
# entries in the unlinked set. We also try to test that the list is able to be
# filled up and drained at the same time.
#
# STRATEGY:
# 1. Use zfs_unlink_suspend_progress tunable to disable freeing to build up
# the unlinked set
# 2. Make sure mount happens even when there are entries in the unlinked set
# 3. Drain and build up the unlinked list at the same time to test for races
#
function cleanup
{
log_must set_tunable32 zfs_unlink_suspend_progress $default_unlink_sp
for fs in $(seq 1 3); do
mounted $TESTDIR.$fs || zfs mount $TESTPOOL/$TESTFS.$fs
rm -f $TESTDIR.$fs/file-*
zfs set xattr=on $TESTPOOL/$TESTFS.$fs
done
}
function unlinked_size_is
{
MAX_ITERS=5 # iteration to do before we consider reported number stable
iters=0
last_usize=0
while [[ $iters -le $MAX_ITERS ]]; do
kstat_file=$(grep -nrwl /proc/spl/kstat/zfs/$2/objset-0x* -e $3)
nunlinks=`cat $kstat_file | grep nunlinks | awk '{print $3}'`
nunlinked=`cat $kstat_file | grep nunlinked | awk '{print $3}'`
usize=$(($nunlinks - $nunlinked))
if [[ $iters == $MAX_ITERS && $usize == $1 ]]; then
return 0
fi
if [[ $usize == $last_usize ]]; then
(( iters++ ))
else
iters=0
fi
last_usize=$usize
done
log_note "Unexpected unlinked set size: $last_usize, expected $1"
return 1
}
UNLINK_SP_PARAM=/sys/module/zfs/parameters/zfs_unlink_suspend_progress
default_unlink_sp=$(get_tunable zfs_unlink_suspend_progress)
log_onexit cleanup
log_assert "Unlinked list drain does not hold up mounting of fs"
for fs in 1 2 3; do
set -A xattrs on sa off
for xa in ${xattrs[@]}; do
# setup fs and ensure all deleted files got into unliked set
log_must mounted $TESTDIR.$fs
log_must zfs set xattr=$xa $TESTPOOL/$TESTFS.$fs
if [[ $xa == off ]]; then
for fn in $(seq 1 175); do
log_must mkfile 128k $TESTDIR.$fs/file-$fn
done
else
log_must xattrtest -f 175 -x 3 -r -k -p $TESTDIR.$fs
fi
log_must set_tunable32 zfs_unlink_suspend_progress 1
log_must unlinked_size_is 0 $TESTPOOL $TESTPOOL/$TESTFS.$fs
# build up unlinked set
for fn in $(seq 1 100); do
log_must eval "rm $TESTDIR.$fs/file-$fn &"
done
log_must unlinked_size_is 100 $TESTPOOL $TESTPOOL/$TESTFS.$fs
# test that we can mount fs without emptying the unlinked list
log_must zfs umount $TESTPOOL/$TESTFS.$fs
log_must unmounted $TESTDIR.$fs
log_must zfs mount $TESTPOOL/$TESTFS.$fs
log_must mounted $TESTDIR.$fs
log_must unlinked_size_is 100 $TESTPOOL $TESTPOOL/$TESTFS.$fs
# confirm we can drain and add to unlinked set at the same time
log_must set_tunable32 zfs_unlink_suspend_progress 0
log_must zfs umount $TESTPOOL/$TESTFS.$fs
log_must zfs mount $TESTPOOL/$TESTFS.$fs
for fn in $(seq 101 175); do
log_must eval "rm $TESTDIR.$fs/file-$fn &"
done
log_must unlinked_size_is 0 $TESTPOOL $TESTPOOL/$TESTFS.$fs
done
done
log_pass "Confirmed unlinked list drain does not hold up mounting of fs"