Fast Dedup: ZAP Shrinking

This allows ZAPs to shrink. When there are two empty sibling leafs,
one of them is collapsed and its storage space is reused.
This improved performance on directories that at one time contained
a large number of files, but many or all of those files have since
been deleted.

This also applies to all other types of ZAPs as well.

Sponsored-by: iXsystems, Inc.
Sponsored-by: Klara, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Alexander Stetsenko <alex.stetsenko@klarasystems.com>
Closes #15888
This commit is contained in:
Allan Jude 2024-04-24 17:51:21 -04:00 committed by GitHub
parent 67d13998b3
commit 5044c4e3ff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 488 additions and 12 deletions

View File

@ -16,7 +16,7 @@
.\" own identifying information: .\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner] .\" Portions Copyright [yyyy] [name of copyright owner]
.\" .\"
.Dd January 9, 2024 .Dd February 14, 2024
.Dt ZFS 4 .Dt ZFS 4
.Os .Os
. .
@ -564,9 +564,8 @@ However, this is limited by
Maximum micro ZAP size. Maximum micro ZAP size.
A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
. .
.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint .It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
Log2 fraction of holes in speculative prefetch stream allowed for it to If set, adjacent empty ZAP blocks will be collapsed, reducing disk space.
proceed.
. .
.It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
Min bytes to prefetch per stream. Min bytes to prefetch per stream.

View File

@ -22,6 +22,8 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
* Copyright (c) 2023, Klara Inc.
*/ */
/* /*
@ -41,6 +43,7 @@
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/dnode.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/zfs_znode.h> #include <sys/zfs_znode.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
@ -78,9 +81,16 @@
*/ */
static int zap_iterate_prefetch = B_TRUE; static int zap_iterate_prefetch = B_TRUE;
/*
* Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
* collapsed into a single block.
*/
int zap_shrink_enabled = B_TRUE;
int fzap_default_block_shift = 14; /* 16k blocksize */ int fzap_default_block_shift = 14; /* 16k blocksize */
static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
void void
fzap_byteswap(void *vbuf, size_t size) fzap_byteswap(void *vbuf, size_t size)
@ -586,6 +596,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
} }
} }
static int
zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
dmu_tx_t *tx)
{
int bs = FZAP_BLOCK_SHIFT(zap);
int epb = bs >> 3; /* entries per block */
int err = 0;
ASSERT(tx != NULL);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
/*
* Check for i/o errors
*/
for (int i = 0; i < nptrs; i += epb) {
uint64_t blk;
err = zap_idx_to_blk(zap, idx + i, &blk);
if (err != 0) {
return (err);
}
}
for (int i = 0; i < nptrs; i++) {
err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
ASSERT0(err); /* we checked for i/o errors above */
if (err != 0)
break;
}
return (err);
}
#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len)))
/*
* Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
* If two leaves are siblings, their ranges are adjecent and contain the same
* number of entries. In order to find out if a leaf has a sibling, we need to
* check the range corresponding to the sibling leaf. There is no need to check
* all entries in the range, we only need to check the frist and the last one.
*/
static uint64_t
check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
{
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
uint64_t nptrs = (1 << pref_diff);
uint64_t first;
uint64_t last;
ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
if (zap_idx_to_blk(zap, idx, &first) != 0)
return (0);
if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
return (0);
if (first != last)
return (0);
return (first);
}
static int static int
zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
{ {
@ -958,6 +1034,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
if (err == 0) { if (err == 0) {
zap_entry_remove(&zeh); zap_entry_remove(&zeh);
zap_increment_num_entries(zn->zn_zap, -1, tx); zap_increment_num_entries(zn->zn_zap, -1, tx);
if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
zap_shrink_enabled)
return (zap_shrink(zn, l, tx));
} }
zap_put_leaf(l); zap_put_leaf(l);
return (err); return (err);
@ -1222,14 +1302,20 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
ZIO_PRIORITY_ASYNC_READ); ZIO_PRIORITY_ASYNC_READ);
} }
if (zc->zc_leaf && if (zc->zc_leaf) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
/*
* The leaf was either shrunk or split.
*/
if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
(ZAP_HASH_IDX(zc->zc_hash, (ZAP_HASH_IDX(zc->zc_hash,
zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
zap_put_leaf(zc->zc_leaf); zap_put_leaf(zc->zc_leaf);
zc->zc_leaf = NULL; zc->zc_leaf = NULL;
} }
}
again: again:
if (zc->zc_leaf == NULL) { if (zc->zc_leaf == NULL) {
@ -1237,8 +1323,6 @@ again:
&zc->zc_leaf); &zc->zc_leaf);
if (err != 0) if (err != 0)
return (err); return (err);
} else {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
} }
l = zc->zc_leaf; l = zc->zc_leaf;
@ -1367,6 +1451,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
} }
} }
/*
* Find last allocated block and update freeblk.
*/
static void
zap_trunc(zap_t *zap)
{
uint64_t nentries;
uint64_t lastblk;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
/* External ptrtbl */
nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
} else {
/* Embedded ptrtbl */
nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
lastblk = 0;
}
for (uint64_t idx = 0; idx < nentries; idx++) {
uint64_t blk;
if (zap_idx_to_blk(zap, idx, &blk) != 0)
return;
if (blk > lastblk)
lastblk = blk;
}
ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
zap_f_phys(zap)->zap_freeblk = lastblk + 1;
}
/*
* ZAP shrinking algorithm.
*
* We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
* only if it has a sibling. Sibling leaves have the same prefix length and
* their prefixes differ only by the least significant (sibling) bit. We require
* both siblings to be empty. This eliminates a need to rehash the non-empty
* remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
* entries of the removed leaf to point out to the remaining leaf. Prefix length
* of the remaining leaf is decremented. As a result, it has a new prefix and it
* might have a new sibling. So, we repeat the process.
*
* Steps:
* 1. Check if a sibling leaf (sl) exists and it is empty.
* 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
* 3. Release the sibling (sl) to derefer it again with WRITER lock.
* 4. Upgrade zapdir lock to WRITER (once).
* 5. Derefer released leaves again.
* 6. If it is needed, recheck whether both leaves are still siblings and empty.
* 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
* the remaining leaf (slbit 0).
* 8. Free disk block of the removed leaf (dmu_free_range).
* 9. Decrement prefix_len of the remaining leaf.
* 10. Repeat the steps.
*/
static int
zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
{
zap_t *zap = zn->zn_zap;
int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
uint64_t hash = zn->zn_hash;
uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
boolean_t trunc = B_FALSE;
int err = 0;
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
boolean_t writer = B_FALSE;
/*
* To avoid deadlock always deref leaves in the same order -
* sibling 0 first, then sibling 1.
*/
while (prefix_len) {
zap_leaf_t *sl;
int64_t prefix_diff = zt_shift - prefix_len;
uint64_t sl_prefix = prefix ^ 1;
uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
int slbit = prefix & 1;
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
/*
* Check if there is a sibling by reading ptrtbl ptrs.
*/
if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
break;
/*
* sibling 1, unlock it - we haven't yet dereferenced sibling 0.
*/
if (slbit == 1) {
zap_put_leaf(l);
l = NULL;
}
/*
* Dereference sibling leaf and check if it is empty.
*/
if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
&sl)) != 0)
break;
ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
/*
* Check if we have a sibling and it is empty.
*/
if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
zap_put_leaf(sl);
break;
}
zap_put_leaf(sl);
/*
* If there two empty sibling, we have work to do, so
* we need to lock ZAP ptrtbl as WRITER.
*/
if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
/* We failed to upgrade */
if (l != NULL) {
zap_put_leaf(l);
l = NULL;
}
/*
* Usually, the right way to upgrade from a READER lock
* to a WRITER lock is to call zap_unlockdir() and
* zap_lockdir(), but we do not have a tag. Instead,
* we do it in more sophisticated way.
*/
rw_exit(&zap->zap_rwlock);
rw_enter(&zap->zap_rwlock, RW_WRITER);
dmu_buf_will_dirty(zap->zap_dbuf, tx);
zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
writer = B_TRUE;
}
/*
* Here we have WRITER lock for ptrtbl.
* Now, we need a WRITER lock for both siblings leaves.
* Also, we have to recheck if the leaves are still siblings
* and still empty.
*/
if (l == NULL) {
/* sibling 0 */
if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
tx, RW_WRITER, &l)) != 0)
break;
/*
* The leaf isn't empty anymore or
* it was shrunk/split while our locks were down.
*/
if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
break;
}
/* sibling 1 */
if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
RW_WRITER, &sl)) != 0)
break;
/*
* The leaf isn't empty anymore or
* it was shrunk/split while our locks were down.
*/
if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
zap_put_leaf(sl);
break;
}
/* If we have gotten here, we have a leaf to collapse */
uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
uint64_t nptrs = (1ULL << prefix_diff);
uint64_t sl_blkid = sl->l_blkid;
/*
* Set ptrtbl entries to point out to the slibling 0 blkid
*/
if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
tx)) != 0) {
zap_put_leaf(sl);
break;
}
/*
* Free sibling 1 disk block.
*/
int bs = FZAP_BLOCK_SHIFT(zap);
if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
trunc = B_TRUE;
(void) dmu_free_range(zap->zap_objset, zap->zap_object,
sl_blkid << bs, 1 << bs, tx);
zap_put_leaf(sl);
zap_f_phys(zap)->zap_num_leafs--;
/*
* Update prefix and prefix_len.
*/
zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
}
if (trunc)
zap_trunc(zap);
if (l != NULL)
zap_put_leaf(l);
return (err);
}
/* CSTYLED */ /* CSTYLED */
ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
"When iterating ZAP object, prefetch it"); "When iterating ZAP object, prefetch it");
/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
"Enable ZAP shrinking");

View File

@ -643,6 +643,10 @@ tags = ['functional', 'compression']
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress'] tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
tags = ['functional', 'cp_files'] tags = ['functional', 'cp_files']
[tests/functional/zap_shrink]
tests = ['zap_shrink_001_pos']
tags = ['functional', 'zap_shrink']
[tests/functional/crtime] [tests/functional/crtime]
tests = ['crtime_001_pos' ] tests = ['crtime_001_pos' ]
tags = ['functional', 'crtime'] tags = ['functional', 'crtime']

View File

@ -2074,6 +2074,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/xattr/xattr_012_pos.ksh \ functional/xattr/xattr_012_pos.ksh \
functional/xattr/xattr_013_pos.ksh \ functional/xattr/xattr_013_pos.ksh \
functional/xattr/xattr_compat.ksh \ functional/xattr/xattr_compat.ksh \
functional/zap_shrink/cleanup.ksh \
functional/zap_shrink/zap_shrink_001_pos.ksh \
functional/zap_shrink/setup.ksh \
functional/zpool_influxdb/cleanup.ksh \ functional/zpool_influxdb/cleanup.ksh \
functional/zpool_influxdb/setup.ksh \ functional/zpool_influxdb/setup.ksh \
functional/zpool_influxdb/zpool_influxdb.ksh \ functional/zpool_influxdb/zpool_influxdb.ksh \

View File

@ -0,0 +1,34 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
default_cleanup

View File

@ -0,0 +1,35 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
DISK=${DISKS%% *}
default_setup $DISK

View File

@ -0,0 +1,81 @@
#! /bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2024, Klara Inc.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Create a large number of files in a directory. Then remove all files and
# check that the directory zap was shrunk. Use zdb to check that the zap object
# contains only one leaf block using zdb.
#
verify_runnable "global"
DIR=largedir
NR_FILES=100000
BATCH=1000
CWD=$PWD
log_assert "Create a large number of files ($NR_FILES) in a directory. " \
"Make sure that the directory ZAP object was shrunk."
log_must mkdir $TESTDIR/$DIR
cd $TESTDIR/$DIR
# In order to prevent arguments overflowing, create NR_FILES in BATCH at once.
for i in $(seq $(($NR_FILES/$BATCH))); do
touch $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH)));
done
cd $CWD
log_must test $NR_FILES -eq $(ls -U $TESTDIR/$DIR | wc -l)
# remove all files in $DIR directory
cd $TESTDIR/$DIR
for i in $(seq $(($NR_FILES/$BATCH))); do
rm $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH)))
done
cd $CWD
sync_pool $TESTPOOL
log_must test 0 -eq $(ls -U $TESTDIR/$DIR | wc -l)
# check whether zap_shrink works
zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR)
nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}')
log_must test 1 -eq $nleafs
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
# check whether zap_shrink works
zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR)
nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}')
log_must test 1 -eq $nleafs
log_pass