Add fast path for zfs_ioc_space_snaps() handling of empty_bpobj

When there are many snapshots, calls to zfs_ioc_space_snaps() (e.g. from
`zfs destroy -nv pool/fs@snap1%snap10000`) can be very slow, resulting
in poor performance because we are holding the dp_config_rwlock the
entire time, blocking spa_sync() from continuing.  With around ten
thousand snapshots, we've seen up to 500 seconds in this ioctl,
iterating over up to 50,000,000 bpobjs, ~99% of which are the empty
bpobj.

By creating a fast path for zfs_ioc_space_snaps() handling of the
empty_bpobj, we can achieve a ~5x performance improvement of this ioctl
(when there are many snapshots, and the deadlist is mostly
empty_bpobj's).

Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-58348
Closes #8744
This commit is contained in:
Matthew Ahrens 2019-08-20 11:34:52 -07:00 committed by Brian Behlendorf
parent 3beb0a7694
commit 325d288c5d
3 changed files with 180 additions and 34 deletions

View File

@ -48,8 +48,10 @@ typedef struct dsl_deadlist_phys {
typedef struct dsl_deadlist { typedef struct dsl_deadlist {
objset_t *dl_os; objset_t *dl_os;
uint64_t dl_object; uint64_t dl_object;
avl_tree_t dl_tree; avl_tree_t dl_tree; /* contains dsl_deadlist_entry_t */
avl_tree_t dl_cache; /* contains dsl_deadlist_cache_entry_t */
boolean_t dl_havetree; boolean_t dl_havetree;
boolean_t dl_havecache;
struct dmu_buf *dl_dbuf; struct dmu_buf *dl_dbuf;
dsl_deadlist_phys_t *dl_phys; dsl_deadlist_phys_t *dl_phys;
kmutex_t dl_lock; kmutex_t dl_lock;
@ -59,6 +61,15 @@ typedef struct dsl_deadlist {
boolean_t dl_oldfmt; boolean_t dl_oldfmt;
} dsl_deadlist_t; } dsl_deadlist_t;
typedef struct dsl_deadlist_cache_entry {
avl_node_t dlce_node;
uint64_t dlce_mintxg;
uint64_t dlce_bpobj;
uint64_t dlce_bytes;
uint64_t dlce_comp;
uint64_t dlce_uncomp;
} dsl_deadlist_cache_entry_t;
typedef struct dsl_deadlist_entry { typedef struct dsl_deadlist_entry {
avl_node_t dle_node; avl_node_t dle_node;
uint64_t dle_mintxg; uint64_t dle_mintxg;
@ -108,6 +119,7 @@ int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free,
zthr_t *t, uint64_t *size); zthr_t *t, uint64_t *size);
void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
dmu_tx_t *tx); dmu_tx_t *tx);
void dsl_deadlist_discard_tree(dsl_deadlist_t *dl);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -112,16 +112,24 @@ unsigned long zfs_livelist_max_entries = 500000;
*/ */
int zfs_livelist_min_percent_shared = 75; int zfs_livelist_min_percent_shared = 75;
static int static int
dsl_deadlist_compare(const void *arg1, const void *arg2) dsl_deadlist_compare(const void *arg1, const void *arg2)
{ {
const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1; const dsl_deadlist_entry_t *dle1 = arg1;
const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2; const dsl_deadlist_entry_t *dle2 = arg2;
return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
} }
static int
dsl_deadlist_cache_compare(const void *arg1, const void *arg2)
{
const dsl_deadlist_cache_entry_t *dlce1 = arg1;
const dsl_deadlist_cache_entry_t *dlce2 = arg2;
return (AVL_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));
}
static void static void
dsl_deadlist_load_tree(dsl_deadlist_t *dl) dsl_deadlist_load_tree(dsl_deadlist_t *dl)
{ {
@ -131,6 +139,23 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
ASSERT(MUTEX_HELD(&dl->dl_lock)); ASSERT(MUTEX_HELD(&dl->dl_lock));
ASSERT(!dl->dl_oldfmt); ASSERT(!dl->dl_oldfmt);
if (dl->dl_havecache) {
/*
* After loading the tree, the caller may modify the tree,
* e.g. to add or remove nodes, or to make a node no longer
* refer to the empty_bpobj. These changes would make the
* dl_cache incorrect. Therefore we discard the cache here,
* so that it can't become incorrect.
*/
dsl_deadlist_cache_entry_t *dlce;
void *cookie = NULL;
while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
!= NULL) {
kmem_free(dlce, sizeof (*dlce));
}
avl_destroy(&dl->dl_cache);
dl->dl_havecache = B_FALSE;
}
if (dl->dl_havetree) if (dl->dl_havetree)
return; return;
@ -142,14 +167,114 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
zap_cursor_advance(&zc)) { zap_cursor_advance(&zc)) {
dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,
za.za_first_integer)); /*
* Prefetch all the bpobj's so that we do that i/o
* in parallel. Then open them all in a second pass.
*/
dle->dle_bpobj.bpo_object = za.za_first_integer;
dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
avl_add(&dl->dl_tree, dle); avl_add(&dl->dl_tree, dle);
} }
zap_cursor_fini(&zc); zap_cursor_fini(&zc);
for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);
dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) {
VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,
dle->dle_bpobj.bpo_object));
}
dl->dl_havetree = B_TRUE; dl->dl_havetree = B_TRUE;
} }
/*
* Load only the non-empty bpobj's into the dl_cache. The cache is an analog
* of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It
* is used only for gathering space statistics. The dl_cache has two
* advantages over the dl_tree:
*
* 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's
* mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj
* many times and to inquire about its (zero) space stats many times.
*
* 2. The dl_cache uses less memory than the dl_tree. We only need to load
* the dl_tree of snapshots when deleting a snapshot, after which we free the
* dl_tree with dsl_deadlist_discard_tree
*/
static void
dsl_deadlist_load_cache(dsl_deadlist_t *dl)
{
zap_cursor_t zc;
zap_attribute_t za;
ASSERT(MUTEX_HELD(&dl->dl_lock));
ASSERT(!dl->dl_oldfmt);
if (dl->dl_havecache)
return;
uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj;
avl_create(&dl->dl_cache, dsl_deadlist_cache_compare,
sizeof (dsl_deadlist_cache_entry_t),
offsetof(dsl_deadlist_cache_entry_t, dlce_node));
for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
if (za.za_first_integer == empty_bpobj)
continue;
dsl_deadlist_cache_entry_t *dlce =
kmem_zalloc(sizeof (*dlce), KM_SLEEP);
dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL);
/*
* Prefetch all the bpobj's so that we do that i/o
* in parallel. Then open them all in a second pass.
*/
dlce->dlce_bpobj = za.za_first_integer;
dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
avl_add(&dl->dl_cache, dlce);
}
zap_cursor_fini(&zc);
for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);
dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) {
bpobj_t bpo;
VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj));
VERIFY0(bpobj_space(&bpo,
&dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp));
bpobj_close(&bpo);
}
dl->dl_havecache = B_TRUE;
}
/*
* Discard the tree to save memory.
*/
void
dsl_deadlist_discard_tree(dsl_deadlist_t *dl)
{
mutex_enter(&dl->dl_lock);
if (!dl->dl_havetree) {
mutex_exit(&dl->dl_lock);
return;
}
dsl_deadlist_entry_t *dle;
void *cookie = NULL;
while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) {
bpobj_close(&dle->dle_bpobj);
kmem_free(dle, sizeof (*dle));
}
avl_destroy(&dl->dl_tree);
dl->dl_havetree = B_FALSE;
mutex_exit(&dl->dl_lock);
}
void void
dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
{ {
@ -190,6 +315,7 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
dl->dl_oldfmt = B_FALSE; dl->dl_oldfmt = B_FALSE;
dl->dl_phys = dl->dl_dbuf->db_data; dl->dl_phys = dl->dl_dbuf->db_data;
dl->dl_havetree = B_FALSE; dl->dl_havetree = B_FALSE;
dl->dl_havecache = B_FALSE;
} }
boolean_t boolean_t
@ -201,9 +327,6 @@ dsl_deadlist_is_open(dsl_deadlist_t *dl)
void void
dsl_deadlist_close(dsl_deadlist_t *dl) dsl_deadlist_close(dsl_deadlist_t *dl)
{ {
void *cookie = NULL;
dsl_deadlist_entry_t *dle;
ASSERT(dsl_deadlist_is_open(dl)); ASSERT(dsl_deadlist_is_open(dl));
mutex_destroy(&dl->dl_lock); mutex_destroy(&dl->dl_lock);
@ -216,6 +339,8 @@ dsl_deadlist_close(dsl_deadlist_t *dl)
} }
if (dl->dl_havetree) { if (dl->dl_havetree) {
dsl_deadlist_entry_t *dle;
void *cookie = NULL;
while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
!= NULL) { != NULL) {
bpobj_close(&dle->dle_bpobj); bpobj_close(&dle->dle_bpobj);
@ -223,6 +348,15 @@ dsl_deadlist_close(dsl_deadlist_t *dl)
} }
avl_destroy(&dl->dl_tree); avl_destroy(&dl->dl_tree);
} }
if (dl->dl_havecache) {
dsl_deadlist_cache_entry_t *dlce;
void *cookie = NULL;
while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
!= NULL) {
kmem_free(dlce, sizeof (*dlce));
}
avl_destroy(&dl->dl_cache);
}
dmu_buf_rele(dl->dl_dbuf, dl); dmu_buf_rele(dl->dl_dbuf, dl);
dl->dl_dbuf = NULL; dl->dl_dbuf = NULL;
dl->dl_phys = NULL; dl->dl_phys = NULL;
@ -440,6 +574,7 @@ dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
avl_remove(&dl->dl_tree, dle); avl_remove(&dl->dl_tree, dle);
VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
dmu_buf_will_dirty(dl->dl_dbuf, tx);
dl->dl_phys->dl_used -= used; dl->dl_phys->dl_used -= used;
dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_comp -= comp;
dl->dl_phys->dl_uncomp -= uncomp; dl->dl_phys->dl_uncomp -= uncomp;
@ -468,6 +603,7 @@ dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
mutex_enter(&dl->dl_lock); mutex_enter(&dl->dl_lock);
VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
dmu_buf_will_dirty(dl->dl_dbuf, tx);
dl->dl_phys->dl_used -= used; dl->dl_phys->dl_used -= used;
dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_comp -= comp;
dl->dl_phys->dl_uncomp -= uncomp; dl->dl_phys->dl_uncomp -= uncomp;
@ -603,8 +739,8 @@ void
dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{ {
dsl_deadlist_entry_t *dle; dsl_deadlist_cache_entry_t *dlce;
dsl_deadlist_entry_t dle_tofind; dsl_deadlist_cache_entry_t dlce_tofind;
avl_index_t where; avl_index_t where;
if (dl->dl_oldfmt) { if (dl->dl_oldfmt) {
@ -616,34 +752,25 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
*usedp = *compp = *uncompp = 0; *usedp = *compp = *uncompp = 0;
mutex_enter(&dl->dl_lock); mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl); dsl_deadlist_load_cache(dl);
dle_tofind.dle_mintxg = mintxg; dlce_tofind.dlce_mintxg = mintxg;
dle = avl_find(&dl->dl_tree, &dle_tofind, &where); dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where);
/* /*
* If we don't find this mintxg, there shouldn't be anything * If this mintxg doesn't exist, it may be an empty_bpobj which
* after it either. * is omitted from the sparse tree. Start at the next non-empty
* entry.
*/ */
ASSERT(dle != NULL || if (dlce == NULL)
avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER);
for (; dle && dle->dle_mintxg < maxtxg; for (; dlce && dlce->dlce_mintxg < maxtxg;
dle = AVL_NEXT(&dl->dl_tree, dle)) { dlce = AVL_NEXT(&dl->dl_tree, dlce)) {
uint64_t used, comp, uncomp; *usedp += dlce->dlce_bytes;
*compp += dlce->dlce_comp;
VERIFY0(bpobj_space(&dle->dle_bpobj, *uncompp += dlce->dlce_uncomp;
&used, &comp, &uncomp));
*usedp += used;
*compp += comp;
*uncompp += uncomp;
} }
/*
* This assertion ensures that the maxtxg is a key in the deadlist
* (unless it's UINT64_MAX).
*/
ASSERT(maxtxg == UINT64_MAX ||
(dle != NULL && dle->dle_mintxg == maxtxg));
mutex_exit(&dl->dl_lock); mutex_exit(&dl->dl_lock);
} }

View File

@ -413,6 +413,13 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
/* Merge our deadlist into next's and free it. */ /* Merge our deadlist into next's and free it. */
dsl_deadlist_merge(&ds_next->ds_deadlist, dsl_deadlist_merge(&ds_next->ds_deadlist,
dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
/*
* We are done with the deadlist tree (generated/used
* by dsl_deadlist_move_bpobj() and dsl_deadlist_merge()).
* Discard it to save memory.
*/
dsl_deadlist_discard_tree(&ds_next->ds_deadlist);
} }
dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_close(&ds->ds_deadlist);