Prefetch on deadlists merge
During snapshot deletion ZFS may issue several reads for each deadlist to merge them into next snapshot's or pool's bpobj. Number of the dead lists increases with number of snapshots. On HDD pools it may take significant time during which sync thread is blocked. This patch introduces prescient prefetch of required blocks for up to 128 deadlists ahead. Tests show reduction of time required to delete dataset with 720 snapshots with randomly overwritten file on wide HDD pool from 75-85 to 22-28 seconds. Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Issue #14276 Closes #14402
This commit is contained in:
parent
c85ac731a0
commit
dc5c8006f6
|
@ -87,6 +87,7 @@ int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func,
|
||||||
void *arg, int64_t start);
|
void *arg, int64_t start);
|
||||||
|
|
||||||
void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
|
void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
|
||||||
|
void bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj);
|
||||||
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
|
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
|
||||||
dmu_tx_t *tx);
|
dmu_tx_t *tx);
|
||||||
|
|
||||||
|
|
|
@ -663,14 +663,13 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
|
||||||
}
|
}
|
||||||
|
|
||||||
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
|
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
|
||||||
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
|
|
||||||
|
|
||||||
if (bpobj_is_empty(&subbpo)) {
|
if (bpobj_is_empty(&subbpo)) {
|
||||||
/* No point in having an empty subobj. */
|
/* No point in having an empty subobj. */
|
||||||
bpobj_close(&subbpo);
|
bpobj_close(&subbpo);
|
||||||
bpobj_free(bpo->bpo_os, subobj, tx);
|
bpobj_free(bpo->bpo_os, subobj, tx);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
|
||||||
|
|
||||||
mutex_enter(&bpo->bpo_lock);
|
mutex_enter(&bpo->bpo_lock);
|
||||||
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
|
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
|
||||||
|
@ -780,6 +779,68 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Prefetch metadata required for bpobj_enqueue_subobj().
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
|
||||||
|
{
|
||||||
|
dmu_object_info_t doi;
|
||||||
|
bpobj_t subbpo;
|
||||||
|
uint64_t subsubobjs;
|
||||||
|
boolean_t copy_subsub = B_TRUE;
|
||||||
|
boolean_t copy_bps = B_TRUE;
|
||||||
|
|
||||||
|
ASSERT(bpobj_is_open(bpo));
|
||||||
|
ASSERT(subobj != 0);
|
||||||
|
|
||||||
|
if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)
|
||||||
|
return;
|
||||||
|
if (bpobj_is_empty(&subbpo)) {
|
||||||
|
bpobj_close(&subbpo);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
|
||||||
|
bpobj_close(&subbpo);
|
||||||
|
|
||||||
|
if (subsubobjs != 0) {
|
||||||
|
if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)
|
||||||
|
return;
|
||||||
|
if (doi.doi_max_offset > doi.doi_data_block_size)
|
||||||
|
copy_subsub = B_FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)
|
||||||
|
return;
|
||||||
|
if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
|
||||||
|
copy_bps = B_FALSE;
|
||||||
|
|
||||||
|
if (copy_subsub && subsubobjs != 0) {
|
||||||
|
if (bpo->bpo_phys->bpo_subobjs) {
|
||||||
|
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
|
||||||
|
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
|
||||||
|
ZIO_PRIORITY_ASYNC_READ);
|
||||||
|
}
|
||||||
|
dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
|
||||||
|
ZIO_PRIORITY_ASYNC_READ);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (copy_bps) {
|
||||||
|
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
|
||||||
|
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
|
||||||
|
ZIO_PRIORITY_ASYNC_READ);
|
||||||
|
dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
|
||||||
|
ZIO_PRIORITY_ASYNC_READ);
|
||||||
|
} else if (bpo->bpo_phys->bpo_subobjs) {
|
||||||
|
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
|
||||||
|
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
|
||||||
|
ZIO_PRIORITY_ASYNC_READ);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
|
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
|
||||||
dmu_tx_t *tx)
|
dmu_tx_t *tx)
|
||||||
|
|
|
@ -438,6 +438,18 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Prefetch metadata required for dle_enqueue_subobj().
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
|
||||||
|
uint64_t obj)
|
||||||
|
{
|
||||||
|
if (dle->dle_bpobj.bpo_object !=
|
||||||
|
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)
|
||||||
|
bpobj_prefetch_subobj(&dle->dle_bpobj, obj);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
|
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
|
||||||
dmu_tx_t *tx)
|
dmu_tx_t *tx)
|
||||||
|
@ -810,6 +822,27 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
|
||||||
dle_enqueue_subobj(dl, dle, obj, tx);
|
dle_enqueue_subobj(dl, dle, obj, tx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Prefetch metadata required for dsl_deadlist_insert_bpobj().
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)
|
||||||
|
{
|
||||||
|
dsl_deadlist_entry_t dle_tofind;
|
||||||
|
dsl_deadlist_entry_t *dle;
|
||||||
|
avl_index_t where;
|
||||||
|
|
||||||
|
ASSERT(MUTEX_HELD(&dl->dl_lock));
|
||||||
|
|
||||||
|
dsl_deadlist_load_tree(dl);
|
||||||
|
|
||||||
|
dle_tofind.dle_mintxg = birth;
|
||||||
|
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
|
||||||
|
if (dle == NULL)
|
||||||
|
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
|
||||||
|
dle_prefetch_subobj(dl, dle, obj);
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||||
dmu_tx_t *tx)
|
dmu_tx_t *tx)
|
||||||
|
@ -826,12 +859,12 @@ dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||||
void
|
void
|
||||||
dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
|
dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
|
||||||
{
|
{
|
||||||
zap_cursor_t zc;
|
zap_cursor_t zc, pzc;
|
||||||
zap_attribute_t za;
|
zap_attribute_t za, pza;
|
||||||
dmu_buf_t *bonus;
|
dmu_buf_t *bonus;
|
||||||
dsl_deadlist_phys_t *dlp;
|
dsl_deadlist_phys_t *dlp;
|
||||||
dmu_object_info_t doi;
|
dmu_object_info_t doi;
|
||||||
int error;
|
int error, perror, i;
|
||||||
|
|
||||||
VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
|
VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
|
||||||
if (doi.doi_type == DMU_OT_BPOBJ) {
|
if (doi.doi_type == DMU_OT_BPOBJ) {
|
||||||
|
@ -843,15 +876,32 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_enter(&dl->dl_lock);
|
mutex_enter(&dl->dl_lock);
|
||||||
|
/*
|
||||||
|
* Prefetch up to 128 deadlists first and then more as we progress.
|
||||||
|
* The limit is a balance between ARC use and diminishing returns.
|
||||||
|
*/
|
||||||
|
for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
|
||||||
|
(perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128;
|
||||||
|
zap_cursor_advance(&pzc), i++) {
|
||||||
|
dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
|
||||||
|
zfs_strtonum(pza.za_name, NULL));
|
||||||
|
}
|
||||||
for (zap_cursor_init(&zc, dl->dl_os, obj);
|
for (zap_cursor_init(&zc, dl->dl_os, obj);
|
||||||
(error = zap_cursor_retrieve(&zc, &za)) == 0;
|
(error = zap_cursor_retrieve(&zc, &za)) == 0;
|
||||||
zap_cursor_advance(&zc)) {
|
zap_cursor_advance(&zc)) {
|
||||||
uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
|
uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
|
||||||
dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
|
dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
|
||||||
VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
|
VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
|
||||||
|
if (perror == 0) {
|
||||||
|
dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
|
||||||
|
zfs_strtonum(pza.za_name, NULL));
|
||||||
|
zap_cursor_advance(&pzc);
|
||||||
|
perror = zap_cursor_retrieve(&pzc, &pza);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
VERIFY3U(error, ==, ENOENT);
|
VERIFY3U(error, ==, ENOENT);
|
||||||
zap_cursor_fini(&zc);
|
zap_cursor_fini(&zc);
|
||||||
|
zap_cursor_fini(&pzc);
|
||||||
|
|
||||||
VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
|
VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
|
||||||
dlp = bonus->db_data;
|
dlp = bonus->db_data;
|
||||||
|
@ -869,8 +919,9 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
|
||||||
dmu_tx_t *tx)
|
dmu_tx_t *tx)
|
||||||
{
|
{
|
||||||
dsl_deadlist_entry_t dle_tofind;
|
dsl_deadlist_entry_t dle_tofind;
|
||||||
dsl_deadlist_entry_t *dle;
|
dsl_deadlist_entry_t *dle, *pdle;
|
||||||
avl_index_t where;
|
avl_index_t where;
|
||||||
|
int i;
|
||||||
|
|
||||||
ASSERT(!dl->dl_oldfmt);
|
ASSERT(!dl->dl_oldfmt);
|
||||||
|
|
||||||
|
@ -882,11 +933,23 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
|
||||||
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
|
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
|
||||||
if (dle == NULL)
|
if (dle == NULL)
|
||||||
dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
|
dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
|
||||||
|
/*
|
||||||
|
* Prefetch up to 128 deadlists first and then more as we progress.
|
||||||
|
* The limit is a balance between ARC use and diminishing returns.
|
||||||
|
*/
|
||||||
|
for (pdle = dle, i = 0; pdle && i < 128; ) {
|
||||||
|
bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
|
||||||
|
pdle = AVL_NEXT(&dl->dl_tree, pdle);
|
||||||
|
}
|
||||||
while (dle) {
|
while (dle) {
|
||||||
uint64_t used, comp, uncomp;
|
uint64_t used, comp, uncomp;
|
||||||
dsl_deadlist_entry_t *dle_next;
|
dsl_deadlist_entry_t *dle_next;
|
||||||
|
|
||||||
bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
|
bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
|
||||||
|
if (pdle) {
|
||||||
|
bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
|
||||||
|
pdle = AVL_NEXT(&dl->dl_tree, pdle);
|
||||||
|
}
|
||||||
|
|
||||||
VERIFY0(bpobj_space(&dle->dle_bpobj,
|
VERIFY0(bpobj_space(&dle->dle_bpobj,
|
||||||
&used, &comp, &uncomp));
|
&used, &comp, &uncomp));
|
||||||
|
|
Loading…
Reference in New Issue