diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index 16e403526c..2bca0a82e5 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -87,6 +87,7 @@ int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, int64_t start); void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); +void bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj); void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx); diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 68f534c6b1..a8e9309d28 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -663,14 +663,13 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) } VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); - VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - if (bpobj_is_empty(&subbpo)) { /* No point in having an empty subobj. */ bpobj_close(&subbpo); bpobj_free(bpo->bpo_os, subobj, tx); return; } + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); mutex_enter(&bpo->bpo_lock); dmu_buf_will_dirty(bpo->bpo_dbuf, tx); @@ -780,6 +779,68 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) } +/* + * Prefetch metadata required for bpobj_enqueue_subobj(). + */ +void +bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj) +{ + dmu_object_info_t doi; + bpobj_t subbpo; + uint64_t subsubobjs; + boolean_t copy_subsub = B_TRUE; + boolean_t copy_bps = B_TRUE; + + ASSERT(bpobj_is_open(bpo)); + ASSERT(subobj != 0); + + if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) + return; + + if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0) + return; + if (bpobj_is_empty(&subbpo)) { + bpobj_close(&subbpo); + return; + } + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + bpobj_close(&subbpo); + + if (subsubobjs != 0) { + if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0) + return; + if (doi.doi_max_offset > doi.doi_data_block_size) + copy_subsub = B_FALSE; + } + + if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0) + return; + if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) + copy_bps = B_FALSE; + + if (copy_subsub && subsubobjs != 0) { + if (bpo->bpo_phys->bpo_subobjs) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, + ZIO_PRIORITY_ASYNC_READ); + } + dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1, + ZIO_PRIORITY_ASYNC_READ); + } + + if (copy_bps) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1, + ZIO_PRIORITY_ASYNC_READ); + dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1, + ZIO_PRIORITY_ASYNC_READ); + } else if (bpo->bpo_phys->bpo_subobjs) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, + ZIO_PRIORITY_ASYNC_READ); + } +} + void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 7681b735ec..1b2d8b92f2 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -438,6 +438,18 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, } } +/* + * Prefetch metadata required for dle_enqueue_subobj(). + */ +static void +dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + uint64_t obj) +{ + if (dle->dle_bpobj.bpo_object != + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) + bpobj_prefetch_subobj(&dle->dle_bpobj, obj); +} + void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) @@ -809,6 +821,27 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, dle_enqueue_subobj(dl, dle, obj, tx); } +/* + * Prefetch metadata required for dsl_deadlist_insert_bpobj(). + */ +static void +dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + dle_prefetch_subobj(dl, dle, obj); +} + static int dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) @@ -825,12 +858,12 @@ dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) { - zap_cursor_t zc; - zap_attribute_t za; + zap_cursor_t zc, pzc; + zap_attribute_t za, pza; dmu_buf_t *bonus; dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; - int error; + int error, perror, i; VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { @@ -842,15 +875,32 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) } mutex_enter(&dl->dl_lock); + /* + * Prefetch up to 128 deadlists first and then more as we progress. + * The limit is a balance between ARC use and diminishing returns. + */ + for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0; + (perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128; + zap_cursor_advance(&pzc), i++) { + dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, + zfs_strtonum(pza.za_name, NULL)); + } for (zap_cursor_init(&zc, dl->dl_os, obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t mintxg = zfs_strtonum(za.za_name, NULL); dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); + if (perror == 0) { + dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, + zfs_strtonum(pza.za_name, NULL)); + zap_cursor_advance(&pzc); + perror = zap_cursor_retrieve(&pzc, &pza); + } } VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); + zap_cursor_fini(&pzc); VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); dlp = bonus->db_data; @@ -868,8 +918,9 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; - dsl_deadlist_entry_t *dle; + dsl_deadlist_entry_t *dle, *pdle; avl_index_t where; + int i; ASSERT(!dl->dl_oldfmt); @@ -881,11 +932,23 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); + /* + * Prefetch up to 128 deadlists first and then more as we progress. + * The limit is a balance between ARC use and diminishing returns. + */ + for (pdle = dle, i = 0; pdle && i < 128; ) { + bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); + pdle = AVL_NEXT(&dl->dl_tree, pdle); + } while (dle) { uint64_t used, comp, uncomp; dsl_deadlist_entry_t *dle_next; bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); + if (pdle) { + bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); + pdle = AVL_NEXT(&dl->dl_tree, pdle); + } VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));