From 9553c533a635bf39697b0b54f4751580f6d2590d Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Wed, 31 Oct 2018 09:58:17 -0700 Subject: [PATCH] bpobj_enqueue_subobj() should copy small subobj's When we delete a snapshot, we consolidate some bpobj's together because we no longer need to keep their entries in separate buckets. This is done in constant time by including the "sub" bpobj by reference in the parent bpobj. After many snapshots have been deleted, we may have many sub-bpobj's. Usually, most sub-bpobj's don't contain many BP's. Compared to this small payload, the sub-bpobj is relatively heavyweight since it is a object in the MOS. A common scenario on a long-lived pool is for the vast majority of MOS objects to be small sub-bpobj's. To improve this situation, when consolidating bpobj's together, bpobj_enqueue_subobj() can copy the contents of small bpobj's into the parent, and then delete the enqueued bpobj, rather than including it by reference. Since this copying is limited in size (to one block), the consolidation is still constant time, though with a larger constant due to reading in the one block of the enqueued bpobj. This idea and mechanism are similar to how we handle "sub-subobj's". When including a sub-bpobj by reference, if the sub-bpobj itself has less than a block of sub-sub-bpobj's, the list of sub-sub-bpobj's is copied to the parent bpobj's list of sub-bpobj's. Reviewed-by: Serapheim Dimitropoulos Reviewed-by: Brian Behlendorf Reviewed-by: Paul Zuchowski Signed-off-by: Matthew Ahrens Closes #8053 Issue #7908 --- module/zfs/bpobj.c | 218 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 177 insertions(+), 41 deletions(-) diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 66ab84df58..94bd1fd8f0 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -377,11 +377,98 @@ bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); } +/* + * Logically add subobj's contents to the parent bpobj. + * + * In the most general case, this is accomplished in constant time by adding + * a reference to subobj. This case is used when enqueuing a large subobj: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | + * +----+----+----+----+----+ +-----+-----+-----+ + * + * +--------------+ +--------------+ + * | sub-bpobj |----------------------> | subsubobj | + * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ + * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | + * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ + * + * Result: sub-bpobj added to parent's subobj list. + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+-----+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | + * +----+----+----+----+----+ +-----+-----+-----+--|--+ + * | + * /-----------------------------------------------------/ + * v + * +--------------+ +--------------+ + * | sub-bpobj |----------------------> | subsubobj | + * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ + * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | + * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ + * + * + * In a common case, the subobj is small: its bp's and its list of subobj's + * are each stored in a single block. In this case we copy the subobj's + * contents to the parent: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | + * +----+----+----+----+----+ +-----+-----+-----+ + * ^ ^ + * +--------------+ | +--------------+ | + * | sub-bpobj |---------^------------> | subsubobj | ^ + * +----+----+----+ | +-----+-----+--+ | + * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/ + * +----+----+ +-----+-----+ + * + * Result: subobj destroyed, contents copied to parent: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+ + * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ | + * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+ + * + * + * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's + * but retain the sub-bpobj: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | + * +----+----+----+----+----+ +-----+-----+-----+ + * ^ + * +--------------+ +--------------+ | + * | sub-bpobj |----------------------> | subsubobj | ^ + * +----+----+----+----+---------+----+ +-----+-----+--+ | + * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/ + * +----+----+----+----+---------+----+ +-----+-----+ + * + * Result: sub-sub-bpobjs and subobj added to parent's subobj list. + * +--------------+ +--------------+ + * | bpobj |-------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* | + * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+ + * | + * /--------------------------------------------------------------/ + * v + * +--------------+ + * | sub-bpobj | + * +----+----+----+----+---------+----+ + * | bp | bp | bp | bp | ... | bp | + * +----+----+----+----+---------+----+ + */ void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) { bpobj_t subbpo; uint64_t used, comp, uncomp, subsubobjs; + boolean_t copy_subsub = B_TRUE; + boolean_t copy_bps = B_TRUE; ASSERT(bpobj_is_open(bpo)); ASSERT(subobj != 0); @@ -406,61 +493,110 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) mutex_enter(&bpo->bpo_lock); dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - if (bpo->bpo_phys->bpo_subobjs == 0) { - bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, - DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, - DMU_OT_NONE, 0, tx); + + dmu_object_info_t doi; + + if (bpo->bpo_phys->bpo_subobjs != 0) { + ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); } - ASSERTV(dmu_object_info_t doi); - ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); - - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - sizeof (subobj), &subobj, tx); - bpo->bpo_phys->bpo_num_subobjs++; - /* * If subobj has only one block of subobjs, then move subobj's - * subobjs to bpo's subobj list directly. This reduces - * recursion in bpobj_iterate due to nested subobjs. + * subobjs to bpo's subobj list directly. This reduces recursion in + * bpobj_iterate due to nested subobjs. */ subsubobjs = subbpo.bpo_phys->bpo_subobjs; if (subsubobjs != 0) { - dmu_object_info_t doi; - - VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); - if (doi.doi_max_offset == doi.doi_data_block_size) { - dmu_buf_t *subdb; - uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; - - VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, - 0, FTAG, &subdb, 0)); - /* - * Make sure that we are not asking dmu_write() - * to write more data than we have in our buffer. - */ - VERIFY3U(subdb->db_size, >=, - numsubsub * sizeof (subobj)); - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - numsubsub * sizeof (subobj), subdb->db_data, tx); - dmu_buf_rele(subdb, FTAG); - bpo->bpo_phys->bpo_num_subobjs += numsubsub; - - dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); - subbpo.bpo_phys->bpo_subobjs = 0; - VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, - subsubobjs, tx)); + VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); + if (doi.doi_max_offset > doi.doi_data_block_size) { + copy_subsub = B_FALSE; } } + + /* + * If, in addition to having only one block of subobj's, subobj has + * only one block of bp's, then move subobj's bp's to bpo's bp list + * directly. This reduces recursion in bpobj_iterate due to nested + * subobjs. + */ + VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi)); + if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) { + copy_bps = B_FALSE; + } + + if (copy_subsub && subsubobjs != 0) { + dmu_buf_t *subdb; + uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; + + VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs, + 0, FTAG, &subdb, 0)); + /* + * Make sure that we are not asking dmu_write() + * to write more data than we have in our buffer. + */ + VERIFY3U(subdb->db_size, >=, + numsubsub * sizeof (subobj)); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = + dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + } + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + numsubsub * sizeof (subobj), subdb->db_data, tx); + dmu_buf_rele(subdb, FTAG); + bpo->bpo_phys->bpo_num_subobjs += numsubsub; + + dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); + subbpo.bpo_phys->bpo_subobjs = 0; + VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx)); + } + + if (copy_bps) { + dmu_buf_t *bps; + uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs; + + ASSERT(copy_subsub); + VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj, + 0, FTAG, &bps, 0)); + + /* + * Make sure that we are not asking dmu_write() + * to write more data than we have in our buffer. + */ + VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t)); + dmu_write(bpo->bpo_os, bpo->bpo_object, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), + numbps * sizeof (blkptr_t), + bps->db_data, tx); + dmu_buf_rele(bps, FTAG); + bpo->bpo_phys->bpo_num_blkptrs += numbps; + + bpobj_close(&subbpo); + VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx)); + } else { + bpobj_close(&subbpo); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = + dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + } + + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + sizeof (subobj), &subobj, tx); + bpo->bpo_phys->bpo_num_subobjs++; + } + bpo->bpo_phys->bpo_bytes += used; bpo->bpo_phys->bpo_comp += comp; bpo->bpo_phys->bpo_uncomp += uncomp; mutex_exit(&bpo->bpo_lock); - bpobj_close(&subbpo); } void