From d63f5d7e50b65c76d9a8b79db0b66ebb6a49742c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 18 Jun 2024 14:11:11 +1000 Subject: [PATCH] zdb: rework DDT block count and leak check to just count the blocks The upcoming dedup features break the long held assumption that all blocks on disk with a 'D' dedup bit will always be present in the DDT, or will have the same set of DVA allocations on disk as in the DDT. If the DDT is no longer a complete picture of all the dedup blocks that will be and should be on disk, then it does us no good to walk and prime it up front, since it won't necessarily match up with every block we'll see anyway. Instead, we rework things here to be more like the BRT checks. When we see a dedup'd block, we look it up in the DDT, consume a refcount, and for the second-or-later instances, count them as duplicates. The DDT and BRT are moved ahead of the space accounting. This will become important for the "flat" feature, which may need to count a modified version of the block. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Co-authored-by: Allan Jude Co-authored-by: Don Brady Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15892 --- cmd/zdb/zdb.c | 315 ++++++++++++++++++++++++++++------------------ include/sys/ddt.h | 2 +- module/zfs/ddt.c | 8 +- module/zfs/zio.c | 4 +- 4 files changed, 200 insertions(+), 129 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index dec70c60ce..fcf0e47797 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -33,7 +33,7 @@ * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, 2024, Klara Inc. * Copyright (c) 2023, Rob Norris */ @@ -3287,9 +3287,46 @@ fuid_table_destroy(void) } } +/* + * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on + * a live pool are normally cleaned up during ddt_sync(). We can't do that (and + * wouldn't want to anyway), but if we don't clean up the presence of stuff on + * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves. + * + * Note that this is not a particularly efficient way to do this, but + * ddt_remove() is the only public method that can do the work we need, and it + * requires the right locks and etc to do the job. This is only ever called + * during zdb shutdown so efficiency is not especially important. + */ +static void +zdb_ddt_cleanup(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + ddt_enter(ddt); + ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next; + while (dde) { + next = AVL_NEXT(&ddt->ddt_tree, dde); + memset(&dde->dde_lead_zio, 0, + sizeof (dde->dde_lead_zio)); + ddt_remove(ddt, dde); + dde = next; + } + ddt_exit(ddt); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } +} + static void zdb_exit(int reason) { + if (spa != NULL) + zdb_ddt_cleanup(spa); + if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { @@ -5633,7 +5670,6 @@ static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { - uint64_t refcnt = 0; int i; ASSERT(type < ZDB_OT_TOTAL); @@ -5641,8 +5677,144 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; + /* + * This flag controls if we will issue a claim for the block while + * counting it, to ensure that all blocks are referenced in space maps. + * We don't issue claims if we're not doing leak tracking, because it's + * expensive if the user isn't interested. We also don't claim the + * second or later occurences of cloned or dedup'd blocks, because we + * already claimed them the first time. + */ + boolean_t do_claim = !dump_opt['L']; + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + if (BP_GET_DEDUP(bp)) { + /* + * Dedup'd blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them once. + * + * We use the existing dedup system to track what we've seen. + * The first time we see a block, we do a ddt_lookup() to see + * if it exists in the DDT. If we're doing leak tracking, we + * claim the block at this time. + * + * Each time we see a block, we reduce the refcount in the + * entry by one, and add to the size and count of dedup'd + * blocks to report at the end. + */ + + ddt_t *ddt = ddt_select(zcb->zcb_spa, bp); + + ddt_enter(ddt); + + /* + * Find the block. This will create the entry in memory, but + * we'll know if that happened by its refcount. + */ + ddt_entry_t *dde = ddt_lookup(ddt, bp); + + /* + * ddt_lookup() can only return NULL if this block didn't exist + * in the DDT and creating it would take the DDT over its + * quota. Since we got the block from disk, it must exist in + * the DDT, so this can't happen. + */ + VERIFY3P(dde, !=, NULL); + + /* Get the phys for this variant */ + ddt_phys_t *ddp = ddt_phys_select(dde, bp); + VERIFY3P(ddp, !=, NULL); + + /* + * This entry may have multiple sets of DVAs. We must claim + * each set the first time we see them in a real block on disk, + * or count them on subsequent occurences. We don't have a + * convenient way to track the first time we see each variant, + * so we repurpose dde_lead_zio[] as a per-phys "seen" flag. We + * can do this safely in zdb because it never writes, so it + * will never have a writing zio for this block in that + * pointer. + */ + + /* + * Work out which dde_phys index was used, get the seen flag, + * and update it if necessary. + */ + uint_t idx = + ((uint_t)((uintptr_t)ddp - (uintptr_t)dde->dde_phys)) / + sizeof (ddt_phys_t); + VERIFY3P(ddp, ==, &dde->dde_phys[idx]); + boolean_t seen = (boolean_t)(uintptr_t)dde->dde_lead_zio[idx]; + if (!seen) + dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE; + + /* Consume a reference for this block. */ + VERIFY3U(ddt_phys_total_refcnt(dde), >, 0); + ddt_phys_decref(ddp); + + if (seen) { + /* + * The second or later time we see this block, + * it's a duplicate and we count it. + */ + zcb->zcb_dedup_asize += BP_GET_ASIZE(bp); + zcb->zcb_dedup_blocks++; + + /* Already claimed, don't do it again. */ + do_claim = B_FALSE; + } + + ddt_exit(ddt); + } else if (zcb->zcb_brt_is_active && + brt_maybe_exists(zcb->zcb_spa, bp)) { + /* + * Cloned blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them once. + * + * To do this, we keep our own in-memory BRT. For each block + * we haven't seen before, we look it up in the real BRT and + * if its there, we note it and its refcount then proceed as + * normal. If we see the block again, we count it as a clone + * and then give it no further consideration. + */ + zdb_brt_entry_t zbre_search, *zbre; + avl_index_t where; + + zbre_search.zbre_dva = bp->blk_dva[0]; + zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); + if (zbre == NULL) { + /* Not seen before; track it */ + uint64_t refcnt = + brt_entry_get_refcount(zcb->zcb_spa, bp); + if (refcnt > 0) { + zbre = umem_zalloc(sizeof (zdb_brt_entry_t), + UMEM_NOFAIL); + zbre->zbre_dva = bp->blk_dva[0]; + zbre->zbre_refcount = refcnt; + avl_insert(&zcb->zcb_brt, zbre, where); + } + } else { + /* + * Second or later occurrence, count it and take a + * refcount. + */ + zcb->zcb_clone_asize += BP_GET_ASIZE(bp); + zcb->zcb_clone_blocks++; + + zbre->zbre_refcount--; + if (zbre->zbre_refcount == 0) { + avl_remove(&zcb->zcb_brt, zbre); + umem_free(zbre, sizeof (zdb_brt_entry_t)); + } + + /* Already claimed, don't do it again. */ + do_claim = B_FALSE; + } + } + for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; @@ -5745,71 +5917,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); - if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { - /* - * Cloned blocks are special. We need to count them, so we can - * later uncount them when reporting leaked space, and we must - * only claim them them once. - * - * To do this, we keep our own in-memory BRT. For each block - * we haven't seen before, we look it up in the real BRT and - * if its there, we note it and its refcount then proceed as - * normal. If we see the block again, we count it as a clone - * and then give it no further consideration. - */ - zdb_brt_entry_t zbre_search, *zbre; - avl_index_t where; - - zbre_search.zbre_dva = bp->blk_dva[0]; - zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); - if (zbre != NULL) { - zcb->zcb_clone_asize += BP_GET_ASIZE(bp); - zcb->zcb_clone_blocks++; - - zbre->zbre_refcount--; - if (zbre->zbre_refcount == 0) { - avl_remove(&zcb->zcb_brt, zbre); - umem_free(zbre, sizeof (zdb_brt_entry_t)); - } - return; - } - - uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); - if (crefcnt > 0) { - zbre = umem_zalloc(sizeof (zdb_brt_entry_t), - UMEM_NOFAIL); - zbre->zbre_dva = bp->blk_dva[0]; - zbre->zbre_refcount = crefcnt; - avl_insert(&zcb->zcb_brt, zbre, where); - } - } - - if (dump_opt['L']) + if (!do_claim) return; - if (BP_GET_DEDUP(bp)) { - ddt_t *ddt; - ddt_entry_t *dde; - - ddt = ddt_select(zcb->zcb_spa, bp); - ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_FALSE); - - if (dde == NULL) { - refcnt = 0; - } else { - ddt_phys_t *ddp = ddt_phys_select(dde, bp); - ddt_phys_decref(ddp); - refcnt = ddp->ddp_refcnt; - if (ddt_phys_total_refcnt(dde) == 0) - ddt_remove(ddt, dde); - } - ddt_exit(ddt); - } - - VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, - refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), - bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); + VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa, + spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, + ZIO_FLAG_CANFAIL))); } static void @@ -6120,49 +6233,6 @@ zdb_load_obsolete_counts(vdev_t *vd) return (counts); } -static void -zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) -{ - ddt_bookmark_t ddb = {0}; - ddt_entry_t dde; - int error; - int p; - - ASSERT(!dump_opt['L']); - - while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { - blkptr_t blk; - ddt_phys_t *ddp = dde.dde_phys; - - if (ddb.ddb_class == DDT_CLASS_UNIQUE) - return; - - ASSERT(ddt_phys_total_refcnt(&dde) > 1); - ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; - VERIFY(ddt); - - for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) - continue; - ddt_bp_create(ddb.ddb_checksum, - &dde.dde_key, ddp, &blk); - if (p == DDT_PHYS_DITTO) { - zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); - } else { - zcb->zcb_dedup_asize += - BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); - zcb->zcb_dedup_blocks++; - } - } - - ddt_enter(ddt); - VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); - ddt_exit(ddt); - } - - ASSERT(error == ENOENT); -} - typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; @@ -6546,10 +6616,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - zdb_ddt_leak_init(spa, zcb); - spa_config_exit(spa, SCL_CONFIG, FTAG); } static boolean_t @@ -6814,6 +6880,8 @@ dump_block_stats(spa_t *spa) int e, c, err; bp_embedded_type_t i; + ddt_prefetch_all(spa); + zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { @@ -6938,7 +7006,6 @@ dump_block_stats(spa_t *spa) (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); - leaks = B_TRUE; } if (tzb->zb_count == 0) { @@ -8022,16 +8089,21 @@ dump_mos_leaks(spa_t *spa) mos_leak_vdev(spa->spa_root_vdev); - for (uint64_t class = 0; class < DDT_CLASSES; class++) { - for (uint64_t type = 0; type < DDT_TYPES; type++) { - for (uint64_t cksum = 0; - cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { - ddt_t *ddt = spa->spa_ddt[cksum]; - if (!ddt) - continue; + for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + /* DDT store objects */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { mos_obj_refd(ddt->ddt_object[type][class]); } } + + /* FDT container */ + mos_obj_refd(ddt->ddt_dir_object); } if (spa->spa_brt != NULL) { @@ -9624,6 +9696,9 @@ retry_lookup: } fini: + if (spa != NULL) + zdb_ddt_cleanup(spa); + if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 02d0cf5daa..20bae8ce0f 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -253,7 +253,7 @@ extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); extern void ddt_init(void); extern void ddt_fini(void); -extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_prefetch_all(spa_t *spa); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 7e2010c423..84d7800cbc 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -715,7 +715,7 @@ ddt_prefetch_all(spa_t *spa) static int ddt_configure(ddt_t *ddt, boolean_t new); ddt_entry_t * -ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +ddt_lookup(ddt_t *ddt, const blkptr_t *bp) { spa_t *spa = ddt->ddt_spa; ddt_key_t search; @@ -767,10 +767,6 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) return (dde); } - /* Not found. */ - if (!add) - return (NULL); - /* Time to make a new entry. */ dde = ddt_alloc(&search); avl_insert(&ddt->ddt_tree, dde, where); @@ -1502,7 +1498,7 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ddt = ddt_select(spa, bp); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + dde = ddt_lookup(ddt, bp); /* Can be NULL if the entry for this block was pruned. */ if (dde == NULL) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 6d08d4bd16..5810e811a3 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3518,7 +3518,7 @@ zio_ddt_write(zio_t *zio) ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + dde = ddt_lookup(ddt, bp); if (dde == NULL) { /* DDT size is over its quota so no new entries */ zp->zp_dedup = B_FALSE; @@ -3598,7 +3598,7 @@ zio_ddt_free(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); - freedde = dde = ddt_lookup(ddt, bp, B_TRUE); + freedde = dde = ddt_lookup(ddt, bp); if (dde) { ddp = ddt_phys_select(dde, bp); if (ddp)