From 27e9cb5f8022bef72553cbe12f7ec292535e4c0b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 15 Jun 2023 17:19:41 +1000 Subject: [PATCH] ddt: cleanup the stats & histogram code Both the API and the code were kinda mangled and I was really struggling to follow it. The worst offender was the old ddt_stat_add(); after fixing it up the rest of the changes are mostly knock-on effects and targets of opportunity. Note that the old ddt_stat_add() was safe against overflows - it could produce crazy numbers, but the compiler wouldn't do anything stupid. The assertions in ddt_stat_sub() go a lot of the way to protecting against this; getting in a position where overflows are a problem is definitely a programming error. Also expanding ddt_stat_add() and ddt_histogram_empty() produces less efficient assembly. I'm not bothered about this right now though; these should not be hot functions, and if they are we'll optimise them later. If we have to go back to the old form, we'll comment it like crazy. Finally, I've removed the assertion that the bucket will never be negative, as it will soon be possible to have entries with zero refcounts: an entry for a block that is no longer on the pool, but is on the log waiting to be synced out. It might be better to have a separate bucket for these, since they're still using real space on disk, but ultimately these stats are driving UI, and for now I've chosen to keep them matching how they've looked in the past, as well as match the operators mental model - pool usage is managed elsewhere. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15895 --- cmd/zdb/zdb.c | 22 ++++----- include/sys/ddt.h | 8 ++- include/sys/ddt_impl.h | 4 -- module/zfs/ddt.c | 24 +++++++-- module/zfs/ddt_stats.c | 107 +++++++++++++++++++++++++++++------------ 5 files changed, 114 insertions(+), 51 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 142f55b299..250052adfb 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -7357,29 +7357,27 @@ dump_simulated_ddt(spa_t *spa) spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { - ddt_stat_t dds; uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); - dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; - dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; - dds.dds_psize = zdde->zdde_ref_psize / refcnt; - dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; + ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1]; - dds.dds_ref_blocks = zdde->zdde_ref_blocks; - dds.dds_ref_lsize = zdde->zdde_ref_lsize; - dds.dds_ref_psize = zdde->zdde_ref_psize; - dds.dds_ref_dsize = zdde->zdde_ref_dsize; + dds->dds_blocks += zdde->zdde_ref_blocks / refcnt; + dds->dds_lsize += zdde->zdde_ref_lsize / refcnt; + dds->dds_psize += zdde->zdde_ref_psize / refcnt; + dds->dds_dsize += zdde->zdde_ref_dsize / refcnt; - ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], - &dds, 0); + dds->dds_ref_blocks += zdde->zdde_ref_blocks; + dds->dds_ref_lsize += zdde->zdde_ref_lsize; + dds->dds_ref_psize += zdde->zdde_ref_psize; + dds->dds_ref_dsize += zdde->zdde_ref_dsize; umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); - ddt_histogram_stat(&dds_total, &ddh_total); + ddt_histogram_total(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 11e09eef3b..2dd18526db 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -318,9 +318,15 @@ extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, boolean_t encrypted); +extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh, + const ddt_lightweight_entry_t *ddlwe); +extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh, + const ddt_lightweight_entry_t *ddlwe); + extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); -extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); +extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh); extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); + extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); extern uint64_t ddt_get_ddt_dsize(spa_t *spa); extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index c4e681fb11..ce4bc559dd 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -77,8 +77,6 @@ typedef struct { extern const ddt_ops_t ddt_zap_ops; -extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); - /* * These are only exposed so that zdb can access them. Try not to use them * outside of the DDT implementation proper, and if you do, consider moving @@ -95,8 +93,6 @@ extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde); extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); -extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); - extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, char *name); extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 59526394bd..f3b3473261 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -992,7 +992,18 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) /* Flag cleanup required */ dde->dde_flags |= DDE_FLAG_OVERQUOTA; } else if (error == 0) { - ddt_stat_update(ddt, dde, -1ULL); + /* + * The histograms only track inactive (stored) blocks. + * We've just put an entry onto the live list, so we need to + * remove its counts. When its synced back, it'll be re-added + * to the right one. + */ + ddt_histogram_t *ddh = + &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_histogram_sub_entry(ddt, ddh, &ddlwe); } /* Entry loaded, everyone can proceed now */ @@ -1527,11 +1538,18 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) if (total_refcnt != 0) { dde->dde_type = ntype; dde->dde_class = nclass; - ddt_stat_update(ddt, dde, 0); + if (!ddt_object_exists(ddt, ntype, nclass)) ddt_object_create(ddt, ntype, nclass, tx); VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx)); + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + + ddt_histogram_t *ddh = + &ddt->ddt_histogram[ntype][nclass]; + ddt_histogram_add_entry(ddt, ddh, &ddlwe); + /* * If the class changes, the order that we scan this bp * changes. If it decreases, we could miss it, so @@ -1540,8 +1558,6 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) * traversing.) */ if (nclass < oclass) { - ddt_lightweight_entry_t ddlwe; - DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt, &ddlwe, tx); } diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 6da77bbca5..9316200f21 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -33,24 +33,24 @@ #include static void -ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, + ddt_stat_t *dds) { spa_t *spa = ddt->ddt_spa; - ddt_key_t *ddk = &dde->dde_key; - uint64_t lsize = DDK_GET_LSIZE(ddk); - uint64_t psize = DDK_GET_PSIZE(ddk); + uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key); + uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key); memset(dds, 0, sizeof (*dds)); - for (int p = 0; p < DDT_NPHYS(ddt); p++) { - const ddt_univ_phys_t *ddp = dde->dde_phys; + for (int p = 0; p < ddlwe->ddlwe_nphys; p++) { + const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); if (ddt_phys_birth(ddp, v) == 0) continue; int ndvas = ddt_phys_dva_count(ddp, v, - DDK_GET_CRYPT(&dde->dde_key)); + DDK_GET_CRYPT(&ddlwe->ddlwe_key)); const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva; @@ -72,61 +72,108 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) } } -void -ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +static void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src) { - const uint64_t *s = (const uint64_t *)src; - uint64_t *d = (uint64_t *)dst; - uint64_t *d_end = (uint64_t *)(dst + 1); + dst->dds_blocks += src->dds_blocks; + dst->dds_lsize += src->dds_lsize; + dst->dds_psize += src->dds_psize; + dst->dds_dsize += src->dds_dsize; + dst->dds_ref_blocks += src->dds_ref_blocks; + dst->dds_ref_lsize += src->dds_ref_lsize; + dst->dds_ref_psize += src->dds_ref_psize; + dst->dds_ref_dsize += src->dds_ref_dsize; +} - ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ +static void +ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src) +{ + /* This caught more during development than you might expect... */ + ASSERT3U(dst->dds_blocks, >=, src->dds_blocks); + ASSERT3U(dst->dds_lsize, >=, src->dds_lsize); + ASSERT3U(dst->dds_psize, >=, src->dds_psize); + ASSERT3U(dst->dds_dsize, >=, src->dds_dsize); + ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks); + ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize); + ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize); + ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize); - for (int i = 0; i < d_end - d; i++) - d[i] += (s[i] ^ neg) - neg; + dst->dds_blocks -= src->dds_blocks; + dst->dds_lsize -= src->dds_lsize; + dst->dds_psize -= src->dds_psize; + dst->dds_dsize -= src->dds_dsize; + dst->dds_ref_blocks -= src->dds_ref_blocks; + dst->dds_ref_lsize -= src->dds_ref_lsize; + dst->dds_ref_psize -= src->dds_ref_psize; + dst->dds_ref_dsize -= src->dds_ref_dsize; } void -ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh, + const ddt_lightweight_entry_t *ddlwe) { ddt_stat_t dds; - ddt_histogram_t *ddh; int bucket; - ddt_stat_generate(ddt, dde, &dds); + ddt_stat_generate(ddt, ddlwe, &dds); bucket = highbit64(dds.dds_ref_blocks) - 1; - ASSERT3U(bucket, >=, 0); + if (bucket < 0) + return; - ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + ddt_stat_add(&ddh->ddh_stat[bucket], &dds); +} - ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +void +ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh, + const ddt_lightweight_entry_t *ddlwe) +{ + ddt_stat_t dds; + int bucket; + + ddt_stat_generate(ddt, ddlwe, &dds); + + bucket = highbit64(dds.dds_ref_blocks) - 1; + if (bucket < 0) + return; + + ddt_stat_sub(&ddh->ddh_stat[bucket], &dds); } void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) { for (int h = 0; h < 64; h++) - ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]); } void -ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh) { memset(dds, 0, sizeof (*dds)); for (int h = 0; h < 64; h++) - ddt_stat_add(dds, &ddh->ddh_stat[h], 0); + ddt_stat_add(dds, &ddh->ddh_stat[h]); } boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh) { - const uint64_t *s = (const uint64_t *)ddh; - const uint64_t *s_end = (const uint64_t *)(ddh + 1); + for (int h = 0; h < 64; h++) { + const ddt_stat_t *dds = &ddh->ddh_stat[h]; - while (s < s_end) - if (*s++ != 0) - return (B_FALSE); + if (dds->dds_blocks == 0 && + dds->dds_lsize == 0 && + dds->dds_psize == 0 && + dds->dds_dsize == 0 && + dds->dds_ref_blocks == 0 && + dds->dds_ref_lsize == 0 && + dds->dds_ref_psize == 0 && + dds->dds_ref_dsize == 0) + continue; + + return (B_FALSE); + } return (B_TRUE); } @@ -222,7 +269,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh_total); - ddt_histogram_stat(dds_total, ddh_total); + ddt_histogram_total(dds_total, ddh_total); kmem_free(ddh_total, sizeof (ddt_histogram_t)); }