diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 2fc798725e..a7920e6580 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -294,6 +294,8 @@ typedef struct { int32_t ddt_log_flush_rate; /* rolling log flush rate */ int32_t ddt_log_flush_time_rate; /* avg time spent flushing */ + uint64_t ddt_flush_force_txg; /* flush hard before this txg */ + enum zio_checksum ddt_checksum; /* checksum algorithm in use */ spa_t *ddt_spa; /* pool this ddt is on */ objset_t *ddt_os; /* ddt objset (always MOS) */ @@ -393,6 +395,9 @@ extern void ddt_create(spa_t *spa); extern int ddt_load(spa_t *spa); extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); + +extern void ddt_walk_init(spa_t *spa, uint64_t txg); +extern boolean_t ddt_walk_ready(spa_t *spa); extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index ce5c4efb51..051005f137 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -183,6 +183,12 @@ * position on the object even if the object changes, the pool is exported, or * OpenZFS is upgraded. * + * If the "fast_dedup" feature is enabled and the table has a log, the scan + * cannot begin until entries on the log are flushed, as the on-disk log has no + * concept of a "stable position". Instead, the log flushing process will enter + * a more aggressive mode, to flush out as much as is necesary as soon as + * possible, in order to begin the scan as soon as possible. + * * ## Interaction with block cloning * * If block cloning and dedup are both enabled on a pool, BRT will look for the @@ -1746,6 +1752,16 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx) ddt->ddt_flush_min = MAX( ddt->ddt_log_ingest_rate, zfs_dedup_log_flush_entries_min); + + /* + * If we've been asked to flush everything in a hurry, + * try to dump as much as possible on this txg. In + * this case we're only limited by time, not amount. + */ + if (ddt->ddt_flush_force_txg > 0) + ddt->ddt_flush_min = + MAX(ddt->ddt_flush_min, avl_numnodes( + &ddt->ddt_log_flushing->ddl_tree)); } else { /* We already decided we're done for this txg */ return (B_FALSE); @@ -1856,6 +1872,40 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx) return (ddt->ddt_flush_pass == 0); } +static inline void +ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg) +{ + /* + * If we're not forcing flush, and not being asked to start, then + * there's nothing more to do. + */ + if (txg == 0) { + /* Update requested, are we currently forcing flush? */ + if (ddt->ddt_flush_force_txg == 0) + return; + txg = ddt->ddt_flush_force_txg; + } + + /* + * If either of the logs have entries unflushed entries before + * the wanted txg, set the force txg, otherwise clear it. + */ + + if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) && + ddt->ddt_log_active->ddl_first_txg <= txg) || + (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && + ddt->ddt_log_flushing->ddl_first_txg <= txg)) { + ddt->ddt_flush_force_txg = txg; + return; + } + + /* + * Nothing to flush behind the given txg, so we can clear force flush + * state. + */ + ddt->ddt_flush_force_txg = 0; +} + static void ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) { @@ -1881,6 +1931,9 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) (void) ddt_log_swap(ddt, tx); } + /* If force flush is no longer necessary, turn it off. */ + ddt_flush_force_update_txg(ddt, 0); + /* * Update flush rate. This is an exponential weighted moving average of * the number of entries flushed over recent txgs. @@ -2049,6 +2102,38 @@ ddt_sync(spa_t *spa, uint64_t txg) dmu_tx_commit(tx); } +void +ddt_walk_init(spa_t *spa, uint64_t txg) +{ + if (txg == 0) + txg = spa_syncing_txg(spa); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG)) + continue; + + ddt_enter(ddt); + ddt_flush_force_update_txg(ddt, txg); + ddt_exit(ddt); + } +} + +boolean_t +ddt_walk_ready(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG)) + continue; + + if (ddt->ddt_flush_force_txg > 0) + return (B_FALSE); + } + + return (B_TRUE); +} + int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) { @@ -2058,6 +2143,10 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; if (ddt == NULL) continue; + + if (ddt->ddt_flush_force_txg > 0) + return (EAGAIN); + int error = ENOENT; if (ddt_object_exists(ddt, ddb->ddb_type, ddb->ddb_class)) { diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c index 7e7ff9e5b8..a367d0cd02 100644 --- a/module/zfs/ddt_log.c +++ b/module/zfs/ddt_log.c @@ -435,7 +435,8 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx) /* * Swap policy. We swap the logs (and so begin flushing) when the * active tree grows too large, or when we haven't swapped it in - * some amount of time. + * some amount of time, or if something has requested the logs be + * flushed ASAP (see ddt_walk_init()). */ /* @@ -452,7 +453,10 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx) (ddt->ddt_log_active->ddl_first_txg + MAX(1, zfs_dedup_log_txg_max)); - if (!(too_large || too_old)) + const boolean_t force = + ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg; + + if (!(too_large || too_old || force)) return (B_FALSE); ddt_log_t *swap = ddt->ddt_log_active; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index daf1bd5d63..9d040e1463 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) zap_cursor_fini(&zc); } + ddt_walk_init(spa, scn->scn_phys.scn_max_txg); + spa_scan_stat_init(spa); vdev_scan_stat_init(spa->spa_root_vdev); @@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); + ddt_walk_init(spa, scn->scn_phys.scn_max_txg); + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_history_log_internal(spa, "scan setup", tx, @@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa) || - (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { + (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) || + !ddt_walk_ready(scn->scn_dp->dp_spa)) { if (zb && zb->zb_level == ZB_ROOT_LEVEL) { dprintf("suspending at first available bookmark " "%llx/%llx/%llx/%llx\n", @@ -3029,9 +3034,21 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) break; } - zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; " - "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name, - (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); + if (error == EAGAIN) { + dsl_scan_check_suspend(scn, NULL); + error = 0; + + zfs_dbgmsg("waiting for ddt to become ready for scan " + "on %s with class_max = %u; suspending=%u", + scn->scn_dp->dp_spa->spa_name, + (int)scn->scn_phys.scn_ddt_class_max, + (int)scn->scn_suspending); + } else + zfs_dbgmsg("scanned %llu ddt entries on %s with " + "class_max = %u; suspending=%u", (longlong_t)n, + scn->scn_dp->dp_spa->spa_name, + (int)scn->scn_phys.scn_ddt_class_max, + (int)scn->scn_suspending); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh index a96a38ff17..474f41eae8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh @@ -95,6 +95,10 @@ while (( i < 16384 )); do done ((i += 1)) done + +# Force the DDT logs to disk with a scrub so they can be prefetched +log_must zpool scrub -w $TESTPOOL + log_note "Dataset generation completed." typeset -A generated