diff --git a/cmd/ztest.c b/cmd/ztest.c index f77a37c215..3775e2ef25 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -8495,17 +8495,24 @@ print_time(hrtime_t t, char *timebuf) } static nvlist_t * -make_random_props(void) +make_random_pool_props(void) { nvlist_t *props; props = fnvlist_alloc(); - if (ztest_random(2) == 0) - return (props); + /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */ + if (ztest_random(5) == 0) { + fnvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA), + 2 * 1024 * 1024); + } - fnvlist_add_uint64(props, - zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); + /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */ + if (ztest_random(2) == 0) { + fnvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); + } return (props); } @@ -8537,7 +8544,7 @@ ztest_init(ztest_shared_t *zs) zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); - props = make_random_props(); + props = make_random_pool_props(); /* * We don't expect the pool to suspend unless maxfaults == 0, diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 726f1a3902..e0129eda5c 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -151,7 +151,8 @@ enum ddt_phys_type { */ /* State flags for dde_flags */ -#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ +#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ +#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ typedef struct { /* key must be first for ddt_key_compare */ @@ -170,6 +171,7 @@ typedef struct { uint8_t dde_flags; /* load state flags */ kcondvar_t dde_cv; /* signaled when load completes */ + uint64_t dde_waiters; /* count of waiters on dde_cv */ avl_node_t dde_node; /* ddt_tree node */ } ddt_entry_t; @@ -228,6 +230,7 @@ extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); +extern uint64_t ddt_get_ddt_dsize(spa_t *spa); extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index b572c22a29..fb461c2f7a 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -258,6 +258,8 @@ typedef enum { ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONERATIO, + ZPOOL_PROP_DEDUP_TABLE_SIZE, + ZPOOL_PROP_DEDUP_TABLE_QUOTA, ZPOOL_NUM_PROPS } zpool_prop_t; diff --git a/include/sys/spa.h b/include/sys/spa.h index f50cb5e04e..df41002ed0 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1051,6 +1051,7 @@ extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); +extern boolean_t spa_special_has_ddt(spa_t *spa); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 5605a35b86..47f3493274 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -465,6 +465,9 @@ struct spa { boolean_t spa_waiters_cancel; /* waiters should return */ char *spa_compatibility; /* compatibility file(s) */ + uint64_t spa_dedup_table_quota; /* property DDT maximum size */ + uint64_t spa_dedup_dsize; /* cached on-disk size of DDT */ + uint64_t spa_dedup_class_full_txg; /* txg dedup class was full */ /* * spa_refcount & spa_config_lock must be the last elements diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index a75f5bbb47..aee6e59c6b 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -2921,7 +2921,9 @@ - + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index d5e934045f..9896948a22 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -332,6 +332,24 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, intval = zpool_get_prop_int(zhp, prop, &src); switch (prop) { + case ZPOOL_PROP_DEDUP_TABLE_QUOTA: + /* + * If dedup quota is 0, we translate this into 'none' + * (unless literal is set). And if it is UINT64_MAX + * we translate that as 'automatic' (limit to size of + * the dedicated dedup VDEV. Otherwise, fall throught + * into the regular number formating. + */ + if (intval == 0) { + (void) strlcpy(buf, literal ? "0" : "none", + len); + break; + } else if (intval == UINT64_MAX) { + (void) strlcpy(buf, "auto", len); + break; + } + zfs_fallthrough; + case ZPOOL_PROP_SIZE: case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: @@ -342,6 +360,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, case ZPOOL_PROP_MAXDNODESIZE: case ZPOOL_PROP_BCLONESAVED: case ZPOOL_PROP_BCLONEUSED: + case ZPOOL_PROP_DEDUP_TABLE_SIZE: if (literal) (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 73ae0950cc..b865af71a1 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1691,6 +1691,16 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, "use 'none' to disable quota/refquota")); goto error; } + /* + * Pool dedup table quota; force use of 'none' instead of 0 + */ + if ((type & ZFS_TYPE_POOL) && *ivalp == 0 && + (!isnone && !isauto) && + prop == ZPOOL_PROP_DEDUP_TABLE_QUOTA) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "use 'none' to disable ddt table quota")); + goto error; + } /* * Special handling for "*_limit=none". In this case it's not @@ -1732,6 +1742,10 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, } *ivalp = UINT64_MAX; break; + case ZPOOL_PROP_DEDUP_TABLE_QUOTA: + ASSERT(type & ZFS_TYPE_POOL); + *ivalp = UINT64_MAX; + break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'auto' is invalid value for '%s'"), diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index 5428ab8d30..ff21e5300c 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -28,7 +28,7 @@ .\" Copyright (c) 2021, Colm Buckley .\" Copyright (c) 2023, Klara Inc. .\" -.Dd January 2, 2024 +.Dd January 14, 2024 .Dt ZPOOLPROPS 7 .Os . @@ -73,6 +73,8 @@ The amount of storage used by cloned blocks. Percentage of pool space used. This property can also be referred to by its shortened column name, .Sy cap . +.It Sy dedup_table_size +Total on-disk size of the deduplication table. .It Sy expandsize Amount of uninitialized space within the pool or device that can be used to increase the total capacity of the pool. @@ -348,6 +350,27 @@ See and .Xr zpool-upgrade 8 for more information on the operation of compatibility feature sets. +.It Sy dedup_table_quota Ns = Ns Ar number Ns | Ns Sy none Ns | Ns Sy auto +This property sets a limit on the on-disk size of the pool's dedup table. +Entries will not be added to the dedup table once this size is reached; +if a dedup table already exists, and is larger than this size, they +will not be removed as part of setting this property. +Existing entries will still have their reference counts updated. +.Pp +The actual size limit of the table may be above or below the quota, +depending on the actual on-disk size of the entries (which may be +approximated for purposes of calculating the quota). +That is, setting a quota size of 1M may result in the maximum size being +slightly below, or slightly above, that value. +Set to +.Sy 'none' +to disable. +In automatic mode, which is the default, the size of a dedicated dedup vdev +is used as the quota limit. +.Pp +The +.Sy dedup_table_quota +property works for both legacy and fast dedup tables. .It Sy dedupditto Ns = Ns Ar number This property is deprecated and no longer has any effect. .It Sy delegation Ns = Ns Sy on Ns | Ns Sy off diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index b367c95b83..1838c937b7 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -23,7 +23,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2021, Klara Inc. + * Copyright (c) 2021, 2023, Klara Inc. */ #include @@ -125,6 +125,9 @@ zpool_prop_init(void) zprop_register_number(ZPOOL_PROP_BCLONERATIO, "bcloneratio", 0, PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if cloned>", "BCLONE_RATIO", B_FALSE, sfeatures); + zprop_register_number(ZPOOL_PROP_DEDUP_TABLE_SIZE, "dedup_table_size", + 0, PROP_READONLY, ZFS_TYPE_POOL, "", "DDTSIZE", B_FALSE, + sfeatures); /* default number properties */ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, @@ -133,6 +136,9 @@ zpool_prop_init(void) zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "", "ASHIFT", B_FALSE, sfeatures); + zprop_register_number(ZPOOL_PROP_DEDUP_TABLE_QUOTA, "dedup_table_quota", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_POOL, "", "DDTQUOTA", + B_FALSE, sfeatures); /* default index (boolean) properties */ zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 4c53cb0a2f..ca73f1a314 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -101,6 +101,22 @@ * object and (if necessary), removed from an old one. ddt_tree is cleared and * the next txg can start. * + * ## Dedup quota + * + * A maximum size for all DDTs on the pool can be set with the + * dedup_table_quota property. This is determined in ddt_over_quota() and + * enforced during ddt_lookup(). If the pool is at or over its quota limit, + * ddt_lookup() will only return entries for existing blocks, as updates are + * still possible. New entries will not be created; instead, ddt_lookup() will + * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove + * the D bit on the block and reissue the IO as a regular write. The block will + * not be deduplicated. + * + * Note that this is based on the on-disk size of the dedup store. Reclaiming + * this space after deleting entries relies on the ZAP "shrinking" behaviour, + * without which, no space would be recovered and the DDT would continue to be + * considered "over quota". See zap_shrink_enabled. + * * ## Repair IO * * If a read on a dedup block fails, but there are other copies of the block in @@ -152,6 +168,13 @@ static kmem_cache_t *ddt_entry_cache; */ int zfs_dedup_prefetch = 0; +/* + * If the dedup class cannot satisfy a DDT allocation, treat as over quota + * for this many TXGs. + */ +uint_t dedup_class_wait_txgs = 5; + + static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { &ddt_zap_ops, }; @@ -554,8 +577,6 @@ ddt_alloc(const ddt_key_t *ddk) static void ddt_free(ddt_entry_t *dde) { - ASSERT(dde->dde_flags & DDE_FLAG_LOADED); - for (int p = 0; p < DDT_PHYS_TYPES; p++) ASSERT3P(dde->dde_lead_zio[p], ==, NULL); @@ -575,9 +596,66 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde) ddt_free(dde); } +static boolean_t +ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc) +{ + if (mc != NULL && metaslab_class_get_space(mc) > 0) { + /* Over quota if allocating outside of this special class */ + if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg + + dedup_class_wait_txgs) { + /* Waiting for some deferred frees to be processed */ + return (B_TRUE); + } + + /* + * We're considered over quota when we hit 85% full, or for + * larger drives, when there is less than 8GB free. + */ + uint64_t allocated = metaslab_class_get_alloc(mc); + uint64_t capacity = metaslab_class_get_space(mc); + uint64_t limit = MAX(capacity * 85 / 100, + (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0); + + return (allocated >= limit); + } + return (B_FALSE); +} + +/* + * Check if the DDT is over its quota. This can be due to a few conditions: + * 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize + * exceeds this limit + * + * 2. 'dedup_table_quota' property is set to automatic and + * a. the dedup or special allocation class could not satisfy a DDT + * allocation in a recent transaction + * b. the dedup or special allocation class has exceeded its 85% limit + */ +static boolean_t +ddt_over_quota(spa_t *spa) +{ + if (spa->spa_dedup_table_quota == 0) + return (B_FALSE); + + if (spa->spa_dedup_table_quota != UINT64_MAX) + return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota); + + /* + * For automatic quota, table size is limited by dedup or special class + */ + if (ddt_special_over_quota(spa, spa_dedup_class(spa))) + return (B_TRUE); + else if (spa_special_has_ddt(spa) && + ddt_special_over_quota(spa, spa_special_class(spa))) + return (B_TRUE); + + return (B_FALSE); +} + ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) { + spa_t *spa = ddt->ddt_spa; ddt_key_t search; ddt_entry_t *dde; ddt_type_t type; @@ -592,13 +670,28 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) /* Find an existing live entry */ dde = avl_find(&ddt->ddt_tree, &search, &where); if (dde != NULL) { - /* Found it. If it's already loaded, we can just return it. */ + /* If we went over quota, act like we didn't find it */ + if (dde->dde_flags & DDE_FLAG_OVERQUOTA) + return (NULL); + + /* If it's already loaded, we can just return it. */ if (dde->dde_flags & DDE_FLAG_LOADED) return (dde); /* Someone else is loading it, wait for it. */ + dde->dde_waiters++; while (!(dde->dde_flags & DDE_FLAG_LOADED)) cv_wait(&dde->dde_cv, &ddt->ddt_lock); + dde->dde_waiters--; + + /* Loaded but over quota, forget we were ever here */ + if (dde->dde_flags & DDE_FLAG_OVERQUOTA) { + if (dde->dde_waiters == 0) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(dde); + } + return (NULL); + } return (dde); } @@ -639,14 +732,27 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ - if (error == 0) + if (dde->dde_type == DDT_TYPES && + dde->dde_class == DDT_CLASSES && + ddt_over_quota(spa)) { + /* Over quota. If no one is waiting, clean up right now. */ + if (dde->dde_waiters == 0) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(dde); + return (NULL); + } + + /* Flag cleanup required */ + dde->dde_flags |= DDE_FLAG_OVERQUOTA; + } else if (error == 0) { ddt_stat_update(ddt, dde, -1ULL); + } /* Entry loaded, everyone can proceed now */ dde->dde_flags |= DDE_FLAG_LOADED; cv_broadcast(&dde->dde_cv); - return (dde); + return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde); } void @@ -775,6 +881,7 @@ ddt_load(spa_t *spa) memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); spa->spa_dedup_dspace = ~0ULL; + spa->spa_dedup_dsize = ~0ULL; } return (0); @@ -1032,6 +1139,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); spa->spa_dedup_dspace = ~0ULL; + spa->spa_dedup_dsize = ~0ULL; } void @@ -1123,7 +1231,13 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); - ASSERT3P(dde, !=, NULL); + + /* Can be NULL if the entry for this block was pruned. */ + if (dde == NULL) { + ddt_exit(ddt); + spa_config_exit(spa, SCL_ZIO, FTAG); + return (B_FALSE); + } if (dde->dde_type < DDT_TYPES) { ddt_phys_t *ddp; diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index af5365a1d1..39b4edfc0f 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -129,7 +129,8 @@ ddt_histogram_empty(const ddt_histogram_t *ddh) void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) { - /* Sum the statistics we cached in ddt_object_sync(). */ + memset(ddo_total, 0, sizeof (*ddo_total)); + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (!ddt) @@ -138,8 +139,32 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + dmu_object_info_t doi; + uint64_t cnt; + int err; + + /* + * These stats were originally calculated + * during ddt_object_load(). + */ + + err = ddt_object_info(ddt, type, class, &doi); + if (err != 0) + continue; + + err = ddt_object_count(ddt, type, class, &cnt); + if (err != 0) + continue; + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + + ddo->ddo_count = cnt; + ddo->ddo_dspace = + doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * + doi.doi_data_block_size; + ddo_total->ddo_count += ddo->ddo_count; ddo_total->ddo_dspace += ddo->ddo_dspace; ddo_total->ddo_mspace += ddo->ddo_mspace; @@ -147,11 +172,24 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) } } - /* ... and compute the averages. */ - if (ddo_total->ddo_count != 0) { - ddo_total->ddo_dspace /= ddo_total->ddo_count; - ddo_total->ddo_mspace /= ddo_total->ddo_count; - } + /* + * This returns raw counts (not averages). One of the consumers, + * print_dedup_stats(), historically has expected raw counts. + */ + + spa->spa_dedup_dsize = ddo_total->ddo_dspace; +} + +uint64_t +ddt_get_ddt_dsize(spa_t *spa) +{ + ddt_object_t ddo_total; + + /* recalculate after each txg sync */ + if (spa->spa_dedup_dsize == ~0ULL) + ddt_get_dedup_object_stats(spa, &ddo_total); + + return (spa->spa_dedup_dsize); } void diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 638572996c..1095c0af37 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -406,6 +406,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, brt_get_ratio(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, + ddt_get_ddt_dsize(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); @@ -672,6 +675,10 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = SET_ERROR(EINVAL); break; + case ZPOOL_PROP_DEDUP_TABLE_QUOTA: + error = nvpair_value_uint64(elem, &intval); + break; + case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: @@ -4732,6 +4739,8 @@ spa_ld_get_props(spa_t *spa) spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); + spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA, + &spa->spa_dedup_table_quota); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); spa->spa_autoreplace = (autoreplace != 0); @@ -6588,6 +6597,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); + spa->spa_dedup_table_quota = + zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); @@ -9631,6 +9642,9 @@ spa_sync_props(void *arg, dmu_tx_t *tx) case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; + case ZPOOL_PROP_DEDUP_TABLE_QUOTA: + spa->spa_dedup_table_quota = intval; + break; default: break; } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index d1d41bbe72..439e56f0d0 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1996,6 +1996,13 @@ spa_dedup_class(spa_t *spa) return (spa->spa_dedup_class); } +boolean_t +spa_special_has_ddt(spa_t *spa) +{ + return (zfs_ddt_data_is_special && + spa->spa_special_class->mc_groups != 0); +} + /* * Locate an appropriate allocation class */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c index d68d5ababe..bc5a3c9b70 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3503,6 +3503,15 @@ zio_ddt_write(zio_t *zio) ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); + if (dde == NULL) { + /* DDT size is over its quota so no new entries */ + zp->zp_dedup = B_FALSE; + BP_SET_DEDUP(bp, B_FALSE); + if (zio->io_bp_override == NULL) + zio->io_pipeline = ZIO_WRITE_PIPELINE; + ddt_exit(ddt); + return (zio); + } ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { @@ -3727,6 +3736,26 @@ zio_dva_allocate(zio_t *zio) * Fallback to normal class when an alloc class is full */ if (error == ENOSPC && mc != spa_normal_class(spa)) { + /* + * When the dedup or special class is spilling into the normal + * class, there can still be significant space available due + * to deferred frees that are in-flight. We track the txg when + * this occurred and back off adding new DDT entries for a few + * txgs to allow the free blocks to be processed. + */ + if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) && + mc == spa_special_class(spa))) && + spa->spa_dedup_class_full_txg != zio->io_txg) { + spa->spa_dedup_class_full_txg = zio->io_txg; + zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, " + "%llu allocated of %llu", + spa_name(spa), (int)zio->io_txg, + mc == spa_dedup_class(spa) ? "dedup" : "special", + (int)zio->io_size, + (u_longlong_t)metaslab_class_get_alloc(mc), + (u_longlong_t)metaslab_class_get_space(mc)); + } + /* * If throttling, transfer reservation over to normal class. * The io_allocator slot can remain the same even though we diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index ac2c541a91..d48b243eef 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -662,6 +662,12 @@ pre = post = tags = ['functional', 'deadman'] +[tests/functional/dedup] +tests = ['dedup_quota'] +pre = +post = +tags = ['functional', 'dedup'] + [tests/functional/delegate] tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 721cf27f48..b4d7c4f72b 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -28,6 +28,7 @@ CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS condense.indirect_commit_entry_delay_ms CONDENSE_INDIRECT_OBSOLETE_PCT condense.indirect_obsolete_pct zfs_condense_indirect_obsolete_pct CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift +DDT_ZAP_DEFAULT_BS ddt_zap_default_bs ddt_zap_default_bs DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 00f306122d..a55c86bd4d 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1415,6 +1415,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/deadman/deadman_ratelimit.ksh \ functional/deadman/deadman_sync.ksh \ functional/deadman/deadman_zio.ksh \ + functional/dedup/cleanup.ksh \ + functional/dedup/setup.ksh \ + functional/dedup/dedup_quota.ksh \ functional/delegate/cleanup.ksh \ functional/delegate/setup.ksh \ functional/delegate/zfs_allow_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 6ebce94591..e8a94ce209 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -47,6 +47,8 @@ typeset -a properties=( "listsnapshots" "autoexpand" "dedupratio" + "dedup_table_quota" + "dedup_table_size" "free" "allocated" "readonly" diff --git a/tests/zfs-tests/tests/functional/dedup/cleanup.ksh b/tests/zfs-tests/tests/functional/dedup/cleanup.ksh new file mode 100755 index 0000000000..b3c4c04d77 --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/cleanup.ksh @@ -0,0 +1,29 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh new file mode 100755 index 0000000000..5b83a1ca39 --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh @@ -0,0 +1,223 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +# DESCRIPTION: +# Verify that new entries are not added to the DDT when dedup_table_quota has +# been exceeded. +# +# STRATEGY: +# 1. Create a pool with dedup=on +# 2. Set threshold for on-disk DDT via dedup_table_quota +# 3. Verify the threshold is exceeded after zpool sync +# 4. Verify no new entries are added after subsequent sync's +# 5. Remove all but one entry from DDT +# 6. Verify new entries are added to DDT +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/events/events_common.kshlib + +verify_runnable "both" + +log_assert "DDT quota is enforced" + +MOUNTDIR="$TEST_BASE_DIR/dedup_mount" +FILEPATH="$MOUNTDIR/dedup_file" +VDEV_GENERAL="$TEST_BASE_DIR/vdevfile.general.$$" +VDEV_DEDUP="$TEST_BASE_DIR/vdevfile.dedup.$$" +POOL="dedup_pool" + +save_tunable TXG_TIMEOUT + +function cleanup +{ + if poolexists $POOL ; then + destroy_pool $POOL + fi + log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR + log_must restore_tunable TXG_TIMEOUT +} + + +function do_clean +{ + log_must destroy_pool $POOL + log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR +} + +function do_setup +{ + log_must truncate -s 5G $VDEV_GENERAL + # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting + log_must zpool create -o ashift=12 -f -O xattr=sa -m $MOUNTDIR $POOL $VDEV_GENERAL + log_must zfs set dedup=on $POOL + log_must set_tunable32 TXG_TIMEOUT 600 +} + +function dedup_table_size +{ + get_pool_prop dedup_table_size $POOL +} + +function dedup_table_quota +{ + get_pool_prop dedup_table_quota $POOL +} + +function ddt_entries +{ + typeset -i entries=$(zpool status -D $POOL | \ + grep "dedup: DDT entries" | awk '{print $4}') + + echo ${entries} +} + +function ddt_add_entry +{ + count=$1 + offset=$2 + expand=$3 + + if [ -z "$offset" ]; then + offset=1 + fi + + for i in {$offset..$count}; do + echo "$i" > $MOUNTDIR/dedup-$i.txt + done + log_must sync_pool $POOL + + log_note range $offset - $(( count + offset - 1 )) + log_note ddt_add_entry got $(ddt_entries) +} + +# Create 6000 entries over three syncs +function ddt_nolimit +{ + do_setup + + log_note base ddt entries is $(ddt_entries) + + ddt_add_entry 1 + ddt_add_entry 100 + ddt_add_entry 101 5000 + ddt_add_entry 5001 6000 + + log_must test $(ddt_entries) -eq 6000 + + do_clean +} + +function ddt_limit +{ + do_setup + + log_note base ddt entries is $(ddt_entries) + + log_must zpool set dedup_table_quota=32768 $POOL + ddt_add_entry 100 + + # it's possible to exceed dedup_table_quota over a single transaction, + # ensure that the threshold has been exceeded + cursize=$(dedup_table_size) + log_must test $cursize -gt $(dedup_table_quota) + + # count the entries we have + log_must test $(ddt_entries) -ge 100 + + # attempt to add new entries + ddt_add_entry 101 200 + log_must test $(ddt_entries) -eq 100 + log_must test $cursize -eq $(dedup_table_size) + + # remove all but one entry + for i in {2..100}; do + rm $MOUNTDIR/dedup-$i.txt + done + log_must sync_pool $POOL + + log_must test $(ddt_entries) -eq 1 + log_must test $cursize -gt $(dedup_table_size) + cursize=$(dedup_table_size) + + log_must zpool set dedup_table_quota=none $POOL + + # create more entries + zpool status -D $POOL + ddt_add_entry 101 200 + log_must sync_pool $POOL + + log_must test $(ddt_entries) -eq 101 + log_must test $cursize -lt $(dedup_table_size) + + do_clean +} + +function ddt_dedup_vdev_limit +{ + do_setup + + # add a dedicated dedup/special VDEV and enable an automatic quota + if (( RANDOM % 2 == 0 )) ; then + class="special" + else + class="dedup" + fi + log_must truncate -s 200M $VDEV_DEDUP + log_must zpool add $POOL $class $VDEV_DEDUP + log_must zpool set dedup_table_quota=auto $POOL + + log_must zfs set recordsize=1K $POOL + log_must zfs set compression=zstd $POOL + + # Generate a working set to fill up the dedup/special allocation class + log_must fio --directory=$MOUNTDIR --name=dedup-filler-1 \ + --rw=read --bs=1m --numjobs=2 --iodepth=8 \ + --size=512M --end_fsync=1 --ioengine=posixaio --runtime=1 \ + --group_reporting --fallocate=none --output-format=terse \ + --dedupe_percentage=0 + log_must sync_pool $POOL + + zpool status -D $POOL + zpool list -v $POOL + echo DDT size $(dedup_table_size), with $(ddt_entries) entries + + # + # With no DDT quota in place, the above workload will produce over + # 800,000 entries by using space in the normal class. With a quota, + # it will be well below 500,000 entries. + # + log_must test $(ddt_entries) -le 500000 + + do_clean +} + +log_onexit cleanup + +ddt_limit +ddt_nolimit +ddt_dedup_vdev_limit + +log_pass "DDT quota is enforced" diff --git a/tests/zfs-tests/tests/functional/dedup/setup.ksh b/tests/zfs-tests/tests/functional/dedup/setup.ksh new file mode 100755 index 0000000000..3c0830401f --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/setup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} + +default_setup $DISK