diff --git a/cmd/ztest.c b/cmd/ztest.c
index f77a37c215..3775e2ef25 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -8495,17 +8495,24 @@ print_time(hrtime_t t, char *timebuf)
}
static nvlist_t *
-make_random_props(void)
+make_random_pool_props(void)
{
nvlist_t *props;
props = fnvlist_alloc();
- if (ztest_random(2) == 0)
- return (props);
+ /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */
+ if (ztest_random(5) == 0) {
+ fnvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA),
+ 2 * 1024 * 1024);
+ }
- fnvlist_add_uint64(props,
- zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1);
+ /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */
+ if (ztest_random(2) == 0) {
+ fnvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1);
+ }
return (props);
}
@@ -8537,7 +8544,7 @@ ztest_init(ztest_shared_t *zs)
zs->zs_mirrors = ztest_opts.zo_mirrors;
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
- props = make_random_props();
+ props = make_random_pool_props();
/*
* We don't expect the pool to suspend unless maxfaults == 0,
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 726f1a3902..e0129eda5c 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -151,7 +151,8 @@ enum ddt_phys_type {
*/
/* State flags for dde_flags */
-#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
+#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
+#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */
typedef struct {
/* key must be first for ddt_key_compare */
@@ -170,6 +171,7 @@ typedef struct {
uint8_t dde_flags; /* load state flags */
kcondvar_t dde_cv; /* signaled when load completes */
+ uint64_t dde_waiters; /* count of waiters on dde_cv */
avl_node_t dde_node; /* ddt_tree node */
} ddt_entry_t;
@@ -228,6 +230,7 @@ extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
+extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index b572c22a29..fb461c2f7a 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -258,6 +258,8 @@ typedef enum {
ZPOOL_PROP_BCLONEUSED,
ZPOOL_PROP_BCLONESAVED,
ZPOOL_PROP_BCLONERATIO,
+ ZPOOL_PROP_DEDUP_TABLE_SIZE,
+ ZPOOL_PROP_DEDUP_TABLE_QUOTA,
ZPOOL_NUM_PROPS
} zpool_prop_t;
diff --git a/include/sys/spa.h b/include/sys/spa.h
index f50cb5e04e..df41002ed0 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1051,6 +1051,7 @@ extern metaslab_class_t *spa_special_class(spa_t *spa);
extern metaslab_class_t *spa_dedup_class(spa_t *spa);
extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
+extern boolean_t spa_special_has_ddt(spa_t *spa);
extern void spa_evicting_os_register(spa_t *, objset_t *os);
extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 5605a35b86..47f3493274 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -465,6 +465,9 @@ struct spa {
boolean_t spa_waiters_cancel; /* waiters should return */
char *spa_compatibility; /* compatibility file(s) */
+ uint64_t spa_dedup_table_quota; /* property DDT maximum size */
+ uint64_t spa_dedup_dsize; /* cached on-disk size of DDT */
+ uint64_t spa_dedup_class_full_txg; /* txg dedup class was full */
/*
* spa_refcount & spa_config_lock must be the last elements
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index a75f5bbb47..aee6e59c6b 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -2921,7 +2921,9 @@
-
+
+
+
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index d5e934045f..9896948a22 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -332,6 +332,24 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
intval = zpool_get_prop_int(zhp, prop, &src);
switch (prop) {
+ case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
+ /*
+ * If dedup quota is 0, we translate this into 'none'
+ * (unless literal is set). And if it is UINT64_MAX
+ * we translate that as 'automatic' (limit to size of
+ * the dedicated dedup VDEV. Otherwise, fall throught
+ * into the regular number formating.
+ */
+ if (intval == 0) {
+ (void) strlcpy(buf, literal ? "0" : "none",
+ len);
+ break;
+ } else if (intval == UINT64_MAX) {
+ (void) strlcpy(buf, "auto", len);
+ break;
+ }
+ zfs_fallthrough;
+
case ZPOOL_PROP_SIZE:
case ZPOOL_PROP_ALLOCATED:
case ZPOOL_PROP_FREE:
@@ -342,6 +360,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
case ZPOOL_PROP_MAXDNODESIZE:
case ZPOOL_PROP_BCLONESAVED:
case ZPOOL_PROP_BCLONEUSED:
+ case ZPOOL_PROP_DEDUP_TABLE_SIZE:
if (literal)
(void) snprintf(buf, len, "%llu",
(u_longlong_t)intval);
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 73ae0950cc..b865af71a1 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -1691,6 +1691,16 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
"use 'none' to disable quota/refquota"));
goto error;
}
+ /*
+ * Pool dedup table quota; force use of 'none' instead of 0
+ */
+ if ((type & ZFS_TYPE_POOL) && *ivalp == 0 &&
+ (!isnone && !isauto) &&
+ prop == ZPOOL_PROP_DEDUP_TABLE_QUOTA) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "use 'none' to disable ddt table quota"));
+ goto error;
+ }
/*
* Special handling for "*_limit=none". In this case it's not
@@ -1732,6 +1742,10 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
}
*ivalp = UINT64_MAX;
break;
+ case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
+ ASSERT(type & ZFS_TYPE_POOL);
+ *ivalp = UINT64_MAX;
+ break;
default:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'auto' is invalid value for '%s'"),
diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7
index 5428ab8d30..ff21e5300c 100644
--- a/man/man7/zpoolprops.7
+++ b/man/man7/zpoolprops.7
@@ -28,7 +28,7 @@
.\" Copyright (c) 2021, Colm Buckley
.\" Copyright (c) 2023, Klara Inc.
.\"
-.Dd January 2, 2024
+.Dd January 14, 2024
.Dt ZPOOLPROPS 7
.Os
.
@@ -73,6 +73,8 @@ The amount of storage used by cloned blocks.
Percentage of pool space used.
This property can also be referred to by its shortened column name,
.Sy cap .
+.It Sy dedup_table_size
+Total on-disk size of the deduplication table.
.It Sy expandsize
Amount of uninitialized space within the pool or device that can be used to
increase the total capacity of the pool.
@@ -348,6 +350,27 @@ See
and
.Xr zpool-upgrade 8
for more information on the operation of compatibility feature sets.
+.It Sy dedup_table_quota Ns = Ns Ar number Ns | Ns Sy none Ns | Ns Sy auto
+This property sets a limit on the on-disk size of the pool's dedup table.
+Entries will not be added to the dedup table once this size is reached;
+if a dedup table already exists, and is larger than this size, they
+will not be removed as part of setting this property.
+Existing entries will still have their reference counts updated.
+.Pp
+The actual size limit of the table may be above or below the quota,
+depending on the actual on-disk size of the entries (which may be
+approximated for purposes of calculating the quota).
+That is, setting a quota size of 1M may result in the maximum size being
+slightly below, or slightly above, that value.
+Set to
+.Sy 'none'
+to disable.
+In automatic mode, which is the default, the size of a dedicated dedup vdev
+is used as the quota limit.
+.Pp
+The
+.Sy dedup_table_quota
+property works for both legacy and fast dedup tables.
.It Sy dedupditto Ns = Ns Ar number
This property is deprecated and no longer has any effect.
.It Sy delegation Ns = Ns Sy on Ns | Ns Sy off
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index b367c95b83..1838c937b7 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -23,7 +23,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2021, Colm Buckley
- * Copyright (c) 2021, Klara Inc.
+ * Copyright (c) 2021, 2023, Klara Inc.
*/
#include
@@ -125,6 +125,9 @@ zpool_prop_init(void)
zprop_register_number(ZPOOL_PROP_BCLONERATIO, "bcloneratio", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if cloned>",
"BCLONE_RATIO", B_FALSE, sfeatures);
+ zprop_register_number(ZPOOL_PROP_DEDUP_TABLE_SIZE, "dedup_table_size",
+ 0, PROP_READONLY, ZFS_TYPE_POOL, "", "DDTSIZE", B_FALSE,
+ sfeatures);
/* default number properties */
zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
@@ -133,6 +136,9 @@ zpool_prop_init(void)
zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_DEFAULT,
ZFS_TYPE_POOL, "", "ASHIFT", B_FALSE,
sfeatures);
+ zprop_register_number(ZPOOL_PROP_DEDUP_TABLE_QUOTA, "dedup_table_quota",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_POOL, "", "DDTQUOTA",
+ B_FALSE, sfeatures);
/* default index (boolean) properties */
zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 4c53cb0a2f..ca73f1a314 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -101,6 +101,22 @@
* object and (if necessary), removed from an old one. ddt_tree is cleared and
* the next txg can start.
*
+ * ## Dedup quota
+ *
+ * A maximum size for all DDTs on the pool can be set with the
+ * dedup_table_quota property. This is determined in ddt_over_quota() and
+ * enforced during ddt_lookup(). If the pool is at or over its quota limit,
+ * ddt_lookup() will only return entries for existing blocks, as updates are
+ * still possible. New entries will not be created; instead, ddt_lookup() will
+ * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
+ * the D bit on the block and reissue the IO as a regular write. The block will
+ * not be deduplicated.
+ *
+ * Note that this is based on the on-disk size of the dedup store. Reclaiming
+ * this space after deleting entries relies on the ZAP "shrinking" behaviour,
+ * without which, no space would be recovered and the DDT would continue to be
+ * considered "over quota". See zap_shrink_enabled.
+ *
* ## Repair IO
*
* If a read on a dedup block fails, but there are other copies of the block in
@@ -152,6 +168,13 @@ static kmem_cache_t *ddt_entry_cache;
*/
int zfs_dedup_prefetch = 0;
+/*
+ * If the dedup class cannot satisfy a DDT allocation, treat as over quota
+ * for this many TXGs.
+ */
+uint_t dedup_class_wait_txgs = 5;
+
+
static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
&ddt_zap_ops,
};
@@ -554,8 +577,6 @@ ddt_alloc(const ddt_key_t *ddk)
static void
ddt_free(ddt_entry_t *dde)
{
- ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
-
for (int p = 0; p < DDT_PHYS_TYPES; p++)
ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
@@ -575,9 +596,66 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
ddt_free(dde);
}
+static boolean_t
+ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc)
+{
+ if (mc != NULL && metaslab_class_get_space(mc) > 0) {
+ /* Over quota if allocating outside of this special class */
+ if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
+ dedup_class_wait_txgs) {
+ /* Waiting for some deferred frees to be processed */
+ return (B_TRUE);
+ }
+
+ /*
+ * We're considered over quota when we hit 85% full, or for
+ * larger drives, when there is less than 8GB free.
+ */
+ uint64_t allocated = metaslab_class_get_alloc(mc);
+ uint64_t capacity = metaslab_class_get_space(mc);
+ uint64_t limit = MAX(capacity * 85 / 100,
+ (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
+
+ return (allocated >= limit);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Check if the DDT is over its quota. This can be due to a few conditions:
+ * 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
+ * exceeds this limit
+ *
+ * 2. 'dedup_table_quota' property is set to automatic and
+ * a. the dedup or special allocation class could not satisfy a DDT
+ * allocation in a recent transaction
+ * b. the dedup or special allocation class has exceeded its 85% limit
+ */
+static boolean_t
+ddt_over_quota(spa_t *spa)
+{
+ if (spa->spa_dedup_table_quota == 0)
+ return (B_FALSE);
+
+ if (spa->spa_dedup_table_quota != UINT64_MAX)
+ return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
+
+ /*
+ * For automatic quota, table size is limited by dedup or special class
+ */
+ if (ddt_special_over_quota(spa, spa_dedup_class(spa)))
+ return (B_TRUE);
+ else if (spa_special_has_ddt(spa) &&
+ ddt_special_over_quota(spa, spa_special_class(spa)))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
ddt_entry_t *
ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
{
+ spa_t *spa = ddt->ddt_spa;
ddt_key_t search;
ddt_entry_t *dde;
ddt_type_t type;
@@ -592,13 +670,28 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
/* Find an existing live entry */
dde = avl_find(&ddt->ddt_tree, &search, &where);
if (dde != NULL) {
- /* Found it. If it's already loaded, we can just return it. */
+ /* If we went over quota, act like we didn't find it */
+ if (dde->dde_flags & DDE_FLAG_OVERQUOTA)
+ return (NULL);
+
+ /* If it's already loaded, we can just return it. */
if (dde->dde_flags & DDE_FLAG_LOADED)
return (dde);
/* Someone else is loading it, wait for it. */
+ dde->dde_waiters++;
while (!(dde->dde_flags & DDE_FLAG_LOADED))
cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+ dde->dde_waiters--;
+
+ /* Loaded but over quota, forget we were ever here */
+ if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
+ if (dde->dde_waiters == 0) {
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(dde);
+ }
+ return (NULL);
+ }
return (dde);
}
@@ -639,14 +732,27 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
dde->dde_type = type; /* will be DDT_TYPES if no entry found */
dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
- if (error == 0)
+ if (dde->dde_type == DDT_TYPES &&
+ dde->dde_class == DDT_CLASSES &&
+ ddt_over_quota(spa)) {
+ /* Over quota. If no one is waiting, clean up right now. */
+ if (dde->dde_waiters == 0) {
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(dde);
+ return (NULL);
+ }
+
+ /* Flag cleanup required */
+ dde->dde_flags |= DDE_FLAG_OVERQUOTA;
+ } else if (error == 0) {
ddt_stat_update(ddt, dde, -1ULL);
+ }
/* Entry loaded, everyone can proceed now */
dde->dde_flags |= DDE_FLAG_LOADED;
cv_broadcast(&dde->dde_cv);
- return (dde);
+ return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde);
}
void
@@ -775,6 +881,7 @@ ddt_load(spa_t *spa)
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
sizeof (ddt->ddt_histogram));
spa->spa_dedup_dspace = ~0ULL;
+ spa->spa_dedup_dsize = ~0ULL;
}
return (0);
@@ -1032,6 +1139,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
sizeof (ddt->ddt_histogram));
spa->spa_dedup_dspace = ~0ULL;
+ spa->spa_dedup_dsize = ~0ULL;
}
void
@@ -1123,7 +1231,13 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE);
- ASSERT3P(dde, !=, NULL);
+
+ /* Can be NULL if the entry for this block was pruned. */
+ if (dde == NULL) {
+ ddt_exit(ddt);
+ spa_config_exit(spa, SCL_ZIO, FTAG);
+ return (B_FALSE);
+ }
if (dde->dde_type < DDT_TYPES) {
ddt_phys_t *ddp;
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index af5365a1d1..39b4edfc0f 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -129,7 +129,8 @@ ddt_histogram_empty(const ddt_histogram_t *ddh)
void
ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
{
- /* Sum the statistics we cached in ddt_object_sync(). */
+ memset(ddo_total, 0, sizeof (*ddo_total));
+
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (!ddt)
@@ -138,8 +139,32 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES;
class++) {
+ dmu_object_info_t doi;
+ uint64_t cnt;
+ int err;
+
+ /*
+ * These stats were originally calculated
+ * during ddt_object_load().
+ */
+
+ err = ddt_object_info(ddt, type, class, &doi);
+ if (err != 0)
+ continue;
+
+ err = ddt_object_count(ddt, type, class, &cnt);
+ if (err != 0)
+ continue;
+
ddt_object_t *ddo =
&ddt->ddt_object_stats[type][class];
+
+ ddo->ddo_count = cnt;
+ ddo->ddo_dspace =
+ doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count *
+ doi.doi_data_block_size;
+
ddo_total->ddo_count += ddo->ddo_count;
ddo_total->ddo_dspace += ddo->ddo_dspace;
ddo_total->ddo_mspace += ddo->ddo_mspace;
@@ -147,11 +172,24 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
}
}
- /* ... and compute the averages. */
- if (ddo_total->ddo_count != 0) {
- ddo_total->ddo_dspace /= ddo_total->ddo_count;
- ddo_total->ddo_mspace /= ddo_total->ddo_count;
- }
+ /*
+ * This returns raw counts (not averages). One of the consumers,
+ * print_dedup_stats(), historically has expected raw counts.
+ */
+
+ spa->spa_dedup_dsize = ddo_total->ddo_dspace;
+}
+
+uint64_t
+ddt_get_ddt_dsize(spa_t *spa)
+{
+ ddt_object_t ddo_total;
+
+ /* recalculate after each txg sync */
+ if (spa->spa_dedup_dsize == ~0ULL)
+ ddt_get_dedup_object_stats(spa, &ddo_total);
+
+ return (spa->spa_dedup_dsize);
}
void
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 638572996c..1095c0af37 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -406,6 +406,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
brt_get_ratio(spa), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL,
+ ddt_get_ddt_dsize(spa), src);
+
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
rvd->vdev_state, src);
@@ -672,6 +675,10 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
error = SET_ERROR(EINVAL);
break;
+ case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
+ error = nvpair_value_uint64(elem, &intval);
+ break;
+
case ZPOOL_PROP_DELEGATION:
case ZPOOL_PROP_AUTOREPLACE:
case ZPOOL_PROP_LISTSNAPS:
@@ -4732,6 +4739,8 @@ spa_ld_get_props(spa_t *spa)
spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+ spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA,
+ &spa->spa_dedup_table_quota);
spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
spa->spa_autoreplace = (autoreplace != 0);
@@ -6588,6 +6597,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
+ spa->spa_dedup_table_quota =
+ zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA);
if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
@@ -9631,6 +9642,9 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
case ZPOOL_PROP_MULTIHOST:
spa->spa_multihost = intval;
break;
+ case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
+ spa->spa_dedup_table_quota = intval;
+ break;
default:
break;
}
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index d1d41bbe72..439e56f0d0 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1996,6 +1996,13 @@ spa_dedup_class(spa_t *spa)
return (spa->spa_dedup_class);
}
+boolean_t
+spa_special_has_ddt(spa_t *spa)
+{
+ return (zfs_ddt_data_is_special &&
+ spa->spa_special_class->mc_groups != 0);
+}
+
/*
* Locate an appropriate allocation class
*/
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d68d5ababe..bc5a3c9b70 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3503,6 +3503,15 @@ zio_ddt_write(zio_t *zio)
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE);
+ if (dde == NULL) {
+ /* DDT size is over its quota so no new entries */
+ zp->zp_dedup = B_FALSE;
+ BP_SET_DEDUP(bp, B_FALSE);
+ if (zio->io_bp_override == NULL)
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ ddt_exit(ddt);
+ return (zio);
+ }
ddp = &dde->dde_phys[p];
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
@@ -3727,6 +3736,26 @@ zio_dva_allocate(zio_t *zio)
* Fallback to normal class when an alloc class is full
*/
if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ /*
+ * When the dedup or special class is spilling into the normal
+ * class, there can still be significant space available due
+ * to deferred frees that are in-flight. We track the txg when
+ * this occurred and back off adding new DDT entries for a few
+ * txgs to allow the free blocks to be processed.
+ */
+ if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) &&
+ mc == spa_special_class(spa))) &&
+ spa->spa_dedup_class_full_txg != zio->io_txg) {
+ spa->spa_dedup_class_full_txg = zio->io_txg;
+ zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, "
+ "%llu allocated of %llu",
+ spa_name(spa), (int)zio->io_txg,
+ mc == spa_dedup_class(spa) ? "dedup" : "special",
+ (int)zio->io_size,
+ (u_longlong_t)metaslab_class_get_alloc(mc),
+ (u_longlong_t)metaslab_class_get_space(mc));
+ }
+
/*
* If throttling, transfer reservation over to normal class.
* The io_allocator slot can remain the same even though we
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index ac2c541a91..d48b243eef 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -662,6 +662,12 @@ pre =
post =
tags = ['functional', 'deadman']
+[tests/functional/dedup]
+tests = ['dedup_quota']
+pre =
+post =
+tags = ['functional', 'dedup']
+
[tests/functional/delegate]
tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos',
'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 721cf27f48..b4d7c4f72b 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -28,6 +28,7 @@ CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS condense.indirect_commit_entry_delay_ms
CONDENSE_INDIRECT_OBSOLETE_PCT condense.indirect_obsolete_pct zfs_condense_indirect_obsolete_pct
CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes
DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift
+DDT_ZAP_DEFAULT_BS ddt_zap_default_bs ddt_zap_default_bs
DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms
DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second
DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 00f306122d..a55c86bd4d 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1415,6 +1415,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/deadman/deadman_ratelimit.ksh \
functional/deadman/deadman_sync.ksh \
functional/deadman/deadman_zio.ksh \
+ functional/dedup/cleanup.ksh \
+ functional/dedup/setup.ksh \
+ functional/dedup/dedup_quota.ksh \
functional/delegate/cleanup.ksh \
functional/delegate/setup.ksh \
functional/delegate/zfs_allow_001_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index 6ebce94591..e8a94ce209 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -47,6 +47,8 @@ typeset -a properties=(
"listsnapshots"
"autoexpand"
"dedupratio"
+ "dedup_table_quota"
+ "dedup_table_size"
"free"
"allocated"
"readonly"
diff --git a/tests/zfs-tests/tests/functional/dedup/cleanup.ksh b/tests/zfs-tests/tests/functional/dedup/cleanup.ksh
new file mode 100755
index 0000000000..b3c4c04d77
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/cleanup.ksh
@@ -0,0 +1,29 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
new file mode 100755
index 0000000000..5b83a1ca39
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
@@ -0,0 +1,223 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+# DESCRIPTION:
+# Verify that new entries are not added to the DDT when dedup_table_quota has
+# been exceeded.
+#
+# STRATEGY:
+# 1. Create a pool with dedup=on
+# 2. Set threshold for on-disk DDT via dedup_table_quota
+# 3. Verify the threshold is exceeded after zpool sync
+# 4. Verify no new entries are added after subsequent sync's
+# 5. Remove all but one entry from DDT
+# 6. Verify new entries are added to DDT
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/events/events_common.kshlib
+
+verify_runnable "both"
+
+log_assert "DDT quota is enforced"
+
+MOUNTDIR="$TEST_BASE_DIR/dedup_mount"
+FILEPATH="$MOUNTDIR/dedup_file"
+VDEV_GENERAL="$TEST_BASE_DIR/vdevfile.general.$$"
+VDEV_DEDUP="$TEST_BASE_DIR/vdevfile.dedup.$$"
+POOL="dedup_pool"
+
+save_tunable TXG_TIMEOUT
+
+function cleanup
+{
+ if poolexists $POOL ; then
+ destroy_pool $POOL
+ fi
+ log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR
+ log_must restore_tunable TXG_TIMEOUT
+}
+
+
+function do_clean
+{
+ log_must destroy_pool $POOL
+ log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR
+}
+
+function do_setup
+{
+ log_must truncate -s 5G $VDEV_GENERAL
+ # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+ log_must zpool create -o ashift=12 -f -O xattr=sa -m $MOUNTDIR $POOL $VDEV_GENERAL
+ log_must zfs set dedup=on $POOL
+ log_must set_tunable32 TXG_TIMEOUT 600
+}
+
+function dedup_table_size
+{
+ get_pool_prop dedup_table_size $POOL
+}
+
+function dedup_table_quota
+{
+ get_pool_prop dedup_table_quota $POOL
+}
+
+function ddt_entries
+{
+ typeset -i entries=$(zpool status -D $POOL | \
+ grep "dedup: DDT entries" | awk '{print $4}')
+
+ echo ${entries}
+}
+
+function ddt_add_entry
+{
+ count=$1
+ offset=$2
+ expand=$3
+
+ if [ -z "$offset" ]; then
+ offset=1
+ fi
+
+ for i in {$offset..$count}; do
+ echo "$i" > $MOUNTDIR/dedup-$i.txt
+ done
+ log_must sync_pool $POOL
+
+ log_note range $offset - $(( count + offset - 1 ))
+ log_note ddt_add_entry got $(ddt_entries)
+}
+
+# Create 6000 entries over three syncs
+function ddt_nolimit
+{
+ do_setup
+
+ log_note base ddt entries is $(ddt_entries)
+
+ ddt_add_entry 1
+ ddt_add_entry 100
+ ddt_add_entry 101 5000
+ ddt_add_entry 5001 6000
+
+ log_must test $(ddt_entries) -eq 6000
+
+ do_clean
+}
+
+function ddt_limit
+{
+ do_setup
+
+ log_note base ddt entries is $(ddt_entries)
+
+ log_must zpool set dedup_table_quota=32768 $POOL
+ ddt_add_entry 100
+
+ # it's possible to exceed dedup_table_quota over a single transaction,
+ # ensure that the threshold has been exceeded
+ cursize=$(dedup_table_size)
+ log_must test $cursize -gt $(dedup_table_quota)
+
+ # count the entries we have
+ log_must test $(ddt_entries) -ge 100
+
+ # attempt to add new entries
+ ddt_add_entry 101 200
+ log_must test $(ddt_entries) -eq 100
+ log_must test $cursize -eq $(dedup_table_size)
+
+ # remove all but one entry
+ for i in {2..100}; do
+ rm $MOUNTDIR/dedup-$i.txt
+ done
+ log_must sync_pool $POOL
+
+ log_must test $(ddt_entries) -eq 1
+ log_must test $cursize -gt $(dedup_table_size)
+ cursize=$(dedup_table_size)
+
+ log_must zpool set dedup_table_quota=none $POOL
+
+ # create more entries
+ zpool status -D $POOL
+ ddt_add_entry 101 200
+ log_must sync_pool $POOL
+
+ log_must test $(ddt_entries) -eq 101
+ log_must test $cursize -lt $(dedup_table_size)
+
+ do_clean
+}
+
+function ddt_dedup_vdev_limit
+{
+ do_setup
+
+ # add a dedicated dedup/special VDEV and enable an automatic quota
+ if (( RANDOM % 2 == 0 )) ; then
+ class="special"
+ else
+ class="dedup"
+ fi
+ log_must truncate -s 200M $VDEV_DEDUP
+ log_must zpool add $POOL $class $VDEV_DEDUP
+ log_must zpool set dedup_table_quota=auto $POOL
+
+ log_must zfs set recordsize=1K $POOL
+ log_must zfs set compression=zstd $POOL
+
+ # Generate a working set to fill up the dedup/special allocation class
+ log_must fio --directory=$MOUNTDIR --name=dedup-filler-1 \
+ --rw=read --bs=1m --numjobs=2 --iodepth=8 \
+ --size=512M --end_fsync=1 --ioengine=posixaio --runtime=1 \
+ --group_reporting --fallocate=none --output-format=terse \
+ --dedupe_percentage=0
+ log_must sync_pool $POOL
+
+ zpool status -D $POOL
+ zpool list -v $POOL
+ echo DDT size $(dedup_table_size), with $(ddt_entries) entries
+
+ #
+ # With no DDT quota in place, the above workload will produce over
+ # 800,000 entries by using space in the normal class. With a quota,
+ # it will be well below 500,000 entries.
+ #
+ log_must test $(ddt_entries) -le 500000
+
+ do_clean
+}
+
+log_onexit cleanup
+
+ddt_limit
+ddt_nolimit
+ddt_dedup_vdev_limit
+
+log_pass "DDT quota is enforced"
diff --git a/tests/zfs-tests/tests/functional/dedup/setup.ksh b/tests/zfs-tests/tests/functional/dedup/setup.ksh
new file mode 100755
index 0000000000..3c0830401f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/setup.ksh
@@ -0,0 +1,31 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+
+default_setup $DISK