ddt: dedup table quota enforcement

This adds two new pool properties:
- dedup_table_size, the total size of all DDTs on the pool; and
- dedup_table_quota, the maximum possible size of all DDTs in the pool

When set, quota will be enforced by checking when a new entry is about
to be created. If the pool is over its dedup quota, the entry won't be
created, and the corresponding write will be converted to a regular
non-dedup write. Note that existing entries can be updated (ie their
refcounts changed), as that reuses the space rather than requiring more.

dedup_table_quota can be set to 'auto', which will set it based on the
size of the devices backing the "dedup" allocation device. This makes it
possible to limit the DDTs to the size of a dedup vdev only, such that
when the device fills, no new blocks are deduplicated.

Sponsored-by: iXsystems, Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Co-authored-by: Sean Eric Fagan <sean.fagan@klarasystems.com>
Closes #15889
This commit is contained in:
Allan Jude 2024-07-25 12:47:36 -04:00 committed by GitHub
parent 82f281ad99
commit c7ada64bb6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 599 additions and 22 deletions

View File

@ -8495,17 +8495,24 @@ print_time(hrtime_t t, char *timebuf)
} }
static nvlist_t * static nvlist_t *
make_random_props(void) make_random_pool_props(void)
{ {
nvlist_t *props; nvlist_t *props;
props = fnvlist_alloc(); props = fnvlist_alloc();
if (ztest_random(2) == 0) /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */
return (props); if (ztest_random(5) == 0) {
fnvlist_add_uint64(props,
zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA),
2 * 1024 * 1024);
}
fnvlist_add_uint64(props, /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */
zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); if (ztest_random(2) == 0) {
fnvlist_add_uint64(props,
zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1);
}
return (props); return (props);
} }
@ -8537,7 +8544,7 @@ ztest_init(ztest_shared_t *zs)
zs->zs_mirrors = ztest_opts.zo_mirrors; zs->zs_mirrors = ztest_opts.zo_mirrors;
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
props = make_random_props(); props = make_random_pool_props();
/* /*
* We don't expect the pool to suspend unless maxfaults == 0, * We don't expect the pool to suspend unless maxfaults == 0,

View File

@ -151,7 +151,8 @@ enum ddt_phys_type {
*/ */
/* State flags for dde_flags */ /* State flags for dde_flags */
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */
typedef struct { typedef struct {
/* key must be first for ddt_key_compare */ /* key must be first for ddt_key_compare */
@ -170,6 +171,7 @@ typedef struct {
uint8_t dde_flags; /* load state flags */ uint8_t dde_flags; /* load state flags */
kcondvar_t dde_cv; /* signaled when load completes */ kcondvar_t dde_cv; /* signaled when load completes */
uint64_t dde_waiters; /* count of waiters on dde_cv */
avl_node_t dde_node; /* ddt_tree node */ avl_node_t dde_node; /* ddt_tree node */
} ddt_entry_t; } ddt_entry_t;
@ -228,6 +230,7 @@ extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);

View File

@ -258,6 +258,8 @@ typedef enum {
ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONEUSED,
ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONESAVED,
ZPOOL_PROP_BCLONERATIO, ZPOOL_PROP_BCLONERATIO,
ZPOOL_PROP_DEDUP_TABLE_SIZE,
ZPOOL_PROP_DEDUP_TABLE_QUOTA,
ZPOOL_NUM_PROPS ZPOOL_NUM_PROPS
} zpool_prop_t; } zpool_prop_t;

View File

@ -1051,6 +1051,7 @@ extern metaslab_class_t *spa_special_class(spa_t *spa);
extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa);
extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
extern boolean_t spa_special_has_ddt(spa_t *spa);
extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_register(spa_t *, objset_t *os);
extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os);

View File

@ -465,6 +465,9 @@ struct spa {
boolean_t spa_waiters_cancel; /* waiters should return */ boolean_t spa_waiters_cancel; /* waiters should return */
char *spa_compatibility; /* compatibility file(s) */ char *spa_compatibility; /* compatibility file(s) */
uint64_t spa_dedup_table_quota; /* property DDT maximum size */
uint64_t spa_dedup_dsize; /* cached on-disk size of DDT */
uint64_t spa_dedup_class_full_txg; /* txg dedup class was full */
/* /*
* spa_refcount & spa_config_lock must be the last elements * spa_refcount & spa_config_lock must be the last elements

View File

@ -2921,7 +2921,9 @@
<enumerator name='ZPOOL_PROP_BCLONEUSED' value='33'/> <enumerator name='ZPOOL_PROP_BCLONEUSED' value='33'/>
<enumerator name='ZPOOL_PROP_BCLONESAVED' value='34'/> <enumerator name='ZPOOL_PROP_BCLONESAVED' value='34'/>
<enumerator name='ZPOOL_PROP_BCLONERATIO' value='35'/> <enumerator name='ZPOOL_PROP_BCLONERATIO' value='35'/>
<enumerator name='ZPOOL_NUM_PROPS' value='36'/> <enumerator name='ZPOOL_PROP_DEDUP_TABLE_SIZE' value='36'/>
<enumerator name='ZPOOL_PROP_DEDUP_TABLE_QUOTA' value='37'/>
<enumerator name='ZPOOL_NUM_PROPS' value='38'/>
</enum-decl> </enum-decl>
<typedef-decl name='zpool_prop_t' type-id='af1ba157' id='5d0c23fb'/> <typedef-decl name='zpool_prop_t' type-id='af1ba157' id='5d0c23fb'/>
<typedef-decl name='regoff_t' type-id='95e97e5e' id='54a2a2a8'/> <typedef-decl name='regoff_t' type-id='95e97e5e' id='54a2a2a8'/>

View File

@ -332,6 +332,24 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
intval = zpool_get_prop_int(zhp, prop, &src); intval = zpool_get_prop_int(zhp, prop, &src);
switch (prop) { switch (prop) {
case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
/*
* If dedup quota is 0, we translate this into 'none'
* (unless literal is set). And if it is UINT64_MAX
* we translate that as 'automatic' (limit to size of
* the dedicated dedup VDEV. Otherwise, fall throught
* into the regular number formating.
*/
if (intval == 0) {
(void) strlcpy(buf, literal ? "0" : "none",
len);
break;
} else if (intval == UINT64_MAX) {
(void) strlcpy(buf, "auto", len);
break;
}
zfs_fallthrough;
case ZPOOL_PROP_SIZE: case ZPOOL_PROP_SIZE:
case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_ALLOCATED:
case ZPOOL_PROP_FREE: case ZPOOL_PROP_FREE:
@ -342,6 +360,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
case ZPOOL_PROP_MAXDNODESIZE: case ZPOOL_PROP_MAXDNODESIZE:
case ZPOOL_PROP_BCLONESAVED: case ZPOOL_PROP_BCLONESAVED:
case ZPOOL_PROP_BCLONEUSED: case ZPOOL_PROP_BCLONEUSED:
case ZPOOL_PROP_DEDUP_TABLE_SIZE:
if (literal) if (literal)
(void) snprintf(buf, len, "%llu", (void) snprintf(buf, len, "%llu",
(u_longlong_t)intval); (u_longlong_t)intval);

View File

@ -1691,6 +1691,16 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
"use 'none' to disable quota/refquota")); "use 'none' to disable quota/refquota"));
goto error; goto error;
} }
/*
* Pool dedup table quota; force use of 'none' instead of 0
*/
if ((type & ZFS_TYPE_POOL) && *ivalp == 0 &&
(!isnone && !isauto) &&
prop == ZPOOL_PROP_DEDUP_TABLE_QUOTA) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"use 'none' to disable ddt table quota"));
goto error;
}
/* /*
* Special handling for "*_limit=none". In this case it's not * Special handling for "*_limit=none". In this case it's not
@ -1732,6 +1742,10 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
} }
*ivalp = UINT64_MAX; *ivalp = UINT64_MAX;
break; break;
case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
ASSERT(type & ZFS_TYPE_POOL);
*ivalp = UINT64_MAX;
break;
default: default:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'auto' is invalid value for '%s'"), "'auto' is invalid value for '%s'"),

View File

@ -28,7 +28,7 @@
.\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org> .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
.\" Copyright (c) 2023, Klara Inc. .\" Copyright (c) 2023, Klara Inc.
.\" .\"
.Dd January 2, 2024 .Dd January 14, 2024
.Dt ZPOOLPROPS 7 .Dt ZPOOLPROPS 7
.Os .Os
. .
@ -73,6 +73,8 @@ The amount of storage used by cloned blocks.
Percentage of pool space used. Percentage of pool space used.
This property can also be referred to by its shortened column name, This property can also be referred to by its shortened column name,
.Sy cap . .Sy cap .
.It Sy dedup_table_size
Total on-disk size of the deduplication table.
.It Sy expandsize .It Sy expandsize
Amount of uninitialized space within the pool or device that can be used to Amount of uninitialized space within the pool or device that can be used to
increase the total capacity of the pool. increase the total capacity of the pool.
@ -348,6 +350,27 @@ See
and and
.Xr zpool-upgrade 8 .Xr zpool-upgrade 8
for more information on the operation of compatibility feature sets. for more information on the operation of compatibility feature sets.
.It Sy dedup_table_quota Ns = Ns Ar number Ns | Ns Sy none Ns | Ns Sy auto
This property sets a limit on the on-disk size of the pool's dedup table.
Entries will not be added to the dedup table once this size is reached;
if a dedup table already exists, and is larger than this size, they
will not be removed as part of setting this property.
Existing entries will still have their reference counts updated.
.Pp
The actual size limit of the table may be above or below the quota,
depending on the actual on-disk size of the entries (which may be
approximated for purposes of calculating the quota).
That is, setting a quota size of 1M may result in the maximum size being
slightly below, or slightly above, that value.
Set to
.Sy 'none'
to disable.
In automatic mode, which is the default, the size of a dedicated dedup vdev
is used as the quota limit.
.Pp
The
.Sy dedup_table_quota
property works for both legacy and fast dedup tables.
.It Sy dedupditto Ns = Ns Ar number .It Sy dedupditto Ns = Ns Ar number
This property is deprecated and no longer has any effect. This property is deprecated and no longer has any effect.
.It Sy delegation Ns = Ns Sy on Ns | Ns Sy off .It Sy delegation Ns = Ns Sy on Ns | Ns Sy off

View File

@ -23,7 +23,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org> * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
* Copyright (c) 2021, Klara Inc. * Copyright (c) 2021, 2023, Klara Inc.
*/ */
#include <sys/zio.h> #include <sys/zio.h>
@ -125,6 +125,9 @@ zpool_prop_init(void)
zprop_register_number(ZPOOL_PROP_BCLONERATIO, "bcloneratio", 0, zprop_register_number(ZPOOL_PROP_BCLONERATIO, "bcloneratio", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if cloned>", PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if cloned>",
"BCLONE_RATIO", B_FALSE, sfeatures); "BCLONE_RATIO", B_FALSE, sfeatures);
zprop_register_number(ZPOOL_PROP_DEDUP_TABLE_SIZE, "dedup_table_size",
0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "DDTSIZE", B_FALSE,
sfeatures);
/* default number properties */ /* default number properties */
zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
@ -133,6 +136,9 @@ zpool_prop_init(void)
zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_DEFAULT, zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_DEFAULT,
ZFS_TYPE_POOL, "<ashift, 9-16, or 0=default>", "ASHIFT", B_FALSE, ZFS_TYPE_POOL, "<ashift, 9-16, or 0=default>", "ASHIFT", B_FALSE,
sfeatures); sfeatures);
zprop_register_number(ZPOOL_PROP_DEDUP_TABLE_QUOTA, "dedup_table_quota",
UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_POOL, "<size>", "DDTQUOTA",
B_FALSE, sfeatures);
/* default index (boolean) properties */ /* default index (boolean) properties */
zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,

View File

@ -101,6 +101,22 @@
* object and (if necessary), removed from an old one. ddt_tree is cleared and * object and (if necessary), removed from an old one. ddt_tree is cleared and
* the next txg can start. * the next txg can start.
* *
* ## Dedup quota
*
* A maximum size for all DDTs on the pool can be set with the
* dedup_table_quota property. This is determined in ddt_over_quota() and
* enforced during ddt_lookup(). If the pool is at or over its quota limit,
* ddt_lookup() will only return entries for existing blocks, as updates are
* still possible. New entries will not be created; instead, ddt_lookup() will
* return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
* the D bit on the block and reissue the IO as a regular write. The block will
* not be deduplicated.
*
* Note that this is based on the on-disk size of the dedup store. Reclaiming
* this space after deleting entries relies on the ZAP "shrinking" behaviour,
* without which, no space would be recovered and the DDT would continue to be
* considered "over quota". See zap_shrink_enabled.
*
* ## Repair IO * ## Repair IO
* *
* If a read on a dedup block fails, but there are other copies of the block in * If a read on a dedup block fails, but there are other copies of the block in
@ -152,6 +168,13 @@ static kmem_cache_t *ddt_entry_cache;
*/ */
int zfs_dedup_prefetch = 0; int zfs_dedup_prefetch = 0;
/*
* If the dedup class cannot satisfy a DDT allocation, treat as over quota
* for this many TXGs.
*/
uint_t dedup_class_wait_txgs = 5;
static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
&ddt_zap_ops, &ddt_zap_ops,
}; };
@ -554,8 +577,6 @@ ddt_alloc(const ddt_key_t *ddk)
static void static void
ddt_free(ddt_entry_t *dde) ddt_free(ddt_entry_t *dde)
{ {
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
for (int p = 0; p < DDT_PHYS_TYPES; p++) for (int p = 0; p < DDT_PHYS_TYPES; p++)
ASSERT3P(dde->dde_lead_zio[p], ==, NULL); ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
@ -575,9 +596,66 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
ddt_free(dde); ddt_free(dde);
} }
static boolean_t
ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc)
{
if (mc != NULL && metaslab_class_get_space(mc) > 0) {
/* Over quota if allocating outside of this special class */
if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
dedup_class_wait_txgs) {
/* Waiting for some deferred frees to be processed */
return (B_TRUE);
}
/*
* We're considered over quota when we hit 85% full, or for
* larger drives, when there is less than 8GB free.
*/
uint64_t allocated = metaslab_class_get_alloc(mc);
uint64_t capacity = metaslab_class_get_space(mc);
uint64_t limit = MAX(capacity * 85 / 100,
(capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
return (allocated >= limit);
}
return (B_FALSE);
}
/*
* Check if the DDT is over its quota. This can be due to a few conditions:
* 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
* exceeds this limit
*
* 2. 'dedup_table_quota' property is set to automatic and
* a. the dedup or special allocation class could not satisfy a DDT
* allocation in a recent transaction
* b. the dedup or special allocation class has exceeded its 85% limit
*/
static boolean_t
ddt_over_quota(spa_t *spa)
{
if (spa->spa_dedup_table_quota == 0)
return (B_FALSE);
if (spa->spa_dedup_table_quota != UINT64_MAX)
return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
/*
* For automatic quota, table size is limited by dedup or special class
*/
if (ddt_special_over_quota(spa, spa_dedup_class(spa)))
return (B_TRUE);
else if (spa_special_has_ddt(spa) &&
ddt_special_over_quota(spa, spa_special_class(spa)))
return (B_TRUE);
return (B_FALSE);
}
ddt_entry_t * ddt_entry_t *
ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
{ {
spa_t *spa = ddt->ddt_spa;
ddt_key_t search; ddt_key_t search;
ddt_entry_t *dde; ddt_entry_t *dde;
ddt_type_t type; ddt_type_t type;
@ -592,13 +670,28 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
/* Find an existing live entry */ /* Find an existing live entry */
dde = avl_find(&ddt->ddt_tree, &search, &where); dde = avl_find(&ddt->ddt_tree, &search, &where);
if (dde != NULL) { if (dde != NULL) {
/* Found it. If it's already loaded, we can just return it. */ /* If we went over quota, act like we didn't find it */
if (dde->dde_flags & DDE_FLAG_OVERQUOTA)
return (NULL);
/* If it's already loaded, we can just return it. */
if (dde->dde_flags & DDE_FLAG_LOADED) if (dde->dde_flags & DDE_FLAG_LOADED)
return (dde); return (dde);
/* Someone else is loading it, wait for it. */ /* Someone else is loading it, wait for it. */
dde->dde_waiters++;
while (!(dde->dde_flags & DDE_FLAG_LOADED)) while (!(dde->dde_flags & DDE_FLAG_LOADED))
cv_wait(&dde->dde_cv, &ddt->ddt_lock); cv_wait(&dde->dde_cv, &ddt->ddt_lock);
dde->dde_waiters--;
/* Loaded but over quota, forget we were ever here */
if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
if (dde->dde_waiters == 0) {
avl_remove(&ddt->ddt_tree, dde);
ddt_free(dde);
}
return (NULL);
}
return (dde); return (dde);
} }
@ -639,14 +732,27 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_type = type; /* will be DDT_TYPES if no entry found */
dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
if (error == 0) if (dde->dde_type == DDT_TYPES &&
dde->dde_class == DDT_CLASSES &&
ddt_over_quota(spa)) {
/* Over quota. If no one is waiting, clean up right now. */
if (dde->dde_waiters == 0) {
avl_remove(&ddt->ddt_tree, dde);
ddt_free(dde);
return (NULL);
}
/* Flag cleanup required */
dde->dde_flags |= DDE_FLAG_OVERQUOTA;
} else if (error == 0) {
ddt_stat_update(ddt, dde, -1ULL); ddt_stat_update(ddt, dde, -1ULL);
}
/* Entry loaded, everyone can proceed now */ /* Entry loaded, everyone can proceed now */
dde->dde_flags |= DDE_FLAG_LOADED; dde->dde_flags |= DDE_FLAG_LOADED;
cv_broadcast(&dde->dde_cv); cv_broadcast(&dde->dde_cv);
return (dde); return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde);
} }
void void
@ -775,6 +881,7 @@ ddt_load(spa_t *spa)
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
sizeof (ddt->ddt_histogram)); sizeof (ddt->ddt_histogram));
spa->spa_dedup_dspace = ~0ULL; spa->spa_dedup_dspace = ~0ULL;
spa->spa_dedup_dsize = ~0ULL;
} }
return (0); return (0);
@ -1032,6 +1139,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
sizeof (ddt->ddt_histogram)); sizeof (ddt->ddt_histogram));
spa->spa_dedup_dspace = ~0ULL; spa->spa_dedup_dspace = ~0ULL;
spa->spa_dedup_dsize = ~0ULL;
} }
void void
@ -1123,7 +1231,13 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
ddt_enter(ddt); ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE); dde = ddt_lookup(ddt, bp, B_TRUE);
ASSERT3P(dde, !=, NULL);
/* Can be NULL if the entry for this block was pruned. */
if (dde == NULL) {
ddt_exit(ddt);
spa_config_exit(spa, SCL_ZIO, FTAG);
return (B_FALSE);
}
if (dde->dde_type < DDT_TYPES) { if (dde->dde_type < DDT_TYPES) {
ddt_phys_t *ddp; ddt_phys_t *ddp;

View File

@ -129,7 +129,8 @@ ddt_histogram_empty(const ddt_histogram_t *ddh)
void void
ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
{ {
/* Sum the statistics we cached in ddt_object_sync(). */ memset(ddo_total, 0, sizeof (*ddo_total));
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c]; ddt_t *ddt = spa->spa_ddt[c];
if (!ddt) if (!ddt)
@ -138,8 +139,32 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES; for (ddt_class_t class = 0; class < DDT_CLASSES;
class++) { class++) {
dmu_object_info_t doi;
uint64_t cnt;
int err;
/*
* These stats were originally calculated
* during ddt_object_load().
*/
err = ddt_object_info(ddt, type, class, &doi);
if (err != 0)
continue;
err = ddt_object_count(ddt, type, class, &cnt);
if (err != 0)
continue;
ddt_object_t *ddo = ddt_object_t *ddo =
&ddt->ddt_object_stats[type][class]; &ddt->ddt_object_stats[type][class];
ddo->ddo_count = cnt;
ddo->ddo_dspace =
doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count *
doi.doi_data_block_size;
ddo_total->ddo_count += ddo->ddo_count; ddo_total->ddo_count += ddo->ddo_count;
ddo_total->ddo_dspace += ddo->ddo_dspace; ddo_total->ddo_dspace += ddo->ddo_dspace;
ddo_total->ddo_mspace += ddo->ddo_mspace; ddo_total->ddo_mspace += ddo->ddo_mspace;
@ -147,11 +172,24 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
} }
} }
/* ... and compute the averages. */ /*
if (ddo_total->ddo_count != 0) { * This returns raw counts (not averages). One of the consumers,
ddo_total->ddo_dspace /= ddo_total->ddo_count; * print_dedup_stats(), historically has expected raw counts.
ddo_total->ddo_mspace /= ddo_total->ddo_count; */
}
spa->spa_dedup_dsize = ddo_total->ddo_dspace;
}
uint64_t
ddt_get_ddt_dsize(spa_t *spa)
{
ddt_object_t ddo_total;
/* recalculate after each txg sync */
if (spa->spa_dedup_dsize == ~0ULL)
ddt_get_dedup_object_stats(spa, &ddo_total);
return (spa->spa_dedup_dsize);
} }
void void

View File

@ -406,6 +406,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
brt_get_ratio(spa), src); brt_get_ratio(spa), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL,
ddt_get_ddt_dsize(spa), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
rvd->vdev_state, src); rvd->vdev_state, src);
@ -672,6 +675,10 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
error = SET_ERROR(EINVAL); error = SET_ERROR(EINVAL);
break; break;
case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
error = nvpair_value_uint64(elem, &intval);
break;
case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_DELEGATION:
case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_AUTOREPLACE:
case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_LISTSNAPS:
@ -4732,6 +4739,8 @@ spa_ld_get_props(spa_t *spa)
spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA,
&spa->spa_dedup_table_quota);
spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
spa->spa_autoreplace = (autoreplace != 0); spa->spa_autoreplace = (autoreplace != 0);
@ -6588,6 +6597,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
spa->spa_dedup_table_quota =
zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA);
if (props != NULL) { if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE); spa_configfile_set(spa, props, B_FALSE);
@ -9631,6 +9642,9 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
case ZPOOL_PROP_MULTIHOST: case ZPOOL_PROP_MULTIHOST:
spa->spa_multihost = intval; spa->spa_multihost = intval;
break; break;
case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
spa->spa_dedup_table_quota = intval;
break;
default: default:
break; break;
} }

View File

@ -1996,6 +1996,13 @@ spa_dedup_class(spa_t *spa)
return (spa->spa_dedup_class); return (spa->spa_dedup_class);
} }
boolean_t
spa_special_has_ddt(spa_t *spa)
{
return (zfs_ddt_data_is_special &&
spa->spa_special_class->mc_groups != 0);
}
/* /*
* Locate an appropriate allocation class * Locate an appropriate allocation class
*/ */

View File

@ -3503,6 +3503,15 @@ zio_ddt_write(zio_t *zio)
ddt_enter(ddt); ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE); dde = ddt_lookup(ddt, bp, B_TRUE);
if (dde == NULL) {
/* DDT size is over its quota so no new entries */
zp->zp_dedup = B_FALSE;
BP_SET_DEDUP(bp, B_FALSE);
if (zio->io_bp_override == NULL)
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
return (zio);
}
ddp = &dde->dde_phys[p]; ddp = &dde->dde_phys[p];
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
@ -3727,6 +3736,26 @@ zio_dva_allocate(zio_t *zio)
* Fallback to normal class when an alloc class is full * Fallback to normal class when an alloc class is full
*/ */
if (error == ENOSPC && mc != spa_normal_class(spa)) { if (error == ENOSPC && mc != spa_normal_class(spa)) {
/*
* When the dedup or special class is spilling into the normal
* class, there can still be significant space available due
* to deferred frees that are in-flight. We track the txg when
* this occurred and back off adding new DDT entries for a few
* txgs to allow the free blocks to be processed.
*/
if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) &&
mc == spa_special_class(spa))) &&
spa->spa_dedup_class_full_txg != zio->io_txg) {
spa->spa_dedup_class_full_txg = zio->io_txg;
zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, "
"%llu allocated of %llu",
spa_name(spa), (int)zio->io_txg,
mc == spa_dedup_class(spa) ? "dedup" : "special",
(int)zio->io_size,
(u_longlong_t)metaslab_class_get_alloc(mc),
(u_longlong_t)metaslab_class_get_space(mc));
}
/* /*
* If throttling, transfer reservation over to normal class. * If throttling, transfer reservation over to normal class.
* The io_allocator slot can remain the same even though we * The io_allocator slot can remain the same even though we

View File

@ -662,6 +662,12 @@ pre =
post = post =
tags = ['functional', 'deadman'] tags = ['functional', 'deadman']
[tests/functional/dedup]
tests = ['dedup_quota']
pre =
post =
tags = ['functional', 'dedup']
[tests/functional/delegate] [tests/functional/delegate]
tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos',
'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',

View File

@ -28,6 +28,7 @@ CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS condense.indirect_commit_entry_delay_ms
CONDENSE_INDIRECT_OBSOLETE_PCT condense.indirect_obsolete_pct zfs_condense_indirect_obsolete_pct CONDENSE_INDIRECT_OBSOLETE_PCT condense.indirect_obsolete_pct zfs_condense_indirect_obsolete_pct
CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes
DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift
DDT_ZAP_DEFAULT_BS ddt_zap_default_bs ddt_zap_default_bs
DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms
DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second
DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode

View File

@ -1415,6 +1415,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/deadman/deadman_ratelimit.ksh \ functional/deadman/deadman_ratelimit.ksh \
functional/deadman/deadman_sync.ksh \ functional/deadman/deadman_sync.ksh \
functional/deadman/deadman_zio.ksh \ functional/deadman/deadman_zio.ksh \
functional/dedup/cleanup.ksh \
functional/dedup/setup.ksh \
functional/dedup/dedup_quota.ksh \
functional/delegate/cleanup.ksh \ functional/delegate/cleanup.ksh \
functional/delegate/setup.ksh \ functional/delegate/setup.ksh \
functional/delegate/zfs_allow_001_pos.ksh \ functional/delegate/zfs_allow_001_pos.ksh \

View File

@ -47,6 +47,8 @@ typeset -a properties=(
"listsnapshots" "listsnapshots"
"autoexpand" "autoexpand"
"dedupratio" "dedupratio"
"dedup_table_quota"
"dedup_table_size"
"free" "free"
"allocated" "allocated"
"readonly" "readonly"

View File

@ -0,0 +1,29 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
default_cleanup

View File

@ -0,0 +1,223 @@
#!/bin/ksh -p
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
#
# DESCRIPTION:
# Verify that new entries are not added to the DDT when dedup_table_quota has
# been exceeded.
#
# STRATEGY:
# 1. Create a pool with dedup=on
# 2. Set threshold for on-disk DDT via dedup_table_quota
# 3. Verify the threshold is exceeded after zpool sync
# 4. Verify no new entries are added after subsequent sync's
# 5. Remove all but one entry from DDT
# 6. Verify new entries are added to DDT
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/events/events_common.kshlib
verify_runnable "both"
log_assert "DDT quota is enforced"
MOUNTDIR="$TEST_BASE_DIR/dedup_mount"
FILEPATH="$MOUNTDIR/dedup_file"
VDEV_GENERAL="$TEST_BASE_DIR/vdevfile.general.$$"
VDEV_DEDUP="$TEST_BASE_DIR/vdevfile.dedup.$$"
POOL="dedup_pool"
save_tunable TXG_TIMEOUT
function cleanup
{
if poolexists $POOL ; then
destroy_pool $POOL
fi
log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR
log_must restore_tunable TXG_TIMEOUT
}
function do_clean
{
log_must destroy_pool $POOL
log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR
}
function do_setup
{
log_must truncate -s 5G $VDEV_GENERAL
# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
log_must zpool create -o ashift=12 -f -O xattr=sa -m $MOUNTDIR $POOL $VDEV_GENERAL
log_must zfs set dedup=on $POOL
log_must set_tunable32 TXG_TIMEOUT 600
}
function dedup_table_size
{
get_pool_prop dedup_table_size $POOL
}
function dedup_table_quota
{
get_pool_prop dedup_table_quota $POOL
}
function ddt_entries
{
typeset -i entries=$(zpool status -D $POOL | \
grep "dedup: DDT entries" | awk '{print $4}')
echo ${entries}
}
function ddt_add_entry
{
count=$1
offset=$2
expand=$3
if [ -z "$offset" ]; then
offset=1
fi
for i in {$offset..$count}; do
echo "$i" > $MOUNTDIR/dedup-$i.txt
done
log_must sync_pool $POOL
log_note range $offset - $(( count + offset - 1 ))
log_note ddt_add_entry got $(ddt_entries)
}
# Create 6000 entries over three syncs
function ddt_nolimit
{
do_setup
log_note base ddt entries is $(ddt_entries)
ddt_add_entry 1
ddt_add_entry 100
ddt_add_entry 101 5000
ddt_add_entry 5001 6000
log_must test $(ddt_entries) -eq 6000
do_clean
}
function ddt_limit
{
do_setup
log_note base ddt entries is $(ddt_entries)
log_must zpool set dedup_table_quota=32768 $POOL
ddt_add_entry 100
# it's possible to exceed dedup_table_quota over a single transaction,
# ensure that the threshold has been exceeded
cursize=$(dedup_table_size)
log_must test $cursize -gt $(dedup_table_quota)
# count the entries we have
log_must test $(ddt_entries) -ge 100
# attempt to add new entries
ddt_add_entry 101 200
log_must test $(ddt_entries) -eq 100
log_must test $cursize -eq $(dedup_table_size)
# remove all but one entry
for i in {2..100}; do
rm $MOUNTDIR/dedup-$i.txt
done
log_must sync_pool $POOL
log_must test $(ddt_entries) -eq 1
log_must test $cursize -gt $(dedup_table_size)
cursize=$(dedup_table_size)
log_must zpool set dedup_table_quota=none $POOL
# create more entries
zpool status -D $POOL
ddt_add_entry 101 200
log_must sync_pool $POOL
log_must test $(ddt_entries) -eq 101
log_must test $cursize -lt $(dedup_table_size)
do_clean
}
function ddt_dedup_vdev_limit
{
do_setup
# add a dedicated dedup/special VDEV and enable an automatic quota
if (( RANDOM % 2 == 0 )) ; then
class="special"
else
class="dedup"
fi
log_must truncate -s 200M $VDEV_DEDUP
log_must zpool add $POOL $class $VDEV_DEDUP
log_must zpool set dedup_table_quota=auto $POOL
log_must zfs set recordsize=1K $POOL
log_must zfs set compression=zstd $POOL
# Generate a working set to fill up the dedup/special allocation class
log_must fio --directory=$MOUNTDIR --name=dedup-filler-1 \
--rw=read --bs=1m --numjobs=2 --iodepth=8 \
--size=512M --end_fsync=1 --ioengine=posixaio --runtime=1 \
--group_reporting --fallocate=none --output-format=terse \
--dedupe_percentage=0
log_must sync_pool $POOL
zpool status -D $POOL
zpool list -v $POOL
echo DDT size $(dedup_table_size), with $(ddt_entries) entries
#
# With no DDT quota in place, the above workload will produce over
# 800,000 entries by using space in the normal class. With a quota,
# it will be well below 500,000 entries.
#
log_must test $(ddt_entries) -le 500000
do_clean
}
log_onexit cleanup
ddt_limit
ddt_nolimit
ddt_dedup_vdev_limit
log_pass "DDT quota is enforced"

View File

@ -0,0 +1,31 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
DISK=${DISKS%% *}
default_setup $DISK