Merge branch 'openzfs/master' into NAS-130821-2

Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
This commit is contained in:
Umer Saleem 2024-09-03 19:09:32 +05:00
commit cfa06493fa
112 changed files with 6355 additions and 1567 deletions

2
META
View File

@ -6,5 +6,5 @@ Release: 1
Release-Tags: relext Release-Tags: relext
License: CDDL License: CDDL
Author: OpenZFS Author: OpenZFS
Linux-Maximum: 6.9 Linux-Maximum: 6.10
Linux-Minimum: 3.10 Linux-Minimum: 3.10

View File

@ -24,7 +24,7 @@ zfs_ids_to_path_LDADD = \
libzfs.la libzfs.la
zhack_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) zhack_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
sbin_PROGRAMS += zhack sbin_PROGRAMS += zhack
CPPCHECKTARGETS += zhack CPPCHECKTARGETS += zhack
@ -39,7 +39,7 @@ zhack_LDADD = \
ztest_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) ztest_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS)
ztest_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) ztest_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
sbin_PROGRAMS += ztest sbin_PROGRAMS += ztest
CPPCHECKTARGETS += ztest CPPCHECKTARGETS += ztest

View File

@ -269,8 +269,7 @@ main(int argc, char **argv)
return (MOUNT_USAGE); return (MOUNT_USAGE);
} }
if (!zfsutil || sloppy || if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
} }
@ -337,7 +336,7 @@ main(int argc, char **argv)
dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt); dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);
if (!fake) { if (!fake) {
if (zfsutil && !sloppy && if (!remount && !sloppy &&
!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint); error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint);
if (error) { if (error) {

View File

@ -1,5 +1,5 @@
raidz_test_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) raidz_test_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS)
raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
bin_PROGRAMS += raidz_test bin_PROGRAMS += raidz_test
CPPCHECKTARGETS += raidz_test CPPCHECKTARGETS += raidz_test

View File

@ -1,4 +1,4 @@
zdb_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) zdb_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
zdb_CFLAGS = $(AM_CFLAGS) $(LIBCRYPTO_CFLAGS) zdb_CFLAGS = $(AM_CFLAGS) $(LIBCRYPTO_CFLAGS)
sbin_PROGRAMS += zdb sbin_PROGRAMS += zdb

View File

@ -33,7 +33,7 @@
* under sponsorship from the FreeBSD Foundation. * under sponsorship from the FreeBSD Foundation.
* Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Allan Jude
* Copyright (c) 2021 Toomas Soome <tsoome@me.com> * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
* Copyright (c) 2023, Klara Inc. * Copyright (c) 2023, 2024, Klara Inc.
* Copyright (c) 2023, Rob Norris <robn@despairlabs.com> * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
*/ */
@ -1914,23 +1914,25 @@ dump_log_spacemaps(spa_t *spa)
} }
static void static void
dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
uint64_t index)
{ {
const ddt_phys_t *ddp = dde->dde_phys; const ddt_key_t *ddk = &ddlwe->ddlwe_key;
const ddt_key_t *ddk = &dde->dde_key;
const char *types[4] = { "ditto", "single", "double", "triple" };
char blkbuf[BP_SPRINTF_LEN]; char blkbuf[BP_SPRINTF_LEN];
blkptr_t blk; blkptr_t blk;
int p; int p;
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { for (p = 0; p < DDT_NPHYS(ddt); p++) {
if (ddp->ddp_phys_birth == 0) const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
if (ddt_phys_birth(ddp, v) == 0)
continue; continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
(void) printf("index %llx refcnt %llu %s %s\n", (void) printf("index %llx refcnt %llu phys %d %s\n",
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
types[p], blkbuf); p, blkbuf);
} }
} }
@ -1956,11 +1958,37 @@ dump_dedup_ratio(const ddt_stat_t *dds)
dedup, compress, copies, dedup * compress / copies); dedup, compress, copies, dedup * compress / copies);
} }
static void
dump_ddt_log(ddt_t *ddt)
{
for (int n = 0; n < 2; n++) {
ddt_log_t *ddl = &ddt->ddt_log[n];
uint64_t count = avl_numnodes(&ddl->ddl_tree);
if (count == 0)
continue;
printf(DMU_POOL_DDT_LOG ": %lu log entries\n",
zio_checksum_table[ddt->ddt_checksum].ci_name, n, count);
if (dump_opt['D'] < 4)
continue;
ddt_lightweight_entry_t ddlwe;
uint64_t index = 0;
for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
dump_ddt_entry(ddt, &ddlwe, index++);
}
}
}
static void static void
dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
{ {
char name[DDT_NAMELEN]; char name[DDT_NAMELEN];
ddt_entry_t dde; ddt_lightweight_entry_t ddlwe;
uint64_t walk = 0; uint64_t walk = 0;
dmu_object_info_t doi; dmu_object_info_t doi;
uint64_t count, dspace, mspace; uint64_t count, dspace, mspace;
@ -2001,8 +2029,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
(void) printf("%s contents:\n\n", name); (void) printf("%s contents:\n\n", name);
while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
dump_dde(ddt, &dde, walk); dump_ddt_entry(ddt, &ddlwe, walk);
ASSERT3U(error, ==, ENOENT); ASSERT3U(error, ==, ENOENT);
@ -2025,6 +2053,7 @@ dump_all_ddts(spa_t *spa)
dump_ddt(ddt, type, class); dump_ddt(ddt, type, class);
} }
} }
dump_ddt_log(ddt);
} }
ddt_get_dedup_stats(spa, &dds_total); ddt_get_dedup_stats(spa, &dds_total);
@ -3287,9 +3316,45 @@ fuid_table_destroy(void)
} }
} }
/*
* Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
* a live pool are normally cleaned up during ddt_sync(). We can't do that (and
* wouldn't want to anyway), but if we don't clean up the presence of stuff on
* ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
*
* Note that this is not a particularly efficient way to do this, but
* ddt_remove() is the only public method that can do the work we need, and it
* requires the right locks and etc to do the job. This is only ever called
* during zdb shutdown so efficiency is not especially important.
*/
static void
zdb_ddt_cleanup(spa_t *spa)
{
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (!ddt)
continue;
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
ddt_enter(ddt);
ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
while (dde) {
next = AVL_NEXT(&ddt->ddt_tree, dde);
dde->dde_io = NULL;
ddt_remove(ddt, dde);
dde = next;
}
ddt_exit(ddt);
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
}
static void static void
zdb_exit(int reason) zdb_exit(int reason)
{ {
if (spa != NULL)
zdb_ddt_cleanup(spa);
if (os != NULL) { if (os != NULL) {
close_objset(os, FTAG); close_objset(os, FTAG);
} else if (spa != NULL) { } else if (spa != NULL) {
@ -4592,7 +4657,6 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
l2arc_log_blk_phys_t this_lb; l2arc_log_blk_phys_t this_lb;
uint64_t asize; uint64_t asize;
l2arc_log_blkptr_t lbps[2]; l2arc_log_blkptr_t lbps[2];
abd_t *abd;
zio_cksum_t cksum; zio_cksum_t cksum;
int failed = 0; int failed = 0;
l2arc_dev_t dev; l2arc_dev_t dev;
@ -4646,20 +4710,25 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
case ZIO_COMPRESS_OFF: case ZIO_COMPRESS_OFF:
break; break;
default: default: {
abd = abd_alloc_for_io(asize, B_TRUE); abd_t *abd = abd_alloc_linear(asize, B_TRUE);
abd_copy_from_buf_off(abd, &this_lb, 0, asize); abd_copy_from_buf_off(abd, &this_lb, 0, asize);
if (zio_decompress_data(L2BLK_GET_COMPRESS( abd_t dabd;
(&lbps[0])->lbp_prop), abd, &this_lb, abd_get_from_buf_struct(&dabd, &this_lb,
asize, sizeof (this_lb), NULL) != 0) { sizeof (this_lb));
int err = zio_decompress_data(L2BLK_GET_COMPRESS(
(&lbps[0])->lbp_prop), abd, &dabd,
asize, sizeof (this_lb), NULL);
abd_free(&dabd);
abd_free(abd);
if (err != 0) {
(void) printf("L2ARC block decompression " (void) printf("L2ARC block decompression "
"failed\n"); "failed\n");
abd_free(abd);
goto out; goto out;
} }
abd_free(abd);
break; break;
} }
}
if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
byteswap_uint64_array(&this_lb, sizeof (this_lb)); byteswap_uint64_array(&this_lb, sizeof (this_lb));
@ -5633,7 +5702,6 @@ static void
zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
dmu_object_type_t type) dmu_object_type_t type)
{ {
uint64_t refcnt = 0;
int i; int i;
ASSERT(type < ZDB_OT_TOTAL); ASSERT(type < ZDB_OT_TOTAL);
@ -5641,8 +5709,161 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
if (zilog && zil_bp_tree_add(zilog, bp) != 0) if (zilog && zil_bp_tree_add(zilog, bp) != 0)
return; return;
/*
* This flag controls if we will issue a claim for the block while
* counting it, to ensure that all blocks are referenced in space maps.
* We don't issue claims if we're not doing leak tracking, because it's
* expensive if the user isn't interested. We also don't claim the
* second or later occurences of cloned or dedup'd blocks, because we
* already claimed them the first time.
*/
boolean_t do_claim = !dump_opt['L'];
spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
blkptr_t tempbp;
if (BP_GET_DEDUP(bp)) {
/*
* Dedup'd blocks are special. We need to count them, so we can
* later uncount them when reporting leaked space, and we must
* only claim them once.
*
* We use the existing dedup system to track what we've seen.
* The first time we see a block, we do a ddt_lookup() to see
* if it exists in the DDT. If we're doing leak tracking, we
* claim the block at this time.
*
* Each time we see a block, we reduce the refcount in the
* entry by one, and add to the size and count of dedup'd
* blocks to report at the end.
*/
ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
ddt_enter(ddt);
/*
* Find the block. This will create the entry in memory, but
* we'll know if that happened by its refcount.
*/
ddt_entry_t *dde = ddt_lookup(ddt, bp);
/*
* ddt_lookup() can only return NULL if this block didn't exist
* in the DDT and creating it would take the DDT over its
* quota. Since we got the block from disk, it must exist in
* the DDT, so this can't happen.
*/
VERIFY3P(dde, !=, NULL);
/* Get the phys for this variant */
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
/*
* This entry may have multiple sets of DVAs. We must claim
* each set the first time we see them in a real block on disk,
* or count them on subsequent occurences. We don't have a
* convenient way to track the first time we see each variant,
* so we repurpose dde_io as a set of "seen" flag bits. We can
* do this safely in zdb because it never writes, so it will
* never have a writing zio for this block in that pointer.
*/
boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
if (!seen)
dde->dde_io =
(void *)(((uintptr_t)dde->dde_io) | (1 << v));
/* Consume a reference for this block. */
VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
ddt_phys_decref(dde->dde_phys, v);
/*
* If this entry has a single flat phys, it may have been
* extended with additional DVAs at some time in its life.
* This block might be from before it was fully extended, and
* so have fewer DVAs.
*
* If this is the first time we've seen this block, and we
* claimed it as-is, then we would miss the claim on some
* number of DVAs, which would then be seen as leaked.
*
* In all cases, if we've had fewer DVAs, then the asize would
* be too small, and would lead to the pool apparently using
* more space than allocated.
*
* To handle this, we copy the canonical set of DVAs from the
* entry back to the block pointer before we claim it.
*/
if (v == DDT_PHYS_FLAT) {
ASSERT3U(BP_GET_BIRTH(bp), ==,
ddt_phys_birth(dde->dde_phys, v));
tempbp = *bp;
ddt_bp_fill(dde->dde_phys, v, &tempbp,
BP_GET_BIRTH(bp));
bp = &tempbp;
}
if (seen) {
/*
* The second or later time we see this block,
* it's a duplicate and we count it.
*/
zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
zcb->zcb_dedup_blocks++;
/* Already claimed, don't do it again. */
do_claim = B_FALSE;
}
ddt_exit(ddt);
} else if (zcb->zcb_brt_is_active &&
brt_maybe_exists(zcb->zcb_spa, bp)) {
/*
* Cloned blocks are special. We need to count them, so we can
* later uncount them when reporting leaked space, and we must
* only claim them once.
*
* To do this, we keep our own in-memory BRT. For each block
* we haven't seen before, we look it up in the real BRT and
* if its there, we note it and its refcount then proceed as
* normal. If we see the block again, we count it as a clone
* and then give it no further consideration.
*/
zdb_brt_entry_t zbre_search, *zbre;
avl_index_t where;
zbre_search.zbre_dva = bp->blk_dva[0];
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
if (zbre == NULL) {
/* Not seen before; track it */
uint64_t refcnt =
brt_entry_get_refcount(zcb->zcb_spa, bp);
if (refcnt > 0) {
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
UMEM_NOFAIL);
zbre->zbre_dva = bp->blk_dva[0];
zbre->zbre_refcount = refcnt;
avl_insert(&zcb->zcb_brt, zbre, where);
}
} else {
/*
* Second or later occurrence, count it and take a
* refcount.
*/
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
zcb->zcb_clone_blocks++;
zbre->zbre_refcount--;
if (zbre->zbre_refcount == 0) {
avl_remove(&zcb->zcb_brt, zbre);
umem_free(zbre, sizeof (zdb_brt_entry_t));
}
/* Already claimed, don't do it again. */
do_claim = B_FALSE;
}
}
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
int t = (i & 1) ? type : ZDB_OT_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL;
@ -5745,71 +5966,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
zcb->zcb_asize_total += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp);
if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { if (!do_claim)
/*
* Cloned blocks are special. We need to count them, so we can
* later uncount them when reporting leaked space, and we must
* only claim them them once.
*
* To do this, we keep our own in-memory BRT. For each block
* we haven't seen before, we look it up in the real BRT and
* if its there, we note it and its refcount then proceed as
* normal. If we see the block again, we count it as a clone
* and then give it no further consideration.
*/
zdb_brt_entry_t zbre_search, *zbre;
avl_index_t where;
zbre_search.zbre_dva = bp->blk_dva[0];
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
if (zbre != NULL) {
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
zcb->zcb_clone_blocks++;
zbre->zbre_refcount--;
if (zbre->zbre_refcount == 0) {
avl_remove(&zcb->zcb_brt, zbre);
umem_free(zbre, sizeof (zdb_brt_entry_t));
}
return;
}
uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
if (crefcnt > 0) {
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
UMEM_NOFAIL);
zbre->zbre_dva = bp->blk_dva[0];
zbre->zbre_refcount = crefcnt;
avl_insert(&zcb->zcb_brt, zbre, where);
}
}
if (dump_opt['L'])
return; return;
if (BP_GET_DEDUP(bp)) { VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
ddt_t *ddt; spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
ddt_entry_t *dde; ZIO_FLAG_CANFAIL)));
ddt = ddt_select(zcb->zcb_spa, bp);
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_FALSE);
if (dde == NULL) {
refcnt = 0;
} else {
ddt_phys_t *ddp = ddt_phys_select(dde, bp);
ddt_phys_decref(ddp);
refcnt = ddp->ddp_refcnt;
if (ddt_phys_total_refcnt(dde) == 0)
ddt_remove(ddt, dde);
}
ddt_exit(ddt);
}
VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
} }
static void static void
@ -6120,49 +6282,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
return (counts); return (counts);
} }
static void
zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
ddt_bookmark_t ddb = {0};
ddt_entry_t dde;
int error;
int p;
ASSERT(!dump_opt['L']);
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
blkptr_t blk;
ddt_phys_t *ddp = dde.dde_phys;
if (ddb.ddb_class == DDT_CLASS_UNIQUE)
return;
ASSERT(ddt_phys_total_refcnt(&dde) > 1);
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
VERIFY(ddt);
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
continue;
ddt_bp_create(ddb.ddb_checksum,
&dde.dde_key, ddp, &blk);
if (p == DDT_PHYS_DITTO) {
zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
} else {
zcb->zcb_dedup_asize +=
BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
zcb->zcb_dedup_blocks++;
}
}
ddt_enter(ddt);
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
ddt_exit(ddt);
}
ASSERT(error == ENOENT);
}
typedef struct checkpoint_sm_exclude_entry_arg { typedef struct checkpoint_sm_exclude_entry_arg {
vdev_t *cseea_vd; vdev_t *cseea_vd;
uint64_t cseea_checkpoint_size; uint64_t cseea_checkpoint_size;
@ -6546,10 +6665,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
increment_indirect_mapping_cb, zcb, NULL); increment_indirect_mapping_cb, zcb, NULL);
} }
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
zdb_ddt_leak_init(spa, zcb);
spa_config_exit(spa, SCL_CONFIG, FTAG);
} }
static boolean_t static boolean_t
@ -6814,6 +6929,8 @@ dump_block_stats(spa_t *spa)
int e, c, err; int e, c, err;
bp_embedded_type_t i; bp_embedded_type_t i;
ddt_prefetch_all(spa);
zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
@ -6938,7 +7055,6 @@ dump_block_stats(spa_t *spa)
(u_longlong_t)total_alloc, (u_longlong_t)total_alloc,
(dump_opt['L']) ? "unreachable" : "leaked", (dump_opt['L']) ? "unreachable" : "leaked",
(longlong_t)(total_alloc - total_found)); (longlong_t)(total_alloc - total_found));
leaks = B_TRUE;
} }
if (tzb->zb_count == 0) { if (tzb->zb_count == 0) {
@ -7272,29 +7388,27 @@ dump_simulated_ddt(spa_t *spa)
spa_config_exit(spa, SCL_CONFIG, FTAG); spa_config_exit(spa, SCL_CONFIG, FTAG);
while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
ddt_stat_t dds;
uint64_t refcnt = zdde->zdde_ref_blocks; uint64_t refcnt = zdde->zdde_ref_blocks;
ASSERT(refcnt != 0); ASSERT(refcnt != 0);
dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];
dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
dds.dds_psize = zdde->zdde_ref_psize / refcnt;
dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
dds.dds_ref_blocks = zdde->zdde_ref_blocks; dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
dds.dds_ref_lsize = zdde->zdde_ref_lsize; dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
dds.dds_ref_psize = zdde->zdde_ref_psize; dds->dds_psize += zdde->zdde_ref_psize / refcnt;
dds.dds_ref_dsize = zdde->zdde_ref_dsize; dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;
ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], dds->dds_ref_blocks += zdde->zdde_ref_blocks;
&dds, 0); dds->dds_ref_lsize += zdde->zdde_ref_lsize;
dds->dds_ref_psize += zdde->zdde_ref_psize;
dds->dds_ref_dsize += zdde->zdde_ref_dsize;
umem_free(zdde, sizeof (*zdde)); umem_free(zdde, sizeof (*zdde));
} }
avl_destroy(&t); avl_destroy(&t);
ddt_histogram_stat(&dds_total, &ddh_total); ddt_histogram_total(&dds_total, &ddh_total);
(void) printf("Simulated DDT histogram:\n"); (void) printf("Simulated DDT histogram:\n");
@ -8022,16 +8136,25 @@ dump_mos_leaks(spa_t *spa)
mos_leak_vdev(spa->spa_root_vdev); mos_leak_vdev(spa->spa_root_vdev);
for (uint64_t class = 0; class < DDT_CLASSES; class++) { for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
for (uint64_t type = 0; type < DDT_TYPES; type++) { ddt_t *ddt = spa->spa_ddt[c];
for (uint64_t cksum = 0; if (!ddt)
cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { continue;
ddt_t *ddt = spa->spa_ddt[cksum];
if (!ddt) /* DDT store objects */
continue; for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES;
class++) {
mos_obj_refd(ddt->ddt_object[type][class]); mos_obj_refd(ddt->ddt_object[type][class]);
} }
} }
/* FDT container */
mos_obj_refd(ddt->ddt_dir_object);
/* FDT log objects */
mos_obj_refd(ddt->ddt_log[0].ddl_object);
mos_obj_refd(ddt->ddt_log[1].ddl_object);
} }
if (spa->spa_brt != NULL) { if (spa->spa_brt != NULL) {
@ -8499,13 +8622,22 @@ try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
memset(lbuf, 0x00, lsize); memset(lbuf, 0x00, lsize);
memset(lbuf2, 0xff, lsize); memset(lbuf2, 0xff, lsize);
abd_t labd, labd2;
abd_get_from_buf_struct(&labd, lbuf, lsize);
abd_get_from_buf_struct(&labd2, lbuf2, lsize);
boolean_t ret = B_FALSE;
if (zio_decompress_data(cfunc, pabd, if (zio_decompress_data(cfunc, pabd,
lbuf, psize, lsize, NULL) == 0 && &labd, psize, lsize, NULL) == 0 &&
zio_decompress_data(cfunc, pabd, zio_decompress_data(cfunc, pabd,
lbuf2, psize, lsize, NULL) == 0 && &labd2, psize, lsize, NULL) == 0 &&
memcmp(lbuf, lbuf2, lsize) == 0) memcmp(lbuf, lbuf2, lsize) == 0)
return (B_TRUE); ret = B_TRUE;
return (B_FALSE);
abd_free(&labd2);
abd_free(&labd);
return (ret);
} }
static uint64_t static uint64_t
@ -9624,6 +9756,9 @@ retry_lookup:
} }
fini: fini:
if (spa != NULL)
zdb_ddt_cleanup(spa);
if (os != NULL) { if (os != NULL) {
close_objset(os, FTAG); close_objset(os, FTAG);
} else if (spa != NULL) { } else if (spa != NULL) {

View File

@ -844,7 +844,6 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
const char *failmode = NULL; const char *failmode = NULL;
boolean_t checkremove = B_FALSE; boolean_t checkremove = B_FALSE;
uint32_t pri = 0; uint32_t pri = 0;
int32_t flags = 0;
/* /*
* If this is a checksum or I/O error, then toss it into the * If this is a checksum or I/O error, then toss it into the
@ -922,18 +921,28 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
} }
} else if (fmd_nvl_class_match(hdl, nvl, } else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
uint64_t flags = 0;
int32_t flags32 = 0;
/* /*
* We ignore ereports for checksum errors generated by * We ignore ereports for checksum errors generated by
* scrub/resilver I/O to avoid potentially further * scrub/resilver I/O to avoid potentially further
* degrading the pool while it's being repaired. * degrading the pool while it's being repaired.
*
* Note that FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS used to
* be int32. To allow newer zed to work on older
* kernels, if we don't find the flags, we look for
* the older ones too.
*/ */
if (((nvlist_lookup_uint32(nvl, if (((nvlist_lookup_uint32(nvl,
FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) && FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
(pri == ZIO_PRIORITY_SCRUB || (pri == ZIO_PRIORITY_SCRUB ||
pri == ZIO_PRIORITY_REBUILD)) || pri == ZIO_PRIORITY_REBUILD)) ||
((nvlist_lookup_int32(nvl, ((nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) && FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
(flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) { (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) ||
((nvlist_lookup_int32(nvl,
FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags32) == 0) &&
(flags32 & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
fmd_hdl_debug(hdl, "ignoring '%s' for " fmd_hdl_debug(hdl, "ignoring '%s' for "
"scrub/resilver I/O", class); "scrub/resilver I/O", class);
return; return;

View File

@ -537,7 +537,7 @@ get_usage(zpool_help_t idx)
"\t [-o property=value] <pool> <newpool> " "\t [-o property=value] <pool> <newpool> "
"[<device> ...]\n")); "[<device> ...]\n"));
case HELP_REGUID: case HELP_REGUID:
return (gettext("\treguid <pool>\n")); return (gettext("\treguid [-g guid] <pool>\n"));
case HELP_SYNC: case HELP_SYNC:
return (gettext("\tsync [pool] ...\n")); return (gettext("\tsync [pool] ...\n"));
case HELP_VERSION: case HELP_VERSION:
@ -2025,7 +2025,7 @@ zpool_do_create(int argc, char **argv)
char *end; char *end;
u_longlong_t ver; u_longlong_t ver;
ver = strtoull(propval, &end, 10); ver = strtoull(propval, &end, 0);
if (*end == '\0' && if (*end == '\0' &&
ver < SPA_VERSION_FEATURES) { ver < SPA_VERSION_FEATURES) {
enable_pool_features = B_FALSE; enable_pool_features = B_FALSE;
@ -8232,19 +8232,32 @@ zpool_do_clear(int argc, char **argv)
} }
/* /*
* zpool reguid <pool> * zpool reguid [-g <guid>] <pool>
*/ */
int int
zpool_do_reguid(int argc, char **argv) zpool_do_reguid(int argc, char **argv)
{ {
uint64_t guid;
uint64_t *guidp = NULL;
int c; int c;
char *endptr;
char *poolname; char *poolname;
zpool_handle_t *zhp; zpool_handle_t *zhp;
int ret = 0; int ret = 0;
/* check options */ /* check options */
while ((c = getopt(argc, argv, "")) != -1) { while ((c = getopt(argc, argv, "g:")) != -1) {
switch (c) { switch (c) {
case 'g':
errno = 0;
guid = strtoull(optarg, &endptr, 10);
if (errno != 0 || *endptr != '\0') {
(void) fprintf(stderr,
gettext("invalid GUID: %s\n"), optarg);
usage(B_FALSE);
}
guidp = &guid;
break;
case '?': case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"), (void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt); optopt);
@ -8270,7 +8283,7 @@ zpool_do_reguid(int argc, char **argv)
if ((zhp = zpool_open(g_zfs, poolname)) == NULL) if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1); return (1);
ret = zpool_reguid(zhp); ret = zpool_set_guid(zhp, guidp);
zpool_close(zhp); zpool_close(zhp);
return (ret); return (ret);

View File

@ -1,3 +1,5 @@
zstream_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
sbin_PROGRAMS += zstream sbin_PROGRAMS += zstream
CPPCHECKTARGETS += zstream CPPCHECKTARGETS += zstream

View File

@ -22,6 +22,8 @@
/* /*
* Copyright 2022 Axcient. All rights reserved. * Copyright 2022 Axcient. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*
* Copyright (c) 2024, Klara, Inc.
*/ */
#include <err.h> #include <err.h>
@ -257,83 +259,73 @@ zstream_do_decompress(int argc, char *argv[])
ENTRY e = {.key = key}; ENTRY e = {.key = key};
p = hsearch(e, FIND); p = hsearch(e, FIND);
if (p != NULL) { if (p == NULL) {
zio_decompress_func_t *xfunc = NULL;
switch ((enum zio_compress)(intptr_t)p->data) {
case ZIO_COMPRESS_OFF:
xfunc = NULL;
break;
case ZIO_COMPRESS_LZJB:
xfunc = lzjb_decompress;
break;
case ZIO_COMPRESS_GZIP_1:
xfunc = gzip_decompress;
break;
case ZIO_COMPRESS_ZLE:
xfunc = zle_decompress;
break;
case ZIO_COMPRESS_LZ4:
xfunc = lz4_decompress_zfs;
break;
case ZIO_COMPRESS_ZSTD:
xfunc = zfs_zstd_decompress;
break;
default:
assert(B_FALSE);
}
/*
* Read and decompress the block
*/
char *lzbuf = safe_calloc(payload_size);
(void) sfread(lzbuf, payload_size, stdin);
if (xfunc == NULL) {
memcpy(buf, lzbuf, payload_size);
drrw->drr_compressiontype =
ZIO_COMPRESS_OFF;
if (verbose)
fprintf(stderr, "Resetting "
"compression type to off "
"for ino %llu offset "
"%llu\n",
(u_longlong_t)
drrw->drr_object,
(u_longlong_t)
drrw->drr_offset);
} else if (0 != xfunc(lzbuf, buf,
payload_size, payload_size, 0)) {
/*
* The block must not be compressed,
* at least not with this compression
* type, possibly because it gets
* written multiple times in this
* stream.
*/
warnx("decompression failed for "
"ino %llu offset %llu",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
memcpy(buf, lzbuf, payload_size);
} else if (verbose) {
drrw->drr_compressiontype =
ZIO_COMPRESS_OFF;
fprintf(stderr, "successfully "
"decompressed ino %llu "
"offset %llu\n",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
} else {
drrw->drr_compressiontype =
ZIO_COMPRESS_OFF;
}
free(lzbuf);
} else {
/* /*
* Read the contents of the block unaltered * Read the contents of the block unaltered
*/ */
(void) sfread(buf, payload_size, stdin); (void) sfread(buf, payload_size, stdin);
break;
} }
/*
* Read and decompress the block
*/
enum zio_compress c =
(enum zio_compress)(intptr_t)p->data;
if (c == ZIO_COMPRESS_OFF) {
(void) sfread(buf, payload_size, stdin);
drrw->drr_compressiontype = 0;
drrw->drr_compressed_size = 0;
if (verbose)
fprintf(stderr,
"Resetting compression type to "
"off for ino %llu offset %llu\n",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
break;
}
uint64_t lsize = drrw->drr_logical_size;
ASSERT3U(payload_size, <=, lsize);
char *lzbuf = safe_calloc(payload_size);
(void) sfread(lzbuf, payload_size, stdin);
abd_t sabd, dabd;
abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
abd_get_from_buf_struct(&dabd, buf, lsize);
int err = zio_decompress_data(c, &sabd, &dabd,
payload_size, lsize, NULL);
abd_free(&dabd);
abd_free(&sabd);
if (err == 0) {
drrw->drr_compressiontype = 0;
drrw->drr_compressed_size = 0;
payload_size = lsize;
if (verbose) {
fprintf(stderr,
"successfully decompressed "
"ino %llu offset %llu\n",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
}
} else {
/*
* The block must not be compressed, at least
* not with this compression type, possibly
* because it gets written multiple times in
* this stream.
*/
warnx("decompression failed for "
"ino %llu offset %llu",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
memcpy(buf, lzbuf, payload_size);
}
free(lzbuf);
break; break;
} }

View File

@ -22,10 +22,9 @@
/* /*
* Copyright 2022 Axcient. All rights reserved. * Copyright 2022 Axcient. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ *
/*
* Copyright (c) 2022 by Delphix. All rights reserved. * Copyright (c) 2022 by Delphix. All rights reserved.
* Copyright (c) 2024, Klara, Inc.
*/ */
#include <err.h> #include <err.h>
@ -72,7 +71,7 @@ zstream_do_recompress(int argc, char *argv[])
dmu_replay_record_t *drr = &thedrr; dmu_replay_record_t *drr = &thedrr;
zio_cksum_t stream_cksum; zio_cksum_t stream_cksum;
int c; int c;
int level = -1; int level = 0;
while ((c = getopt(argc, argv, "l:")) != -1) { while ((c = getopt(argc, argv, "l:")) != -1) {
switch (c) { switch (c) {
@ -97,34 +96,22 @@ zstream_do_recompress(int argc, char *argv[])
if (argc != 1) if (argc != 1)
zstream_usage(); zstream_usage();
int type = 0;
zio_compress_info_t *cinfo = NULL;
if (0 == strcmp(argv[0], "off")) {
type = ZIO_COMPRESS_OFF;
cinfo = &zio_compress_table[type];
} else if (0 == strcmp(argv[0], "inherit") ||
0 == strcmp(argv[0], "empty") ||
0 == strcmp(argv[0], "on")) {
// Fall through to invalid compression type case
} else {
for (int i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
if (0 == strcmp(zio_compress_table[i].ci_name,
argv[0])) {
cinfo = &zio_compress_table[i];
type = i;
break;
}
}
}
if (cinfo == NULL) {
fprintf(stderr, "Invalid compression type %s.\n",
argv[0]);
exit(2);
}
if (cinfo->ci_compress == NULL) { enum zio_compress ctype;
type = 0; if (strcmp(argv[0], "off") == 0) {
cinfo = &zio_compress_table[0]; ctype = ZIO_COMPRESS_OFF;
} else {
for (ctype = 0; ctype < ZIO_COMPRESS_FUNCTIONS; ctype++) {
if (strcmp(argv[0],
zio_compress_table[ctype].ci_name) == 0)
break;
}
if (ctype == ZIO_COMPRESS_FUNCTIONS ||
zio_compress_table[ctype].ci_compress == NULL) {
fprintf(stderr, "Invalid compression type %s.\n",
argv[0]);
exit(2);
}
} }
if (isatty(STDIN_FILENO)) { if (isatty(STDIN_FILENO)) {
@ -135,6 +122,7 @@ zstream_do_recompress(int argc, char *argv[])
exit(1); exit(1);
} }
abd_init();
fletcher_4_init(); fletcher_4_init();
zio_init(); zio_init();
zstd_init(); zstd_init();
@ -247,63 +235,78 @@ zstream_do_recompress(int argc, char *argv[])
(void) sfread(buf, payload_size, stdin); (void) sfread(buf, payload_size, stdin);
break; break;
} }
if (drrw->drr_compressiontype >= enum zio_compress dtype = drrw->drr_compressiontype;
ZIO_COMPRESS_FUNCTIONS) { if (dtype >= ZIO_COMPRESS_FUNCTIONS) {
fprintf(stderr, "Invalid compression type in " fprintf(stderr, "Invalid compression type in "
"stream: %d\n", drrw->drr_compressiontype); "stream: %d\n", dtype);
exit(3); exit(3);
} }
zio_compress_info_t *dinfo = if (zio_compress_table[dtype].ci_decompress == NULL)
&zio_compress_table[drrw->drr_compressiontype]; dtype = ZIO_COMPRESS_OFF;
/* Set up buffers to minimize memcpys */ /* Set up buffers to minimize memcpys */
char *cbuf, *dbuf; char *cbuf, *dbuf;
if (cinfo->ci_compress == NULL) if (ctype == ZIO_COMPRESS_OFF)
dbuf = buf; dbuf = buf;
else else
dbuf = safe_calloc(bufsz); dbuf = safe_calloc(bufsz);
if (dinfo->ci_decompress == NULL) if (dtype == ZIO_COMPRESS_OFF)
cbuf = dbuf; cbuf = dbuf;
else else
cbuf = safe_calloc(payload_size); cbuf = safe_calloc(payload_size);
/* Read and decompress the payload */ /* Read and decompress the payload */
(void) sfread(cbuf, payload_size, stdin); (void) sfread(cbuf, payload_size, stdin);
if (dinfo->ci_decompress != NULL) { if (dtype != ZIO_COMPRESS_OFF) {
if (0 != dinfo->ci_decompress(cbuf, dbuf, abd_t cabd, dabd;
payload_size, MIN(bufsz, abd_get_from_buf_struct(&cabd,
drrw->drr_logical_size), dinfo->ci_level)) { cbuf, payload_size);
abd_get_from_buf_struct(&dabd, dbuf,
MIN(bufsz, drrw->drr_logical_size));
if (zio_decompress_data(dtype, &cabd, &dabd,
payload_size, abd_get_size(&dabd),
NULL) != 0) {
warnx("decompression type %d failed " warnx("decompression type %d failed "
"for ino %llu offset %llu", "for ino %llu offset %llu",
type, dtype,
(u_longlong_t)drrw->drr_object, (u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset); (u_longlong_t)drrw->drr_offset);
exit(4); exit(4);
} }
payload_size = drrw->drr_logical_size; payload_size = drrw->drr_logical_size;
abd_free(&dabd);
abd_free(&cabd);
free(cbuf); free(cbuf);
} }
/* Recompress the payload */ /* Recompress the payload */
if (cinfo->ci_compress != NULL) { if (ctype != ZIO_COMPRESS_OFF) {
payload_size = P2ROUNDUP(cinfo->ci_compress( abd_t dabd, abd;
dbuf, buf, drrw->drr_logical_size, abd_get_from_buf_struct(&dabd,
MIN(payload_size, bufsz), (level == -1 ? dbuf, drrw->drr_logical_size);
cinfo->ci_level : level)), abd_t *pabd =
SPA_MINBLOCKSIZE); abd_get_from_buf_struct(&abd, buf, bufsz);
if (payload_size != drrw->drr_logical_size) { size_t csize = zio_compress_data(ctype, &dabd,
drrw->drr_compressiontype = type; &pabd, drrw->drr_logical_size, level);
drrw->drr_compressed_size = size_t rounded =
payload_size; P2ROUNDUP(csize, SPA_MINBLOCKSIZE);
} else { if (rounded >= drrw->drr_logical_size) {
memcpy(buf, dbuf, payload_size); memcpy(buf, dbuf, payload_size);
drrw->drr_compressiontype = 0; drrw->drr_compressiontype = 0;
drrw->drr_compressed_size = 0; drrw->drr_compressed_size = 0;
} else {
abd_zero_off(pabd, csize,
rounded - csize);
drrw->drr_compressiontype = ctype;
drrw->drr_compressed_size =
payload_size = rounded;
} }
abd_free(&abd);
abd_free(&dabd);
free(dbuf); free(dbuf);
} else { } else {
drrw->drr_compressiontype = type; drrw->drr_compressiontype = 0;
drrw->drr_compressed_size = 0; drrw->drr_compressed_size = 0;
} }
break; break;
@ -371,6 +374,7 @@ zstream_do_recompress(int argc, char *argv[])
fletcher_4_fini(); fletcher_4_fini();
zio_fini(); zio_fini();
zstd_fini(); zstd_fini();
abd_fini();
return (0); return (0);
} }

View File

@ -6746,7 +6746,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
load = spa_load_guid(spa); load = spa_load_guid(spa);
(void) pthread_rwlock_wrlock(&ztest_name_lock); (void) pthread_rwlock_wrlock(&ztest_name_lock);
error = spa_change_guid(spa); error = spa_change_guid(spa, NULL);
zs->zs_guid = spa_guid(spa); zs->zs_guid = spa_guid(spa);
(void) pthread_rwlock_unlock(&ztest_name_lock); (void) pthread_rwlock_unlock(&ztest_name_lock);

View File

@ -10,7 +10,8 @@ AM_CPPFLAGS = \
-I$(top_srcdir)/include \ -I$(top_srcdir)/include \
-I$(top_srcdir)/module/icp/include \ -I$(top_srcdir)/module/icp/include \
-I$(top_srcdir)/lib/libspl/include \ -I$(top_srcdir)/lib/libspl/include \
-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ -I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ \
-I$(top_srcdir)/lib/libzpool/include
AM_LIBTOOLFLAGS = --silent AM_LIBTOOLFLAGS = --silent
@ -85,4 +86,7 @@ KERNEL_CFLAGS = $(FRAME_LARGER_THAN)
LIBRARY_CFLAGS = -no-suppress LIBRARY_CFLAGS = -no-suppress
# Forcibly enable asserts/debugging for libzpool &al. # Forcibly enable asserts/debugging for libzpool &al.
FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG # Since ZFS_DEBUG can change shared data structures, all libzpool users must
# be compiled with the same flags.
# See https://github.com/openzfs/zfs/issues/16476
LIBZPOOL_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG

View File

@ -25,6 +25,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [
dnl # dnl #
dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue
dnl # 4.12: dynamically allocated bdi in request_queue dnl # 4.12: dynamically allocated bdi in request_queue
dnl # 6.11: bdi no longer available through request_queue, so get it from
dnl # the gendisk attached to the queue
dnl # dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [
ZFS_LINUX_TEST_SRC([blk_queue_bdi], [ ZFS_LINUX_TEST_SRC([blk_queue_bdi], [
@ -47,6 +49,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [
]) ])
]) ])
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI], [
ZFS_LINUX_TEST_SRC([blk_queue_disk_bdi], [
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
], [
struct request_queue q;
struct gendisk disk;
struct backing_dev_info bdi __attribute__ ((unused));
q.disk = &disk;
q.disk->bdi = &bdi;
])
])
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI], [
AC_MSG_CHECKING([whether backing_dev_info is available through queue gendisk])
ZFS_LINUX_TEST_RESULT([blk_queue_disk_bdi], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_QUEUE_DISK_BDI, 1,
[backing_dev_info is available through queue gendisk])
],[
AC_MSG_RESULT(no)
])
])
dnl # dnl #
dnl # 5.9: added blk_queue_update_readahead(), dnl # 5.9: added blk_queue_update_readahead(),
dnl # 5.15: renamed to disk_update_readahead() dnl # 5.15: renamed to disk_update_readahead()
@ -407,6 +433,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI
ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD
ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD
ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE
@ -421,6 +448,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
ZFS_AC_KERNEL_BLK_QUEUE_PLUG ZFS_AC_KERNEL_BLK_QUEUE_PLUG
ZFS_AC_KERNEL_BLK_QUEUE_BDI ZFS_AC_KERNEL_BLK_QUEUE_BDI
ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI
ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD
ZFS_AC_KERNEL_BLK_QUEUE_DISCARD ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE

View File

@ -58,6 +58,13 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
disk = blk_alloc_disk(lim, NUMA_NO_NODE); disk = blk_alloc_disk(lim, NUMA_NO_NODE);
]) ])
ZFS_LINUX_TEST_SRC([blkdev_queue_limits_features], [
#include <linux/blkdev.h>
],[
struct queue_limits *lim = NULL;
lim->features = 0;
])
ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [ ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
#include <linux/blkdev.h> #include <linux/blkdev.h>
],[ ],[
@ -114,6 +121,20 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
AC_MSG_RESULT(yes) AC_MSG_RESULT(yes)
AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args]) AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])
dnl #
dnl # Linux 6.11 API change:
dnl # struct queue_limits gains a 'features' field,
dnl # used to set flushing options
dnl #
AC_MSG_CHECKING([whether struct queue_limits has a features field])
ZFS_LINUX_TEST_RESULT([blkdev_queue_limits_features], [
AC_MSG_RESULT(yes)
AC_DEFINE([HAVE_BLKDEV_QUEUE_LIMITS_FEATURES], 1,
[struct queue_limits has a features field])
], [
AC_MSG_RESULT(no)
])
dnl # dnl #
dnl # 5.20 API change, dnl # 5.20 API change,
dnl # Removed blk_cleanup_disk(), put_disk() should be used. dnl # Removed blk_cleanup_disk(), put_disk() should be used.

View File

@ -1,17 +0,0 @@
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
ZFS_LINUX_TEST_SRC([page_size], [
#include <linux/mm.h>
],[
unsigned long s;
s = page_size(NULL);
])
])
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
AC_MSG_CHECKING([whether page_size() is available])
ZFS_LINUX_TEST_RESULT([page_size], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -0,0 +1,36 @@
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
ZFS_LINUX_TEST_SRC([page_size], [
#include <linux/mm.h>
],[
unsigned long s;
s = page_size(NULL);
])
])
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
AC_MSG_CHECKING([whether page_size() is available])
ZFS_LINUX_TEST_RESULT([page_size], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
],[
AC_MSG_RESULT(no)
])
])
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING], [
ZFS_LINUX_TEST_SRC([page_mapping], [
#include <linux/pagemap.h>
],[
struct page *p = NULL;
struct address_space *m = page_mapping(NULL);
])
])
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_MAPPING], [
AC_MSG_CHECKING([whether page_mapping() is available])
ZFS_LINUX_TEST_RESULT([page_mapping], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_MM_PAGE_MAPPING, 1, [page_mapping() is available])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -25,3 +25,62 @@ AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
AC_MSG_RESULT([no]) AC_MSG_RESULT([no])
]) ])
]) ])
dnl #
dnl # Linux 6.11 register_sysctl() enforces that sysctl tables no longer
dnl # supply a sentinel end-of-table element. 6.6 introduces
dnl # register_sysctl_sz() to enable callers to choose, so we use it if
dnl # available for backward compatibility.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ], [
ZFS_LINUX_TEST_SRC([has_register_sysctl_sz], [
#include <linux/sysctl.h>
],[
struct ctl_table test_table[] __attribute__((unused)) = {0};
register_sysctl_sz("", test_table, 0);
])
])
AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ], [
AC_MSG_CHECKING([whether register_sysctl_sz exists])
ZFS_LINUX_TEST_RESULT([has_register_sysctl_sz], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_REGISTER_SYSCTL_SZ, 1,
[register_sysctl_sz exists])
],[
AC_MSG_RESULT([no])
])
])
dnl #
dnl # Linux 6.11 makes const the ctl_table arg of proc_handler
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST], [
ZFS_LINUX_TEST_SRC([has_proc_handler_ctl_table_const], [
#include <linux/sysctl.h>
static int test_handler(
const struct ctl_table *ctl __attribute((unused)),
int write __attribute((unused)),
void *buffer __attribute((unused)),
size_t *lenp __attribute((unused)),
loff_t *ppos __attribute((unused)))
{
return (0);
}
], [
proc_handler *ph __attribute((unused)) =
&test_handler;
])
])
AC_DEFUN([ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST], [
AC_MSG_CHECKING([whether proc_handler ctl_table arg is const])
ZFS_LINUX_TEST_RESULT([has_proc_handler_ctl_table_const], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_PROC_HANDLER_CTL_TABLE_CONST, 1,
[proc_handler ctl_table arg is const])
], [
AC_MSG_RESULT([no])
])
])

View File

@ -167,9 +167,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_WRITEPAGE_T
ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_RECLAIMED
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ
ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
ZFS_AC_KERNEL_SRC_SYNC_BDEV ZFS_AC_KERNEL_SRC_SYNC_BDEV
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
case "$host_cpu" in case "$host_cpu" in
powerpc*) powerpc*)
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@ -319,9 +322,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_WRITEPAGE_T
ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_RECLAIMED
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ
ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST
ZFS_AC_KERNEL_COPY_SPLICE_READ ZFS_AC_KERNEL_COPY_SPLICE_READ
ZFS_AC_KERNEL_SYNC_BDEV ZFS_AC_KERNEL_SYNC_BDEV
ZFS_AC_KERNEL_MM_PAGE_SIZE ZFS_AC_KERNEL_MM_PAGE_SIZE
ZFS_AC_KERNEL_MM_PAGE_MAPPING
case "$host_cpu" in case "$host_cpu" in
powerpc*) powerpc*)
ZFS_AC_KERNEL_CPU_HAS_FEATURE ZFS_AC_KERNEL_CPU_HAS_FEATURE

View File

@ -300,6 +300,7 @@ _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,
_LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
_LIBZFS_H int zpool_reguid(zpool_handle_t *); _LIBZFS_H int zpool_reguid(zpool_handle_t *);
_LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *);
_LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
_LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);

View File

@ -77,6 +77,8 @@ noinst_HEADERS = \
%D%/spl/sys/zmod.h \ %D%/spl/sys/zmod.h \
%D%/spl/sys/zone.h \ %D%/spl/sys/zone.h \
\ \
%D%/zfs/sys/abd_os.h \
%D%/zfs/sys/abd_impl_os.h \
%D%/zfs/sys/arc_os.h \ %D%/zfs/sys/arc_os.h \
%D%/zfs/sys/freebsd_crypto.h \ %D%/zfs/sys/freebsd_crypto.h \
%D%/zfs/sys/freebsd_event.h \ %D%/zfs/sys/freebsd_event.h \

View File

@ -0,0 +1,41 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ABD_IMPL_OS_H
#define _ABD_IMPL_OS_H
#ifdef __cplusplus
extern "C" {
#endif
#define abd_enter_critical(flags) critical_enter()
#define abd_exit_critical(flags) critical_exit()
#ifdef __cplusplus
}
#endif
#endif /* _ABD_IMPL_OS_H */

View File

@ -0,0 +1,46 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/
#ifndef _ABD_OS_H
#define _ABD_OS_H
#ifdef __cplusplus
extern "C" {
#endif
struct abd_scatter {
uint_t abd_offset;
void *abd_chunks[1]; /* actually variable-length */
};
struct abd_linear {
void *abd_buf;
};
#ifdef __cplusplus
}
#endif
#endif /* _ABD_H */

View File

@ -20,6 +20,8 @@ kernel_linux_HEADERS = \
kernel_sysdir = $(kerneldir)/sys kernel_sysdir = $(kerneldir)/sys
kernel_sys_HEADERS = \ kernel_sys_HEADERS = \
%D%/zfs/sys/abd_os.h \
%D%/zfs/sys/abd_impl_os.h \
%D%/zfs/sys/policy.h \ %D%/zfs/sys/policy.h \
%D%/zfs/sys/trace_acl.h \ %D%/zfs/sys/trace_acl.h \
%D%/zfs/sys/trace_arc.h \ %D%/zfs/sys/trace_arc.h \

View File

@ -57,6 +57,11 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
#endif #endif
/* /*
* 6.11 API
* Setting the flush flags directly is no longer possible; flush flags are set
* on the queue_limits structure and passed to blk_disk_alloc(). In this case
* we remove this function entirely.
*
* 4.7 API, * 4.7 API,
* The blk_queue_write_cache() interface has replaced blk_queue_flush() * The blk_queue_write_cache() interface has replaced blk_queue_flush()
* interface. However, the new interface is GPL-only thus we implement * interface. However, the new interface is GPL-only thus we implement
@ -68,31 +73,33 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
* new one is GPL-only. Thus if the GPL-only version is detected we * new one is GPL-only. Thus if the GPL-only version is detected we
* implement our own trivial helper. * implement our own trivial helper.
*/ */
#if !defined(HAVE_BLK_ALLOC_DISK_2ARG) || \
!defined(HAVE_BLKDEV_QUEUE_LIMITS_FEATURES)
static inline void static inline void
blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) blk_queue_set_write_cache(struct request_queue *q, bool on)
{ {
#if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY) #if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY)
if (wc) if (on) {
blk_queue_flag_set(QUEUE_FLAG_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q);
else
blk_queue_flag_clear(QUEUE_FLAG_WC, q);
if (fua)
blk_queue_flag_set(QUEUE_FLAG_FUA, q); blk_queue_flag_set(QUEUE_FLAG_FUA, q);
else } else {
blk_queue_flag_clear(QUEUE_FLAG_WC, q);
blk_queue_flag_clear(QUEUE_FLAG_FUA, q); blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
}
#elif defined(HAVE_BLK_QUEUE_WRITE_CACHE) #elif defined(HAVE_BLK_QUEUE_WRITE_CACHE)
blk_queue_write_cache(q, wc, fua); blk_queue_write_cache(q, on, on);
#elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY) #elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY)
if (wc) if (on)
q->flush_flags |= REQ_FLUSH; q->flush_flags |= REQ_FLUSH | REQ_FUA;
if (fua) else
q->flush_flags |= REQ_FUA; q->flush_flags &= ~(REQ_FLUSH | REQ_FUA);
#elif defined(HAVE_BLK_QUEUE_FLUSH) #elif defined(HAVE_BLK_QUEUE_FLUSH)
blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0)); blk_queue_flush(q, on ? (REQ_FLUSH | REQ_FUA) : 0);
#else #else
#error "Unsupported kernel" #error "Unsupported kernel"
#endif #endif
} }
#endif /* !HAVE_BLK_ALLOC_DISK_2ARG || !HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */
/* /*
* Detect if a device has a write cache. Used to set the intial value for the * Detect if a device has a write cache. Used to set the intial value for the
@ -126,8 +133,10 @@ blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
{ {
#if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \ #if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \
!defined(HAVE_DISK_UPDATE_READAHEAD) !defined(HAVE_DISK_UPDATE_READAHEAD)
#ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC #if defined(HAVE_BLK_QUEUE_BDI_DYNAMIC)
q->backing_dev_info->ra_pages = ra_pages; q->backing_dev_info->ra_pages = ra_pages;
#elif defined(HAVE_BLK_QUEUE_DISK_BDI)
q->disk->bdi->ra_pages = ra_pages;
#else #else
q->backing_dev_info.ra_pages = ra_pages; q->backing_dev_info.ra_pages = ra_pages;
#endif #endif

View File

@ -21,16 +21,23 @@
/* /*
* Copyright (c) 2023, 2024, Klara Inc. * Copyright (c) 2023, 2024, Klara Inc.
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
*/ */
#ifndef _ZFS_MM_COMPAT_H #ifndef _ZFS_MM_COMPAT_H
#define _ZFS_MM_COMPAT_H #define _ZFS_MM_COMPAT_H
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/pagemap.h>
/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */ /* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
#ifndef HAVE_MM_PAGE_SIZE #ifndef HAVE_MM_PAGE_SIZE
#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p))) #define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
#endif #endif
/* 6.11 removed page_mapping(). A simple wrapper around folio_mapping() works */
#ifndef HAVE_MM_PAGE_MAPPING
#define page_mapping(p) folio_mapping(page_folio(p))
#endif
#endif /* _ZFS_MM_COMPAT_H */ #endif /* _ZFS_MM_COMPAT_H */

View File

@ -20,6 +20,10 @@
* You should have received a copy of the GNU General Public License along * You should have received a copy of the GNU General Public License along
* with the SPL. If not, see <http://www.gnu.org/licenses/>. * with the SPL. If not, see <http://www.gnu.org/licenses/>.
*/ */
/*
* Copyright (c) 2024, Klara Inc.
* Copyright (c) 2024, Syneto
*/
#ifndef _SPL_TASKQ_H #ifndef _SPL_TASKQ_H
#define _SPL_TASKQ_H #define _SPL_TASKQ_H
@ -33,6 +37,9 @@
#include <sys/thread.h> #include <sys/thread.h>
#include <sys/rwlock.h> #include <sys/rwlock.h>
#include <sys/wait.h> #include <sys/wait.h>
#include <sys/wmsum.h>
typedef struct kstat_s kstat_t;
#define TASKQ_NAMELEN 31 #define TASKQ_NAMELEN 31
@ -74,6 +81,32 @@ typedef enum tq_lock_role {
typedef unsigned long taskqid_t; typedef unsigned long taskqid_t;
typedef void (task_func_t)(void *); typedef void (task_func_t)(void *);
typedef struct taskq_sums {
/* gauges (inc/dec counters, current value) */
wmsum_t tqs_threads_active; /* threads running a task */
wmsum_t tqs_threads_idle; /* threads waiting for work */
wmsum_t tqs_threads_total; /* total threads */
wmsum_t tqs_tasks_pending; /* tasks waiting to execute */
wmsum_t tqs_tasks_priority; /* hi-pri tasks waiting */
wmsum_t tqs_tasks_total; /* total waiting tasks */
wmsum_t tqs_tasks_delayed; /* tasks deferred to future */
wmsum_t tqs_entries_free; /* task entries on free list */
/* counters (inc only, since taskq creation) */
wmsum_t tqs_threads_created; /* threads created */
wmsum_t tqs_threads_destroyed; /* threads destroyed */
wmsum_t tqs_tasks_dispatched; /* tasks dispatched */
wmsum_t tqs_tasks_dispatched_delayed; /* tasks delayed to future */
wmsum_t tqs_tasks_executed_normal; /* normal pri tasks executed */
wmsum_t tqs_tasks_executed_priority; /* high pri tasks executed */
wmsum_t tqs_tasks_executed; /* total tasks executed */
wmsum_t tqs_tasks_delayed_requeued; /* delayed tasks requeued */
wmsum_t tqs_tasks_cancelled; /* tasks cancelled before run */
wmsum_t tqs_thread_wakeups; /* total thread wakeups */
wmsum_t tqs_thread_wakeups_nowork; /* thread woken but no tasks */
wmsum_t tqs_thread_sleeps; /* total thread sleeps */
} taskq_sums_t;
typedef struct taskq { typedef struct taskq {
spinlock_t tq_lock; /* protects taskq_t */ spinlock_t tq_lock; /* protects taskq_t */
char *tq_name; /* taskq name */ char *tq_name; /* taskq name */
@ -105,6 +138,8 @@ typedef struct taskq {
struct hlist_node tq_hp_cb_node; struct hlist_node tq_hp_cb_node;
boolean_t tq_hp_support; boolean_t tq_hp_support;
unsigned long lastspawnstop; /* when to purge dynamic */ unsigned long lastspawnstop; /* when to purge dynamic */
taskq_sums_t tq_sums;
kstat_t *tq_ksp;
} taskq_t; } taskq_t;
typedef struct taskq_ent { typedef struct taskq_ent {
@ -123,6 +158,13 @@ typedef struct taskq_ent {
#define TQENT_FLAG_PREALLOC 0x1 #define TQENT_FLAG_PREALLOC 0x1
#define TQENT_FLAG_CANCEL 0x2 #define TQENT_FLAG_CANCEL 0x2
/* bits 2-3 are which list tqent is on */
#define TQENT_LIST_NONE 0x0
#define TQENT_LIST_PENDING 0x4
#define TQENT_LIST_PRIORITY 0x8
#define TQENT_LIST_DELAY 0xc
#define TQENT_LIST_MASK 0xc
typedef struct taskq_thread { typedef struct taskq_thread {
struct list_head tqt_thread_list; struct list_head tqt_thread_list;
struct list_head tqt_active_list; struct list_head tqt_active_list;

View File

@ -0,0 +1,41 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ABD_IMPL_OS_H
#define _ABD_IMPL_OS_H
#ifdef __cplusplus
extern "C" {
#endif
#define abd_enter_critical(flags) local_irq_save(flags)
#define abd_exit_critical(flags) local_irq_restore(flags)
#ifdef __cplusplus
}
#endif
#endif /* _ABD_IMPL_OS_H */

View File

@ -0,0 +1,62 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/
#ifndef _ABD_OS_H
#define _ABD_OS_H
#ifdef __cplusplus
extern "C" {
#endif
struct abd_scatter {
uint_t abd_offset;
uint_t abd_nents;
struct scatterlist *abd_sgl;
};
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
};
typedef struct abd abd_t;
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
void *);
/*
* Linux ABD bio functions
* Note: these are only needed to support vdev_classic. See comment in
* vdev_disk.c.
*/
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
#ifdef __cplusplus
}
#endif
#endif /* _ABD_H */

View File

@ -30,6 +30,7 @@
#include <sys/debug.h> #include <sys/debug.h>
#include <sys/zfs_refcount.h> #include <sys/zfs_refcount.h>
#include <sys/uio.h> #include <sys/uio.h>
#include <sys/abd_os.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -44,8 +45,7 @@ typedef enum abd_flags {
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */ ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */
ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */ ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */
ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */ ABD_FLAG_ALLOCD = 1 << 8, /* we allocated the abd_t */
ABD_FLAG_ALLOCD = 1 << 9, /* we allocated the abd_t */
} abd_flags_t; } abd_flags_t;
typedef struct abd { typedef struct abd {
@ -58,19 +58,8 @@ typedef struct abd {
#endif #endif
kmutex_t abd_mtx; kmutex_t abd_mtx;
union { union {
struct abd_scatter { struct abd_scatter abd_scatter;
uint_t abd_offset; struct abd_linear abd_linear;
#if defined(__FreeBSD__) && defined(_KERNEL)
void *abd_chunks[1]; /* actually variable-length */
#else
uint_t abd_nents;
struct scatterlist *abd_sgl;
#endif
} abd_scatter;
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
} abd_linear;
struct abd_gang { struct abd_gang {
list_t abd_gang_chain; list_t abd_gang_chain;
} abd_gang; } abd_gang;
@ -79,9 +68,6 @@ typedef struct abd {
typedef int abd_iter_func_t(void *buf, size_t len, void *priv); typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
#if defined(__linux__) && defined(_KERNEL)
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
#endif
extern int zfs_abd_scatter_enabled; extern int zfs_abd_scatter_enabled;
@ -107,6 +93,7 @@ abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t); abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t);
abd_t *abd_get_zeros(size_t); abd_t *abd_get_zeros(size_t);
abd_t *abd_get_from_buf(void *, size_t); abd_t *abd_get_from_buf(void *, size_t);
abd_t *abd_get_from_buf_struct(abd_t *, void *, size_t);
void abd_cache_reap_now(void); void abd_cache_reap_now(void);
/* /*
@ -128,10 +115,6 @@ void abd_release_ownership_of_buf(abd_t *);
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
abd_iter_func2_t *, void *); abd_iter_func2_t *, void *);
#if defined(__linux__) && defined(_KERNEL)
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
void *);
#endif
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
@ -225,16 +208,6 @@ abd_get_size(abd_t *abd)
void abd_init(void); void abd_init(void);
void abd_fini(void); void abd_fini(void);
/*
* Linux ABD bio functions
* Note: these are only needed to support vdev_classic. See comment in
* vdev_disk.c.
*/
#if defined(__linux__) && defined(_KERNEL)
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -28,6 +28,7 @@
#define _ABD_IMPL_H #define _ABD_IMPL_H
#include <sys/abd.h> #include <sys/abd.h>
#include <sys/abd_impl_os.h>
#include <sys/wmsum.h> #include <sys/wmsum.h>
#ifdef __cplusplus #ifdef __cplusplus
@ -111,19 +112,6 @@ void abd_iter_page(struct abd_iter *);
#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) #define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
#define ABD_GANG(abd) (abd->abd_u.abd_gang) #define ABD_GANG(abd) (abd->abd_u.abd_gang)
#if defined(_KERNEL)
#if defined(__FreeBSD__)
#define abd_enter_critical(flags) critical_enter()
#define abd_exit_critical(flags) critical_exit()
#else
#define abd_enter_critical(flags) local_irq_save(flags)
#define abd_exit_critical(flags) local_irq_restore(flags)
#endif
#else /* !_KERNEL */
#define abd_enter_critical(flags) ((void)0)
#define abd_exit_critical(flags) ((void)0)
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -39,6 +39,13 @@ extern "C" {
struct abd; struct abd;
/*
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
*/
#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
#define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */
#define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG)
/* /*
* DDT on-disk storage object types. Each one corresponds to specific * DDT on-disk storage object types. Each one corresponds to specific
* implementation, see ddt_ops_t. The value itself is not stored on disk. * implementation, see ddt_ops_t. The value itself is not stored on disk.
@ -120,30 +127,80 @@ typedef struct {
* characteristics of the stored block, such as its location on disk (DVAs), * characteristics of the stored block, such as its location on disk (DVAs),
* birth txg and ref count. * birth txg and ref count.
* *
* Note that an entry has an array of four ddt_phys_t, one for each number of * The "traditional" entry has an array of four, one for each number of DVAs
* DVAs (copies= property) and another for additional "ditto" copies. Most * (copies= property) and another for additional "ditto" copies. Users of the
* users of ddt_phys_t will handle indexing into or counting the phys they * traditional struct will specify the variant (index) of the one they want.
* want. *
* The newer "flat" entry has only a single form that is specified using the
* DDT_PHYS_FLAT variant.
*
* Since the value size varies, use one of the size macros when interfacing
* with the ddt zap.
*/ */
typedef struct {
dva_t ddp_dva[SPA_DVAS_PER_BP]; #define DDT_PHYS_MAX (4)
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddt_phys_t;
/* /*
* Named indexes into the ddt_phys_t array in each entry. * Note - this can be used in a flexible array and allocated for
* a specific size (ddp_trad or ddp_flat). So be careful not to
* copy using "=" assignment but instead use ddt_phys_copy().
*/
typedef union {
/*
* Traditional physical payload value for DDT zap (256 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddp_trad[DDT_PHYS_MAX];
/*
* Flat physical payload value for DDT zap (72 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth; /* txg based from BP */
uint64_t ddp_class_start; /* in realtime seconds */
} ddp_flat;
} ddt_univ_phys_t;
/*
* This enum denotes which variant of a ddt_univ_phys_t to target. For
* a traditional DDT entry, it represents the indexes into the ddp_trad
* array. Any consumer of a ddt_univ_phys_t needs to know which variant
* is being targeted.
* *
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
* we maintain the ability to free existing dedup-ditto blocks. * we maintain the ability to free existing dedup-ditto blocks.
*/ */
enum ddt_phys_type {
typedef enum {
DDT_PHYS_DITTO = 0, DDT_PHYS_DITTO = 0,
DDT_PHYS_SINGLE = 1, DDT_PHYS_SINGLE = 1,
DDT_PHYS_DOUBLE = 2, DDT_PHYS_DOUBLE = 2,
DDT_PHYS_TRIPLE = 3, DDT_PHYS_TRIPLE = 3,
DDT_PHYS_TYPES DDT_PHYS_FLAT = 4,
}; DDT_PHYS_NONE = 5
} ddt_phys_variant_t;
#define DDT_PHYS_VARIANT(ddt, p) \
(ASSERT((p) < DDT_PHYS_NONE), \
((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
#define _DDT_PHYS_SWITCH(ddt, flat, trad) \
(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))
/* /*
* A "live" entry, holding changes to an entry made this txg, and other data to * A "live" entry, holding changes to an entry made this txg, and other data to
@ -153,17 +210,27 @@ enum ddt_phys_type {
/* State flags for dde_flags */ /* State flags for dde_flags */
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */
#define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */
/*
* Additional data to support entry update or repair. This is fixed size
* because its relatively rarely used.
*/
typedef struct {
/* copy of data after a repair read, to be rewritten */
abd_t *dde_repair_abd;
/* original phys contents before update, for error handling */
ddt_univ_phys_t dde_orig_phys;
/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_MAX];
} ddt_entry_io_t;
typedef struct { typedef struct {
/* key must be first for ddt_key_compare */ /* key must be first for ddt_key_compare */
ddt_key_t dde_key; /* ddt_tree key */ ddt_key_t dde_key; /* ddt_tree key */
ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */ avl_node_t dde_node; /* ddt_tree_node */
/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_TYPES];
/* copy of data after a repair read, to be rewritten */
struct abd *dde_repair_abd;
/* storage type and class the entry was loaded from */ /* storage type and class the entry was loaded from */
ddt_type_t dde_type; ddt_type_t dde_type;
@ -173,9 +240,35 @@ typedef struct {
kcondvar_t dde_cv; /* signaled when load completes */ kcondvar_t dde_cv; /* signaled when load completes */
uint64_t dde_waiters; /* count of waiters on dde_cv */ uint64_t dde_waiters; /* count of waiters on dde_cv */
avl_node_t dde_node; /* ddt_tree node */ ddt_entry_io_t *dde_io; /* IO support, when required */
ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */
} ddt_entry_t; } ddt_entry_t;
/*
* A lightweight entry is for short-lived or transient uses, like iterating or
* inspecting, when you don't care where it came from.
*/
typedef struct {
ddt_key_t ddlwe_key;
ddt_type_t ddlwe_type;
ddt_class_t ddlwe_class;
ddt_univ_phys_t ddlwe_phys;
} ddt_lightweight_entry_t;
/*
* In-core DDT log. A separate struct to make it easier to switch between the
* appending and flushing logs.
*/
typedef struct {
avl_tree_t ddl_tree; /* logged entries */
uint32_t ddl_flags; /* flags for this log */
uint64_t ddl_object; /* log object id */
uint64_t ddl_length; /* on-disk log size */
uint64_t ddl_first_txg; /* txg log became active */
ddt_key_t ddl_checkpoint; /* last checkpoint */
} ddt_log_t;
/* /*
* In-core DDT object. This covers all entries and stats for a the whole pool * In-core DDT object. This covers all entries and stats for a the whole pool
* for a given checksum type. * for a given checksum type.
@ -184,23 +277,49 @@ typedef struct {
kmutex_t ddt_lock; /* protects changes to all fields */ kmutex_t ddt_lock; /* protects changes to all fields */
avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ avl_tree_t ddt_tree; /* "live" (changed) entries this txg */
avl_tree_t ddt_log_tree; /* logged entries */
avl_tree_t ddt_repair_tree; /* entries being repaired */ avl_tree_t ddt_repair_tree; /* entries being repaired */
enum zio_checksum ddt_checksum; /* checksum algorithm in use */ ddt_log_t ddt_log[2]; /* active/flushing logs */
spa_t *ddt_spa; /* pool this ddt is on */ ddt_log_t *ddt_log_active; /* pointers into ddt_log */
objset_t *ddt_os; /* ddt objset (always MOS) */ ddt_log_t *ddt_log_flushing; /* swapped when flush starts */
hrtime_t ddt_flush_start; /* log flush start this txg */
uint32_t ddt_flush_pass; /* log flush pass this txg */
int32_t ddt_flush_count; /* entries flushed this txg */
int32_t ddt_flush_min; /* min rem entries to flush */
int32_t ddt_log_ingest_rate; /* rolling log ingest rate */
int32_t ddt_log_flush_rate; /* rolling log flush rate */
int32_t ddt_log_flush_time_rate; /* avg time spent flushing */
uint64_t ddt_flush_force_txg; /* flush hard before this txg */
kstat_t *ddt_ksp; /* kstats context */
enum zio_checksum ddt_checksum; /* checksum algorithm in use */
spa_t *ddt_spa; /* pool this ddt is on */
objset_t *ddt_os; /* ddt objset (always MOS) */
uint64_t ddt_dir_object; /* MOS dir holding ddt objects */
uint64_t ddt_version; /* DDT version */
uint64_t ddt_flags; /* FDT option flags */
/* per-type/per-class entry store objects */ /* per-type/per-class entry store objects */
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
/* object ids for whole-ddt and per-type/per-class stats */ /* object ids for stored, logged and per-type/per-class stats */
uint64_t ddt_stat_object; uint64_t ddt_stat_object;
ddt_object_t ddt_log_stats;
ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
/* type/class stats by power-2-sized referenced blocks */ /* type/class stats by power-2-sized referenced blocks */
ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
/* log stats power-2-sized referenced blocks */
ddt_histogram_t ddt_log_histogram;
} ddt_t; } ddt_t;
/* /*
@ -215,20 +334,36 @@ typedef struct {
uint64_t ddb_cursor; uint64_t ddb_cursor;
} ddt_bookmark_t; } ddt_bookmark_t;
extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
uint64_t txg); blkptr_t *bp, uint64_t txg);
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_phys_t *ddp, blkptr_t *bp); const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);
extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
extern void ddt_phys_clear(ddt_phys_t *ddp); const blkptr_t *bp);
extern void ddt_phys_addref(ddt_phys_t *ddp); extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
extern void ddt_phys_decref(ddt_phys_t *ddp); ddt_phys_variant_t v);
extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
const ddt_entry_t *dde, const blkptr_t *bp);
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
boolean_t encrypted);
extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
const ddt_lightweight_entry_t *ddlwe);
extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
const ddt_lightweight_entry_t *ddlwe);
extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh);
extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
extern uint64_t ddt_get_ddt_dsize(spa_t *spa); extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
@ -243,7 +378,7 @@ extern void ddt_enter(ddt_t *ddt);
extern void ddt_exit(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt);
extern void ddt_init(void); extern void ddt_init(void);
extern void ddt_fini(void); extern void ddt_fini(void);
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp);
extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
extern void ddt_prefetch_all(spa_t *spa); extern void ddt_prefetch_all(spa_t *spa);
@ -251,6 +386,8 @@ extern void ddt_prefetch_all(spa_t *spa);
extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
const blkptr_t *bp); const blkptr_t *bp);
extern void ddt_alloc_entry_io(ddt_entry_t *dde);
extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
@ -260,7 +397,11 @@ extern void ddt_create(spa_t *spa);
extern int ddt_load(spa_t *spa); extern int ddt_load(spa_t *spa);
extern void ddt_unload(spa_t *spa); extern void ddt_unload(spa_t *spa);
extern void ddt_sync(spa_t *spa, uint64_t txg); extern void ddt_sync(spa_t *spa, uint64_t txg);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
extern void ddt_walk_init(spa_t *spa, uint64_t txg);
extern boolean_t ddt_walk_ready(spa_t *spa);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);

View File

@ -28,11 +28,129 @@
#define _SYS_DDT_IMPL_H #define _SYS_DDT_IMPL_H
#include <sys/ddt.h> #include <sys/ddt.h>
#include <sys/bitops.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/* DDT version numbers */
#define DDT_VERSION_LEGACY (0)
#define DDT_VERSION_FDT (1)
/* Names of interesting objects in the DDT root dir */
#define DDT_DIR_VERSION "version"
#define DDT_DIR_FLAGS "flags"
/* Fill a lightweight entry from a live entry. */
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
memset((ddlwe), 0, sizeof (*ddlwe)); \
(ddlwe)->ddlwe_key = (dde)->dde_key; \
(ddlwe)->ddlwe_type = (dde)->dde_type; \
(ddlwe)->ddlwe_class = (dde)->dde_class; \
memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
} while (0)
#define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \
memset((ddlwe), 0, sizeof (*ddlwe)); \
(ddlwe)->ddlwe_key = (ddle)->ddle_key; \
(ddlwe)->ddlwe_type = (ddle)->ddle_type; \
(ddlwe)->ddlwe_class = (ddle)->ddle_class; \
memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
} while (0)
/*
* An entry on the log tree. These are "frozen", and a record of what's in
* the on-disk log. They can't be used in place, but can be "loaded" back into
* the live tree.
*/
typedef struct {
ddt_key_t ddle_key; /* ddt_log_tree key */
avl_node_t ddle_node; /* ddt_log_tree node */
ddt_type_t ddle_type; /* storage type */
ddt_class_t ddle_class; /* storage class */
/* extra allocation for flat/trad phys */
ddt_univ_phys_t ddle_phys[];
} ddt_log_entry_t;
/* On-disk log record types. */
typedef enum {
DLR_INVALID = 0, /* end of block marker */
DLR_ENTRY = 1, /* an entry to add or replace in the log tree */
} ddt_log_record_type_t;
/* On-disk log record header. */
typedef struct {
/*
* dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
* access it.
*
* bits 0-7: record type (ddt_log_record_type_t)
* bits 8-15: length of record header+payload
* bits 16-47: reserved, all zero
* bits 48-55: if type==DLR_ENTRY, storage type (ddt_type)
* otherwise all zero
* bits 56-63: if type==DLR_ENTRY, storage class (ddt_class)
* otherwise all zero
*/
uint64_t dlr_info;
uint8_t dlr_payload[];
} ddt_log_record_t;
#define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8)
#define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v)
#define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16)
#define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v)
#define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8)
#define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v)
#define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8)
#define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v)
/* Payload for DLR_ENTRY. */
typedef struct {
ddt_key_t dlre_key;
ddt_univ_phys_t dlre_phys[];
} ddt_log_record_entry_t;
/* Log flags (ddl_flags, dlh_flags) */
#define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */
#define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */
/* On-disk log header, stored in the bonus buffer. */
typedef struct {
/*
* dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
* access it.
*
* bits 0-7: log version
* bits 8-15: log flags
* bits 16-63: reserved, all zero
*/
uint64_t dlh_info;
uint64_t dlh_length; /* log size in bytes */
uint64_t dlh_first_txg; /* txg this log went active */
ddt_key_t dlh_checkpoint; /* last checkpoint */
} ddt_log_header_t;
#define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8)
#define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v)
#define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8)
#define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v)
/* DDT log update state */
typedef struct {
dmu_tx_t *dlu_tx; /* tx the update is being applied to */
dnode_t *dlu_dn; /* log object dnode */
dmu_buf_t **dlu_dbp; /* array of block buffer pointers */
int dlu_ndbp; /* number of block buffer pointers */
uint16_t dlu_reclen; /* cached length of record */
uint64_t dlu_block; /* block for next entry */
uint64_t dlu_offset; /* offset for next entry */
} ddt_log_update_t;
/* /*
* Ops vector to access a specific DDT object type. * Ops vector to access a specific DDT object type.
*/ */
@ -42,25 +160,50 @@ typedef struct {
boolean_t prehash); boolean_t prehash);
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
int (*ddt_op_lookup)(objset_t *os, uint64_t object, int (*ddt_op_lookup)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); const ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_contains)(objset_t *os, uint64_t object, int (*ddt_op_contains)(objset_t *os, uint64_t object,
const ddt_key_t *ddk); const ddt_key_t *ddk);
void (*ddt_op_prefetch)(objset_t *os, uint64_t object, void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
const ddt_key_t *ddk); const ddt_key_t *ddk);
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
int (*ddt_op_update)(objset_t *os, uint64_t object, int (*ddt_op_update)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, const ddt_key_t *ddk, const void *phys, size_t psize,
dmu_tx_t *tx); dmu_tx_t *tx);
int (*ddt_op_remove)(objset_t *os, uint64_t object, int (*ddt_op_remove)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, dmu_tx_t *tx); const ddt_key_t *ddk, dmu_tx_t *tx);
int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
} ddt_ops_t; } ddt_ops_t;
extern const ddt_ops_t ddt_zap_ops; extern const ddt_ops_t ddt_zap_ops;
extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); /* Dedup log API */
extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
ddt_log_update_t *dlu);
extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
ddt_log_update_t *dlu);
extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
dmu_tx_t *tx);
extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
extern int ddt_log_load(ddt_t *ddt);
extern void ddt_log_alloc(ddt_t *ddt);
extern void ddt_log_free(ddt_t *ddt);
extern void ddt_log_init(void);
extern void ddt_log_fini(void);
/* /*
* These are only exposed so that zdb can access them. Try not to use them * These are only exposed so that zdb can access them. Try not to use them
@ -74,16 +217,15 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
*/ */
#define DDT_NAMELEN 32 #define DDT_NAMELEN 32
extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
const ddt_univ_phys_t *ddp);
extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
char *name); char *name);
extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
uint64_t *walk, ddt_entry_t *dde); uint64_t *walk, ddt_lightweight_entry_t *ddlwe);
extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
uint64_t *count); uint64_t *count);
extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,

View File

@ -375,7 +375,9 @@ typedef struct dmu_buf {
#define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_L2CACHE "l2cache"
#define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_TMP_USERREFS "tmp_userrefs"
#define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT "DDT-%s-%s-%s"
#define DMU_POOL_DDT_LOG "DDT-log-%s-%u"
#define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_DDT_STATS "DDT-statistics"
#define DMU_POOL_DDT_DIR "DDT-%s"
#define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_CREATION_VERSION "creation_version"
#define DMU_POOL_SCAN "scan" #define DMU_POOL_SCAN "scan"
#define DMU_POOL_ERRORSCRUB "error_scrub" #define DMU_POOL_ERRORSCRUB "error_scrub"

View File

@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_entry_t *dde, dmu_tx_t *tx); ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,

View File

@ -1710,6 +1710,11 @@ typedef enum {
#define ZPOOL_INITIALIZE_COMMAND "initialize_command" #define ZPOOL_INITIALIZE_COMMAND "initialize_command"
#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" #define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
/*
* The following are names used when invoking ZFS_IOC_POOL_REGUID.
*/
#define ZPOOL_REGUID_GUID "guid"
/* /*
* The following are names used when invoking ZFS_IOC_POOL_TRIM. * The following are names used when invoking ZFS_IOC_POOL_TRIM.
*/ */

View File

@ -572,7 +572,7 @@ typedef struct blkptr {
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
BP_GET_PSIZE(bp)) BP_GET_PSIZE(bp))
#define BP_ZERO(bp) \ #define BP_ZERO_DVAS(bp) \
{ \ { \
(bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[0] = 0; \
(bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \
@ -580,6 +580,11 @@ typedef struct blkptr {
(bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \
(bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \
(bp)->blk_dva[2].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \
}
#define BP_ZERO(bp) \
{ \
BP_ZERO_DVAS(bp); \
(bp)->blk_prop = 0; \ (bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \ (bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \ (bp)->blk_pad[1] = 0; \
@ -1087,7 +1092,7 @@ extern void spa_strfree(char *);
extern uint64_t spa_generate_guid(spa_t *spa); extern uint64_t spa_generate_guid(spa_t *spa);
extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
extern void spa_freeze(spa_t *spa); extern void spa_freeze(spa_t *spa);
extern int spa_change_guid(spa_t *spa); extern int spa_change_guid(spa_t *spa, const uint64_t *guidp);
extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_upgrade(spa_t *spa, uint64_t version);
extern void spa_evict_all(void); extern void spa_evict_all(void);
extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,

View File

@ -22,7 +22,7 @@
/* /*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Allan Jude
* Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, 2024, Klara, Inc.
* Use is subject to license terms. * Use is subject to license terms.
* Copyright (c) 2015, 2016 by Delphix. All rights reserved. * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
*/ */
@ -122,25 +122,15 @@ enum zio_zstd_levels {
struct zio_prop; struct zio_prop;
/* Common signature for all zio compress functions. */ /* Common signature for all zio compress functions. */
typedef size_t zio_compress_func_t(void *src, void *dst, typedef size_t zio_compress_func_t(abd_t *src, abd_t *dst,
size_t s_len, size_t d_len, int); size_t s_len, size_t d_len, int);
/* Common signature for all zio decompress functions. */ /* Common signature for all zio decompress functions. */
typedef int zio_decompress_func_t(void *src, void *dst, typedef int zio_decompress_func_t(abd_t *src, abd_t *dst,
size_t s_len, size_t d_len, int); size_t s_len, size_t d_len, int);
/* Common signature for all zio decompress and get level functions. */ /* Common signature for all zio decompress and get level functions. */
typedef int zio_decompresslevel_func_t(void *src, void *dst, typedef int zio_decompresslevel_func_t(abd_t *src, abd_t *dst,
size_t s_len, size_t d_len, uint8_t *level); size_t s_len, size_t d_len, uint8_t *level);
/* Common signature for all zio get-compression-level functions. */
typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level);
/*
* Common signature for all zio decompress functions using an ABD as input.
* This is helpful if you have both compressed ARC and scatter ABDs enabled,
* but is not a requirement for all compression algorithms.
*/
typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
size_t s_len, size_t d_len, int);
/* /*
* Information about each compression function. * Information about each compression function.
*/ */
@ -163,34 +153,66 @@ extern void lz4_fini(void);
/* /*
* Compression routines. * Compression routines.
*/ */
extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, extern size_t zfs_lzjb_compress(abd_t *src, abd_t *dst, size_t s_len,
int level); size_t d_len, int level);
extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, extern int zfs_lzjb_decompress(abd_t *src, abd_t *dst, size_t s_len,
int level); size_t d_len, int level);
extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, extern size_t zfs_gzip_compress(abd_t *src, abd_t *dst, size_t s_len,
int level); size_t d_len, int level);
extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, extern int zfs_gzip_decompress(abd_t *src, abd_t *dst, size_t s_len,
int level); size_t d_len, int level);
extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len, extern size_t zfs_zle_compress(abd_t *src, abd_t *dst, size_t s_len,
int level); size_t d_len, int level);
extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, extern int zfs_zle_decompress(abd_t *src, abd_t *dst, size_t s_len,
int level); size_t d_len, int level);
extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, extern size_t zfs_lz4_compress(abd_t *src, abd_t *dst, size_t s_len,
int level); size_t d_len, int level);
extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len,
int level); size_t d_len, int level);
/* /*
* Compress and decompress data if necessary. * Compress and decompress data if necessary.
*/ */
extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst, extern size_t zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst,
size_t s_len, uint8_t level); size_t s_len, uint8_t level);
extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, extern int zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *abd,
size_t s_len, size_t d_len, uint8_t *level);
extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len, uint8_t *level); size_t s_len, size_t d_len, uint8_t *level);
extern int zio_compress_to_feature(enum zio_compress comp); extern int zio_compress_to_feature(enum zio_compress comp);
#define ZFS_COMPRESS_WRAP_DECL(name) \
size_t \
name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n) \
{ \
void *s_buf = abd_borrow_buf_copy(src, s_len); \
void *d_buf = abd_borrow_buf(dst, d_len); \
size_t c_len = name##_buf(s_buf, d_buf, s_len, d_len, n); \
abd_return_buf(src, s_buf, s_len); \
abd_return_buf_copy(dst, d_buf, d_len); \
return (c_len); \
}
#define ZFS_DECOMPRESS_WRAP_DECL(name) \
int \
name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n) \
{ \
void *s_buf = abd_borrow_buf_copy(src, s_len); \
void *d_buf = abd_borrow_buf(dst, d_len); \
int err = name##_buf(s_buf, d_buf, s_len, d_len, n); \
abd_return_buf(src, s_buf, s_len); \
abd_return_buf_copy(dst, d_buf, d_len); \
return (err); \
}
#define ZFS_DECOMPRESS_LEVEL_WRAP_DECL(name) \
int \
name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *n) \
{ \
void *s_buf = abd_borrow_buf_copy(src, s_len); \
void *d_buf = abd_borrow_buf(dst, d_len); \
int err = name##_buf(s_buf, d_buf, s_len, d_len, n); \
abd_return_buf(src, s_buf, s_len); \
abd_return_buf_copy(dst, d_buf, d_len); \
return (err); \
}
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -90,14 +90,12 @@ typedef struct zfs_zstd_meta {
int zstd_init(void); int zstd_init(void);
void zstd_fini(void); void zstd_fini(void);
size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t zfs_zstd_compress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int level);
size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len,
size_t d_len, int level); size_t d_len, int level);
int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level); int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, int zfs_zstd_decompress_level(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, uint8_t *level); size_t d_len, uint8_t *level);
int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, int zfs_zstd_decompress(abd_t *src, abd_t *dst, size_t s_len,
size_t d_len, int n); size_t d_len, int n);
void zfs_zstd_cache_reap_now(void); void zfs_zstd_cache_reap_now(void);

View File

@ -82,6 +82,7 @@ typedef enum spa_feature {
SPA_FEATURE_AVZ_V2, SPA_FEATURE_AVZ_V2,
SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_REDACTION_LIST_SPILL,
SPA_FEATURE_RAIDZ_EXPANSION, SPA_FEATURE_RAIDZ_EXPANSION,
SPA_FEATURE_FAST_DEDUP,
SPA_FEATURES SPA_FEATURES
} spa_feature_t; } spa_feature_t;

View File

@ -556,6 +556,7 @@
<elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_set_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_vdev_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_set_vdev_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_skip_pool' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_skip_pool' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -616,7 +617,7 @@
<elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2296' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='spa_feature_table' size='2352' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -6006,7 +6007,8 @@
<enumerator name='SPA_FEATURE_AVZ_V2' value='38'/> <enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
<enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/> <enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
<enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/> <enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
<enumerator name='SPA_FEATURES' value='41'/> <enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
<enumerator name='SPA_FEATURES' value='42'/>
</enum-decl> </enum-decl>
<typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/> <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
<qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/> <qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@ -6638,6 +6640,11 @@
<parameter type-id='9c313c2d' name='guid'/> <parameter type-id='9c313c2d' name='guid'/>
<return type-id='95e97e5e'/> <return type-id='95e97e5e'/>
</function-decl> </function-decl>
<function-decl name='zpool_set_guid' mangled-name='zpool_set_guid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_set_guid'>
<parameter type-id='4c81de99' name='zhp'/>
<parameter type-id='713a56f5' name='guid'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_reguid' mangled-name='zpool_reguid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_reguid'> <function-decl name='zpool_reguid' mangled-name='zpool_reguid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_reguid'>
<parameter type-id='4c81de99' name='zhp'/> <parameter type-id='4c81de99' name='zhp'/>
<return type-id='95e97e5e'/> <return type-id='95e97e5e'/>
@ -9131,8 +9138,8 @@
</function-decl> </function-decl>
</abi-instr> </abi-instr>
<abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'> <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18368' id='b93e4d14'> <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18816' id='b937914f'>
<subrange length='41' type-id='7359adad' id='cb834f44'/> <subrange length='42' type-id='7359adad' id='cb7c937f'/>
</array-type-def> </array-type-def>
<enum-decl name='zfeature_flags' id='6db816a4'> <enum-decl name='zfeature_flags' id='6db816a4'>
<underlying-type type-id='9cac1fee'/> <underlying-type type-id='9cac1fee'/>
@ -9209,7 +9216,7 @@
<pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/> <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
<qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/> <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
<pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/> <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
<var-decl name='spa_feature_table' type-id='b93e4d14' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/> <var-decl name='spa_feature_table' type-id='b937914f' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/> <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
<function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'> <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/> <parameter type-id='80f4b756'/>

View File

@ -3735,6 +3735,13 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
(void) zpool_standard_error(hdl, errno, errbuf); (void) zpool_standard_error(hdl, errno, errbuf);
} }
break; break;
case ZFS_ERR_ASHIFT_MISMATCH:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"The new device cannot have a higher alignment requirement "
"than the top-level vdev."));
(void) zfs_error(hdl, EZFS_BADTARGET, errbuf);
break;
default: default:
(void) zpool_standard_error(hdl, errno, errbuf); (void) zpool_standard_error(hdl, errno, errbuf);
} }
@ -4305,22 +4312,55 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
/* /*
* Change the GUID for a pool. * Change the GUID for a pool.
*
* Similar to zpool_reguid(), but may take a GUID.
*
* If the guid argument is NULL, then no GUID is passed in the nvlist to the
* ioctl().
*/ */
int int
zpool_reguid(zpool_handle_t *zhp) zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid)
{ {
char errbuf[ERRBUFLEN]; char errbuf[ERRBUFLEN];
libzfs_handle_t *hdl = zhp->zpool_hdl; libzfs_handle_t *hdl = zhp->zpool_hdl;
nvlist_t *nvl = NULL;
zfs_cmd_t zc = {"\0"}; zfs_cmd_t zc = {"\0"};
int error = -1;
if (guid != NULL) {
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
return (no_memory(hdl));
if (nvlist_add_uint64(nvl, ZPOOL_REGUID_GUID, *guid) != 0) {
nvlist_free(nvl);
return (no_memory(hdl));
}
zcmd_write_src_nvlist(hdl, &zc, nvl);
}
(void) snprintf(errbuf, sizeof (errbuf), (void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) error = zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc);
return (0); if (error) {
return (zpool_standard_error(hdl, errno, errbuf));
}
if (guid != NULL) {
zcmd_free_nvlists(&zc);
nvlist_free(nvl);
}
return (0);
}
return (zpool_standard_error(hdl, errno, errbuf)); /*
* Change the GUID for a pool.
*/
int
zpool_reguid(zpool_handle_t *zhp)
{
return (zpool_set_guid(zhp, NULL));
} }
/* /*

View File

@ -1,7 +1,9 @@
include $(srcdir)/%D%/include/Makefile.am
libzpool_la_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS) libzpool_la_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS)
libzpool_la_CFLAGS += $(ZLIB_CFLAGS) libzpool_la_CFLAGS += $(ZLIB_CFLAGS)
libzpool_la_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) libzpool_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
libzpool_la_CPPFLAGS += -I$(srcdir)/include/os/@ac_system_l@/zfs libzpool_la_CPPFLAGS += -I$(srcdir)/include/os/@ac_system_l@/zfs
libzpool_la_CPPFLAGS += -DLIB_ZPOOL_BUILD libzpool_la_CPPFLAGS += -DLIB_ZPOOL_BUILD
@ -9,6 +11,7 @@ lib_LTLIBRARIES += libzpool.la
CPPCHECKTARGETS += libzpool.la CPPCHECKTARGETS += libzpool.la
dist_libzpool_la_SOURCES = \ dist_libzpool_la_SOURCES = \
%D%/abd_os.c \
%D%/kernel.c \ %D%/kernel.c \
%D%/taskq.c \ %D%/taskq.c \
%D%/util.c %D%/util.c
@ -39,7 +42,6 @@ nodist_libzpool_la_SOURCES = \
module/lua/lvm.c \ module/lua/lvm.c \
module/lua/lzio.c \ module/lua/lzio.c \
\ \
module/os/linux/zfs/abd_os.c \
module/os/linux/zfs/arc_os.c \ module/os/linux/zfs/arc_os.c \
module/os/linux/zfs/trace.c \ module/os/linux/zfs/trace.c \
module/os/linux/zfs/vdev_file.c \ module/os/linux/zfs/vdev_file.c \
@ -79,6 +81,7 @@ nodist_libzpool_la_SOURCES = \
module/zfs/dbuf.c \ module/zfs/dbuf.c \
module/zfs/dbuf_stats.c \ module/zfs/dbuf_stats.c \
module/zfs/ddt.c \ module/zfs/ddt.c \
module/zfs/ddt_log.c \
module/zfs/ddt_stats.c \ module/zfs/ddt_stats.c \
module/zfs/ddt_zap.c \ module/zfs/ddt_zap.c \
module/zfs/dmu.c \ module/zfs/dmu.c \

365
lib/libzpool/abd_os.c Normal file
View File

@ -0,0 +1,365 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#include <sys/abd_impl.h>
#include <sys/param.h>
#include <sys/zio.h>
#include <sys/arc.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
/*
* We're simulating scatter/gather with 4K allocations, since that's more like
* what a typical kernel does.
*/
#define ABD_PAGESIZE (4096)
#define ABD_PAGESHIFT (12)
#define ABD_PAGEMASK (ABD_PAGESIZE-1)
/*
* See rationale in module/os/linux/zfs/abd_os.c, but in userspace this is
* mostly useful to get a mix of linear and scatter ABDs for testing.
*/
#define ABD_SCATTER_MIN_SIZE (512 * 3)
abd_t *abd_zero_scatter = NULL;
static uint_t
abd_iovcnt_for_bytes(size_t size)
{
/*
* Each iovec points to a 4K page. There's no real reason to do this
* in userspace, but our whole point here is to make it feel a bit
* more like a real paged memory model.
*/
return (P2ROUNDUP(size, ABD_PAGESIZE) / ABD_PAGESIZE);
}
abd_t *
abd_alloc_struct_impl(size_t size)
{
/*
* Zero-sized means it will be used for a linear or gang abd, so just
* allocate the abd itself and return.
*/
if (size == 0)
return (umem_alloc(sizeof (abd_t), UMEM_NOFAIL));
/*
* Allocating for a scatter abd, so compute how many ABD_PAGESIZE
* iovecs we will need to hold this size. Append that allocation to the
* end. Note that struct abd_scatter has includes abd_iov[1], so we
* allocate one less iovec than we need.
*
* Note we're not allocating the pages proper, just the iovec pointers.
* That's down in abd_alloc_chunks. We _could_ do it here in a single
* allocation, but it's fiddly and harder to read for no real gain.
*/
uint_t n = abd_iovcnt_for_bytes(size);
abd_t *abd = umem_alloc(sizeof (abd_t) + (n-1) * sizeof (struct iovec),
UMEM_NOFAIL);
ABD_SCATTER(abd).abd_offset = 0;
ABD_SCATTER(abd).abd_iovcnt = n;
return (abd);
}
void
abd_free_struct_impl(abd_t *abd)
{
/* For scatter, compute the extra amount we need to free */
uint_t iovcnt =
abd_is_linear(abd) || abd_is_gang(abd) ?
0 : (ABD_SCATTER(abd).abd_iovcnt - 1);
umem_free(abd, sizeof (abd_t) + iovcnt * sizeof (struct iovec));
}
void
abd_alloc_chunks(abd_t *abd, size_t size)
{
/*
* We've already allocated the iovec array; ensure that the wanted size
* actually matches, otherwise the caller has made a mistake somewhere.
*/
uint_t n = ABD_SCATTER(abd).abd_iovcnt;
ASSERT3U(n, ==, abd_iovcnt_for_bytes(size));
/*
* Allocate a ABD_PAGESIZE region for each iovec.
*/
struct iovec *iov = ABD_SCATTER(abd).abd_iov;
for (int i = 0; i < n; i++) {
iov[i].iov_base =
umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
iov[i].iov_len = ABD_PAGESIZE;
}
}
void
abd_free_chunks(abd_t *abd)
{
uint_t n = ABD_SCATTER(abd).abd_iovcnt;
struct iovec *iov = ABD_SCATTER(abd).abd_iov;
for (int i = 0; i < n; i++)
umem_free_aligned(iov[i].iov_base, ABD_PAGESIZE);
}
boolean_t
abd_size_alloc_linear(size_t size)
{
return (size < ABD_SCATTER_MIN_SIZE);
}
void
abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
{
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
int waste = P2ROUNDUP(abd->abd_size, ABD_PAGESIZE) - abd->abd_size;
if (op == ABDSTAT_INCR) {
arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
} else {
arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
}
}
void
abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
{
(void) abd;
(void) op;
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
}
void
abd_verify_scatter(abd_t *abd)
{
#ifdef ZFS_DEBUG
/*
* scatter abds shall have:
* - at least one iovec
* - all iov_base point somewhere
* - all iov_len are ABD_PAGESIZE
* - offset set within the abd pages somewhere
*/
uint_t n = ABD_SCATTER(abd).abd_iovcnt;
ASSERT3U(n, >, 0);
uint_t len = 0;
for (int i = 0; i < n; i++) {
ASSERT3P(ABD_SCATTER(abd).abd_iov[i].iov_base, !=, NULL);
ASSERT3U(ABD_SCATTER(abd).abd_iov[i].iov_len, ==, ABD_PAGESIZE);
len += ABD_PAGESIZE;
}
ASSERT3U(ABD_SCATTER(abd).abd_offset, <, len);
#endif
}
void
abd_init(void)
{
/*
* Create the "zero" scatter abd. This is always the size of the
* largest possible block, but only actually has a single allocated
* page, which all iovecs in the abd point to.
*/
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
void *zero =
umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
memset(zero, 0, ABD_PAGESIZE);
uint_t n = abd_iovcnt_for_bytes(SPA_MAXBLOCKSIZE);
struct iovec *iov = ABD_SCATTER(abd_zero_scatter).abd_iov;
for (int i = 0; i < n; i++) {
iov[i].iov_base = zero;
iov[i].iov_len = ABD_PAGESIZE;
}
}
void
abd_fini(void)
{
umem_free_aligned(
ABD_SCATTER(abd_zero_scatter).abd_iov[0].iov_base, ABD_PAGESIZE);
abd_free_struct(abd_zero_scatter);
abd_zero_scatter = NULL;
}
void
abd_free_linear_page(abd_t *abd)
{
/*
* LINEAR_PAGE is specific to the Linux kernel; we never set this
* flag, so this will never be called.
*/
(void) abd;
PANIC("unreachable");
}
abd_t *
abd_alloc_for_io(size_t size, boolean_t is_metadata)
{
return (abd_alloc(size, is_metadata));
}
abd_t *
abd_get_offset_scatter(abd_t *dabd, abd_t *sabd, size_t off, size_t size)
{
/*
* Create a new scatter dabd by borrowing data pages from sabd to cover
* off+size.
*
* sabd is an existing scatter abd with a set of iovecs, each covering
* an ABD_PAGESIZE (4K) allocation. It's "zero" is at abd_offset.
*
* [........][........][........][........]
* ^- sabd_offset
*
* We want to produce a new abd, referencing those allocations at the
* given offset.
*
* [........][........][........][........]
* ^- dabd_offset = sabd_offset + off
* ^- dabd_offset + size
*
* In this example, dabd needs three iovecs. The first iovec is offset
* 0, so the final dabd_offset is masked back into the first iovec.
*
* [........][........][........]
* ^- dabd_offset
*/
size_t soff = ABD_SCATTER(sabd).abd_offset + off;
size_t doff = soff & ABD_PAGEMASK;
size_t iovcnt = abd_iovcnt_for_bytes(doff + size);
/*
* If the passed-in abd has enough allocated iovecs already, reuse it.
* Otherwise, make a new one. The caller will free the original if the
* one it gets back is not the same.
*
* Note that it's ok if we reuse an abd with more iovecs than we need.
* abd_size has the usable amount of data, and the abd does not own the
* pages referenced by the iovecs. At worst, they're holding dangling
* pointers that we'll never use anyway.
*/
if (dabd == NULL || ABD_SCATTER(dabd).abd_iovcnt < iovcnt)
dabd = abd_alloc_struct(iovcnt << ABD_PAGESHIFT);
/* Set offset into first page in view */
ABD_SCATTER(dabd).abd_offset = doff;
/* Copy the wanted iovecs from the source to the dest */
memcpy(&ABD_SCATTER(dabd).abd_iov[0],
&ABD_SCATTER(sabd).abd_iov[soff >> ABD_PAGESHIFT],
iovcnt * sizeof (struct iovec));
return (dabd);
}
void
abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
}
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
if (abd_iter_at_end(aiter))
return;
aiter->iter_pos += amount;
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
}
void
abd_iter_map(struct abd_iter *aiter)
{
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
if (abd_iter_at_end(aiter))
return;
if (abd_is_linear(aiter->iter_abd)) {
aiter->iter_mapaddr =
ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
aiter->iter_mapsize =
aiter->iter_abd->abd_size - aiter->iter_pos;
return;
}
/*
* For scatter, we index into the appropriate iovec, and return the
* smaller of the amount requested, or up to the end of the page.
*/
size_t poff = aiter->iter_pos + ABD_SCATTER(aiter->iter_abd).abd_offset;
ASSERT3U(poff >> ABD_PAGESHIFT, <=,
ABD_SCATTER(aiter->iter_abd).abd_iovcnt);
struct iovec *iov = &ABD_SCATTER(aiter->iter_abd).
abd_iov[poff >> ABD_PAGESHIFT];
aiter->iter_mapsize = MIN(ABD_PAGESIZE - (poff & ABD_PAGEMASK),
aiter->iter_abd->abd_size - aiter->iter_pos);
ASSERT3U(aiter->iter_mapsize, <=, ABD_PAGESIZE);
aiter->iter_mapaddr = iov->iov_base + (poff & ABD_PAGEMASK);
}
void
abd_iter_unmap(struct abd_iter *aiter)
{
if (abd_iter_at_end(aiter))
return;
ASSERT3P(aiter->iter_mapaddr, !=, NULL);
ASSERT3U(aiter->iter_mapsize, >, 0);
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
void
abd_cache_reap_now(void)
{
}

View File

@ -0,0 +1,4 @@
libzpooldir = $(includedir)/libzpool
libzpool_HEADERS = \
%D%/sys/abd_os.h \
%D%/sys/abd_impl_os.h

View File

@ -0,0 +1,41 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ABD_IMPL_OS_H
#define _ABD_IMPL_OS_H
#ifdef __cplusplus
extern "C" {
#endif
#define abd_enter_critical(flags) ((void)0)
#define abd_exit_critical(flags) ((void)0)
#ifdef __cplusplus
}
#endif
#endif /* _ABD_IMPL_OS_H */

View File

@ -0,0 +1,47 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/
#ifndef _ABD_OS_H
#define _ABD_OS_H
#ifdef __cplusplus
extern "C" {
#endif
struct abd_scatter {
uint_t abd_offset;
uint_t abd_iovcnt;
struct iovec abd_iov[1]; /* actually variable-length */
};
struct abd_linear {
void *abd_buf;
};
#ifdef __cplusplus
}
#endif
#endif /* _ABD_H */

View File

@ -175,17 +175,6 @@ Increasing this value will
result in a slower thread creation rate which may be preferable for some result in a slower thread creation rate which may be preferable for some
configurations. configurations.
. .
.It Sy spl_max_show_tasks Ns = Ns Sy 512 Pq uint
The maximum number of tasks per pending list in each taskq shown in
.Pa /proc/spl/taskq{,-all} .
Write
.Sy 0
to turn off the limit.
The proc file will walk the lists with lock held,
reading it could cause a lock-up if the list grow too large
without limiting the output.
"(truncated)" will be shown if the list is larger than the limit.
.
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint .It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
Minimum idle threads exit interval for dynamic taskqs. Minimum idle threads exit interval for dynamic taskqs.
Smaller values allow idle threads exit more often and potentially be Smaller values allow idle threads exit more often and potentially be

View File

@ -77,6 +77,17 @@ the array is dynamically sized based on total system memory.
dnode slots allocated in a single operation as a power of 2. dnode slots allocated in a single operation as a power of 2.
The default value minimizes lock contention for the bulk operation performed. The default value minimizes lock contention for the bulk operation performed.
. .
.It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint
Controls the number of copies stored for DeDup Table
.Pq DDT
objects.
Reducing the number of copies to 1 from the previous default of 3
can reduce the write inflation caused by deduplication.
This assumes redundancy for this data is provided by the vdev layer.
If the DDT is damaged, space may be leaked
.Pq not freed
when the DDT can not report the correct reference count.
.
.It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
Limit the amount we can prefetch with one call to this amount in bytes. Limit the amount we can prefetch with one call to this amount in bytes.
This helps to limit the amount of memory that can be used by prefetching. This helps to limit the amount of memory that can be used by prefetching.
@ -121,20 +132,26 @@ Controls whether buffers present on special vdevs are eligible for caching
into L2ARC. into L2ARC.
If set to 1, exclude dbufs on special vdevs from being cached to L2ARC. If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
. .
.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
Controls whether only MFU metadata and data are cached from ARC into L2ARC. Controls whether only MFU metadata and data are cached from ARC into L2ARC.
This may be desired to avoid wasting space on L2ARC when reading/writing large This may be desired to avoid wasting space on L2ARC when reading/writing large
amounts of data that are not expected to be accessed more than once. amounts of data that are not expected to be accessed more than once.
.Pp .Pp
The default is off, The default is 0,
meaning both MRU and MFU data and metadata are cached. meaning both MRU and MFU data and metadata are cached.
When turning off this feature, some MRU buffers will still be present When turning off this feature (setting it to 0), some MRU buffers will
in ARC and eventually cached on L2ARC. still be present in ARC and eventually cached on L2ARC.
.No If Sy l2arc_noprefetch Ns = Ns Sy 0 , .No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
some prefetched buffers will be cached to L2ARC, and those might later some prefetched buffers will be cached to L2ARC, and those might later
transition to MRU, in which case the transition to MRU, in which case the
.Sy l2arc_mru_asize No arcstat will not be Sy 0 . .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
.Pp .Pp
Setting it to 1 means to L2 cache only MFU data and metadata.
.Pp
Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
only MFU data (ie: MRU data are not cached). This can be the right setting
to cache as much metadata as possible even when having high data turnover.
.Pp
Regardless of Regardless of
.Sy l2arc_noprefetch , .Sy l2arc_noprefetch ,
some MFU buffers might be evicted from ARC, some MFU buffers might be evicted from ARC,
@ -821,6 +838,7 @@ This is a limit on how many pages the ARC shrinker makes available for
eviction in response to one page allocation attempt. eviction in response to one page allocation attempt.
Note that in practice, the kernel's shrinker can ask us to evict Note that in practice, the kernel's shrinker can ask us to evict
up to about four times this for one allocation attempt. up to about four times this for one allocation attempt.
To reduce OOM risk, this limit is applied for kswapd reclaims only.
.Pp .Pp
The default limit of The default limit of
.Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
@ -974,6 +992,88 @@ milliseconds until the operation completes.
.It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
Enable prefetching dedup-ed blocks which are going to be freed. Enable prefetching dedup-ed blocks which are going to be freed.
. .
.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
Maximum number of dedup log flush passes (iterations) each transaction.
.Pp
At the start of each transaction, OpenZFS will estimate how many entries it
needs to flush out to keep up with the change rate, taking the amount and time
taken to flush on previous txgs into account (see
.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
It will spread this amount into a number of passes.
At each pass, it will use the amount already flushed and the total time taken
by flushing and by other IO to recompute how much it should do for the remainder
of the txg.
.Pp
Reducing the max number of passes will make flushing more aggressive, flushing
out more entries on each pass.
This can be faster, but also more likely to compete with other IO.
Increasing the max number of passes will put fewer entries onto each pass,
keeping the overhead of dedup changes to a minimum but possibly causing a large
number of changes to be dumped on the last pass, which can blow out the txg
sync time beyond
.Sy zfs_txg_timeout .
.
.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
Minimum time to spend on dedup log flush each transaction.
.Pp
At least this long will be spent flushing dedup log entries each transaction,
up to
.Sy zfs_txg_timeout .
This occurs even if doing so would delay the transaction, that is, other IO
completes under this time.
.
.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
Flush at least this many entries each transaction.
.Pp
OpenZFS will estimate how many entries it needs to flush each transaction to
keep up with the ingest rate (see
.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
This sets the minimum for that estimate.
Raising it can force OpenZFS to flush more aggressively, keeping the log small
and so reducing pool import times, but can make it less able to back off if
log flushing would compete with other IO too much.
.
.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
Number of transactions to use to compute the flow rate.
.Pp
OpenZFS will estimate how many entries it needs to flush each transaction by
monitoring the number of entries changed (ingest rate), number of entries
flushed (flush rate) and time spent flushing (flush time rate) and combining
these into an overall "flow rate".
It will use an exponential weighted moving average over some number of recent
transactions to compute these rates.
This sets the number of transactions to compute these averages over.
Setting it higher can help to smooth out the flow rate in the face of spiky
workloads, but will take longer for the flow rate to adjust to a sustained
change in the ingress rate.
.
.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
Max transactions to before starting to flush dedup logs.
.Pp
OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
If there is nothing to flush, it will accumulate changes for no more than this
many transactions before switching the logs and starting to flush entries out.
.
.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
Max memory to use for dedup logs.
.Pp
OpenZFS will spend no more than this much memory on maintaining the in-memory
dedup log.
Flushing will begin when around half this amount is being spent on logs.
The default value of
.Sy 0
will cause it to be set by
.Sy zfs_dedup_log_mem_max_percent
instead.
.
.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
Max memory to use for dedup logs, as a percentage of total memory.
.Pp
If
.Sy zfs_dedup_log_mem_max
is not set, it will be initialised as a percentage of the total memory in the
system.
.
.It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
Start to delay each transaction once there is this amount of dirty data, Start to delay each transaction once there is this amount of dirty data,
expressed as a percentage of expressed as a percentage of

View File

@ -17,8 +17,9 @@
.\" Copyright (c) 2019, Klara Inc. .\" Copyright (c) 2019, Klara Inc.
.\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2019, Allan Jude
.\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org> .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
.\" Copyright (c) 2023, Klara Inc.
.\" .\"
.Dd June 23, 2022 .Dd February 14, 2024
.Dt ZPOOL-FEATURES 7 .Dt ZPOOL-FEATURES 7
.Os .Os
. .
@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the
.Sy enabled .Sy enabled
state when all datasets that use this feature are destroyed. state when all datasets that use this feature are destroyed.
. .
.feature com.klarasystems fast_dedup yes
This feature allows more advanced deduplication features to be enabled on new
dedup tables.
.Pp
This feature will be
.Sy active
when the first deduplicated block is written after a new dedup table is created
(ie after a new pool creation, or new checksum used on a dataset with
.Sy dedup
enabled).
It will be returned to the
.Sy enabled
state when all deduplicated blocks using it are freed.
.
.feature com.delphix extensible_dataset no .feature com.delphix extensible_dataset no
This feature allows more flexible use of internal ZFS data structures, This feature allows more flexible use of internal ZFS data structures,
and exists for other features to depend on. and exists for other features to depend on.

View File

@ -25,8 +25,10 @@
.\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
.\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\" Copyright (c) 2024, Klara Inc.
.\" Copyright (c) 2024, Mateusz Piotrowski
.\" .\"
.Dd May 31, 2021 .Dd June 21, 2023
.Dt ZPOOL-REGUID 8 .Dt ZPOOL-REGUID 8
.Os .Os
. .
@ -36,6 +38,7 @@
.Sh SYNOPSIS .Sh SYNOPSIS
.Nm zpool .Nm zpool
.Cm reguid .Cm reguid
.Op Fl g Ar guid
.Ar pool .Ar pool
. .
.Sh DESCRIPTION .Sh DESCRIPTION
@ -43,6 +46,15 @@ Generates a new unique identifier for the pool.
You must ensure that all devices in this pool are online and healthy before You must ensure that all devices in this pool are online and healthy before
performing this action. performing this action.
. .
.Bl -tag -width Ds
.It Fl g Ar guid
Set the pool GUID to the provided value.
The GUID can be any 64-bit value accepted by
.Xr strtoull 3
in base 10.
.Nm
will return an error if the provided GUID is already in use.
.El
.Sh SEE ALSO .Sh SEE ALSO
.Xr zpool-export 8 , .Xr zpool-export 8 ,
.Xr zpool-import 8 .Xr zpool-import 8

View File

@ -16,8 +16,8 @@ src = @abs_srcdir@
obj = @abs_builddir@ obj = @abs_builddir@
else else
zfs_include = $(srctree)/include/zfs zfs_include = $(srctree)/include/zfs
icp_include = $(srctree)/$(src)/icp/include icp_include = $(src)/icp/include
zstd_include = $(srctree)/$(src)/zstd/include zstd_include = $(src)/zstd/include
ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
endif endif
@ -323,6 +323,7 @@ ZFS_OBJS := \
dbuf.o \ dbuf.o \
dbuf_stats.o \ dbuf_stats.o \
ddt.o \ ddt.o \
ddt_log.o \
ddt_stats.o \ ddt_stats.o \
ddt_zap.o \ ddt_zap.o \
dmu.o \ dmu.o \

View File

@ -252,6 +252,7 @@ SRCS+= abd.c \
dbuf.c \ dbuf.c \
dbuf_stats.c \ dbuf_stats.c \
ddt.c \ ddt.c \
ddt_log.c \
ddt_stats.c \ ddt_stats.c \
ddt_zap.c \ ddt_zap.c \
dmu.c \ dmu.c \
@ -426,6 +427,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast
CFLAGS.abd.c= -Wno-cast-qual CFLAGS.abd.c= -Wno-cast-qual
CFLAGS.ddt.c= -Wno-cast-qual CFLAGS.ddt.c= -Wno-cast-qual
CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith
CFLAGS.ddt_zap.c= -Wno-cast-qual CFLAGS.ddt_zap.c= -Wno-cast-qual
CFLAGS.dmu.c= -Wno-cast-qual CFLAGS.dmu.c= -Wno-cast-qual
CFLAGS.dmu_traverse.c= -Wno-cast-qual CFLAGS.dmu_traverse.c= -Wno-cast-qual

View File

@ -95,14 +95,12 @@ struct {
*/ */
static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1; static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;
#if defined(_KERNEL)
SYSCTL_DECL(_vfs_zfs); SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN, SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
&zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations."); &zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
#endif
kmem_cache_t *abd_chunk_cache; kmem_cache_t *abd_chunk_cache;
static kstat_t *abd_ksp; static kstat_t *abd_ksp;
@ -250,7 +248,7 @@ abd_alloc_zero_scatter(void)
n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS; abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
ABD_SCATTER(abd_zero_scatter).abd_offset = 0; ABD_SCATTER(abd_zero_scatter).abd_offset = 0;

View File

@ -6125,7 +6125,9 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
error == EOPNOTSUPP) error == EOPNOTSUPP)
goto bad_locked_fallback; goto bad_locked_fallback;
*ap->a_lenp = (size_t)len; *ap->a_lenp = (size_t)len;
#ifdef MAC
out_locked: out_locked:
#endif
if (invp != outvp) if (invp != outvp)
VOP_UNLOCK(invp); VOP_UNLOCK(invp);
VOP_UNLOCK(outvp); VOP_UNLOCK(outvp);

View File

@ -868,16 +868,16 @@ spl_init(void)
if ((rc = spl_tsd_init())) if ((rc = spl_tsd_init()))
goto out2; goto out2;
if ((rc = spl_taskq_init())) if ((rc = spl_proc_init()))
goto out3; goto out3;
if ((rc = spl_kmem_cache_init())) if ((rc = spl_kstat_init()))
goto out4; goto out4;
if ((rc = spl_proc_init())) if ((rc = spl_taskq_init()))
goto out5; goto out5;
if ((rc = spl_kstat_init())) if ((rc = spl_kmem_cache_init()))
goto out6; goto out6;
if ((rc = spl_zlib_init())) if ((rc = spl_zlib_init()))
@ -891,13 +891,13 @@ spl_init(void)
out8: out8:
spl_zlib_fini(); spl_zlib_fini();
out7: out7:
spl_kstat_fini();
out6:
spl_proc_fini();
out5:
spl_kmem_cache_fini(); spl_kmem_cache_fini();
out4: out6:
spl_taskq_fini(); spl_taskq_fini();
out5:
spl_kstat_fini();
out4:
spl_proc_fini();
out3: out3:
spl_tsd_fini(); spl_tsd_fini();
out2: out2:
@ -913,10 +913,10 @@ spl_fini(void)
{ {
spl_zone_fini(); spl_zone_fini();
spl_zlib_fini(); spl_zlib_fini();
spl_kstat_fini();
spl_proc_fini();
spl_kmem_cache_fini(); spl_kmem_cache_fini();
spl_taskq_fini(); spl_taskq_fini();
spl_kstat_fini();
spl_proc_fini();
spl_tsd_fini(); spl_tsd_fini();
spl_kvmem_fini(); spl_kvmem_fini();
spl_random_fini(); spl_random_fini();

View File

@ -22,13 +22,15 @@
* *
* Solaris Porting Layer (SPL) Proc Implementation. * Solaris Porting Layer (SPL) Proc Implementation.
*/ */
/*
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
*/
#include <sys/systeminfo.h> #include <sys/systeminfo.h>
#include <sys/kstat.h> #include <sys/kstat.h>
#include <sys/kmem.h> #include <sys/kmem.h>
#include <sys/kmem_cache.h> #include <sys/kmem_cache.h>
#include <sys/vmem.h> #include <sys/vmem.h>
#include <sys/taskq.h>
#include <sys/proc.h> #include <sys/proc.h>
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/kmod.h> #include <linux/kmod.h>
@ -43,6 +45,12 @@ typedef struct ctl_table __no_const spl_ctl_table;
typedef struct ctl_table spl_ctl_table; typedef struct ctl_table spl_ctl_table;
#endif #endif
#ifdef HAVE_PROC_HANDLER_CTL_TABLE_CONST
#define CONST_CTL_TABLE const struct ctl_table
#else
#define CONST_CTL_TABLE struct ctl_table
#endif
static unsigned long table_min = 0; static unsigned long table_min = 0;
static unsigned long table_max = ~0; static unsigned long table_max = ~0;
@ -54,13 +62,11 @@ static struct ctl_table_header *spl_kstat = NULL;
static struct proc_dir_entry *proc_spl = NULL; static struct proc_dir_entry *proc_spl = NULL;
static struct proc_dir_entry *proc_spl_kmem = NULL; static struct proc_dir_entry *proc_spl_kmem = NULL;
static struct proc_dir_entry *proc_spl_kmem_slab = NULL; static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
static struct proc_dir_entry *proc_spl_taskq_all = NULL;
static struct proc_dir_entry *proc_spl_taskq = NULL;
struct proc_dir_entry *proc_spl_kstat = NULL; struct proc_dir_entry *proc_spl_kstat = NULL;
#ifdef DEBUG_KMEM #ifdef DEBUG_KMEM
static int static int
proc_domemused(struct ctl_table *table, int write, proc_domemused(CONST_CTL_TABLE *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos) void __user *buffer, size_t *lenp, loff_t *ppos)
{ {
int rc = 0; int rc = 0;
@ -88,7 +94,7 @@ proc_domemused(struct ctl_table *table, int write,
#endif /* DEBUG_KMEM */ #endif /* DEBUG_KMEM */
static int static int
proc_doslab(struct ctl_table *table, int write, proc_doslab(CONST_CTL_TABLE *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos) void __user *buffer, size_t *lenp, loff_t *ppos)
{ {
int rc = 0; int rc = 0;
@ -135,7 +141,7 @@ proc_doslab(struct ctl_table *table, int write,
} }
static int static int
proc_dohostid(struct ctl_table *table, int write, proc_dohostid(CONST_CTL_TABLE *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos) void __user *buffer, size_t *lenp, loff_t *ppos)
{ {
char *end, str[32]; char *end, str[32];
@ -168,195 +174,6 @@ proc_dohostid(struct ctl_table *table, int write,
return (0); return (0);
} }
static void
taskq_seq_show_headers(struct seq_file *f)
{
seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
"taskq", "act", "nthr", "spwn", "maxt", "pri",
"mina", "maxa", "cura", "flags");
}
/* indices into the lheads array below */
#define LHEAD_PEND 0
#define LHEAD_PRIO 1
#define LHEAD_DELAY 2
#define LHEAD_WAIT 3
#define LHEAD_ACTIVE 4
#define LHEAD_SIZE 5
static unsigned int spl_max_show_tasks = 512;
/* CSTYLED */
module_param(spl_max_show_tasks, uint, 0644);
MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
static int
taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
{
taskq_t *tq = p;
taskq_thread_t *tqt = NULL;
spl_wait_queue_entry_t *wq;
struct task_struct *tsk;
taskq_ent_t *tqe;
char name[100];
struct list_head *lheads[LHEAD_SIZE], *lh;
static char *list_names[LHEAD_SIZE] =
{"pend", "prio", "delay", "wait", "active" };
int i, j, have_lheads = 0;
unsigned long wflags, flags;
spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
/* get the various lists and check whether they're empty */
lheads[LHEAD_PEND] = &tq->tq_pend_list;
lheads[LHEAD_PRIO] = &tq->tq_prio_list;
lheads[LHEAD_DELAY] = &tq->tq_delay_list;
#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
#else
lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
#endif
lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
for (i = 0; i < LHEAD_SIZE; ++i) {
if (list_empty(lheads[i]))
lheads[i] = NULL;
else
++have_lheads;
}
/* early return in non-"all" mode if lists are all empty */
if (!allflag && !have_lheads) {
spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
spin_unlock_irqrestore(&tq->tq_lock, flags);
return (0);
}
/* unlock the waitq quickly */
if (!lheads[LHEAD_WAIT])
spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
/* show the base taskq contents */
snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
seq_printf(f, "%-25s ", name);
seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
tq->tq_nalloc, tq->tq_flags);
/* show the active list */
if (lheads[LHEAD_ACTIVE]) {
j = 0;
list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
if (j == 0)
seq_printf(f, "\t%s:",
list_names[LHEAD_ACTIVE]);
else if (j == 2) {
seq_printf(f, "\n\t ");
j = 0;
}
seq_printf(f, " [%d]%pf(%ps)",
tqt->tqt_thread->pid,
tqt->tqt_task->tqent_func,
tqt->tqt_task->tqent_arg);
++j;
}
seq_printf(f, "\n");
}
for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
if (lheads[i]) {
j = 0;
list_for_each(lh, lheads[i]) {
if (spl_max_show_tasks != 0 &&
j >= spl_max_show_tasks) {
seq_printf(f, "\n\t(truncated)");
break;
}
/* show the wait waitq list */
if (i == LHEAD_WAIT) {
#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
wq = list_entry(lh,
spl_wait_queue_entry_t, entry);
#else
wq = list_entry(lh,
spl_wait_queue_entry_t, task_list);
#endif
if (j == 0)
seq_printf(f, "\t%s:",
list_names[i]);
else if (j % 8 == 0)
seq_printf(f, "\n\t ");
tsk = wq->private;
seq_printf(f, " %d", tsk->pid);
/* pend, prio and delay lists */
} else {
tqe = list_entry(lh, taskq_ent_t,
tqent_list);
if (j == 0)
seq_printf(f, "\t%s:",
list_names[i]);
else if (j % 2 == 0)
seq_printf(f, "\n\t ");
seq_printf(f, " %pf(%ps)",
tqe->tqent_func,
tqe->tqent_arg);
}
++j;
}
seq_printf(f, "\n");
}
if (lheads[LHEAD_WAIT])
spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
spin_unlock_irqrestore(&tq->tq_lock, flags);
return (0);
}
static int
taskq_all_seq_show(struct seq_file *f, void *p)
{
return (taskq_seq_show_impl(f, p, B_TRUE));
}
static int
taskq_seq_show(struct seq_file *f, void *p)
{
return (taskq_seq_show_impl(f, p, B_FALSE));
}
static void *
taskq_seq_start(struct seq_file *f, loff_t *pos)
{
struct list_head *p;
loff_t n = *pos;
down_read(&tq_list_sem);
if (!n)
taskq_seq_show_headers(f);
p = tq_list.next;
while (n--) {
p = p->next;
if (p == &tq_list)
return (NULL);
}
return (list_entry(p, taskq_t, tq_taskqs));
}
static void *
taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
{
taskq_t *tq = p;
++*pos;
return ((tq->tq_taskqs.next == &tq_list) ?
NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
}
static void static void
slab_seq_show_headers(struct seq_file *f) slab_seq_show_headers(struct seq_file *f)
{ {
@ -492,66 +309,6 @@ static const kstat_proc_op_t proc_slab_operations = {
#endif #endif
}; };
static void
taskq_seq_stop(struct seq_file *f, void *v)
{
up_read(&tq_list_sem);
}
static const struct seq_operations taskq_all_seq_ops = {
.show = taskq_all_seq_show,
.start = taskq_seq_start,
.next = taskq_seq_next,
.stop = taskq_seq_stop,
};
static const struct seq_operations taskq_seq_ops = {
.show = taskq_seq_show,
.start = taskq_seq_start,
.next = taskq_seq_next,
.stop = taskq_seq_stop,
};
static int
proc_taskq_all_open(struct inode *inode, struct file *filp)
{
return (seq_open(filp, &taskq_all_seq_ops));
}
static int
proc_taskq_open(struct inode *inode, struct file *filp)
{
return (seq_open(filp, &taskq_seq_ops));
}
static const kstat_proc_op_t proc_taskq_all_operations = {
#ifdef HAVE_PROC_OPS_STRUCT
.proc_open = proc_taskq_all_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
#else
.open = proc_taskq_all_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
#endif
};
static const kstat_proc_op_t proc_taskq_operations = {
#ifdef HAVE_PROC_OPS_STRUCT
.proc_open = proc_taskq_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
#else
.open = proc_taskq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
#endif
};
static struct ctl_table spl_kmem_table[] = { static struct ctl_table spl_kmem_table[] = {
#ifdef DEBUG_KMEM #ifdef DEBUG_KMEM
{ {
@ -668,8 +425,6 @@ static void spl_proc_cleanup(void)
remove_proc_entry("kstat", proc_spl); remove_proc_entry("kstat", proc_spl);
remove_proc_entry("slab", proc_spl_kmem); remove_proc_entry("slab", proc_spl_kmem);
remove_proc_entry("kmem", proc_spl); remove_proc_entry("kmem", proc_spl);
remove_proc_entry("taskq-all", proc_spl);
remove_proc_entry("taskq", proc_spl);
remove_proc_entry("spl", NULL); remove_proc_entry("spl", NULL);
#ifndef HAVE_REGISTER_SYSCTL_TABLE #ifndef HAVE_REGISTER_SYSCTL_TABLE
@ -688,6 +443,37 @@ static void spl_proc_cleanup(void)
} }
} }
#ifndef HAVE_REGISTER_SYSCTL_TABLE
/*
* Traditionally, struct ctl_table arrays have been terminated by an "empty"
* sentinel element (specifically, one with .procname == NULL).
*
* Linux 6.6 began migrating away from this, adding register_sysctl_sz() so
* that callers could provide the size directly, and redefining
* register_sysctl() to just call register_sysctl_sz() with the array size. It
* retained support for the terminating element so that existing callers would
* continue to work.
*
* Linux 6.11 removed support for the terminating element, instead interpreting
* it as a real malformed element, and rejecting it.
*
* In order to continue support older kernels, we retain the terminating
* sentinel element for our sysctl tables, but instead detect availability of
* register_sysctl_sz(). If it exists, we pass it the array size -1, stopping
* the kernel from trying to process the terminator. For pre-6.6 kernels that
* don't have register_sysctl_sz(), we just use register_sysctl(), which can
* handle the terminating element as it always has.
*/
#ifdef HAVE_REGISTER_SYSCTL_SZ
#define spl_proc_register_sysctl(p, t) \
register_sysctl_sz(p, t, ARRAY_SIZE(t)-1)
#else
#define spl_proc_register_sysctl(p, t) \
register_sysctl(p, t)
#endif
#endif
int int
spl_proc_init(void) spl_proc_init(void)
{ {
@ -698,16 +484,17 @@ spl_proc_init(void)
if (spl_header == NULL) if (spl_header == NULL)
return (-EUNATCH); return (-EUNATCH);
#else #else
spl_header = register_sysctl("kernel/spl", spl_table); spl_header = spl_proc_register_sysctl("kernel/spl", spl_table);
if (spl_header == NULL) if (spl_header == NULL)
return (-EUNATCH); return (-EUNATCH);
spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table); spl_kmem = spl_proc_register_sysctl("kernel/spl/kmem", spl_kmem_table);
if (spl_kmem == NULL) { if (spl_kmem == NULL) {
rc = -EUNATCH; rc = -EUNATCH;
goto out; goto out;
} }
spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table); spl_kstat = spl_proc_register_sysctl("kernel/spl/kstat",
spl_kstat_table);
if (spl_kstat == NULL) { if (spl_kstat == NULL) {
rc = -EUNATCH; rc = -EUNATCH;
goto out; goto out;
@ -720,20 +507,6 @@ spl_proc_init(void)
goto out; goto out;
} }
proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
&proc_taskq_all_operations, NULL);
if (proc_spl_taskq_all == NULL) {
rc = -EUNATCH;
goto out;
}
proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
&proc_taskq_operations, NULL);
if (proc_spl_taskq == NULL) {
rc = -EUNATCH;
goto out;
}
proc_spl_kmem = proc_mkdir("kmem", proc_spl); proc_spl_kmem = proc_mkdir("kmem", proc_spl);
if (proc_spl_kmem == NULL) { if (proc_spl_kmem == NULL) {
rc = -EUNATCH; rc = -EUNATCH;

View File

@ -22,16 +22,98 @@
* *
* Solaris Porting Layer (SPL) Task Queue Implementation. * Solaris Porting Layer (SPL) Task Queue Implementation.
*/ */
/*
* Copyright (c) 2024, Klara Inc.
* Copyright (c) 2024, Syneto
*/
#include <sys/timer.h> #include <sys/timer.h>
#include <sys/taskq.h> #include <sys/taskq.h>
#include <sys/kmem.h> #include <sys/kmem.h>
#include <sys/tsd.h> #include <sys/tsd.h>
#include <sys/trace_spl.h> #include <sys/trace_spl.h>
#include <sys/time.h>
#include <sys/atomic.h>
#include <sys/kstat.h>
#ifdef HAVE_CPU_HOTPLUG #ifdef HAVE_CPU_HOTPLUG
#include <linux/cpuhotplug.h> #include <linux/cpuhotplug.h>
#endif #endif
typedef struct taskq_kstats {
/* static values, for completeness */
kstat_named_t tqks_threads_max;
kstat_named_t tqks_entry_pool_min;
kstat_named_t tqks_entry_pool_max;
/* gauges (inc/dec counters, current value) */
kstat_named_t tqks_threads_active;
kstat_named_t tqks_threads_idle;
kstat_named_t tqks_threads_total;
kstat_named_t tqks_tasks_pending;
kstat_named_t tqks_tasks_priority;
kstat_named_t tqks_tasks_total;
kstat_named_t tqks_tasks_delayed;
kstat_named_t tqks_entries_free;
/* counters (inc only, since taskq creation) */
kstat_named_t tqks_threads_created;
kstat_named_t tqks_threads_destroyed;
kstat_named_t tqks_tasks_dispatched;
kstat_named_t tqks_tasks_dispatched_delayed;
kstat_named_t tqks_tasks_executed_normal;
kstat_named_t tqks_tasks_executed_priority;
kstat_named_t tqks_tasks_executed;
kstat_named_t tqks_tasks_delayed_requeued;
kstat_named_t tqks_tasks_cancelled;
kstat_named_t tqks_thread_wakeups;
kstat_named_t tqks_thread_wakeups_nowork;
kstat_named_t tqks_thread_sleeps;
} taskq_kstats_t;
static taskq_kstats_t taskq_kstats_template = {
{ "threads_max", KSTAT_DATA_UINT64 },
{ "entry_pool_min", KSTAT_DATA_UINT64 },
{ "entry_pool_max", KSTAT_DATA_UINT64 },
{ "threads_active", KSTAT_DATA_UINT64 },
{ "threads_idle", KSTAT_DATA_UINT64 },
{ "threads_total", KSTAT_DATA_UINT64 },
{ "tasks_pending", KSTAT_DATA_UINT64 },
{ "tasks_priority", KSTAT_DATA_UINT64 },
{ "tasks_total", KSTAT_DATA_UINT64 },
{ "tasks_delayed", KSTAT_DATA_UINT64 },
{ "entries_free", KSTAT_DATA_UINT64 },
{ "threads_created", KSTAT_DATA_UINT64 },
{ "threads_destroyed", KSTAT_DATA_UINT64 },
{ "tasks_dispatched", KSTAT_DATA_UINT64 },
{ "tasks_dispatched_delayed", KSTAT_DATA_UINT64 },
{ "tasks_executed_normal", KSTAT_DATA_UINT64 },
{ "tasks_executed_priority", KSTAT_DATA_UINT64 },
{ "tasks_executed", KSTAT_DATA_UINT64 },
{ "tasks_delayed_requeued", KSTAT_DATA_UINT64 },
{ "tasks_cancelled", KSTAT_DATA_UINT64 },
{ "thread_wakeups", KSTAT_DATA_UINT64 },
{ "thread_wakeups_nowork", KSTAT_DATA_UINT64 },
{ "thread_sleeps", KSTAT_DATA_UINT64 },
};
#define TQSTAT_INC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, 1)
#define TQSTAT_DEC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, -1)
#define _TQSTAT_MOD_LIST(mod, tq, t) do { \
switch (t->tqent_flags & TQENT_LIST_MASK) { \
case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\
case TQENT_LIST_PENDING: mod(tq, tasks_pending); break; \
case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break; \
case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break; \
} \
} while (0)
#define TQSTAT_INC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_INC, tq, t)
#define TQSTAT_DEC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t)
#define TQENT_SET_LIST(t, l) \
t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l;
static int spl_taskq_thread_bind = 0; static int spl_taskq_thread_bind = 0;
module_param(spl_taskq_thread_bind, int, 0644); module_param(spl_taskq_thread_bind, int, 0644);
MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
@ -134,6 +216,7 @@ retry:
ASSERT(!timer_pending(&t->tqent_timer)); ASSERT(!timer_pending(&t->tqent_timer));
list_del_init(&t->tqent_list); list_del_init(&t->tqent_list);
TQSTAT_DEC(tq, entries_free);
return (t); return (t);
} }
@ -204,12 +287,11 @@ task_done(taskq_t *tq, taskq_ent_t *t)
{ {
ASSERT(tq); ASSERT(tq);
ASSERT(t); ASSERT(t);
ASSERT(list_empty(&t->tqent_list));
/* Wake tasks blocked in taskq_wait_id() */ /* Wake tasks blocked in taskq_wait_id() */
wake_up_all(&t->tqent_waitq); wake_up_all(&t->tqent_waitq);
list_del_init(&t->tqent_list);
if (tq->tq_nalloc <= tq->tq_minalloc) { if (tq->tq_nalloc <= tq->tq_minalloc) {
t->tqent_id = TASKQID_INVALID; t->tqent_id = TASKQID_INVALID;
t->tqent_func = NULL; t->tqent_func = NULL;
@ -217,6 +299,7 @@ task_done(taskq_t *tq, taskq_ent_t *t)
t->tqent_flags = 0; t->tqent_flags = 0;
list_add_tail(&t->tqent_list, &tq->tq_free_list); list_add_tail(&t->tqent_list, &tq->tq_free_list);
TQSTAT_INC(tq, entries_free);
} else { } else {
task_free(tq, t); task_free(tq, t);
} }
@ -263,6 +346,8 @@ task_expire_impl(taskq_ent_t *t)
spin_unlock_irqrestore(&tq->tq_lock, flags); spin_unlock_irqrestore(&tq->tq_lock, flags);
wake_up(&tq->tq_work_waitq); wake_up(&tq->tq_work_waitq);
TQSTAT_INC(tq, tasks_delayed_requeued);
} }
static void static void
@ -534,7 +619,11 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
t = taskq_find(tq, id); t = taskq_find(tq, id);
if (t && t != ERR_PTR(-EBUSY)) { if (t && t != ERR_PTR(-EBUSY)) {
list_del_init(&t->tqent_list); list_del_init(&t->tqent_list);
TQSTAT_DEC_LIST(tq, t);
TQSTAT_DEC(tq, tasks_total);
t->tqent_flags |= TQENT_FLAG_CANCEL; t->tqent_flags |= TQENT_FLAG_CANCEL;
TQSTAT_INC(tq, tasks_cancelled);
/* /*
* When canceling the lowest outstanding task id we * When canceling the lowest outstanding task id we
@ -604,13 +693,19 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
spin_lock(&t->tqent_lock); spin_lock(&t->tqent_lock);
/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
if (flags & TQ_NOQUEUE) if (flags & TQ_NOQUEUE) {
TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
list_add(&t->tqent_list, &tq->tq_prio_list); list_add(&t->tqent_list, &tq->tq_prio_list);
/* Queue to the priority list instead of the pending list */ /* Queue to the priority list instead of the pending list */
else if (flags & TQ_FRONT) } else if (flags & TQ_FRONT) {
TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
list_add_tail(&t->tqent_list, &tq->tq_prio_list); list_add_tail(&t->tqent_list, &tq->tq_prio_list);
else } else {
TQENT_SET_LIST(t, TQENT_LIST_PENDING);
list_add_tail(&t->tqent_list, &tq->tq_pend_list); list_add_tail(&t->tqent_list, &tq->tq_pend_list);
}
TQSTAT_INC_LIST(tq, t);
TQSTAT_INC(tq, tasks_total);
t->tqent_id = rc = tq->tq_next_id; t->tqent_id = rc = tq->tq_next_id;
tq->tq_next_id++; tq->tq_next_id++;
@ -629,6 +724,8 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
wake_up(&tq->tq_work_waitq); wake_up(&tq->tq_work_waitq);
TQSTAT_INC(tq, tasks_dispatched);
/* Spawn additional taskq threads if required. */ /* Spawn additional taskq threads if required. */
if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq); (void) taskq_thread_spawn(tq);
@ -662,6 +759,9 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
/* Queue to the delay list for subsequent execution */ /* Queue to the delay list for subsequent execution */
list_add_tail(&t->tqent_list, &tq->tq_delay_list); list_add_tail(&t->tqent_list, &tq->tq_delay_list);
TQENT_SET_LIST(t, TQENT_LIST_DELAY);
TQSTAT_INC_LIST(tq, t);
TQSTAT_INC(tq, tasks_total);
t->tqent_id = rc = tq->tq_next_id; t->tqent_id = rc = tq->tq_next_id;
tq->tq_next_id++; tq->tq_next_id++;
@ -676,6 +776,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
spin_unlock(&t->tqent_lock); spin_unlock(&t->tqent_lock);
TQSTAT_INC(tq, tasks_dispatched_delayed);
/* Spawn additional taskq threads if required. */ /* Spawn additional taskq threads if required. */
if (tq->tq_nactive == tq->tq_nthreads) if (tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq); (void) taskq_thread_spawn(tq);
@ -724,10 +826,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
t->tqent_flags |= TQENT_FLAG_PREALLOC; t->tqent_flags |= TQENT_FLAG_PREALLOC;
/* Queue to the priority list instead of the pending list */ /* Queue to the priority list instead of the pending list */
if (flags & TQ_FRONT) if (flags & TQ_FRONT) {
TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
list_add_tail(&t->tqent_list, &tq->tq_prio_list); list_add_tail(&t->tqent_list, &tq->tq_prio_list);
else } else {
TQENT_SET_LIST(t, TQENT_LIST_PENDING);
list_add_tail(&t->tqent_list, &tq->tq_pend_list); list_add_tail(&t->tqent_list, &tq->tq_pend_list);
}
TQSTAT_INC_LIST(tq, t);
TQSTAT_INC(tq, tasks_total);
t->tqent_id = tq->tq_next_id; t->tqent_id = tq->tq_next_id;
tq->tq_next_id++; tq->tq_next_id++;
@ -742,6 +849,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
wake_up(&tq->tq_work_waitq); wake_up(&tq->tq_work_waitq);
TQSTAT_INC(tq, tasks_dispatched);
/* Spawn additional taskq threads if required. */ /* Spawn additional taskq threads if required. */
if (tq->tq_nactive == tq->tq_nthreads) if (tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq); (void) taskq_thread_spawn(tq);
@ -908,6 +1017,8 @@ taskq_thread(void *args)
wake_up(&tq->tq_wait_waitq); wake_up(&tq->tq_wait_waitq);
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
TQSTAT_INC(tq, threads_total);
while (!kthread_should_stop()) { while (!kthread_should_stop()) {
if (list_empty(&tq->tq_pend_list) && if (list_empty(&tq->tq_pend_list) &&
@ -919,9 +1030,15 @@ taskq_thread(void *args)
add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
spin_unlock_irqrestore(&tq->tq_lock, flags); spin_unlock_irqrestore(&tq->tq_lock, flags);
TQSTAT_INC(tq, thread_sleeps);
TQSTAT_INC(tq, threads_idle);
schedule(); schedule();
seq_tasks = 0; seq_tasks = 0;
TQSTAT_DEC(tq, threads_idle);
TQSTAT_INC(tq, thread_wakeups);
spin_lock_irqsave_nested(&tq->tq_lock, flags, spin_lock_irqsave_nested(&tq->tq_lock, flags,
tq->tq_lock_class); tq->tq_lock_class);
remove_wait_queue(&tq->tq_work_waitq, &wait); remove_wait_queue(&tq->tq_work_waitq, &wait);
@ -931,6 +1048,8 @@ taskq_thread(void *args)
if ((t = taskq_next_ent(tq)) != NULL) { if ((t = taskq_next_ent(tq)) != NULL) {
list_del_init(&t->tqent_list); list_del_init(&t->tqent_list);
TQSTAT_DEC_LIST(tq, t);
TQSTAT_DEC(tq, tasks_total);
/* /*
* A TQENT_FLAG_PREALLOC task may be reused or freed * A TQENT_FLAG_PREALLOC task may be reused or freed
@ -955,6 +1074,7 @@ taskq_thread(void *args)
tq->tq_nactive++; tq->tq_nactive++;
spin_unlock_irqrestore(&tq->tq_lock, flags); spin_unlock_irqrestore(&tq->tq_lock, flags);
TQSTAT_INC(tq, threads_active);
DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t); DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
/* Perform the requested task */ /* Perform the requested task */
@ -962,8 +1082,17 @@ taskq_thread(void *args)
DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t); DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
TQSTAT_DEC(tq, threads_active);
if ((t->tqent_flags & TQENT_LIST_MASK) ==
TQENT_LIST_PENDING)
TQSTAT_INC(tq, tasks_executed_normal);
else
TQSTAT_INC(tq, tasks_executed_priority);
TQSTAT_INC(tq, tasks_executed);
spin_lock_irqsave_nested(&tq->tq_lock, flags, spin_lock_irqsave_nested(&tq->tq_lock, flags,
tq->tq_lock_class); tq->tq_lock_class);
tq->tq_nactive--; tq->tq_nactive--;
list_del_init(&tqt->tqt_active_list); list_del_init(&tqt->tqt_active_list);
tqt->tqt_task = NULL; tqt->tqt_task = NULL;
@ -989,7 +1118,8 @@ taskq_thread(void *args)
tqt->tqt_id = TASKQID_INVALID; tqt->tqt_id = TASKQID_INVALID;
tqt->tqt_flags = 0; tqt->tqt_flags = 0;
wake_up_all(&tq->tq_wait_waitq); wake_up_all(&tq->tq_wait_waitq);
} } else
TQSTAT_INC(tq, thread_wakeups_nowork);
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
@ -998,6 +1128,10 @@ taskq_thread(void *args)
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
tq->tq_nthreads--; tq->tq_nthreads--;
list_del_init(&tqt->tqt_thread_list); list_del_init(&tqt->tqt_thread_list);
TQSTAT_DEC(tq, threads_total);
TQSTAT_INC(tq, threads_destroyed);
error: error:
kmem_free(tqt, sizeof (taskq_thread_t)); kmem_free(tqt, sizeof (taskq_thread_t));
spin_unlock_irqrestore(&tq->tq_lock, flags); spin_unlock_irqrestore(&tq->tq_lock, flags);
@ -1037,9 +1171,156 @@ taskq_thread_create(taskq_t *tq)
wake_up_process(tqt->tqt_thread); wake_up_process(tqt->tqt_thread);
TQSTAT_INC(tq, threads_created);
return (tqt); return (tqt);
} }
static void
taskq_stats_init(taskq_t *tq)
{
taskq_sums_t *tqs = &tq->tq_sums;
wmsum_init(&tqs->tqs_threads_active, 0);
wmsum_init(&tqs->tqs_threads_idle, 0);
wmsum_init(&tqs->tqs_threads_total, 0);
wmsum_init(&tqs->tqs_tasks_pending, 0);
wmsum_init(&tqs->tqs_tasks_priority, 0);
wmsum_init(&tqs->tqs_tasks_total, 0);
wmsum_init(&tqs->tqs_tasks_delayed, 0);
wmsum_init(&tqs->tqs_entries_free, 0);
wmsum_init(&tqs->tqs_threads_created, 0);
wmsum_init(&tqs->tqs_threads_destroyed, 0);
wmsum_init(&tqs->tqs_tasks_dispatched, 0);
wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0);
wmsum_init(&tqs->tqs_tasks_executed_normal, 0);
wmsum_init(&tqs->tqs_tasks_executed_priority, 0);
wmsum_init(&tqs->tqs_tasks_executed, 0);
wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0);
wmsum_init(&tqs->tqs_tasks_cancelled, 0);
wmsum_init(&tqs->tqs_thread_wakeups, 0);
wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0);
wmsum_init(&tqs->tqs_thread_sleeps, 0);
}
static void
taskq_stats_fini(taskq_t *tq)
{
taskq_sums_t *tqs = &tq->tq_sums;
wmsum_fini(&tqs->tqs_threads_active);
wmsum_fini(&tqs->tqs_threads_idle);
wmsum_fini(&tqs->tqs_threads_total);
wmsum_fini(&tqs->tqs_tasks_pending);
wmsum_fini(&tqs->tqs_tasks_priority);
wmsum_fini(&tqs->tqs_tasks_total);
wmsum_fini(&tqs->tqs_tasks_delayed);
wmsum_fini(&tqs->tqs_entries_free);
wmsum_fini(&tqs->tqs_threads_created);
wmsum_fini(&tqs->tqs_threads_destroyed);
wmsum_fini(&tqs->tqs_tasks_dispatched);
wmsum_fini(&tqs->tqs_tasks_dispatched_delayed);
wmsum_fini(&tqs->tqs_tasks_executed_normal);
wmsum_fini(&tqs->tqs_tasks_executed_priority);
wmsum_fini(&tqs->tqs_tasks_executed);
wmsum_fini(&tqs->tqs_tasks_delayed_requeued);
wmsum_fini(&tqs->tqs_tasks_cancelled);
wmsum_fini(&tqs->tqs_thread_wakeups);
wmsum_fini(&tqs->tqs_thread_wakeups_nowork);
wmsum_fini(&tqs->tqs_thread_sleeps);
}
static int
taskq_kstats_update(kstat_t *ksp, int rw)
{
if (rw == KSTAT_WRITE)
return (EACCES);
taskq_t *tq = ksp->ks_private;
taskq_kstats_t *tqks = ksp->ks_data;
tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads;
tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc;
tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc;
taskq_sums_t *tqs = &tq->tq_sums;
tqks->tqks_threads_active.value.ui64 =
wmsum_value(&tqs->tqs_threads_active);
tqks->tqks_threads_idle.value.ui64 =
wmsum_value(&tqs->tqs_threads_idle);
tqks->tqks_threads_total.value.ui64 =
wmsum_value(&tqs->tqs_threads_total);
tqks->tqks_tasks_pending.value.ui64 =
wmsum_value(&tqs->tqs_tasks_pending);
tqks->tqks_tasks_priority.value.ui64 =
wmsum_value(&tqs->tqs_tasks_priority);
tqks->tqks_tasks_total.value.ui64 =
wmsum_value(&tqs->tqs_tasks_total);
tqks->tqks_tasks_delayed.value.ui64 =
wmsum_value(&tqs->tqs_tasks_delayed);
tqks->tqks_entries_free.value.ui64 =
wmsum_value(&tqs->tqs_entries_free);
tqks->tqks_threads_created.value.ui64 =
wmsum_value(&tqs->tqs_threads_created);
tqks->tqks_threads_destroyed.value.ui64 =
wmsum_value(&tqs->tqs_threads_destroyed);
tqks->tqks_tasks_dispatched.value.ui64 =
wmsum_value(&tqs->tqs_tasks_dispatched);
tqks->tqks_tasks_dispatched_delayed.value.ui64 =
wmsum_value(&tqs->tqs_tasks_dispatched_delayed);
tqks->tqks_tasks_executed_normal.value.ui64 =
wmsum_value(&tqs->tqs_tasks_executed_normal);
tqks->tqks_tasks_executed_priority.value.ui64 =
wmsum_value(&tqs->tqs_tasks_executed_priority);
tqks->tqks_tasks_executed.value.ui64 =
wmsum_value(&tqs->tqs_tasks_executed);
tqks->tqks_tasks_delayed_requeued.value.ui64 =
wmsum_value(&tqs->tqs_tasks_delayed_requeued);
tqks->tqks_tasks_cancelled.value.ui64 =
wmsum_value(&tqs->tqs_tasks_cancelled);
tqks->tqks_thread_wakeups.value.ui64 =
wmsum_value(&tqs->tqs_thread_wakeups);
tqks->tqks_thread_wakeups_nowork.value.ui64 =
wmsum_value(&tqs->tqs_thread_wakeups_nowork);
tqks->tqks_thread_sleeps.value.ui64 =
wmsum_value(&tqs->tqs_thread_sleeps);
return (0);
}
static void
taskq_kstats_init(taskq_t *tq)
{
char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance);
kstat_t *ksp = kstat_create("taskq", 0, name, "misc",
KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (ksp == NULL)
return;
ksp->ks_private = tq;
ksp->ks_update = taskq_kstats_update;
ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP);
memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t));
kstat_install(ksp);
tq->tq_ksp = ksp;
}
static void
taskq_kstats_fini(taskq_t *tq)
{
if (tq->tq_ksp == NULL)
return;
kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t));
kstat_delete(tq->tq_ksp);
tq->tq_ksp = NULL;
}
taskq_t * taskq_t *
taskq_create(const char *name, int threads_arg, pri_t pri, taskq_create(const char *name, int threads_arg, pri_t pri,
int minalloc, int maxalloc, uint_t flags) int minalloc, int maxalloc, uint_t flags)
@ -1104,6 +1385,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
init_waitqueue_head(&tq->tq_wait_waitq); init_waitqueue_head(&tq->tq_wait_waitq);
tq->tq_lock_class = TQ_LOCK_GENERAL; tq->tq_lock_class = TQ_LOCK_GENERAL;
INIT_LIST_HEAD(&tq->tq_taskqs); INIT_LIST_HEAD(&tq->tq_taskqs);
taskq_stats_init(tq);
if (flags & TASKQ_PREPOPULATE) { if (flags & TASKQ_PREPOPULATE) {
spin_lock_irqsave_nested(&tq->tq_lock, irqflags, spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
@ -1137,14 +1419,17 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
if (rc) { if (rc) {
taskq_destroy(tq); taskq_destroy(tq);
tq = NULL; return (NULL);
} else {
down_write(&tq_list_sem);
tq->tq_instance = taskq_find_by_name(name) + 1;
list_add_tail(&tq->tq_taskqs, &tq_list);
up_write(&tq_list_sem);
} }
down_write(&tq_list_sem);
tq->tq_instance = taskq_find_by_name(name) + 1;
list_add_tail(&tq->tq_taskqs, &tq_list);
up_write(&tq_list_sem);
/* Install kstats late, because the name includes tq_instance */
taskq_kstats_init(tq);
return (tq); return (tq);
} }
EXPORT_SYMBOL(taskq_create); EXPORT_SYMBOL(taskq_create);
@ -1177,6 +1462,8 @@ taskq_destroy(taskq_t *tq)
taskq_wait(tq); taskq_wait(tq);
taskq_kstats_fini(tq);
/* remove taskq from global list used by the kstats */ /* remove taskq from global list used by the kstats */
down_write(&tq_list_sem); down_write(&tq_list_sem);
list_del(&tq->tq_taskqs); list_del(&tq->tq_taskqs);
@ -1230,6 +1517,7 @@ taskq_destroy(taskq_t *tq)
spin_unlock_irqrestore(&tq->tq_lock, flags); spin_unlock_irqrestore(&tq->tq_lock, flags);
taskq_stats_fini(tq);
kmem_strfree(tq->tq_name); kmem_strfree(tq->tq_name);
kmem_free(tq, sizeof (taskq_t)); kmem_free(tq, sizeof (taskq_t));
} }
@ -1271,6 +1559,100 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri,
} }
EXPORT_SYMBOL(taskq_create_synced); EXPORT_SYMBOL(taskq_create_synced);
static kstat_t *taskq_summary_ksp = NULL;
static int
spl_taskq_kstat_headers(char *buf, size_t size)
{
size_t n = snprintf(buf, size,
"%-20s | %-17s | %-23s\n"
"%-20s | %-17s | %-23s\n"
"%-20s | %-17s | %-23s\n",
"", "threads", "tasks on queue",
"taskq name", "tot [act idl] max", " pend [ norm high] dly",
"--------------------", "-----------------",
"-----------------------");
return (n >= size ? ENOMEM : 0);
}
static int
spl_taskq_kstat_data(char *buf, size_t size, void *data)
{
struct list_head *tql = NULL;
taskq_t *tq;
char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
char threads[25];
char tasks[30];
size_t n;
int err = 0;
down_read(&tq_list_sem);
list_for_each_prev(tql, &tq_list) {
tq = list_entry(tql, taskq_t, tq_taskqs);
mutex_enter(tq->tq_ksp->ks_lock);
taskq_kstats_update(tq->tq_ksp, KSTAT_READ);
taskq_kstats_t *tqks = tq->tq_ksp->ks_data;
snprintf(name, sizeof (name), "%s.%d", tq->tq_name,
tq->tq_instance);
snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu",
tqks->tqks_threads_total.value.ui64,
tqks->tqks_threads_active.value.ui64,
tqks->tqks_threads_idle.value.ui64,
tqks->tqks_threads_max.value.ui64);
snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu",
tqks->tqks_tasks_total.value.ui64,
tqks->tqks_tasks_pending.value.ui64,
tqks->tqks_tasks_priority.value.ui64,
tqks->tqks_tasks_delayed.value.ui64);
mutex_exit(tq->tq_ksp->ks_lock);
n = snprintf(buf, size, "%-20s | %-17s | %-23s\n",
name, threads, tasks);
if (n >= size) {
err = ENOMEM;
break;
}
buf = &buf[n];
size -= n;
}
up_read(&tq_list_sem);
return (err);
}
static void
spl_taskq_kstat_init(void)
{
kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
if (ksp == NULL)
return;
ksp->ks_data = (void *)(uintptr_t)1;
ksp->ks_ndata = 1;
kstat_set_raw_ops(ksp, spl_taskq_kstat_headers,
spl_taskq_kstat_data, NULL);
kstat_install(ksp);
taskq_summary_ksp = ksp;
}
static void
spl_taskq_kstat_fini(void)
{
if (taskq_summary_ksp == NULL)
return;
kstat_delete(taskq_summary_ksp);
taskq_summary_ksp = NULL;
}
static unsigned int spl_taskq_kick = 0; static unsigned int spl_taskq_kick = 0;
/* /*
@ -1451,12 +1833,16 @@ spl_taskq_init(void)
*/ */
dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
spl_taskq_kstat_init();
return (0); return (0);
} }
void void
spl_taskq_fini(void) spl_taskq_fini(void)
{ {
spl_taskq_kstat_fini();
taskq_destroy(dynamic_taskq); taskq_destroy(dynamic_taskq);
dynamic_taskq = NULL; dynamic_taskq = NULL;

View File

@ -186,6 +186,13 @@ issig(void)
schedule(); schedule();
#endif #endif
/*
* Dequeued SIGSTOP/SIGTSTP.
* Check if process has other singal pending.
*/
if (signal_pending(current))
return (1);
return (0); return (0);
} }

View File

@ -58,22 +58,16 @@
#include <sys/arc.h> #include <sys/arc.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/zfs_znode.h> #include <sys/zfs_znode.h>
#ifdef _KERNEL
#include <linux/kmap_compat.h> #include <linux/kmap_compat.h>
#include <linux/mm_compat.h> #include <linux/mm_compat.h>
#include <linux/scatterlist.h> #include <linux/scatterlist.h>
#include <linux/version.h> #include <linux/version.h>
#endif
#ifdef _KERNEL
#if defined(MAX_ORDER) #if defined(MAX_ORDER)
#define ABD_MAX_ORDER (MAX_ORDER) #define ABD_MAX_ORDER (MAX_ORDER)
#elif defined(MAX_PAGE_ORDER) #elif defined(MAX_PAGE_ORDER)
#define ABD_MAX_ORDER (MAX_PAGE_ORDER) #define ABD_MAX_ORDER (MAX_PAGE_ORDER)
#endif #endif
#else
#define ABD_MAX_ORDER (1)
#endif
typedef struct abd_stats { typedef struct abd_stats {
kstat_named_t abdstat_struct_size; kstat_named_t abdstat_struct_size;
@ -193,11 +187,9 @@ abd_t *abd_zero_scatter = NULL;
struct page; struct page;
/* /*
* _KERNEL - Will point to ZERO_PAGE if it is available or it will be * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
* an allocated zero'd PAGESIZE buffer. * point to ZERO_PAGE if it is available or it will be an allocated zero'd
* Userspace - Will be an allocated zero'ed PAGESIZE buffer. * PAGESIZE buffer.
*
* abd_zero_page is assigned to each of the pages of abd_zero_scatter.
*/ */
static struct page *abd_zero_page = NULL; static struct page *abd_zero_page = NULL;
@ -232,7 +224,6 @@ abd_free_struct_impl(abd_t *abd)
ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
} }
#ifdef _KERNEL
static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
/* /*
@ -509,7 +500,7 @@ abd_alloc_zero_scatter(void)
ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
sg_set_page(sg, abd_zero_page, PAGESIZE, 0); sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
@ -520,134 +511,6 @@ abd_alloc_zero_scatter(void)
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
} }
#else /* _KERNEL */
#ifndef PAGE_SHIFT
#define PAGE_SHIFT (highbit64(PAGESIZE)-1)
#endif
#define zfs_kmap_local(chunk) ((void *)chunk)
#define zfs_kunmap_local(addr) do { (void)(addr); } while (0)
#define local_irq_save(flags) do { (void)(flags); } while (0)
#define local_irq_restore(flags) do { (void)(flags); } while (0)
#define nth_page(pg, i) \
((struct page *)((void *)(pg) + (i) * PAGESIZE))
struct scatterlist {
struct page *page;
int length;
int end;
};
static void
sg_init_table(struct scatterlist *sg, int nr)
{
memset(sg, 0, nr * sizeof (struct scatterlist));
sg[nr - 1].end = 1;
}
/*
* This must be called if any of the sg_table allocation functions
* are called.
*/
static void
abd_free_sg_table(abd_t *abd)
{
int nents = ABD_SCATTER(abd).abd_nents;
vmem_free(ABD_SCATTER(abd).abd_sgl,
nents * sizeof (struct scatterlist));
}
#define for_each_sg(sgl, sg, nr, i) \
for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
static inline void
sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
unsigned int offset)
{
/* currently we don't use offset */
ASSERT(offset == 0);
sg->page = page;
sg->length = len;
}
static inline struct page *
sg_page(struct scatterlist *sg)
{
return (sg->page);
}
static inline struct scatterlist *
sg_next(struct scatterlist *sg)
{
if (sg->end)
return (NULL);
return (sg + 1);
}
void
abd_alloc_chunks(abd_t *abd, size_t size)
{
unsigned nr_pages = abd_chunkcnt_for_bytes(size);
struct scatterlist *sg;
int i;
ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
sizeof (struct scatterlist), KM_SLEEP);
sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
abd_for_each_sg(abd, sg, nr_pages, i) {
struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
sg_set_page(sg, p, PAGESIZE, 0);
}
ABD_SCATTER(abd).abd_nents = nr_pages;
}
void
abd_free_chunks(abd_t *abd)
{
int i, n = ABD_SCATTER(abd).abd_nents;
struct scatterlist *sg;
abd_for_each_sg(abd, sg, n, i) {
struct page *p = nth_page(sg_page(sg), 0);
umem_free_aligned(p, PAGESIZE);
}
abd_free_sg_table(abd);
}
static void
abd_alloc_zero_scatter(void)
{
unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
struct scatterlist *sg;
int i;
abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
memset(abd_zero_page, 0, PAGESIZE);
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
sizeof (struct scatterlist), KM_SLEEP);
sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
}
ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
}
#endif /* _KERNEL */
boolean_t boolean_t
abd_size_alloc_linear(size_t size) abd_size_alloc_linear(size_t size)
{ {
@ -712,14 +575,10 @@ abd_free_zero_scatter(void)
abd_free_struct(abd_zero_scatter); abd_free_struct(abd_zero_scatter);
abd_zero_scatter = NULL; abd_zero_scatter = NULL;
ASSERT3P(abd_zero_page, !=, NULL); ASSERT3P(abd_zero_page, !=, NULL);
#if defined(_KERNEL)
#if defined(HAVE_ZERO_PAGE_GPL_ONLY) #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
abd_unmark_zfs_page(abd_zero_page); abd_unmark_zfs_page(abd_zero_page);
__free_page(abd_zero_page); __free_page(abd_zero_page);
#endif /* HAVE_ZERO_PAGE_GPL_ONLY */ #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
#else
umem_free_aligned(abd_zero_page, PAGESIZE);
#endif /* _KERNEL */
} }
static int static int
@ -1014,8 +873,6 @@ abd_cache_reap_now(void)
{ {
} }
#if defined(_KERNEL)
/* /*
* This is abd_iter_page(), the function underneath abd_iterate_page_func(). * This is abd_iter_page(), the function underneath abd_iterate_page_func().
* It yields the next page struct and data offset and size within it, without * It yields the next page struct and data offset and size within it, without
@ -1297,5 +1154,3 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
module_param(zfs_abd_scatter_max_order, uint, 0644); module_param(zfs_abd_scatter_max_order, uint, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_max_order, MODULE_PARM_DESC(zfs_abd_scatter_max_order,
"Maximum order allocation used for a scatter ABD."); "Maximum order allocation used for a scatter ABD.");
#endif /* _KERNEL */

View File

@ -201,9 +201,9 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
* See also the comment above zfs_arc_shrinker_limit. * See also the comment above zfs_arc_shrinker_limit.
*/ */
int64_t can_free = btop(arc_evictable_memory()); int64_t can_free = btop(arc_evictable_memory());
int64_t limit = zfs_arc_shrinker_limit != 0 ? if (current_is_kswapd() && zfs_arc_shrinker_limit)
zfs_arc_shrinker_limit : INT64_MAX; can_free = MIN(can_free, zfs_arc_shrinker_limit);
return (MIN(can_free, limit)); return (can_free);
} }
static unsigned long static unsigned long

View File

@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
zfsvfs_t *snap_zfsvfs; zfsvfs_t *snap_zfsvfs;
zfs_snapentry_t *se; zfs_snapentry_t *se;
char *full_name, *full_path; char *full_name, *full_path;
char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",
NULL }; NULL, NULL, NULL };
char *envp[] = { NULL }; char *envp[] = { NULL };
int error; int error;
struct path spath; struct path spath;
@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
* value from call_usermodehelper() will be (exitcode << 8 + signal). * value from call_usermodehelper() will be (exitcode << 8 + signal).
*/ */
dprintf("mount; name=%s path=%s\n", full_name, full_path); dprintf("mount; name=%s path=%s\n", full_name, full_path);
argv[5] = full_name; argv[6] = full_name;
argv[6] = full_path; argv[7] = full_path;
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
if (error) { if (error) {
if (!(error & MOUNT_BUSY << 8)) { if (!(error & MOUNT_BUSY << 8)) {

View File

@ -69,6 +69,7 @@
#include <sys/zpl.h> #include <sys/zpl.h>
#include <sys/zil.h> #include <sys/zil.h>
#include <sys/sa_impl.h> #include <sys/sa_impl.h>
#include <linux/mm_compat.h>
/* /*
* Programming rules. * Programming rules.
@ -1820,24 +1821,36 @@ zfs_setattr_dir(znode_t *dzp)
&gid, sizeof (gid)); &gid, sizeof (gid));
} }
if (zp->z_projid != dzp->z_projid) {
uint64_t projid = dzp->z_projid;
if (zp->z_projid != projid) {
if (!(zp->z_pflags & ZFS_PROJID)) { if (!(zp->z_pflags & ZFS_PROJID)) {
zp->z_pflags |= ZFS_PROJID; err = sa_add_projid(zp->z_sa_hdl, tx, projid);
SA_ADD_BULK_ATTR(bulk, count, if (unlikely(err == EEXIST)) {
SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, err = 0;
sizeof (zp->z_pflags)); } else if (err != 0) {
goto sa_add_projid_err;
} else {
projid = ZFS_INVALID_PROJID;
}
} }
zp->z_projid = dzp->z_projid; if (projid != ZFS_INVALID_PROJID) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), zp->z_projid = projid;
NULL, &zp->z_projid, sizeof (zp->z_projid)); SA_ADD_BULK_ATTR(bulk, count,
SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
sizeof (zp->z_projid));
}
} }
sa_add_projid_err:
mutex_exit(&dzp->z_lock); mutex_exit(&dzp->z_lock);
if (likely(count > 0)) { if (likely(count > 0)) {
err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
dmu_tx_commit(tx); dmu_tx_commit(tx);
} else if (projid == ZFS_INVALID_PROJID) {
dmu_tx_commit(tx);
} else { } else {
dmu_tx_abort(tx); dmu_tx_abort(tx);
} }

View File

@ -295,6 +295,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
{ {
struct super_block *s; struct super_block *s;
objset_t *os; objset_t *os;
boolean_t issnap = B_FALSE;
int err; int err;
err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
@ -326,6 +327,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
if (zpl_enter(zfsvfs, FTAG) == 0) { if (zpl_enter(zfsvfs, FTAG) == 0) {
if (os != zfsvfs->z_os) if (os != zfsvfs->z_os)
err = -SET_ERROR(EBUSY); err = -SET_ERROR(EBUSY);
issnap = zfsvfs->z_issnap;
zpl_exit(zfsvfs, FTAG); zpl_exit(zfsvfs, FTAG);
} else { } else {
err = -SET_ERROR(EBUSY); err = -SET_ERROR(EBUSY);
@ -349,7 +351,11 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
return (ERR_PTR(err)); return (ERR_PTR(err));
} }
s->s_flags |= SB_ACTIVE; s->s_flags |= SB_ACTIVE;
} else if ((flags ^ s->s_flags) & SB_RDONLY) { } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
/*
* Skip ro check for snap since snap is always ro regardless
* ro flag is passed by mount or not.
*/
deactivate_locked_super(s); deactivate_locked_super(s);
return (ERR_PTR(-EBUSY)); return (ERR_PTR(-EBUSY));
} }

View File

@ -20,6 +20,7 @@
*/ */
/* /*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
* Copyright (c) 2024, Klara, Inc. * Copyright (c) 2024, Klara, Inc.
*/ */
@ -1089,11 +1090,42 @@ static const struct block_device_operations zvol_ops = {
#endif #endif
}; };
/*
* Since 6.9, Linux has been removing queue limit setters in favour of an
* initial queue_limits struct applied when the device is open. Since 6.11,
* queue_limits is being extended to allow more things to be applied when the
* device is open. Setters are also being removed for this.
*
* For OpenZFS, this means that depending on kernel version, some options may
* be set up before the device is open, and some applied to an open device
* (queue) after the fact.
*
* We manage this complexity by having our own limits struct,
* zvol_queue_limits_t, in which we carry any queue config that we're
* interested in setting. This structure is the same on all kernels.
*
* These limits are then applied to the queue at device open time by the most
* appropriate method for the kernel.
*
* zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
* blk_alloc_disk() exists). This converts our limits struct to a proper Linux
* struct queue_limits, and passes it in. Any fields added in later kernels are
* (obviously) not set up here.
*
* zvol_queue_limits_apply() is called on all kernel versions after the queue
* is created, and applies any remaining config. Before 6.9 that will be
* everything, via setter methods. After 6.9 that will be whatever couldn't be
* put into struct queue_limits. (This implies that zvol_queue_limits_apply()
* will always be a no-op on the latest kernel we support).
*/
typedef struct zvol_queue_limits { typedef struct zvol_queue_limits {
unsigned int zql_max_hw_sectors; unsigned int zql_max_hw_sectors;
unsigned short zql_max_segments; unsigned short zql_max_segments;
unsigned int zql_max_segment_size; unsigned int zql_max_segment_size;
unsigned int zql_io_opt; unsigned int zql_io_opt;
unsigned int zql_physical_block_size;
unsigned int zql_max_discard_sectors;
unsigned int zql_discard_granularity;
} zvol_queue_limits_t; } zvol_queue_limits_t;
static void static void
@ -1162,6 +1194,11 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
} }
limits->zql_io_opt = zv->zv_volblocksize; limits->zql_io_opt = zv->zv_volblocksize;
limits->zql_physical_block_size = zv->zv_volblocksize;
limits->zql_max_discard_sectors =
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
limits->zql_discard_granularity = zv->zv_volblocksize;
} }
#ifdef HAVE_BLK_ALLOC_DISK_2ARG #ifdef HAVE_BLK_ALLOC_DISK_2ARG
@ -1174,18 +1211,35 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
qlimits->max_segments = limits->zql_max_segments; qlimits->max_segments = limits->zql_max_segments;
qlimits->max_segment_size = limits->zql_max_segment_size; qlimits->max_segment_size = limits->zql_max_segment_size;
qlimits->io_opt = limits->zql_io_opt; qlimits->io_opt = limits->zql_io_opt;
qlimits->physical_block_size = limits->zql_physical_block_size;
qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
qlimits->discard_granularity = limits->zql_discard_granularity;
#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
qlimits->features =
BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
#endif
} }
#else #endif
static void static void
zvol_queue_limits_apply(zvol_queue_limits_t *limits, zvol_queue_limits_apply(zvol_queue_limits_t *limits,
struct request_queue *queue) struct request_queue *queue)
{ {
#ifndef HAVE_BLK_ALLOC_DISK_2ARG
blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
blk_queue_max_segments(queue, limits->zql_max_segments); blk_queue_max_segments(queue, limits->zql_max_segments);
blk_queue_max_segment_size(queue, limits->zql_max_segment_size); blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
blk_queue_io_opt(queue, limits->zql_io_opt); blk_queue_io_opt(queue, limits->zql_io_opt);
} blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
#endif #endif
#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
blk_queue_set_write_cache(queue, B_TRUE);
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
#endif
}
static int static int
zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
@ -1198,7 +1252,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_disk->minors = ZVOL_MINORS;
zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_queue = zso->zvo_disk->queue;
zvol_queue_limits_apply(limits, zso->zvo_queue);
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
struct queue_limits qlimits; struct queue_limits qlimits;
zvol_queue_limits_convert(limits, &qlimits); zvol_queue_limits_convert(limits, &qlimits);
@ -1211,6 +1264,7 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
zso->zvo_disk = disk; zso->zvo_disk = disk;
zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_disk->minors = ZVOL_MINORS;
zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_queue = zso->zvo_disk->queue;
#else #else
zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
if (zso->zvo_queue == NULL) if (zso->zvo_queue == NULL)
@ -1223,7 +1277,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
} }
zso->zvo_disk->queue = zso->zvo_queue; zso->zvo_disk->queue = zso->zvo_queue;
zvol_queue_limits_apply(limits, zso->zvo_queue);
#endif /* HAVE_BLK_ALLOC_DISK */ #endif /* HAVE_BLK_ALLOC_DISK */
#else #else
zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
@ -1237,8 +1290,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
} }
zso->zvo_disk->queue = zso->zvo_queue; zso->zvo_disk->queue = zso->zvo_queue;
zvol_queue_limits_apply(limits, zso->zvo_queue);
#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
zvol_queue_limits_apply(limits, zso->zvo_queue);
return (0); return (0);
} }
@ -1260,7 +1315,6 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
return (1); return (1);
} }
zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_queue = zso->zvo_disk->queue;
zvol_queue_limits_apply(limits, zso->zvo_queue);
zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_disk->minors = ZVOL_MINORS;
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
struct queue_limits qlimits; struct queue_limits qlimits;
@ -1291,10 +1345,11 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
/* Our queue is now created, assign it to our disk */ /* Our queue is now created, assign it to our disk */
zso->zvo_disk->queue = zso->zvo_queue; zso->zvo_disk->queue = zso->zvo_queue;
zvol_queue_limits_apply(limits, zso->zvo_queue); #endif
zvol_queue_limits_apply(limits, zso->zvo_queue);
#endif #endif
#endif
return (0); return (0);
} }
@ -1303,7 +1358,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
* request queue and generic disk structures for the block device. * request queue and generic disk structures for the block device.
*/ */
static zvol_state_t * static zvol_state_t *
zvol_alloc(dev_t dev, const char *name) zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
{ {
zvol_state_t *zv; zvol_state_t *zv;
struct zvol_state_os *zso; struct zvol_state_os *zso;
@ -1323,6 +1378,7 @@ zvol_alloc(dev_t dev, const char *name)
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
zv->zv_zso = zso; zv->zv_zso = zso;
zv->zv_volmode = volmode; zv->zv_volmode = volmode;
zv->zv_volblocksize = volblocksize;
list_link_init(&zv->zv_next); list_link_init(&zv->zv_next);
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
@ -1360,8 +1416,6 @@ zvol_alloc(dev_t dev, const char *name)
if (ret != 0) if (ret != 0)
goto out_kmem; goto out_kmem;
blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
/* Limit read-ahead to a single page to prevent over-prefetching. */ /* Limit read-ahead to a single page to prevent over-prefetching. */
blk_queue_set_read_ahead(zso->zvo_queue, 1); blk_queue_set_read_ahead(zso->zvo_queue, 1);
@ -1370,9 +1424,6 @@ zvol_alloc(dev_t dev, const char *name)
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
} }
/* Enable /proc/diskstats */
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
zso->zvo_queue->queuedata = zv; zso->zvo_queue->queuedata = zv;
zso->zvo_dev = dev; zso->zvo_dev = dev;
zv->zv_open_count = 0; zv->zv_open_count = 0;
@ -1617,7 +1668,8 @@ zvol_os_create_minor(const char *name)
if (error) if (error)
goto out_dmu_objset_disown; goto out_dmu_objset_disown;
zv = zvol_alloc(MKDEV(zvol_major, minor), name); zv = zvol_alloc(MKDEV(zvol_major, minor), name,
doi->doi_data_block_size);
if (zv == NULL) { if (zv == NULL) {
error = SET_ERROR(EAGAIN); error = SET_ERROR(EAGAIN);
goto out_dmu_objset_disown; goto out_dmu_objset_disown;
@ -1627,7 +1679,6 @@ zvol_os_create_minor(const char *name)
if (dmu_objset_is_snapshot(os)) if (dmu_objset_is_snapshot(os))
zv->zv_flags |= ZVOL_RDONLY; zv->zv_flags |= ZVOL_RDONLY;
zv->zv_volblocksize = doi->doi_data_block_size;
zv->zv_volsize = volsize; zv->zv_volsize = volsize;
zv->zv_objset = os; zv->zv_objset = os;
@ -1639,14 +1690,6 @@ zvol_os_create_minor(const char *name)
set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
zv->zv_volblocksize);
blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
zv->zv_volblocksize);
#ifdef QUEUE_FLAG_DISCARD #ifdef QUEUE_FLAG_DISCARD
blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
#endif #endif

View File

@ -754,6 +754,12 @@ zpool_feature_init(void)
"Support for raidz expansion", "Support for raidz expansion",
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
zfeature_register(SPA_FEATURE_FAST_DEDUP,
"com.klarasystems:fast_dedup", "fast_dedup",
"Support for advanced deduplication",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
sfeatures);
zfs_mod_list_supported_free(sfeatures); zfs_mod_list_supported_free(sfeatures);
} }

View File

@ -113,7 +113,7 @@ abd_verify(abd_t *abd)
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD)); ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) { if (abd_is_linear(abd)) {
@ -603,13 +603,11 @@ abd_get_zeros(size_t size)
} }
/* /*
* Allocate a linear ABD structure for buf. * Create a linear ABD for an existing buf.
*/ */
abd_t * static abd_t *
abd_get_from_buf(void *buf, size_t size) abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)
{ {
abd_t *abd = abd_alloc_struct(0);
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
/* /*
@ -625,6 +623,20 @@ abd_get_from_buf(void *buf, size_t size)
return (abd); return (abd);
} }
abd_t *
abd_get_from_buf(void *buf, size_t size)
{
abd_t *abd = abd_alloc_struct(0);
return (abd_get_from_buf_impl(abd, buf, size));
}
abd_t *
abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)
{
abd_init_struct(abd);
return (abd_get_from_buf_impl(abd, buf, size));
}
/* /*
* Get the raw buffer associated with a linear ABD. * Get the raw buffer associated with a linear ABD.
*/ */

View File

@ -1767,12 +1767,12 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
uint64_t csize; uint64_t csize;
uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr);
uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr);
void *tmpbuf = NULL;
abd_t *abd = hdr->b_l1hdr.b_pabd; abd_t *abd = hdr->b_l1hdr.b_pabd;
boolean_t free_abd = B_FALSE;
ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3P(abd, !=, NULL);
/* /*
* The MAC is calculated on the compressed data that is stored on disk. * The MAC is calculated on the compressed data that is stored on disk.
@ -1784,14 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
*/ */
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
!HDR_COMPRESSION_ENABLED(hdr)) { !HDR_COMPRESSION_ENABLED(hdr)) {
abd = NULL;
csize = zio_compress_data(HDR_GET_COMPRESS(hdr), csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel); hdr->b_l1hdr.b_pabd, &abd, lsize, hdr->b_complevel);
ASSERT3P(tmpbuf, !=, NULL); ASSERT3P(abd, !=, NULL);
ASSERT3U(csize, <=, psize); ASSERT3U(csize, <=, psize);
abd = abd_get_from_buf(tmpbuf, lsize);
abd_take_ownership_of_buf(abd, B_TRUE);
abd_zero_off(abd, csize, psize - csize); abd_zero_off(abd, csize, psize - csize);
free_abd = B_TRUE;
} }
/* /*
@ -1810,16 +1809,10 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
if (ret == 0) if (ret == 0)
arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
else if (ret != ENOENT) else if (ret == ENOENT)
goto error; ret = 0;
if (tmpbuf != NULL) if (free_abd)
abd_free(abd);
return (0);
error:
if (tmpbuf != NULL)
abd_free(abd); abd_free(abd);
return (ret); return (ret);
@ -1836,7 +1829,6 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
{ {
int ret; int ret;
abd_t *cabd = NULL; abd_t *cabd = NULL;
void *tmp = NULL;
boolean_t no_crypt = B_FALSE; boolean_t no_crypt = B_FALSE;
boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
@ -1871,17 +1863,14 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
* linear buffer and wrapping it in an abd later. * linear buffer and wrapping it in an abd later.
*/ */
cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
HDR_GET_LSIZE(hdr), &hdr->b_complevel); HDR_GET_LSIZE(hdr), &hdr->b_complevel);
if (ret != 0) { if (ret != 0) {
abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
goto error; goto error;
} }
abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
arc_hdr_size(hdr), hdr); arc_hdr_size(hdr), hdr);
hdr->b_l1hdr.b_pabd = cabd; hdr->b_l1hdr.b_pabd = cabd;
@ -2123,10 +2112,14 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
/* Skip byteswapping and checksumming (already done) */ /* Skip byteswapping and checksumming (already done) */
return (0); return (0);
} else { } else {
abd_t dabd;
abd_get_from_buf_struct(&dabd, buf->b_data,
HDR_GET_LSIZE(hdr));
error = zio_decompress_data(HDR_GET_COMPRESS(hdr), error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, buf->b_data, hdr->b_l1hdr.b_pabd, &dabd,
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
&hdr->b_complevel); &hdr->b_complevel);
abd_free(&dabd);
/* /*
* Absent hardware errors or software bugs, this should * Absent hardware errors or software bugs, this should
@ -8531,18 +8524,15 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
!HDR_COMPRESSION_ENABLED(hdr)) { !HDR_COMPRESSION_ENABLED(hdr)) {
abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
ARC_HDR_USE_RESERVE); ARC_HDR_USE_RESERVE);
void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
HDR_GET_LSIZE(hdr), &hdr->b_complevel); HDR_GET_LSIZE(hdr), &hdr->b_complevel);
if (ret != 0) { if (ret != 0) {
abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
goto error; goto error;
} }
abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
arc_hdr_size(hdr), hdr); arc_hdr_size(hdr), hdr);
hdr->b_l1hdr.b_pabd = cabd; hdr->b_l1hdr.b_pabd = cabd;
@ -9037,9 +9027,8 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
} }
if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
size_t bufsize = MAX(size, asize); cabd = abd_alloc_for_io(MAX(size, asize), ismd);
void *buf = zio_buf_alloc(bufsize); uint64_t csize = zio_compress_data(compress, to_write, &cabd,
uint64_t csize = zio_compress_data(compress, to_write, &buf,
size, hdr->b_complevel); size, hdr->b_complevel);
if (csize > psize) { if (csize > psize) {
/* /*
@ -9047,13 +9036,12 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
* psize. Even if it fits into asize, it does not * psize. Even if it fits into asize, it does not
* matter, since checksum will never match on read. * matter, since checksum will never match on read.
*/ */
zio_buf_free(buf, bufsize); abd_free(cabd);
return (SET_ERROR(EIO)); return (SET_ERROR(EIO));
} }
if (asize > csize) if (asize > csize)
memset((char *)buf + csize, 0, asize - csize); abd_zero_off(cabd, csize, asize - csize);
to_write = cabd = abd_get_from_buf(buf, bufsize); to_write = cabd;
abd_take_ownership_of_buf(cabd, B_TRUE);
} }
if (HDR_ENCRYPTED(hdr)) { if (HDR_ENCRYPTED(hdr)) {
@ -9158,12 +9146,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
*/ */
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
/* /*
* If pass == 1 or 3, we cache MRU metadata and data * pass == 0: MFU meta
* respectively. * pass == 1: MRU meta
* pass == 2: MFU data
* pass == 3: MRU data
*/ */
if (l2arc_mfuonly) { if (l2arc_mfuonly == 1) {
if (pass == 1 || pass == 3) if (pass == 1 || pass == 3)
continue; continue;
} else if (l2arc_mfuonly > 1) {
if (pass == 3)
continue;
} }
uint64_t passed_sz = 0; uint64_t passed_sz = 0;
@ -10179,7 +10172,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
{ {
int err = 0; int err = 0;
zio_cksum_t cksum; zio_cksum_t cksum;
abd_t *abd = NULL;
uint64_t asize; uint64_t asize;
ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lbp != NULL && next_lbp != NULL);
@ -10241,16 +10233,22 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
case ZIO_COMPRESS_OFF: case ZIO_COMPRESS_OFF:
break; break;
case ZIO_COMPRESS_LZ4: case ZIO_COMPRESS_LZ4: {
abd = abd_alloc_for_io(asize, B_TRUE); abd_t *abd = abd_alloc_linear(asize, B_TRUE);
abd_copy_from_buf_off(abd, this_lb, 0, asize); abd_copy_from_buf_off(abd, this_lb, 0, asize);
if ((err = zio_decompress_data( abd_t dabd;
abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
err = zio_decompress_data(
L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { abd, &dabd, asize, sizeof (*this_lb), NULL);
abd_free(&dabd);
abd_free(abd);
if (err != 0) {
err = SET_ERROR(EINVAL); err = SET_ERROR(EINVAL);
goto cleanup; goto cleanup;
} }
break; break;
}
default: default:
err = SET_ERROR(EINVAL); err = SET_ERROR(EINVAL);
goto cleanup; goto cleanup;
@ -10267,8 +10265,6 @@ cleanup:
l2arc_log_blk_fetch_abort(*next_io); l2arc_log_blk_fetch_abort(*next_io);
*next_io = NULL; *next_io = NULL;
} }
if (abd != NULL)
abd_free(abd);
return (err); return (err);
} }
@ -10504,7 +10500,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
uint64_t psize, asize; uint64_t psize, asize;
zio_t *wzio; zio_t *wzio;
l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_abd_buf_t *abd_buf;
uint8_t *tmpbuf = NULL; abd_t *abd = NULL;
l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf;
VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
@ -10527,7 +10523,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
/* try to compress the buffer */ /* try to compress the buffer */
psize = zio_compress_data(ZIO_COMPRESS_LZ4, psize = zio_compress_data(ZIO_COMPRESS_LZ4,
abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0); abd_buf->abd, &abd, sizeof (*lb), 0);
/* a log block is never entirely zero */ /* a log block is never entirely zero */
ASSERT(psize != 0); ASSERT(psize != 0);
@ -10553,27 +10549,26 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
ZIO_CHECKSUM_FLETCHER_4); ZIO_CHECKSUM_FLETCHER_4);
if (asize < sizeof (*lb)) { if (asize < sizeof (*lb)) {
/* compression succeeded */ /* compression succeeded */
memset(tmpbuf + psize, 0, asize - psize); abd_zero_off(abd, psize, asize - psize);
L2BLK_SET_COMPRESS( L2BLK_SET_COMPRESS(
(&l2dhdr->dh_start_lbps[0])->lbp_prop, (&l2dhdr->dh_start_lbps[0])->lbp_prop,
ZIO_COMPRESS_LZ4); ZIO_COMPRESS_LZ4);
} else { } else {
/* compression failed */ /* compression failed */
memcpy(tmpbuf, lb, sizeof (*lb)); abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
L2BLK_SET_COMPRESS( L2BLK_SET_COMPRESS(
(&l2dhdr->dh_start_lbps[0])->lbp_prop, (&l2dhdr->dh_start_lbps[0])->lbp_prop,
ZIO_COMPRESS_OFF); ZIO_COMPRESS_OFF);
} }
/* checksum what we're about to write */ /* checksum what we're about to write */
fletcher_4_native(tmpbuf, asize, NULL, abd_fletcher_4_native(abd, asize, NULL,
&l2dhdr->dh_start_lbps[0].lbp_cksum); &l2dhdr->dh_start_lbps[0].lbp_cksum);
abd_free(abd_buf->abd); abd_free(abd_buf->abd);
/* perform the write itself */ /* perform the write itself */
abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); abd_buf->abd = abd;
abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);

View File

@ -142,8 +142,13 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
uint8_t dstbuf[BPE_PAYLOAD_SIZE]; uint8_t dstbuf[BPE_PAYLOAD_SIZE];
decode_embedded_bp_compressed(bp, dstbuf); decode_embedded_bp_compressed(bp, dstbuf);
VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp), abd_t cabd, dabd;
dstbuf, buf, psize, buflen, NULL)); abd_get_from_buf_struct(&cabd, dstbuf, psize);
abd_get_from_buf_struct(&dabd, buf, buflen);
VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &cabd,
&dabd, psize, buflen, NULL));
abd_free(&dabd);
abd_free(&cabd);
} else { } else {
ASSERT3U(lsize, ==, psize); ASSERT3U(lsize, ==, psize);
decode_embedded_bp_compressed(bp, buf); decode_embedded_bp_compressed(bp, buf);

View File

@ -40,6 +40,9 @@ static dataset_kstat_values_t empty_dataset_kstats = {
{ {
{ "zil_commit_count", KSTAT_DATA_UINT64 }, { "zil_commit_count", KSTAT_DATA_UINT64 },
{ "zil_commit_writer_count", KSTAT_DATA_UINT64 }, { "zil_commit_writer_count", KSTAT_DATA_UINT64 },
{ "zil_commit_error_count", KSTAT_DATA_UINT64 },
{ "zil_commit_stall_count", KSTAT_DATA_UINT64 },
{ "zil_commit_suspend_count", KSTAT_DATA_UINT64 },
{ "zil_itx_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 },
{ "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_count", KSTAT_DATA_UINT64 },
{ "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 },
@ -201,6 +204,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
void void
dataset_kstats_rename(dataset_kstats_t *dk, const char *name) dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
{ {
if (dk->dk_kstats == NULL)
return;
dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
char *ds_name; char *ds_name;

File diff suppressed because it is too large Load Diff

764
module/zfs/ddt_log.c Normal file
View File

@ -0,0 +1,764 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2023, Klara Inc.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/ddt.h>
#include <sys/dmu_tx.h>
#include <sys/dmu.h>
#include <sys/ddt_impl.h>
#include <sys/dnode.h>
#include <sys/dbuf.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
/*
* No more than this many txgs before swapping logs.
*/
uint_t zfs_dedup_log_txg_max = 8;
/*
* Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
* load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
*/
uint64_t zfs_dedup_log_mem_max = 0;
uint_t zfs_dedup_log_mem_max_percent = 1;
static kmem_cache_t *ddt_log_entry_flat_cache;
static kmem_cache_t *ddt_log_entry_trad_cache;
#define DDT_LOG_ENTRY_FLAT_SIZE \
(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
#define DDT_LOG_ENTRY_TRAD_SIZE \
(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
#define DDT_LOG_ENTRY_SIZE(ddt) \
_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
void
ddt_log_init(void)
{
ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
/*
* Max memory for log AVL entries. At least 1M, because we need
* something (that's ~3800 entries per tree). They can say 100% if they
* want; it just means they're at the mercy of the the txg flush limit.
*/
if (zfs_dedup_log_mem_max == 0) {
zfs_dedup_log_mem_max_percent =
MIN(zfs_dedup_log_mem_max_percent, 100);
zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
zfs_dedup_log_mem_max_percent / 100;
}
zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
}
void
ddt_log_fini(void)
{
kmem_cache_destroy(ddt_log_entry_trad_cache);
kmem_cache_destroy(ddt_log_entry_flat_cache);
}
static void
ddt_log_name(ddt_t *ddt, char *name, uint_t n)
{
snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
zio_checksum_table[ddt->ddt_checksum].ci_name, n);
}
static void
ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
{
dmu_buf_t *db;
VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
DLH_SET_VERSION(hdr, 1);
DLH_SET_FLAGS(hdr, ddl->ddl_flags);
hdr->dlh_length = ddl->ddl_length;
hdr->dlh_first_txg = ddl->ddl_first_txg;
hdr->dlh_checkpoint = ddl->ddl_checkpoint;
dmu_buf_rele(db, FTAG);
}
static void
ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
{
ASSERT3U(ddt->ddt_dir_object, >, 0);
ASSERT3U(ddl->ddl_object, ==, 0);
char name[DDT_NAMELEN];
ddt_log_name(ddt, name, n);
ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
sizeof (uint64_t), 1, &ddl->ddl_object, tx));
ddl->ddl_length = 0;
ddl->ddl_first_txg = tx->tx_txg;
ddt_log_update_header(ddt, ddl, tx);
}
static void
ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
{
ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
}
static void
ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
{
ASSERT3U(ddt->ddt_dir_object, >, 0);
if (ddl->ddl_object == 0)
return;
ASSERT0(ddl->ddl_length);
char name[DDT_NAMELEN];
ddt_log_name(ddt, name, n);
VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
ddl->ddl_object = 0;
}
void
ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
{
ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
}
static void
ddt_log_update_stats(ddt_t *ddt)
{
/*
* Log object stats. We count the number of live entries in the log
* tree, even if there are more than on disk, and even if the same
* entry is on both append and flush trees, because that's more what
* the user expects to see. This does mean the on-disk size is not
* really correlated with the number of entries, but I don't think
* that's reasonable to expect anyway.
*/
dmu_object_info_t doi;
uint64_t nblocks;
dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
nblocks = doi.doi_physical_blocks_512;
dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
nblocks += doi.doi_physical_blocks_512;
ddt_object_t *ddo = &ddt->ddt_log_stats;
ddo->ddo_count =
avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
ddo->ddo_dspace = nblocks << 9;
}
void
ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
{
ASSERT3U(nentries, >, 0);
ASSERT3P(dlu->dlu_dbp, ==, NULL);
if (ddt->ddt_log_active->ddl_object == 0)
ddt_log_create(ddt, tx);
/*
* We want to store as many entries as we can in a block, but never
* split an entry across block boundaries.
*/
size_t reclen = P2ALIGN_TYPED(
sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
ASSERT3U(reclen, <=, UINT16_MAX);
dlu->dlu_reclen = reclen;
VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
&dlu->dlu_dn));
dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
uint64_t nblocks = howmany(nentries,
dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
uint64_t offset = ddt->ddt_log_active->ddl_length;
uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
DMU_READ_NO_PREFETCH));
dlu->dlu_tx = tx;
dlu->dlu_block = dlu->dlu_offset = 0;
}
static ddt_log_entry_t *
ddt_log_alloc_entry(ddt_t *ddt)
{
ddt_log_entry_t *ddle;
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
} else {
ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
}
return (ddle);
}
static void
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
{
/* Create the log tree entry from a live or stored entry */
avl_index_t where;
ddt_log_entry_t *ddle =
avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
if (ddle == NULL) {
ddle = ddt_log_alloc_entry(ddt);
ddle->ddle_key = ddlwe->ddlwe_key;
avl_insert(&ddl->ddl_tree, ddle, where);
}
ddle->ddle_type = ddlwe->ddlwe_type;
ddle->ddle_class = ddlwe->ddlwe_class;
memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
}
void
ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
{
ASSERT3U(dlu->dlu_dbp, !=, NULL);
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
/* Get our block */
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
/*
* If this would take us past the end of the block, finish it and
* move to the next one.
*/
if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
ASSERT3U(dlu->dlu_offset, >, 0);
dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
dlu->dlu_block++;
dlu->dlu_offset = 0;
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
db = dlu->dlu_dbp[dlu->dlu_block];
}
/*
* If this is the first time touching the block, inform the DMU that
* we will fill it, and zero it out.
*/
if (dlu->dlu_offset == 0) {
dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
memset(db->db_data, 0, db->db_size);
}
/* Create the log record directly in the buffer */
ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
DLR_SET_TYPE(dlr, DLR_ENTRY);
DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
ddt_log_record_entry_t *dlre =
(ddt_log_record_entry_t *)&dlr->dlr_payload;
dlre->dlre_key = ddlwe->ddlwe_key;
memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
/* Advance offset for next record. */
dlu->dlu_offset += dlu->dlu_reclen;
}
void
ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
{
ASSERT3U(dlu->dlu_dbp, !=, NULL);
ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
ASSERT3U(dlu->dlu_offset, >, 0);
/*
* Close out the last block. Whatever we haven't used will be zeroed,
* which matches DLR_INVALID, so we can detect this during load.
*/
dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
ddt->ddt_log_active->ddl_length +=
dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
dnode_rele(dlu->dlu_dn, FTAG);
ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
memset(dlu, 0, sizeof (ddt_log_update_t));
ddt_log_update_stats(ddt);
}
boolean_t
ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
{
ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
if (ddle == NULL)
return (B_FALSE);
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
avl_remove(&ddl->ddl_tree, ddle);
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
return (B_TRUE);
}
boolean_t
ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe)
{
ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
if (ddle == NULL)
return (B_FALSE);
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
avl_remove(&ddl->ddl_tree, ddle);
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
return (B_TRUE);
}
void
ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
{
ddt_log_t *ddl = ddt->ddt_log_flushing;
ASSERT3U(ddl->ddl_object, !=, 0);
#ifdef ZFS_DEBUG
/*
* There should not be any entries on the log tree before the given
* checkpoint. Assert that this is the case.
*/
ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
if (ddle != NULL)
VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
>, 0);
#endif
ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
ddl->ddl_checkpoint = ddlwe->ddlwe_key;
ddt_log_update_header(ddt, ddl, tx);
ddt_log_update_stats(ddt);
}
void
ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
{
ddt_log_t *ddl = ddt->ddt_log_flushing;
if (ddl->ddl_object == 0)
return;
ASSERT(avl_is_empty(&ddl->ddl_tree));
/* Eject the entire object */
dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
ddl->ddl_length = 0;
ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
ddt_log_update_header(ddt, ddl, tx);
ddt_log_update_stats(ddt);
}
boolean_t
ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
{
/* Swap the logs. The old flushing one must be empty */
VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
/*
* If there are still blocks on the flushing log, truncate it first.
* This can happen if there were entries on the flushing log that were
* removed in memory via ddt_lookup(); their vestigal remains are
* on disk.
*/
if (ddt->ddt_log_flushing->ddl_length > 0)
ddt_log_truncate(ddt, tx);
/*
* Swap policy. We swap the logs (and so begin flushing) when the
* active tree grows too large, or when we haven't swapped it in
* some amount of time, or if something has requested the logs be
* flushed ASAP (see ddt_walk_init()).
*/
/*
* The log tree is too large if the memory usage of its entries is over
* half of the memory limit. This effectively gives each log tree half
* the available memory.
*/
const boolean_t too_large =
(avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
const boolean_t too_old =
tx->tx_txg >=
(ddt->ddt_log_active->ddl_first_txg +
MAX(1, zfs_dedup_log_txg_max));
const boolean_t force =
ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
if (!(too_large || too_old || force))
return (B_FALSE);
ddt_log_t *swap = ddt->ddt_log_active;
ddt->ddt_log_active = ddt->ddt_log_flushing;
ddt->ddt_log_flushing = swap;
ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
ddt->ddt_log_active->ddl_flags &=
~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
ddt_log_update_stats(ddt);
return (B_TRUE);
}
static inline void
ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
const ddt_key_t *checkpoint)
{
ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
ddt_log_record_entry_t *dlre =
(ddt_log_record_entry_t *)dlr->dlr_payload;
if (checkpoint != NULL &&
ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
/* Skip pre-checkpoint entries; they're already flushed. */
return;
}
ddt_lightweight_entry_t ddlwe;
ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
ddlwe.ddlwe_key = dlre->dlre_key;
memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
ddt_log_update_entry(ddt, ddl, &ddlwe);
}
static void
ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
{
void *cookie = NULL;
ddt_log_entry_t *ddle;
IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
while ((ddle =
avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
}
ASSERT(avl_is_empty(&ddl->ddl_tree));
}
static int
ddt_log_load_one(ddt_t *ddt, uint_t n)
{
ASSERT3U(n, <, 2);
ddt_log_t *ddl = &ddt->ddt_log[n];
char name[DDT_NAMELEN];
ddt_log_name(ddt, name, n);
uint64_t obj;
int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
sizeof (uint64_t), 1, &obj);
if (err == ENOENT)
return (0);
if (err != 0)
return (err);
dnode_t *dn;
err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
if (err != 0)
return (err);
ddt_log_header_t hdr;
dmu_buf_t *db;
err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
if (err != 0) {
dnode_rele(dn, FTAG);
return (err);
}
memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
dmu_buf_rele(db, FTAG);
if (DLH_GET_VERSION(&hdr) != 1) {
dnode_rele(dn, FTAG);
zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
"unknown version=%llu", spa_name(ddt->ddt_spa), name,
(u_longlong_t)DLH_GET_VERSION(&hdr));
return (SET_ERROR(EINVAL));
}
ddt_key_t *checkpoint = NULL;
if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
/*
* If the log has a checkpoint, then we can ignore any entries
* that have already been flushed.
*/
ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
checkpoint = &hdr.dlh_checkpoint;
}
if (hdr.dlh_length > 0) {
dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
ZIO_PRIORITY_SYNC_READ);
for (uint64_t offset = 0; offset < hdr.dlh_length;
offset += dn->dn_datablksz) {
err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
DMU_READ_PREFETCH);
if (err != 0) {
dnode_rele(dn, FTAG);
ddt_log_empty(ddt, ddl);
return (err);
}
uint64_t boffset = 0;
while (boffset < db->db_size) {
ddt_log_record_t *dlr =
(ddt_log_record_t *)(db->db_data + boffset);
/* Partially-filled block, skip the rest */
if (DLR_GET_TYPE(dlr) == DLR_INVALID)
break;
switch (DLR_GET_TYPE(dlr)) {
case DLR_ENTRY:
ddt_log_load_entry(ddt, ddl, dlr,
checkpoint);
break;
default:
dmu_buf_rele(db, FTAG);
dnode_rele(dn, FTAG);
ddt_log_empty(ddt, ddl);
return (SET_ERROR(EINVAL));
}
boffset += DLR_GET_RECLEN(dlr);
}
dmu_buf_rele(db, FTAG);
}
}
dnode_rele(dn, FTAG);
ddl->ddl_object = obj;
ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
ddl->ddl_length = hdr.dlh_length;
ddl->ddl_first_txg = hdr.dlh_first_txg;
if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
ddt->ddt_log_flushing = ddl;
else
ddt->ddt_log_active = ddl;
return (0);
}
int
ddt_log_load(ddt_t *ddt)
{
int err;
if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
/*
* The DDT is going to be freed again in a moment, so there's
* no point loading the log; it'll just slow down import.
*/
return (0);
}
ASSERT0(ddt->ddt_log[0].ddl_object);
ASSERT0(ddt->ddt_log[1].ddl_object);
if (ddt->ddt_dir_object == 0) {
/*
* If we're configured but the containing dir doesn't exist
* yet, then the log object can't possibly exist either.
*/
ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
return (SET_ERROR(ENOENT));
}
if ((err = ddt_log_load_one(ddt, 0)) != 0)
return (err);
if ((err = ddt_log_load_one(ddt, 1)) != 0)
return (err);
VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
/*
* We have two finalisation tasks:
*
* - rebuild the histogram. We do this at the end rather than while
* we're loading so we don't need to uncount and recount entries that
* appear multiple times in the log.
*
* - remove entries from the flushing tree that are on both trees. This
* happens when ddt_lookup() rehydrates an entry from the flushing
* tree, as ddt_log_take_key() removes the entry from the in-memory
* tree but doesn't remove it from disk.
*/
/*
* We don't technically need a config lock here, since there shouldn't
* be pool config changes during DDT load. dva_get_dsize_sync() via
* ddt_stat_generate() is expecting it though, and it won't hurt
* anything, so we take it.
*/
spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
ddt_log_entry_t *ae = avl_first(al);
ddt_log_entry_t *fe = avl_first(fl);
while (ae != NULL || fe != NULL) {
ddt_log_entry_t *ddle;
if (ae == NULL) {
/* active exhausted, take flushing */
ddle = fe;
fe = AVL_NEXT(fl, fe);
} else if (fe == NULL) {
/* flushing exuhausted, take active */
ddle = ae;
ae = AVL_NEXT(al, ae);
} else {
/* compare active and flushing */
int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
if (c < 0) {
/* active behind, take and advance */
ddle = ae;
ae = AVL_NEXT(al, ae);
} else if (c > 0) {
/* flushing behind, take and advance */
ddle = fe;
fe = AVL_NEXT(fl, fe);
} else {
/* match. remove from flushing, take active */
ddle = fe;
fe = AVL_NEXT(fl, fe);
avl_remove(fl, ddle);
ddle = ae;
ae = AVL_NEXT(al, ae);
}
}
ddt_lightweight_entry_t ddlwe;
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
}
spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
ddt_log_update_stats(ddt);
return (0);
}
void
ddt_log_alloc(ddt_t *ddt)
{
ASSERT3P(ddt->ddt_log_active, ==, NULL);
ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
ddt->ddt_log_active = &ddt->ddt_log[0];
ddt->ddt_log_flushing = &ddt->ddt_log[1];
ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
}
void
ddt_log_free(ddt_t *ddt)
{
ddt_log_empty(ddt, &ddt->ddt_log[0]);
ddt_log_empty(ddt, &ddt->ddt_log[1]);
avl_destroy(&ddt->ddt_log[0].ddl_tree);
avl_destroy(&ddt->ddt_log[1].ddl_tree);
}
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
"Max transactions before starting to flush dedup logs");
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
"Max memory for dedup logs");
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
"Max memory for dedup logs, as % of total memory");

View File

@ -33,27 +33,32 @@
#include <sys/ddt_impl.h> #include <sys/ddt_impl.h>
static void static void
ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
ddt_stat_t *dds)
{ {
spa_t *spa = ddt->ddt_spa; spa_t *spa = ddt->ddt_spa;
ddt_phys_t *ddp = dde->dde_phys; uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key);
ddt_key_t *ddk = &dde->dde_key; uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key);
uint64_t lsize = DDK_GET_LSIZE(ddk);
uint64_t psize = DDK_GET_PSIZE(ddk);
memset(dds, 0, sizeof (*dds)); memset(dds, 0, sizeof (*dds));
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
uint64_t dsize = 0; const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
uint64_t refcnt = ddp->ddp_refcnt; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
if (ddp->ddp_phys_birth == 0) if (ddt_phys_birth(ddp, v) == 0)
continue; continue;
int ndvas = DDK_GET_CRYPT(&dde->dde_key) ? int ndvas = ddt_phys_dva_count(ddp, v,
SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; DDK_GET_CRYPT(&ddlwe->ddlwe_key));
const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
uint64_t dsize = 0;
for (int d = 0; d < ndvas; d++) for (int d = 0; d < ndvas; d++)
dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); dsize += dva_get_dsize_sync(spa, &dvas[d]);
uint64_t refcnt = ddt_phys_refcnt(ddp, v);
dds->dds_blocks += 1; dds->dds_blocks += 1;
dds->dds_lsize += lsize; dds->dds_lsize += lsize;
@ -67,61 +72,108 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
} }
} }
void static void
ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src)
{ {
const uint64_t *s = (const uint64_t *)src; dst->dds_blocks += src->dds_blocks;
uint64_t *d = (uint64_t *)dst; dst->dds_lsize += src->dds_lsize;
uint64_t *d_end = (uint64_t *)(dst + 1); dst->dds_psize += src->dds_psize;
dst->dds_dsize += src->dds_dsize;
dst->dds_ref_blocks += src->dds_ref_blocks;
dst->dds_ref_lsize += src->dds_ref_lsize;
dst->dds_ref_psize += src->dds_ref_psize;
dst->dds_ref_dsize += src->dds_ref_dsize;
}
ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ static void
ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src)
{
/* This caught more during development than you might expect... */
ASSERT3U(dst->dds_blocks, >=, src->dds_blocks);
ASSERT3U(dst->dds_lsize, >=, src->dds_lsize);
ASSERT3U(dst->dds_psize, >=, src->dds_psize);
ASSERT3U(dst->dds_dsize, >=, src->dds_dsize);
ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks);
ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize);
ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize);
ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize);
for (int i = 0; i < d_end - d; i++) dst->dds_blocks -= src->dds_blocks;
d[i] += (s[i] ^ neg) - neg; dst->dds_lsize -= src->dds_lsize;
dst->dds_psize -= src->dds_psize;
dst->dds_dsize -= src->dds_dsize;
dst->dds_ref_blocks -= src->dds_ref_blocks;
dst->dds_ref_lsize -= src->dds_ref_lsize;
dst->dds_ref_psize -= src->dds_ref_psize;
dst->dds_ref_dsize -= src->dds_ref_dsize;
} }
void void
ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
const ddt_lightweight_entry_t *ddlwe)
{ {
ddt_stat_t dds; ddt_stat_t dds;
ddt_histogram_t *ddh;
int bucket; int bucket;
ddt_stat_generate(ddt, dde, &dds); ddt_stat_generate(ddt, ddlwe, &dds);
bucket = highbit64(dds.dds_ref_blocks) - 1; bucket = highbit64(dds.dds_ref_blocks) - 1;
ASSERT3U(bucket, >=, 0); if (bucket < 0)
return;
ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; ddt_stat_add(&ddh->ddh_stat[bucket], &dds);
}
ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); void
ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
const ddt_lightweight_entry_t *ddlwe)
{
ddt_stat_t dds;
int bucket;
ddt_stat_generate(ddt, ddlwe, &dds);
bucket = highbit64(dds.dds_ref_blocks) - 1;
if (bucket < 0)
return;
ddt_stat_sub(&ddh->ddh_stat[bucket], &dds);
} }
void void
ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
{ {
for (int h = 0; h < 64; h++) for (int h = 0; h < 64; h++)
ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]);
} }
void void
ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh)
{ {
memset(dds, 0, sizeof (*dds)); memset(dds, 0, sizeof (*dds));
for (int h = 0; h < 64; h++) for (int h = 0; h < 64; h++)
ddt_stat_add(dds, &ddh->ddh_stat[h], 0); ddt_stat_add(dds, &ddh->ddh_stat[h]);
} }
boolean_t boolean_t
ddt_histogram_empty(const ddt_histogram_t *ddh) ddt_histogram_empty(const ddt_histogram_t *ddh)
{ {
const uint64_t *s = (const uint64_t *)ddh; for (int h = 0; h < 64; h++) {
const uint64_t *s_end = (const uint64_t *)(ddh + 1); const ddt_stat_t *dds = &ddh->ddh_stat[h];
while (s < s_end) if (dds->dds_blocks == 0 &&
if (*s++ != 0) dds->dds_lsize == 0 &&
return (B_FALSE); dds->dds_psize == 0 &&
dds->dds_dsize == 0 &&
dds->dds_ref_blocks == 0 &&
dds->dds_ref_lsize == 0 &&
dds->dds_ref_psize == 0 &&
dds->dds_ref_dsize == 0)
continue;
return (B_FALSE);
}
return (B_TRUE); return (B_TRUE);
} }
@ -170,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
ddo_total->ddo_mspace += ddo->ddo_mspace; ddo_total->ddo_mspace += ddo->ddo_mspace;
} }
} }
ddt_object_t *ddo = &ddt->ddt_log_stats;
ddo_total->ddo_count += ddo->ddo_count;
ddo_total->ddo_dspace += ddo->ddo_dspace;
ddo_total->ddo_mspace += ddo->ddo_mspace;
} }
/* /*
@ -207,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
&ddt->ddt_histogram_cache[type][class]); &ddt->ddt_histogram_cache[type][class]);
} }
} }
ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
} }
} }
@ -217,7 +276,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
ddt_get_dedup_histogram(spa, ddh_total); ddt_get_dedup_histogram(spa, ddh_total);
ddt_histogram_stat(dds_total, ddh_total); ddt_histogram_total(dds_total, ddh_total);
kmem_free(ddh_total, sizeof (ddt_histogram_t)); kmem_free(ddh_total, sizeof (ddt_histogram_t));
} }

View File

@ -22,6 +22,7 @@
/* /*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved. * Copyright (c) 2018 by Delphix. All rights reserved.
* Copyright (c) 2023, Klara Inc.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -51,8 +52,13 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */ ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */
c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1, /* Call compress function directly to avoid hole detection. */
ci->ci_level); abd_t sabd, dabd;
abd_get_from_buf_struct(&sabd, (void *)src, s_len);
abd_get_from_buf_struct(&dabd, dst, d_len);
c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level);
abd_free(&dabd);
abd_free(&sabd);
if (c_len == s_len) { if (c_len == s_len) {
cpfunc = ZIO_COMPRESS_OFF; cpfunc = ZIO_COMPRESS_OFF;
@ -71,12 +77,18 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
{ {
uchar_t version = *src++; uchar_t version = *src++;
int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK; int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
zio_compress_info_t *ci = &zio_compress_table[cpfunc];
if (ci->ci_decompress != NULL) if (zio_compress_table[cpfunc].ci_decompress == NULL) {
(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
else
memcpy(dst, src, d_len); memcpy(dst, src, d_len);
return;
}
abd_t sabd, dabd;
abd_get_from_buf_struct(&sabd, src, s_len);
abd_get_from_buf_struct(&dabd, dst, d_len);
VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL));
abd_free(&dabd);
abd_free(&sabd);
if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) != if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
(ZFS_HOST_BYTEORDER != 0)) (ZFS_HOST_BYTEORDER != 0))
@ -108,7 +120,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
static int static int
ddt_zap_lookup(objset_t *os, uint64_t object, ddt_zap_lookup(objset_t *os, uint64_t object,
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize) const ddt_key_t *ddk, void *phys, size_t psize)
{ {
uchar_t *cbuf; uchar_t *cbuf;
uint64_t one, csize; uint64_t one, csize;
@ -155,7 +167,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object)
static int static int
ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) const void *phys, size_t psize, dmu_tx_t *tx)
{ {
const size_t cbuf_size = psize + 1; const size_t cbuf_size = psize + 1;
@ -181,7 +193,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
static int static int
ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
ddt_phys_t *phys, size_t psize) void *phys, size_t psize)
{ {
zap_cursor_t zc; zap_cursor_t zc;
zap_attribute_t za; zap_attribute_t za;

View File

@ -95,6 +95,12 @@ uint_t dmu_prefetch_max = 8 * 1024 * 1024;
uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
#endif #endif
/*
* Override copies= for dedup state objects. 0 means the traditional behaviour
* (ie the default for the containing objset ie 3 for the MOS).
*/
uint_t dmu_ddt_copies = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
{DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" },
@ -2272,6 +2278,28 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
case ZFS_REDUNDANT_METADATA_NONE: case ZFS_REDUNDANT_METADATA_NONE:
break; break;
} }
if (dmu_ddt_copies > 0) {
/*
* If this tuneable is set, and this is a write for a
* dedup entry store (zap or log), then we treat it
* something like ZFS_REDUNDANT_METADATA_MOST on a
* regular dataset: this many copies, and one more for
* "higher" indirect blocks. This specific exception is
* necessary because dedup objects are stored in the
* MOS, which always has the highest possible copies.
*/
dmu_object_type_t stype =
dn ? dn->dn_storage_type : DMU_OT_NONE;
if (stype == DMU_OT_NONE)
stype = type;
if (stype == DMU_OT_DDT_ZAP) {
copies = dmu_ddt_copies;
if (level >=
zfs_redundant_metadata_most_ditto_level)
copies++;
}
}
} else if (wp & WP_NOFILL) { } else if (wp & WP_NOFILL) {
ASSERT(level == 0); ASSERT(level == 0);
@ -2824,3 +2852,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
/* CSTYLED */ /* CSTYLED */
ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
"Limit one prefetch call to this size"); "Limit one prefetch call to this size");
/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
"Override copies= for dedup objects");

View File

@ -1391,7 +1391,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
abd_t *dabd = abd_alloc_linear( abd_t *dabd = abd_alloc_linear(
drrw->drr_logical_size, B_FALSE); drrw->drr_logical_size, B_FALSE);
err = zio_decompress_data(drrw->drr_compressiontype, err = zio_decompress_data(drrw->drr_compressiontype,
abd, abd_to_buf(dabd), abd_get_size(abd), abd, dabd, abd_get_size(abd),
abd_get_size(dabd), NULL); abd_get_size(dabd), NULL);
if (err != 0) { if (err != 0) {
@ -1407,9 +1407,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
/* Recompress the data */ /* Recompress the data */
abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp), abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
B_FALSE); B_FALSE);
void *buf = abd_to_buf(cabd);
uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp), uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
abd, &buf, abd_get_size(abd), abd, &cabd, abd_get_size(abd),
rwa->os->os_complevel); rwa->os->os_complevel);
abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize); abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
/* Swap in newly compressed data into the abd */ /* Swap in newly compressed data into the abd */
@ -2221,7 +2220,7 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
err = zio_decompress_data( err = zio_decompress_data(
drrw->drr_compressiontype, drrw->drr_compressiontype,
abd, abd_to_buf(decomp_abd), abd, decomp_abd,
abd_get_size(abd), abd_get_size(abd),
abd_get_size(decomp_abd), NULL); abd_get_size(decomp_abd), NULL);

View File

@ -2425,8 +2425,14 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
fnvlist_free(token_nv); fnvlist_free(token_nv);
compressed = kmem_alloc(packed_size, KM_SLEEP); compressed = kmem_alloc(packed_size, KM_SLEEP);
compressed_size = gzip_compress(packed, compressed, /* Call compress function directly to avoid hole detection. */
abd_t pabd, cabd;
abd_get_from_buf_struct(&pabd, packed, packed_size);
abd_get_from_buf_struct(&cabd, compressed, packed_size);
compressed_size = zfs_gzip_compress(&pabd, &cabd,
packed_size, packed_size, 6); packed_size, packed_size, 6);
abd_free(&cabd);
abd_free(&pabd);
zio_cksum_t cksum; zio_cksum_t cksum;
fletcher_4_native_varsize(compressed, compressed_size, &cksum); fletcher_4_native_varsize(compressed, compressed_size, &cksum);

View File

@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
zap_cursor_fini(&zc); zap_cursor_fini(&zc);
} }
ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
spa_scan_stat_init(spa); spa_scan_stat_init(spa);
vdev_scan_stat_init(spa->spa_root_vdev); vdev_scan_stat_init(spa->spa_root_vdev);
@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
spa_history_log_internal(spa, "scan setup", tx, spa_history_log_internal(spa, "scan setup", tx,
@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
txg_sync_waiting(scn->scn_dp) || txg_sync_waiting(scn->scn_dp) ||
NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
spa_shutting_down(scn->scn_dp->dp_spa) || spa_shutting_down(scn->scn_dp->dp_spa) ||
(zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
!ddt_walk_ready(scn->scn_dp->dp_spa)) {
if (zb && zb->zb_level == ZB_ROOT_LEVEL) { if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
dprintf("suspending at first available bookmark " dprintf("suspending at first available bookmark "
"%llx/%llx/%llx/%llx\n", "%llx/%llx/%llx/%llx\n",
@ -2929,11 +2934,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
void void
dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_entry_t *dde, dmu_tx_t *tx) ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
{ {
(void) tx; (void) tx;
const ddt_key_t *ddk = &dde->dde_key; const ddt_key_t *ddk = &ddlwe->ddlwe_key;
ddt_phys_t *ddp = dde->dde_phys;
blkptr_t bp; blkptr_t bp;
zbookmark_phys_t zb = { 0 }; zbookmark_phys_t zb = { 0 };
@ -2954,11 +2958,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
if (scn->scn_done_txg != 0) if (scn->scn_done_txg != 0)
return; return;
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
if (ddp->ddp_phys_birth == 0 || ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
continue; continue;
ddt_bp_create(checksum, ddk, ddp, &bp); ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
scn->scn_visited_this_txg++; scn->scn_visited_this_txg++;
scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
@ -3002,11 +3008,11 @@ static void
dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
{ {
ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
ddt_entry_t dde = {{{{0}}}}; ddt_lightweight_entry_t ddlwe = {0};
int error; int error;
uint64_t n = 0; uint64_t n = 0;
while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
ddt_t *ddt; ddt_t *ddt;
if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
@ -3021,16 +3027,28 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
ASSERT(avl_first(&ddt->ddt_tree) == NULL); ASSERT(avl_first(&ddt->ddt_tree) == NULL);
dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
n++; n++;
if (dsl_scan_check_suspend(scn, NULL)) if (dsl_scan_check_suspend(scn, NULL))
break; break;
} }
zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; " if (error == EAGAIN) {
"suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name, dsl_scan_check_suspend(scn, NULL);
(int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); error = 0;
zfs_dbgmsg("waiting for ddt to become ready for scan "
"on %s with class_max = %u; suspending=%u",
scn->scn_dp->dp_spa->spa_name,
(int)scn->scn_phys.scn_ddt_class_max,
(int)scn->scn_suspending);
} else
zfs_dbgmsg("scanned %llu ddt entries on %s with "
"class_max = %u; suspending=%u", (longlong_t)n,
scn->scn_dp->dp_spa->spa_name,
(int)scn->scn_phys.scn_ddt_class_max,
(int)scn->scn_suspending);
ASSERT(error == 0 || error == ENOENT); ASSERT(error == 0 || error == ENOENT);
ASSERT(error != ENOENT || ASSERT(error != ENOENT ||

View File

@ -47,8 +47,9 @@ typedef uLongf zlen_t;
#endif #endif
size_t static size_t
gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) zfs_gzip_compress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n)
{ {
int ret; int ret;
zlen_t dstlen = d_len; zlen_t dstlen = d_len;
@ -82,8 +83,9 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
return ((size_t)dstlen); return ((size_t)dstlen);
} }
int static int
gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n)
{ {
(void) n; (void) n;
zlen_t dstlen = d_len; zlen_t dstlen = d_len;
@ -103,3 +105,6 @@ gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
return (0); return (0);
} }
ZFS_COMPRESS_WRAP_DECL(zfs_gzip_compress)
ZFS_DECOMPRESS_WRAP_DECL(zfs_gzip_decompress)

View File

@ -52,8 +52,8 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
static kmem_cache_t *lz4_cache; static kmem_cache_t *lz4_cache;
size_t static size_t
lz4_compress_zfs(void *s_start, void *d_start, size_t s_len, zfs_lz4_compress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n) size_t d_len, int n)
{ {
(void) n; (void) n;
@ -80,8 +80,8 @@ lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
return (bufsiz + sizeof (bufsiz)); return (bufsiz + sizeof (bufsiz));
} }
int static int
lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n) size_t d_len, int n)
{ {
(void) n; (void) n;
@ -100,6 +100,9 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
d_start, bufsiz, d_len) < 0); d_start, bufsiz, d_len) < 0);
} }
ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress)
ZFS_DECOMPRESS_WRAP_DECL(zfs_lz4_decompress)
/* /*
* LZ4 API Description: * LZ4 API Description:
* *

View File

@ -45,8 +45,9 @@
#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) #define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
#define LEMPEL_SIZE 1024 #define LEMPEL_SIZE 1024
size_t static size_t
lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) zfs_lzjb_compress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n)
{ {
(void) n; (void) n;
uchar_t *src = s_start; uchar_t *src = s_start;
@ -100,8 +101,9 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
return (dst - (uchar_t *)d_start); return (dst - (uchar_t *)d_start);
} }
int static int
lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) zfs_lzjb_decompress_buf(void *s_start, void *d_start,
size_t s_len, size_t d_len, int n)
{ {
(void) s_len, (void) n; (void) s_len, (void) n;
uchar_t *src = s_start; uchar_t *src = s_start;
@ -130,3 +132,6 @@ lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
} }
return (0); return (0);
} }
ZFS_COMPRESS_WRAP_DECL(zfs_lzjb_compress)
ZFS_DECOMPRESS_WRAP_DECL(zfs_lzjb_decompress)

View File

@ -1040,16 +1040,34 @@ spa_change_guid_sync(void *arg, dmu_tx_t *tx)
* online when we do this, or else any vdevs that weren't present * online when we do this, or else any vdevs that weren't present
* would be orphaned from our pool. We are also going to issue a * would be orphaned from our pool. We are also going to issue a
* sysevent to update any watchers. * sysevent to update any watchers.
*
* The GUID of the pool will be changed to the value pointed to by guidp.
* The GUID may not be set to the reserverd value of 0.
* The new GUID will be generated if guidp is NULL.
*/ */
int int
spa_change_guid(spa_t *spa) spa_change_guid(spa_t *spa, const uint64_t *guidp)
{ {
int error;
uint64_t guid; uint64_t guid;
int error;
mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock); mutex_enter(&spa_namespace_lock);
guid = spa_generate_guid(NULL);
if (guidp != NULL) {
guid = *guidp;
if (guid == 0) {
error = SET_ERROR(EINVAL);
goto out;
}
if (spa_guid_exists(guid, 0)) {
error = SET_ERROR(EEXIST);
goto out;
}
} else {
guid = spa_generate_guid(NULL);
}
error = dsl_sync_task(spa->spa_name, spa_change_guid_check, error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
@ -1068,6 +1086,7 @@ spa_change_guid(spa_t *spa)
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
} }
out:
mutex_exit(&spa_namespace_lock); mutex_exit(&spa_namespace_lock);
mutex_exit(&spa->spa_vdev_top_lock); mutex_exit(&spa->spa_vdev_top_lock);
@ -7602,8 +7621,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
* The new device cannot have a higher alignment requirement * The new device cannot have a higher alignment requirement
* than the top-level vdev. * than the top-level vdev.
*/ */
if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); return (spa_vdev_exit(spa, newrootvd, txg,
ZFS_ERR_ASHIFT_MISMATCH));
}
/* /*
* RAIDZ-expansion-specific checks. * RAIDZ-expansion-specific checks.

View File

@ -645,7 +645,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
DATA_TYPE_INT32, zio->io_error, NULL); DATA_TYPE_INT32, zio->io_error, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
DATA_TYPE_INT32, zio->io_flags, NULL); DATA_TYPE_UINT64, zio->io_flags, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
DATA_TYPE_UINT32, zio->io_stage, NULL); DATA_TYPE_UINT32, zio->io_stage, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,

View File

@ -1794,17 +1794,45 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
return (error); return (error);
} }
/*
* inputs:
* zc_nvlist_src nvlist optionally containing ZPOOL_REGUID_GUID
* zc_nvlist_src_size size of the nvlist
*/
static int static int
zfs_ioc_pool_reguid(zfs_cmd_t *zc) zfs_ioc_pool_reguid(zfs_cmd_t *zc)
{ {
uint64_t *guidp = NULL;
nvlist_t *props = NULL;
spa_t *spa; spa_t *spa;
uint64_t guid;
int error; int error;
if (zc->zc_nvlist_src_size != 0) {
error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
zc->zc_iflags, &props);
if (error != 0)
return (error);
error = nvlist_lookup_uint64(props, ZPOOL_REGUID_GUID, &guid);
if (error == 0)
guidp = &guid;
else if (error == ENOENT)
guidp = NULL;
else
goto out;
}
error = spa_open(zc->zc_name, &spa, FTAG); error = spa_open(zc->zc_name, &spa, FTAG);
if (error == 0) { if (error == 0) {
error = spa_change_guid(spa); error = spa_change_guid(spa, guidp);
spa_close(spa, FTAG); spa_close(spa, FTAG);
} }
out:
if (props != NULL)
nvlist_free(props);
return (error); return (error);
} }

View File

@ -299,10 +299,13 @@ zio_fini(void)
* ========================================================================== * ==========================================================================
*/ */
#ifdef ZFS_DEBUG #if defined(ZFS_DEBUG) && defined(_KERNEL)
static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; #define ZFS_ZIO_BUF_CANARY 1
#endif #endif
#ifdef ZFS_ZIO_BUF_CANARY
static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
/* /*
* Use empty space after the buffer to detect overflows. * Use empty space after the buffer to detect overflows.
* *
@ -314,7 +317,6 @@ static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
static void static void
zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
{ {
#ifdef ZFS_DEBUG
size_t off = P2ROUNDUP(size, sizeof (ulong_t)); size_t off = P2ROUNDUP(size, sizeof (ulong_t));
ulong_t *canary = p + off / sizeof (ulong_t); ulong_t *canary = p + off / sizeof (ulong_t);
size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
@ -323,13 +325,11 @@ zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
asize = (c + 2) << SPA_MINBLOCKSHIFT; asize = (c + 2) << SPA_MINBLOCKSHIFT;
for (; off < asize; canary++, off += sizeof (ulong_t)) for (; off < asize; canary++, off += sizeof (ulong_t))
*canary = zio_buf_canary; *canary = zio_buf_canary;
#endif
} }
static void static void
zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
{ {
#ifdef ZFS_DEBUG
size_t off = P2ROUNDUP(size, sizeof (ulong_t)); size_t off = P2ROUNDUP(size, sizeof (ulong_t));
ulong_t *canary = p + off / sizeof (ulong_t); ulong_t *canary = p + off / sizeof (ulong_t);
size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
@ -343,8 +343,8 @@ zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
*canary, zio_buf_canary); *canary, zio_buf_canary);
} }
} }
#endif
} }
#endif
/* /*
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
@ -363,7 +363,9 @@ zio_buf_alloc(size_t size)
#endif #endif
void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE); void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
#ifdef ZFS_ZIO_BUF_CANARY
zio_buf_put_canary(p, size, zio_buf_cache, c); zio_buf_put_canary(p, size, zio_buf_cache, c);
#endif
return (p); return (p);
} }
@ -381,7 +383,9 @@ zio_data_buf_alloc(size_t size)
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE); void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
#ifdef ZFS_ZIO_BUF_CANARY
zio_buf_put_canary(p, size, zio_data_buf_cache, c); zio_buf_put_canary(p, size, zio_data_buf_cache, c);
#endif
return (p); return (p);
} }
@ -395,7 +399,9 @@ zio_buf_free(void *buf, size_t size)
atomic_add_64(&zio_buf_cache_frees[c], 1); atomic_add_64(&zio_buf_cache_frees[c], 1);
#endif #endif
#ifdef ZFS_ZIO_BUF_CANARY
zio_buf_check_canary(buf, size, zio_buf_cache, c); zio_buf_check_canary(buf, size, zio_buf_cache, c);
#endif
kmem_cache_free(zio_buf_cache[c], buf); kmem_cache_free(zio_buf_cache[c], buf);
} }
@ -406,7 +412,9 @@ zio_data_buf_free(void *buf, size_t size)
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
#ifdef ZFS_ZIO_BUF_CANARY
zio_buf_check_canary(buf, size, zio_data_buf_cache, c); zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
#endif
kmem_cache_free(zio_data_buf_cache[c], buf); kmem_cache_free(zio_data_buf_cache[c], buf);
} }
@ -479,11 +487,9 @@ static void
zio_decompress(zio_t *zio, abd_t *data, uint64_t size) zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
{ {
if (zio->io_error == 0) { if (zio->io_error == 0) {
void *tmp = abd_borrow_buf(data, size);
int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
zio->io_abd, tmp, zio->io_size, size, zio->io_abd, data, zio->io_size, size,
&zio->io_prop.zp_complevel); &zio->io_prop.zp_complevel);
abd_return_buf_copy(data, tmp, size);
if (zio_injection_enabled && ret == 0) if (zio_injection_enabled && ret == 0)
ret = zio_handle_fault_injection(zio, EINVAL); ret = zio_handle_fault_injection(zio, EINVAL);
@ -530,17 +536,18 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
* from the indirect block. We decompress it now and * from the indirect block. We decompress it now and
* throw away the result after we are finished. * throw away the result after we are finished.
*/ */
tmp = zio_buf_alloc(lsize); abd_t *abd = abd_alloc_linear(lsize, B_TRUE);
ret = zio_decompress_data(BP_GET_COMPRESS(bp), ret = zio_decompress_data(BP_GET_COMPRESS(bp),
zio->io_abd, tmp, zio->io_size, lsize, zio->io_abd, abd, zio->io_size, lsize,
&zio->io_prop.zp_complevel); &zio->io_prop.zp_complevel);
if (ret != 0) { if (ret != 0) {
abd_free(abd);
ret = SET_ERROR(EIO); ret = SET_ERROR(EIO);
goto error; goto error;
} }
ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); abd, lsize, BP_SHOULD_BYTESWAP(bp), mac);
zio_buf_free(tmp, lsize); abd_free(abd);
} else { } else {
ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
@ -1858,30 +1865,32 @@ zio_write_compress(zio_t *zio)
/* If it's a compressed write that is not raw, compress the buffer. */ /* If it's a compressed write that is not raw, compress the buffer. */
if (compress != ZIO_COMPRESS_OFF && if (compress != ZIO_COMPRESS_OFF &&
!(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
void *cbuf = NULL; abd_t *cabd = NULL;
if (abd_cmp_zero(zio->io_abd, lsize) == 0) if (abd_cmp_zero(zio->io_abd, lsize) == 0)
psize = 0; psize = 0;
else if (compress == ZIO_COMPRESS_EMPTY) else if (compress == ZIO_COMPRESS_EMPTY)
psize = lsize; psize = lsize;
else else
psize = zio_compress_data(compress, zio->io_abd, &cbuf, psize = zio_compress_data(compress, zio->io_abd, &cabd,
lsize, zp->zp_complevel); lsize, zp->zp_complevel);
if (psize == 0) { if (psize == 0) {
compress = ZIO_COMPRESS_OFF; compress = ZIO_COMPRESS_OFF;
} else if (psize >= lsize) { } else if (psize >= lsize) {
compress = ZIO_COMPRESS_OFF; compress = ZIO_COMPRESS_OFF;
if (cbuf != NULL) if (cabd != NULL)
zio_buf_free(cbuf, lsize); abd_free(cabd);
} else if (!zp->zp_dedup && !zp->zp_encrypt && } else if (!zp->zp_dedup && !zp->zp_encrypt &&
psize <= BPE_PAYLOAD_SIZE && psize <= BPE_PAYLOAD_SIZE &&
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
void *cbuf = abd_borrow_buf_copy(cabd, lsize);
encode_embedded_bp_compressed(bp, encode_embedded_bp_compressed(bp,
cbuf, compress, lsize, psize); cbuf, compress, lsize, psize);
BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_TYPE(bp, zio->io_prop.zp_type);
BP_SET_LEVEL(bp, zio->io_prop.zp_level); BP_SET_LEVEL(bp, zio->io_prop.zp_level);
zio_buf_free(cbuf, lsize); abd_return_buf(cabd, cbuf, lsize);
abd_free(cabd);
BP_SET_LOGICAL_BIRTH(bp, zio->io_txg); BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
ASSERT(spa_feature_is_active(spa, ASSERT(spa_feature_is_active(spa,
@ -1900,14 +1909,12 @@ zio_write_compress(zio_t *zio)
psize); psize);
if (rounded >= lsize) { if (rounded >= lsize) {
compress = ZIO_COMPRESS_OFF; compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize); abd_free(cabd);
psize = lsize; psize = lsize;
} else { } else {
abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_zero_off(cabd, psize, rounded - psize);
abd_take_ownership_of_buf(cdata, B_TRUE);
abd_zero_off(cdata, psize, rounded - psize);
psize = rounded; psize = rounded;
zio_push_transform(zio, cdata, zio_push_transform(zio, cabd,
psize, lsize, NULL); psize, lsize, NULL);
} }
} }
@ -3254,17 +3261,21 @@ static void
zio_ddt_child_read_done(zio_t *zio) zio_ddt_child_read_done(zio_t *zio)
{ {
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
ddt_t *ddt;
ddt_entry_t *dde = zio->io_private; ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp;
zio_t *pio = zio_unique_parent(zio); zio_t *pio = zio_unique_parent(zio);
mutex_enter(&pio->io_lock); mutex_enter(&pio->io_lock);
ddp = ddt_phys_select(dde, bp); ddt = ddt_select(zio->io_spa, bp);
if (zio->io_error == 0)
ddt_phys_clear(ddp); /* this ddp doesn't need repair */
if (zio->io_error == 0 && dde->dde_repair_abd == NULL) if (zio->io_error == 0) {
dde->dde_repair_abd = zio->io_abd; ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
/* this phys variant doesn't need repair */
ddt_phys_clear(dde->dde_phys, v);
}
if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
dde->dde_io->dde_repair_abd = zio->io_abd;
else else
abd_free(zio->io_abd); abd_free(zio->io_abd);
mutex_exit(&pio->io_lock); mutex_exit(&pio->io_lock);
@ -3282,21 +3293,25 @@ zio_ddt_read_start(zio_t *zio)
if (zio->io_child_error[ZIO_CHILD_DDT]) { if (zio->io_child_error[ZIO_CHILD_DDT]) {
ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_t *ddt = ddt_select(zio->io_spa, bp);
ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp);
ddt_phys_t *ddp = dde->dde_phys; ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp);
ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); ddt_univ_phys_t *ddp = dde->dde_phys;
blkptr_t blk; blkptr_t blk;
ASSERT(zio->io_vsd == NULL); ASSERT(zio->io_vsd == NULL);
zio->io_vsd = dde; zio->io_vsd = dde;
if (ddp_self == NULL) if (v_self == DDT_PHYS_NONE)
return (zio); return (zio);
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { /* issue I/O for the other copies */
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
if (ddt_phys_birth(ddp, v) == 0 || v == v_self)
continue; continue;
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
&blk); ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
ddp, v, &blk);
zio_nowait(zio_read(zio, zio->io_spa, &blk, zio_nowait(zio_read(zio, zio->io_spa, &blk,
abd_alloc_for_io(zio->io_size, B_TRUE), abd_alloc_for_io(zio->io_size, B_TRUE),
zio->io_size, zio_ddt_child_read_done, dde, zio->io_size, zio_ddt_child_read_done, dde,
@ -3338,8 +3353,8 @@ zio_ddt_read_done(zio_t *zio)
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (NULL); return (NULL);
} }
if (dde->dde_repair_abd != NULL) { if (dde->dde_io->dde_repair_abd != NULL) {
abd_copy(zio->io_abd, dde->dde_repair_abd, abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd,
zio->io_size); zio->io_size);
zio->io_child_error[ZIO_CHILD_DDT] = 0; zio->io_child_error[ZIO_CHILD_DDT] = 0;
} }
@ -3372,28 +3387,36 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
* loaded). * loaded).
*/ */
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
zio_t *lio = dde->dde_lead_zio[p]; if (DDT_PHYS_IS_DITTO(ddt, p))
continue;
if (lio != NULL && do_raw) { if (dde->dde_io == NULL)
continue;
zio_t *lio = dde->dde_io->dde_lead_zio[p];
if (lio == NULL)
continue;
if (do_raw)
return (lio->io_size != zio->io_size || return (lio->io_size != zio->io_size ||
abd_cmp(zio->io_abd, lio->io_abd) != 0); abd_cmp(zio->io_abd, lio->io_abd) != 0);
} else if (lio != NULL) {
return (lio->io_orig_size != zio->io_orig_size || return (lio->io_orig_size != zio->io_orig_size ||
abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
}
} }
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v);
if (ddp->ddp_phys_birth != 0 && do_raw) { if (phys_birth != 0 && do_raw) {
blkptr_t blk = *zio->io_bp; blkptr_t blk = *zio->io_bp;
uint64_t psize; uint64_t psize;
abd_t *tmpabd; abd_t *tmpabd;
int error; int error;
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
psize = BP_GET_PSIZE(&blk); psize = BP_GET_PSIZE(&blk);
if (psize != zio->io_size) if (psize != zio->io_size)
@ -3416,13 +3439,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
abd_free(tmpabd); abd_free(tmpabd);
ddt_enter(ddt); ddt_enter(ddt);
return (error != 0); return (error != 0);
} else if (ddp->ddp_phys_birth != 0) { } else if (phys_birth != 0) {
arc_buf_t *abuf = NULL; arc_buf_t *abuf = NULL;
arc_flags_t aflags = ARC_FLAG_WAIT; arc_flags_t aflags = ARC_FLAG_WAIT;
blkptr_t blk = *zio->io_bp; blkptr_t blk = *zio->io_bp;
int error; int error;
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
if (BP_GET_LSIZE(&blk) != zio->io_orig_size) if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
return (B_TRUE); return (B_TRUE);
@ -3450,50 +3473,87 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
} }
static void static void
zio_ddt_child_write_ready(zio_t *zio) zio_ddt_child_write_done(zio_t *zio)
{ {
int p = zio->io_prop.zp_copies;
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
ddt_entry_t *dde = zio->io_private; ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp = &dde->dde_phys[p];
zio_t *pio;
if (zio->io_error) zio_link_t *zl = NULL;
return; ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
ddt_univ_phys_t *ddp = dde->dde_phys;
ddt_enter(ddt); ddt_enter(ddt);
ASSERT(dde->dde_lead_zio[p] == zio); /* we're the lead, so once we're done there's no one else outstanding */
if (dde->dde_io->dde_lead_zio[p] == zio)
dde->dde_io->dde_lead_zio[p] = NULL;
ddt_phys_fill(ddp, zio->io_bp); ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys;
zio_link_t *zl = NULL; if (zio->io_error != 0) {
while ((pio = zio_walk_parents(zio, &zl)) != NULL) /*
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); * The write failed, so we're about to abort the entire IO
* chain. We need to revert the entry back to what it was at
* the last time it was successfully extended.
*/
ddt_phys_copy(ddp, orig, v);
ddt_phys_clear(orig, v);
ddt_exit(ddt);
return;
}
/*
* We've successfully added new DVAs to the entry. Clear the saved
* state or, if there's still outstanding IO, remember it so we can
* revert to a known good state if that IO fails.
*/
if (dde->dde_io->dde_lead_zio[p] == NULL)
ddt_phys_clear(orig, v);
else
ddt_phys_copy(orig, ddp, v);
/*
* Add references for all dedup writes that were waiting on the
* physical one, skipping any other physical writes that are waiting.
*/
zio_t *pio;
zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
ddt_phys_addref(ddp, v);
}
ddt_exit(ddt); ddt_exit(ddt);
} }
static void static void
zio_ddt_child_write_done(zio_t *zio) zio_ddt_child_write_ready(zio_t *zio)
{ {
int p = zio->io_prop.zp_copies;
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
ddt_entry_t *dde = zio->io_private; ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp = &dde->dde_phys[p];
zio_link_t *zl = NULL;
ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
if (zio->io_error != 0)
return;
ddt_enter(ddt); ddt_enter(ddt);
ASSERT(ddp->ddp_refcnt == 0); ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
ASSERT(dde->dde_lead_zio[p] == zio);
dde->dde_lead_zio[p] = NULL;
if (zio->io_error == 0) { zio_t *pio;
zio_link_t *zl = NULL; zl = NULL;
while (zio_walk_parents(zio, &zl) != NULL) while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
ddt_phys_addref(ddp); if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
} else { ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
ddt_phys_clear(ddp);
} }
ddt_exit(ddt); ddt_exit(ddt);
@ -3506,11 +3566,8 @@ zio_ddt_write(zio_t *zio)
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
uint64_t txg = zio->io_txg; uint64_t txg = zio->io_txg;
zio_prop_t *zp = &zio->io_prop; zio_prop_t *zp = &zio->io_prop;
int p = zp->zp_copies;
zio_t *cio = NULL;
ddt_t *ddt = ddt_select(spa, bp); ddt_t *ddt = ddt_select(spa, bp);
ddt_entry_t *dde; ddt_entry_t *dde;
ddt_phys_t *ddp;
ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_DEDUP(bp));
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
@ -3518,7 +3575,7 @@ zio_ddt_write(zio_t *zio)
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
ddt_enter(ddt); ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE); dde = ddt_lookup(ddt, bp);
if (dde == NULL) { if (dde == NULL) {
/* DDT size is over its quota so no new entries */ /* DDT size is over its quota so no new entries */
zp->zp_dedup = B_FALSE; zp->zp_dedup = B_FALSE;
@ -3528,7 +3585,6 @@ zio_ddt_write(zio_t *zio)
ddt_exit(ddt); ddt_exit(ddt);
return (zio); return (zio);
} }
ddp = &dde->dde_phys[p];
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
/* /*
@ -3553,29 +3609,227 @@ zio_ddt_write(zio_t *zio)
return (zio); return (zio);
} }
if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
if (ddp->ddp_phys_birth != 0) ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
ddt_bp_fill(ddp, bp, txg); ddt_univ_phys_t *ddp = dde->dde_phys;
if (dde->dde_lead_zio[p] != NULL)
zio_add_child(zio, dde->dde_lead_zio[p]);
else
ddt_phys_addref(ddp);
} else if (zio->io_bp_override) {
ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
ddt_phys_fill(ddp, bp);
ddt_phys_addref(ddp);
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); /*
dde->dde_lead_zio[p] = cio; * In the common cases, at this point we have a regular BP with no
* allocated DVAs, and the corresponding DDT entry for its checksum.
* Our goal is to fill the BP with enough DVAs to satisfy its copies=
* requirement.
*
* One of three things needs to happen to fulfill this:
*
* - if the DDT entry has enough DVAs to satisfy the BP, we just copy
* them out of the entry and return;
*
* - if the DDT entry has no DVAs (ie its brand new), then we have to
* issue the write as normal so that DVAs can be allocated and the
* data land on disk. We then copy the DVAs into the DDT entry on
* return.
*
* - if the DDT entry has some DVAs, but too few, we have to issue the
* write, adjusted to have allocate fewer copies. When it returns, we
* add the new DVAs to the DDT entry, and update the BP to have the
* full amount it originally requested.
*
* In all cases, if there's already a writing IO in flight, we need to
* defer the action until after the write is done. If our action is to
* write, we need to adjust our request for additional DVAs to match
* what will be in the DDT entry after it completes. In this way every
* IO can be guaranteed to recieve enough DVAs simply by joining the
* end of the chain and letting the sequence play out.
*/
/*
* Number of DVAs in the DDT entry. If the BP is encrypted we ignore
* the third one as normal.
*/
int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
/* Number of DVAs requested bya the IO. */
uint8_t need_dvas = zp->zp_copies;
/*
* What we do next depends on whether or not there's IO outstanding that
* will update this entry.
*/
if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
/*
* No IO outstanding, so we only need to worry about ourselves.
*/
/*
* Override BPs bring their own DVAs and their own problems.
*/
if (zio->io_bp_override) {
/*
* For a brand-new entry, all the work has been done
* for us, and we can just fill it out from the provided
* block and leave.
*/
if (have_dvas == 0) {
ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
ddt_phys_extend(ddp, v, bp);
ddt_phys_addref(ddp, v);
ddt_exit(ddt);
return (zio);
}
/*
* If we already have this entry, then we want to treat
* it like a regular write. To do this we just wipe
* them out and proceed like a regular write.
*
* Even if there are some DVAs in the entry, we still
* have to clear them out. We can't use them to fill
* out the dedup entry, as they are all referenced
* together by a bp already on disk, and will be freed
* as a group.
*/
BP_ZERO_DVAS(bp);
BP_SET_BIRTH(bp, 0, 0);
}
/*
* If there are enough DVAs in the entry to service our request,
* then we can just use them as-is.
*/
if (have_dvas >= need_dvas) {
ddt_bp_fill(ddp, v, bp, txg);
ddt_phys_addref(ddp, v);
ddt_exit(ddt);
return (zio);
}
/*
* Otherwise, we have to issue IO to fill the entry up to the
* amount we need.
*/
need_dvas -= have_dvas;
} else {
/*
* There's a write in-flight. If there's already enough DVAs on
* the entry, then either there were already enough to start
* with, or the in-flight IO is between READY and DONE, and so
* has extended the entry with new DVAs. Either way, we don't
* need to do anything, we can just slot in behind it.
*/
if (zio->io_bp_override) {
/*
* If there's a write out, then we're soon going to
* have our own copies of this block, so clear out the
* override block and treat it as a regular dedup
* write. See comment above.
*/
BP_ZERO_DVAS(bp);
BP_SET_BIRTH(bp, 0, 0);
}
if (have_dvas >= need_dvas) {
/*
* A minor point: there might already be enough
* committed DVAs in the entry to service our request,
* but we don't know which are completed and which are
* allocated but not yet written. In this case, should
* the IO for the new DVAs fail, we will be on the end
* of the IO chain and will also recieve an error, even
* though our request could have been serviced.
*
* This is an extremely rare case, as it requires the
* original block to be copied with a request for a
* larger number of DVAs, then copied again requesting
* the same (or already fulfilled) number of DVAs while
* the first request is active, and then that first
* request errors. In return, the logic required to
* catch and handle it is complex. For now, I'm just
* not going to bother with it.
*/
/*
* We always fill the bp here as we may have arrived
* after the in-flight write has passed READY, and so
* missed out.
*/
ddt_bp_fill(ddp, v, bp, txg);
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
ddt_exit(ddt);
return (zio);
}
/*
* There's not enough in the entry yet, so we need to look at
* the write in-flight and see how many DVAs it will have once
* it completes.
*
* The in-flight write has potentially had its copies request
* reduced (if we're filling out an existing entry), so we need
* to reach in and get the original write to find out what it is
* expecting.
*
* Note that the parent of the lead zio will always have the
* highest zp_copies of any zio in the chain, because ones that
* can be serviced without additional IO are always added to
* the back of the chain.
*/
zio_link_t *zl = NULL;
zio_t *pio =
zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
ASSERT(pio);
uint8_t parent_dvas = pio->io_prop.zp_copies;
if (parent_dvas >= need_dvas) {
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
ddt_exit(ddt);
return (zio);
}
/*
* Still not enough, so we will need to issue to get the
* shortfall.
*/
need_dvas -= parent_dvas;
} }
/*
* We need to write. We will create a new write with the copies
* property adjusted to match the number of DVAs we need to need to
* grow the DDT entry by to satisfy the request.
*/
zio_prop_t czp = *zp;
czp.zp_copies = need_dvas;
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp,
zio_ddt_child_write_ready, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
/*
* We are the new lead zio, because our parent has the highest
* zp_copies that has been requested for this entry so far.
*/
ddt_alloc_entry_io(dde);
if (dde->dde_io->dde_lead_zio[p] == NULL) {
/*
* First time out, take a copy of the stable entry to revert
* to if there's an error (see zio_ddt_child_write_done())
*/
ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
} else {
/*
* Make the existing chain our child, because it cannot
* complete until we have.
*/
zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
}
dde->dde_io->dde_lead_zio[p] = cio;
ddt_exit(ddt); ddt_exit(ddt);
zio_nowait(cio); zio_nowait(cio);
@ -3591,18 +3845,17 @@ zio_ddt_free(zio_t *zio)
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
ddt_t *ddt = ddt_select(spa, bp); ddt_t *ddt = ddt_select(spa, bp);
ddt_entry_t *dde; ddt_entry_t *dde = NULL;
ddt_phys_t *ddp;
ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_DEDUP(bp));
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ddt_enter(ddt); ddt_enter(ddt);
freedde = dde = ddt_lookup(ddt, bp, B_TRUE); freedde = dde = ddt_lookup(ddt, bp);
if (dde) { if (dde) {
ddp = ddt_phys_select(dde, bp); ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
if (ddp) if (v != DDT_PHYS_NONE)
ddt_phys_decref(ddp); ddt_phys_decref(dde->dde_phys, v);
} }
ddt_exit(ddt); ddt_exit(ddt);

View File

@ -29,7 +29,7 @@
/* /*
* Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
* Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, 2024, Klara, Inc.
* Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Allan Jude
*/ */
@ -48,26 +48,42 @@ static unsigned long zio_decompress_fail_fraction = 0;
/* /*
* Compression vectors. * Compression vectors.
*
* NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS.
* THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE
* PART OF THE ON-DISK FORMAT.
*/ */
zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{"inherit", 0, NULL, NULL, NULL}, {"inherit", 0, NULL, NULL, NULL},
{"on", 0, NULL, NULL, NULL}, {"on", 0, NULL, NULL, NULL},
{"uncompressed", 0, NULL, NULL, NULL}, {"uncompressed", 0, NULL, NULL, NULL},
{"lzjb", 0, lzjb_compress, lzjb_decompress, NULL}, {"lzjb", 0,
{"empty", 0, NULL, NULL, NULL}, zfs_lzjb_compress, zfs_lzjb_decompress, NULL},
{"gzip-1", 1, gzip_compress, gzip_decompress, NULL}, {"empty", 0, NULL, NULL, NULL},
{"gzip-2", 2, gzip_compress, gzip_decompress, NULL}, {"gzip-1", 1,
{"gzip-3", 3, gzip_compress, gzip_decompress, NULL}, zfs_gzip_compress, zfs_gzip_decompress, NULL},
{"gzip-4", 4, gzip_compress, gzip_decompress, NULL}, {"gzip-2", 2,
{"gzip-5", 5, gzip_compress, gzip_decompress, NULL}, zfs_gzip_compress, zfs_gzip_decompress, NULL},
{"gzip-6", 6, gzip_compress, gzip_decompress, NULL}, {"gzip-3", 3,
{"gzip-7", 7, gzip_compress, gzip_decompress, NULL}, zfs_gzip_compress, zfs_gzip_decompress, NULL},
{"gzip-8", 8, gzip_compress, gzip_decompress, NULL}, {"gzip-4", 4,
{"gzip-9", 9, gzip_compress, gzip_decompress, NULL}, zfs_gzip_compress, zfs_gzip_decompress, NULL},
{"zle", 64, zle_compress, zle_decompress, NULL}, {"gzip-5", 5,
{"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL}, zfs_gzip_compress, zfs_gzip_decompress, NULL},
{"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress_wrap, {"gzip-6", 6,
zfs_zstd_decompress, zfs_zstd_decompress_level}, zfs_gzip_compress, zfs_gzip_decompress, NULL},
{"gzip-7", 7,
zfs_gzip_compress, zfs_gzip_decompress, NULL},
{"gzip-8", 8,
zfs_gzip_compress, zfs_gzip_decompress, NULL},
{"gzip-9", 9,
zfs_gzip_compress, zfs_gzip_decompress, NULL},
{"zle", 64,
zfs_zle_compress, zfs_zle_decompress, NULL},
{"lz4", 0,
zfs_lz4_compress, zfs_lz4_decompress, NULL},
{"zstd", ZIO_ZSTD_LEVEL_DEFAULT,
zfs_zstd_compress, zfs_zstd_decompress, zfs_zstd_decompress_level},
}; };
uint8_t uint8_t
@ -112,20 +128,16 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
} }
size_t size_t
zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len, zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len,
uint8_t level) uint8_t level)
{ {
size_t c_len, d_len; size_t c_len, d_len;
uint8_t complevel; uint8_t complevel;
zio_compress_info_t *ci = &zio_compress_table[c]; zio_compress_info_t *ci = &zio_compress_table[c];
ASSERT3U(c, <, ZIO_COMPRESS_FUNCTIONS);
ASSERT3U(ci->ci_compress, !=, NULL); ASSERT3U(ci->ci_compress, !=, NULL);
ASSERT3U(s_len, >, 0); ASSERT3U(s_len, >, 0);
/* Compress at least 12.5% */
d_len = s_len - (s_len >> 3);
complevel = ci->ci_level; complevel = ci->ci_level;
if (c == ZIO_COMPRESS_ZSTD) { if (c == ZIO_COMPRESS_ZSTD) {
@ -142,12 +154,12 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
} }
if (*dst == NULL) if (*dst == NULL)
*dst = zio_buf_alloc(s_len); *dst = abd_alloc_sametype(src, s_len);
/* No compression algorithms can read from ABDs directly */ /* Compress at least 12.5%, but limit to the size of the dest abd. */
void *tmp = abd_borrow_buf_copy(src, s_len); d_len = MIN(s_len - (s_len >> 3), abd_get_size(*dst));
c_len = ci->ci_compress(tmp, *dst, s_len, d_len, complevel);
abd_return_buf(src, tmp, s_len); c_len = ci->ci_compress(src, *dst, s_len, d_len, complevel);
if (c_len > d_len) if (c_len > d_len)
return (s_len); return (s_len);
@ -157,26 +169,18 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
} }
int int
zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *dst,
size_t s_len, size_t d_len, uint8_t *level) size_t s_len, size_t d_len, uint8_t *level)
{ {
zio_compress_info_t *ci = &zio_compress_table[c]; zio_compress_info_t *ci = &zio_compress_table[c];
if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
int err;
if (ci->ci_decompress_level != NULL && level != NULL) if (ci->ci_decompress_level != NULL && level != NULL)
return (ci->ci_decompress_level(src, dst, s_len, d_len, level)); err = ci->ci_decompress_level(src, dst, s_len, d_len, level);
else
return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); err = ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
}
int
zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len, size_t d_len, uint8_t *level)
{
void *tmp = abd_borrow_buf_copy(src, s_len);
int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level);
abd_return_buf(src, tmp, s_len);
/* /*
* Decompression shouldn't fail, because we've already verified * Decompression shouldn't fail, because we've already verified
@ -185,9 +189,9 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
*/ */
if (zio_decompress_fail_fraction != 0 && if (zio_decompress_fail_fraction != 0 &&
random_in_range(zio_decompress_fail_fraction) == 0) random_in_range(zio_decompress_fail_fraction) == 0)
ret = SET_ERROR(EINVAL); err = SET_ERROR(EINVAL);
return (ret); return (err);
} }
int int

View File

@ -34,8 +34,9 @@
#include <sys/sysmacros.h> #include <sys/sysmacros.h>
#include <sys/zio_compress.h> #include <sys/zio_compress.h>
size_t static size_t
zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) zfs_zle_compress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n)
{ {
uchar_t *src = s_start; uchar_t *src = s_start;
uchar_t *dst = d_start; uchar_t *dst = d_start;
@ -64,8 +65,9 @@ zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
return (src == s_end ? dst - (uchar_t *)d_start : s_len); return (src == s_end ? dst - (uchar_t *)d_start : s_len);
} }
int static int
zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) zfs_zle_decompress_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, int n)
{ {
uchar_t *src = s_start; uchar_t *src = s_start;
uchar_t *dst = d_start; uchar_t *dst = d_start;
@ -89,3 +91,6 @@ zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
} }
return (dst == d_end ? 0 : -1); return (dst == d_end ? 0 : -1);
} }
ZFS_COMPRESS_WRAP_DECL(zfs_zle_compress)
ZFS_DECOMPRESS_WRAP_DECL(zfs_zle_decompress)

View File

@ -429,68 +429,9 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
return (1); return (1);
} }
size_t
zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
int level)
{
int16_t zstd_level;
if (zstd_enum_to_level(level, &zstd_level)) {
ZSTDSTAT_BUMP(zstd_stat_com_inval);
return (s_len);
}
/*
* A zstd early abort heuristic.
*
* - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
* 128k), don't try any of this, just go.
* (because experimentally that was a reasonable cutoff for a perf win
* with tiny ratio change)
* - First, we try LZ4 compression, and if it doesn't early abort, we
* jump directly to whatever compression level we intended to try.
* - Second, we try zstd-1 - if that errors out (usually, but not
* exclusively, if it would overflow), we give up early.
*
* If it works, instead we go on and compress anyway.
*
* Why two passes? LZ4 alone gets you a lot of the way, but on highly
* compressible data, it was losing up to 8.5% of the compressed
* savings versus no early abort, and all the zstd-fast levels are
* worse indications on their own than LZ4, and don't improve the LZ4
* pass noticably if stacked like this.
*/
size_t actual_abort_size = zstd_abort_size;
if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
s_len >= actual_abort_size) {
int pass_len = 1;
pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
if (pass_len < d_len) {
ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
goto keep_trying;
}
ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
ZIO_ZSTD_LEVEL_1);
if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
return (s_len);
}
ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
} else {
ZSTDSTAT_BUMP(zstd_stat_passignored);
if (s_len < actual_abort_size) {
ZSTDSTAT_BUMP(zstd_stat_passignored_size);
}
}
keep_trying:
return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
}
/* Compress block using zstd */ /* Compress block using zstd */
size_t static size_t
zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
int level) int level)
{ {
size_t c_len; size_t c_len;
@ -594,9 +535,73 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
return (c_len + sizeof (*hdr)); return (c_len + sizeof (*hdr));
} }
static size_t
zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
int level)
{
int16_t zstd_level;
if (zstd_enum_to_level(level, &zstd_level)) {
ZSTDSTAT_BUMP(zstd_stat_com_inval);
return (s_len);
}
/*
* A zstd early abort heuristic.
*
* - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
* 128k), don't try any of this, just go.
* (because experimentally that was a reasonable cutoff for a perf win
* with tiny ratio change)
* - First, we try LZ4 compression, and if it doesn't early abort, we
* jump directly to whatever compression level we intended to try.
* - Second, we try zstd-1 - if that errors out (usually, but not
* exclusively, if it would overflow), we give up early.
*
* If it works, instead we go on and compress anyway.
*
* Why two passes? LZ4 alone gets you a lot of the way, but on highly
* compressible data, it was losing up to 8.5% of the compressed
* savings versus no early abort, and all the zstd-fast levels are
* worse indications on their own than LZ4, and don't improve the LZ4
* pass noticably if stacked like this.
*/
size_t actual_abort_size = zstd_abort_size;
if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
s_len >= actual_abort_size) {
int pass_len = 1;
abd_t sabd, dabd;
abd_get_from_buf_struct(&sabd, s_start, s_len);
abd_get_from_buf_struct(&dabd, d_start, d_len);
pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
abd_free(&dabd);
abd_free(&sabd);
if (pass_len < d_len) {
ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
goto keep_trying;
}
ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
d_len, ZIO_ZSTD_LEVEL_1);
if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
return (s_len);
}
ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
} else {
ZSTDSTAT_BUMP(zstd_stat_passignored);
if (s_len < actual_abort_size) {
ZSTDSTAT_BUMP(zstd_stat_passignored_size);
}
}
keep_trying:
return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
}
/* Decompress block using zstd and return its stored level */ /* Decompress block using zstd and return its stored level */
int static int
zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
size_t d_len, uint8_t *level) size_t d_len, uint8_t *level)
{ {
ZSTD_DCtx *dctx; ZSTD_DCtx *dctx;
@ -671,15 +676,20 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
} }
/* Decompress datablock using zstd */ /* Decompress datablock using zstd */
int static int
zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
int level __maybe_unused) size_t d_len, int level __maybe_unused)
{ {
return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len, return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
NULL)); NULL));
} }
ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
/* Allocator for zstd compression context using mempool_allocator */ /* Allocator for zstd compression context using mempool_allocator */
static void * static void *
zstd_alloc(void *opaque __maybe_unused, size_t size) zstd_alloc(void *opaque __maybe_unused, size_t size)

View File

@ -145,6 +145,24 @@ for kernel_version in %{?kernel_versions}; do
%{?kernel_cc} \ %{?kernel_cc} \
%{?kernel_ld} \ %{?kernel_ld} \
%{?kernel_llvm} %{?kernel_llvm}
# Pre-6.10 kernel builds didn't need to copy over the source files to the
# build directory. However we do need to do it though post-6.10 due to
# these commits:
#
# b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
# directory
#
# 9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
# rules
#
# Note that kmodtool actually copies over the source into the build
# directory, so what we're doing here is normal. For efficiency reasons
# though we just use hardlinks instead of copying.
#
# See https://github.com/openzfs/zfs/issues/16439 for more info.
cp -lR ../%{module}-%{version}/module/* module/
make %{?_smp_mflags} make %{?_smp_mflags}
cd .. cd ..
done done

View File

@ -514,6 +514,10 @@ tags = ['functional', 'cli_root', 'zpool_offline']
tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
tags = ['functional', 'cli_root', 'zpool_online'] tags = ['functional', 'cli_root', 'zpool_online']
[tests/functional/cli_root/zpool_reguid]
tests = ['zpool_reguid_001_pos', 'zpool_reguid_002_neg']
tags = ['functional', 'cli_root', 'zpool_reguid']
[tests/functional/cli_root/zpool_remove] [tests/functional/cli_root/zpool_remove]
tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
'zpool_remove_003_pos'] 'zpool_remove_003_pos']
@ -672,7 +676,9 @@ post =
tags = ['functional', 'deadman'] tags = ['functional', 'deadman']
[tests/functional/dedup] [tests/functional/dedup]
tests = ['dedup_quota'] tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import',
'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
'dedup_legacy_fdt_mixed', 'dedup_quota']
pre = pre =
post = post =
tags = ['functional', 'dedup'] tags = ['functional', 'dedup']

View File

@ -24,7 +24,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/badsend
scripts_zfs_tests_bin_PROGRAMS += %D%/btree_test scripts_zfs_tests_bin_PROGRAMS += %D%/btree_test
%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) %C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
%C%_btree_test_LDADD = \ %C%_btree_test_LDADD = \
libzpool.la \ libzpool.la \
libzfs_core.la libzfs_core.la

View File

@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift
DDT_ZAP_DEFAULT_BS dedup.ddt_zap_default_bs ddt_zap_default_bs DDT_ZAP_DEFAULT_BS dedup.ddt_zap_default_bs ddt_zap_default_bs
DDT_ZAP_DEFAULT_IBS dedup.ddt_zap_default_ibs ddt_zap_default_ibs DDT_ZAP_DEFAULT_IBS dedup.ddt_zap_default_ibs ddt_zap_default_ibs
DDT_DATA_IS_SPECIAL ddt_data_is_special zfs_ddt_data_is_special DDT_DATA_IS_SPECIAL ddt_data_is_special zfs_ddt_data_is_special
DEDUP_LOG_TXG_MAX dedup.log_txg_max zfs_dedup_log_txg_max
DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms
DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second
DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode

View File

@ -1424,6 +1424,12 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/deadman/deadman_zio.ksh \ functional/deadman/deadman_zio.ksh \
functional/dedup/cleanup.ksh \ functional/dedup/cleanup.ksh \
functional/dedup/setup.ksh \ functional/dedup/setup.ksh \
functional/dedup/dedup_fdt_create.ksh \
functional/dedup/dedup_fdt_import.ksh \
functional/dedup/dedup_legacy_create.ksh \
functional/dedup/dedup_legacy_import.ksh \
functional/dedup/dedup_legacy_fdt_upgrade.ksh \
functional/dedup/dedup_legacy_fdt_mixed.ksh \
functional/dedup/dedup_quota.ksh \ functional/dedup/dedup_quota.ksh \
functional/delegate/cleanup.ksh \ functional/delegate/cleanup.ksh \
functional/delegate/setup.ksh \ functional/delegate/setup.ksh \

View File

@ -55,7 +55,7 @@ function display_status
((ret |= $?)) ((ret |= $?))
typeset mntpnt=$(get_prop mountpoint $pool) typeset mntpnt=$(get_prop mountpoint $pool)
dd if=/dev/random of=$mntpnt/testfile.$$ & dd if=/dev/urandom of=$mntpnt/testfile.$$ &
typeset pid=$! typeset pid=$!
zpool iostat -v 1 3 > /dev/null zpool iostat -v 1 3 > /dev/null

View File

@ -54,7 +54,7 @@ log_must truncate -s 1G $VDEV
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV
log_must dd if=/dev/random of=/$TESTPOOL/file1 bs=1 count=1000 log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1 count=1000
ulimit -f 2 ulimit -f 2
log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 all log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 all

View File

@ -109,5 +109,6 @@ if is_linux || is_freebsd; then
"feature@block_cloning" "feature@block_cloning"
"feature@vdev_zaps_v2" "feature@vdev_zaps_v2"
"feature@raidz_expansion" "feature@raidz_expansion"
"feature@fast_dedup"
) )
fi fi

View File

@ -95,6 +95,10 @@ while (( i < 16384 )); do
done done
((i += 1)) ((i += 1))
done done
# Force the DDT logs to disk with a scrub so they can be prefetched
log_must zpool scrub -w $TESTPOOL
log_note "Dataset generation completed." log_note "Dataset generation completed."
typeset -A generated typeset -A generated

View File

@ -0,0 +1,6 @@
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_reguid
dist_pkgdata_SCRIPTS = \
setup.ksh \
cleanup.ksh \
zpool_reguid_001_pos.ksh \
zpool_reguid_002_neg.ksh

View File

@ -0,0 +1,32 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
default_cleanup

View File

@ -0,0 +1,34 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
DISK=${DISKS%% *}
default_setup $DISK

Some files were not shown because too many files have changed in this diff Show More