Merge branch 'openzfs/master' into NAS-130821-2
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
This commit is contained in:
commit
cfa06493fa
2
META
2
META
|
@ -6,5 +6,5 @@ Release: 1
|
|||
Release-Tags: relext
|
||||
License: CDDL
|
||||
Author: OpenZFS
|
||||
Linux-Maximum: 6.9
|
||||
Linux-Maximum: 6.10
|
||||
Linux-Minimum: 3.10
|
||||
|
|
|
@ -24,7 +24,7 @@ zfs_ids_to_path_LDADD = \
|
|||
libzfs.la
|
||||
|
||||
|
||||
zhack_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
|
||||
zhack_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
|
||||
|
||||
sbin_PROGRAMS += zhack
|
||||
CPPCHECKTARGETS += zhack
|
||||
|
@ -39,7 +39,7 @@ zhack_LDADD = \
|
|||
|
||||
|
||||
ztest_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS)
|
||||
ztest_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
|
||||
ztest_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
|
||||
|
||||
sbin_PROGRAMS += ztest
|
||||
CPPCHECKTARGETS += ztest
|
||||
|
|
|
@ -269,8 +269,7 @@ main(int argc, char **argv)
|
|||
return (MOUNT_USAGE);
|
||||
}
|
||||
|
||||
if (!zfsutil || sloppy ||
|
||||
libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
|
||||
if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
|
||||
zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
|
||||
}
|
||||
|
||||
|
@ -337,7 +336,7 @@ main(int argc, char **argv)
|
|||
dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);
|
||||
|
||||
if (!fake) {
|
||||
if (zfsutil && !sloppy &&
|
||||
if (!remount && !sloppy &&
|
||||
!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
|
||||
error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint);
|
||||
if (error) {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
raidz_test_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS)
|
||||
raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
|
||||
raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
|
||||
|
||||
bin_PROGRAMS += raidz_test
|
||||
CPPCHECKTARGETS += raidz_test
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
zdb_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
|
||||
zdb_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
|
||||
zdb_CFLAGS = $(AM_CFLAGS) $(LIBCRYPTO_CFLAGS)
|
||||
|
||||
sbin_PROGRAMS += zdb
|
||||
|
|
449
cmd/zdb/zdb.c
449
cmd/zdb/zdb.c
|
@ -33,7 +33,7 @@
|
|||
* under sponsorship from the FreeBSD Foundation.
|
||||
* Copyright (c) 2021 Allan Jude
|
||||
* Copyright (c) 2021 Toomas Soome <tsoome@me.com>
|
||||
* Copyright (c) 2023, Klara Inc.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
* Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
|
||||
*/
|
||||
|
||||
|
@ -1914,23 +1914,25 @@ dump_log_spacemaps(spa_t *spa)
|
|||
}
|
||||
|
||||
static void
|
||||
dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
|
||||
dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
|
||||
uint64_t index)
|
||||
{
|
||||
const ddt_phys_t *ddp = dde->dde_phys;
|
||||
const ddt_key_t *ddk = &dde->dde_key;
|
||||
const char *types[4] = { "ditto", "single", "double", "triple" };
|
||||
const ddt_key_t *ddk = &ddlwe->ddlwe_key;
|
||||
char blkbuf[BP_SPRINTF_LEN];
|
||||
blkptr_t blk;
|
||||
int p;
|
||||
|
||||
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
||||
if (ddp->ddp_phys_birth == 0)
|
||||
for (p = 0; p < DDT_NPHYS(ddt); p++) {
|
||||
const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
|
||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||
|
||||
if (ddt_phys_birth(ddp, v) == 0)
|
||||
continue;
|
||||
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
|
||||
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
|
||||
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
|
||||
(void) printf("index %llx refcnt %llu %s %s\n",
|
||||
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
|
||||
types[p], blkbuf);
|
||||
(void) printf("index %llx refcnt %llu phys %d %s\n",
|
||||
(u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
|
||||
p, blkbuf);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1956,11 +1958,37 @@ dump_dedup_ratio(const ddt_stat_t *dds)
|
|||
dedup, compress, copies, dedup * compress / copies);
|
||||
}
|
||||
|
||||
static void
|
||||
dump_ddt_log(ddt_t *ddt)
|
||||
{
|
||||
for (int n = 0; n < 2; n++) {
|
||||
ddt_log_t *ddl = &ddt->ddt_log[n];
|
||||
|
||||
uint64_t count = avl_numnodes(&ddl->ddl_tree);
|
||||
if (count == 0)
|
||||
continue;
|
||||
|
||||
printf(DMU_POOL_DDT_LOG ": %lu log entries\n",
|
||||
zio_checksum_table[ddt->ddt_checksum].ci_name, n, count);
|
||||
|
||||
if (dump_opt['D'] < 4)
|
||||
continue;
|
||||
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
uint64_t index = 0;
|
||||
for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
|
||||
ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
|
||||
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
|
||||
dump_ddt_entry(ddt, &ddlwe, index++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
|
||||
{
|
||||
char name[DDT_NAMELEN];
|
||||
ddt_entry_t dde;
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
uint64_t walk = 0;
|
||||
dmu_object_info_t doi;
|
||||
uint64_t count, dspace, mspace;
|
||||
|
@ -2001,8 +2029,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
|
|||
|
||||
(void) printf("%s contents:\n\n", name);
|
||||
|
||||
while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
|
||||
dump_dde(ddt, &dde, walk);
|
||||
while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
|
||||
dump_ddt_entry(ddt, &ddlwe, walk);
|
||||
|
||||
ASSERT3U(error, ==, ENOENT);
|
||||
|
||||
|
@ -2025,6 +2053,7 @@ dump_all_ddts(spa_t *spa)
|
|||
dump_ddt(ddt, type, class);
|
||||
}
|
||||
}
|
||||
dump_ddt_log(ddt);
|
||||
}
|
||||
|
||||
ddt_get_dedup_stats(spa, &dds_total);
|
||||
|
@ -3287,9 +3316,45 @@ fuid_table_destroy(void)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
|
||||
* a live pool are normally cleaned up during ddt_sync(). We can't do that (and
|
||||
* wouldn't want to anyway), but if we don't clean up the presence of stuff on
|
||||
* ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
|
||||
*
|
||||
* Note that this is not a particularly efficient way to do this, but
|
||||
* ddt_remove() is the only public method that can do the work we need, and it
|
||||
* requires the right locks and etc to do the job. This is only ever called
|
||||
* during zdb shutdown so efficiency is not especially important.
|
||||
*/
|
||||
static void
|
||||
zdb_ddt_cleanup(spa_t *spa)
|
||||
{
|
||||
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
|
||||
ddt_t *ddt = spa->spa_ddt[c];
|
||||
if (!ddt)
|
||||
continue;
|
||||
|
||||
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
||||
ddt_enter(ddt);
|
||||
ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
|
||||
while (dde) {
|
||||
next = AVL_NEXT(&ddt->ddt_tree, dde);
|
||||
dde->dde_io = NULL;
|
||||
ddt_remove(ddt, dde);
|
||||
dde = next;
|
||||
}
|
||||
ddt_exit(ddt);
|
||||
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
zdb_exit(int reason)
|
||||
{
|
||||
if (spa != NULL)
|
||||
zdb_ddt_cleanup(spa);
|
||||
|
||||
if (os != NULL) {
|
||||
close_objset(os, FTAG);
|
||||
} else if (spa != NULL) {
|
||||
|
@ -4592,7 +4657,6 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
|
|||
l2arc_log_blk_phys_t this_lb;
|
||||
uint64_t asize;
|
||||
l2arc_log_blkptr_t lbps[2];
|
||||
abd_t *abd;
|
||||
zio_cksum_t cksum;
|
||||
int failed = 0;
|
||||
l2arc_dev_t dev;
|
||||
|
@ -4646,20 +4710,25 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
|
|||
switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
|
||||
case ZIO_COMPRESS_OFF:
|
||||
break;
|
||||
default:
|
||||
abd = abd_alloc_for_io(asize, B_TRUE);
|
||||
default: {
|
||||
abd_t *abd = abd_alloc_linear(asize, B_TRUE);
|
||||
abd_copy_from_buf_off(abd, &this_lb, 0, asize);
|
||||
if (zio_decompress_data(L2BLK_GET_COMPRESS(
|
||||
(&lbps[0])->lbp_prop), abd, &this_lb,
|
||||
asize, sizeof (this_lb), NULL) != 0) {
|
||||
abd_t dabd;
|
||||
abd_get_from_buf_struct(&dabd, &this_lb,
|
||||
sizeof (this_lb));
|
||||
int err = zio_decompress_data(L2BLK_GET_COMPRESS(
|
||||
(&lbps[0])->lbp_prop), abd, &dabd,
|
||||
asize, sizeof (this_lb), NULL);
|
||||
abd_free(&dabd);
|
||||
abd_free(abd);
|
||||
if (err != 0) {
|
||||
(void) printf("L2ARC block decompression "
|
||||
"failed\n");
|
||||
abd_free(abd);
|
||||
goto out;
|
||||
}
|
||||
abd_free(abd);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
|
||||
byteswap_uint64_array(&this_lb, sizeof (this_lb));
|
||||
|
@ -5633,7 +5702,6 @@ static void
|
|||
zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
|
||||
dmu_object_type_t type)
|
||||
{
|
||||
uint64_t refcnt = 0;
|
||||
int i;
|
||||
|
||||
ASSERT(type < ZDB_OT_TOTAL);
|
||||
|
@ -5641,8 +5709,161 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
|
|||
if (zilog && zil_bp_tree_add(zilog, bp) != 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* This flag controls if we will issue a claim for the block while
|
||||
* counting it, to ensure that all blocks are referenced in space maps.
|
||||
* We don't issue claims if we're not doing leak tracking, because it's
|
||||
* expensive if the user isn't interested. We also don't claim the
|
||||
* second or later occurences of cloned or dedup'd blocks, because we
|
||||
* already claimed them the first time.
|
||||
*/
|
||||
boolean_t do_claim = !dump_opt['L'];
|
||||
|
||||
spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
|
||||
|
||||
blkptr_t tempbp;
|
||||
if (BP_GET_DEDUP(bp)) {
|
||||
/*
|
||||
* Dedup'd blocks are special. We need to count them, so we can
|
||||
* later uncount them when reporting leaked space, and we must
|
||||
* only claim them once.
|
||||
*
|
||||
* We use the existing dedup system to track what we've seen.
|
||||
* The first time we see a block, we do a ddt_lookup() to see
|
||||
* if it exists in the DDT. If we're doing leak tracking, we
|
||||
* claim the block at this time.
|
||||
*
|
||||
* Each time we see a block, we reduce the refcount in the
|
||||
* entry by one, and add to the size and count of dedup'd
|
||||
* blocks to report at the end.
|
||||
*/
|
||||
|
||||
ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
|
||||
|
||||
ddt_enter(ddt);
|
||||
|
||||
/*
|
||||
* Find the block. This will create the entry in memory, but
|
||||
* we'll know if that happened by its refcount.
|
||||
*/
|
||||
ddt_entry_t *dde = ddt_lookup(ddt, bp);
|
||||
|
||||
/*
|
||||
* ddt_lookup() can only return NULL if this block didn't exist
|
||||
* in the DDT and creating it would take the DDT over its
|
||||
* quota. Since we got the block from disk, it must exist in
|
||||
* the DDT, so this can't happen.
|
||||
*/
|
||||
VERIFY3P(dde, !=, NULL);
|
||||
|
||||
/* Get the phys for this variant */
|
||||
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
|
||||
|
||||
/*
|
||||
* This entry may have multiple sets of DVAs. We must claim
|
||||
* each set the first time we see them in a real block on disk,
|
||||
* or count them on subsequent occurences. We don't have a
|
||||
* convenient way to track the first time we see each variant,
|
||||
* so we repurpose dde_io as a set of "seen" flag bits. We can
|
||||
* do this safely in zdb because it never writes, so it will
|
||||
* never have a writing zio for this block in that pointer.
|
||||
*/
|
||||
boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
|
||||
if (!seen)
|
||||
dde->dde_io =
|
||||
(void *)(((uintptr_t)dde->dde_io) | (1 << v));
|
||||
|
||||
/* Consume a reference for this block. */
|
||||
VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
|
||||
ddt_phys_decref(dde->dde_phys, v);
|
||||
|
||||
/*
|
||||
* If this entry has a single flat phys, it may have been
|
||||
* extended with additional DVAs at some time in its life.
|
||||
* This block might be from before it was fully extended, and
|
||||
* so have fewer DVAs.
|
||||
*
|
||||
* If this is the first time we've seen this block, and we
|
||||
* claimed it as-is, then we would miss the claim on some
|
||||
* number of DVAs, which would then be seen as leaked.
|
||||
*
|
||||
* In all cases, if we've had fewer DVAs, then the asize would
|
||||
* be too small, and would lead to the pool apparently using
|
||||
* more space than allocated.
|
||||
*
|
||||
* To handle this, we copy the canonical set of DVAs from the
|
||||
* entry back to the block pointer before we claim it.
|
||||
*/
|
||||
if (v == DDT_PHYS_FLAT) {
|
||||
ASSERT3U(BP_GET_BIRTH(bp), ==,
|
||||
ddt_phys_birth(dde->dde_phys, v));
|
||||
tempbp = *bp;
|
||||
ddt_bp_fill(dde->dde_phys, v, &tempbp,
|
||||
BP_GET_BIRTH(bp));
|
||||
bp = &tempbp;
|
||||
}
|
||||
|
||||
if (seen) {
|
||||
/*
|
||||
* The second or later time we see this block,
|
||||
* it's a duplicate and we count it.
|
||||
*/
|
||||
zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
|
||||
zcb->zcb_dedup_blocks++;
|
||||
|
||||
/* Already claimed, don't do it again. */
|
||||
do_claim = B_FALSE;
|
||||
}
|
||||
|
||||
ddt_exit(ddt);
|
||||
} else if (zcb->zcb_brt_is_active &&
|
||||
brt_maybe_exists(zcb->zcb_spa, bp)) {
|
||||
/*
|
||||
* Cloned blocks are special. We need to count them, so we can
|
||||
* later uncount them when reporting leaked space, and we must
|
||||
* only claim them once.
|
||||
*
|
||||
* To do this, we keep our own in-memory BRT. For each block
|
||||
* we haven't seen before, we look it up in the real BRT and
|
||||
* if its there, we note it and its refcount then proceed as
|
||||
* normal. If we see the block again, we count it as a clone
|
||||
* and then give it no further consideration.
|
||||
*/
|
||||
zdb_brt_entry_t zbre_search, *zbre;
|
||||
avl_index_t where;
|
||||
|
||||
zbre_search.zbre_dva = bp->blk_dva[0];
|
||||
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
|
||||
if (zbre == NULL) {
|
||||
/* Not seen before; track it */
|
||||
uint64_t refcnt =
|
||||
brt_entry_get_refcount(zcb->zcb_spa, bp);
|
||||
if (refcnt > 0) {
|
||||
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
|
||||
UMEM_NOFAIL);
|
||||
zbre->zbre_dva = bp->blk_dva[0];
|
||||
zbre->zbre_refcount = refcnt;
|
||||
avl_insert(&zcb->zcb_brt, zbre, where);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Second or later occurrence, count it and take a
|
||||
* refcount.
|
||||
*/
|
||||
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
|
||||
zcb->zcb_clone_blocks++;
|
||||
|
||||
zbre->zbre_refcount--;
|
||||
if (zbre->zbre_refcount == 0) {
|
||||
avl_remove(&zcb->zcb_brt, zbre);
|
||||
umem_free(zbre, sizeof (zdb_brt_entry_t));
|
||||
}
|
||||
|
||||
/* Already claimed, don't do it again. */
|
||||
do_claim = B_FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
|
||||
int t = (i & 1) ? type : ZDB_OT_TOTAL;
|
||||
|
@ -5745,71 +5966,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
|
|||
zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
|
||||
zcb->zcb_asize_total += BP_GET_ASIZE(bp);
|
||||
|
||||
if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
|
||||
/*
|
||||
* Cloned blocks are special. We need to count them, so we can
|
||||
* later uncount them when reporting leaked space, and we must
|
||||
* only claim them them once.
|
||||
*
|
||||
* To do this, we keep our own in-memory BRT. For each block
|
||||
* we haven't seen before, we look it up in the real BRT and
|
||||
* if its there, we note it and its refcount then proceed as
|
||||
* normal. If we see the block again, we count it as a clone
|
||||
* and then give it no further consideration.
|
||||
*/
|
||||
zdb_brt_entry_t zbre_search, *zbre;
|
||||
avl_index_t where;
|
||||
|
||||
zbre_search.zbre_dva = bp->blk_dva[0];
|
||||
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
|
||||
if (zbre != NULL) {
|
||||
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
|
||||
zcb->zcb_clone_blocks++;
|
||||
|
||||
zbre->zbre_refcount--;
|
||||
if (zbre->zbre_refcount == 0) {
|
||||
avl_remove(&zcb->zcb_brt, zbre);
|
||||
umem_free(zbre, sizeof (zdb_brt_entry_t));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
|
||||
if (crefcnt > 0) {
|
||||
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
|
||||
UMEM_NOFAIL);
|
||||
zbre->zbre_dva = bp->blk_dva[0];
|
||||
zbre->zbre_refcount = crefcnt;
|
||||
avl_insert(&zcb->zcb_brt, zbre, where);
|
||||
}
|
||||
}
|
||||
|
||||
if (dump_opt['L'])
|
||||
if (!do_claim)
|
||||
return;
|
||||
|
||||
if (BP_GET_DEDUP(bp)) {
|
||||
ddt_t *ddt;
|
||||
ddt_entry_t *dde;
|
||||
|
||||
ddt = ddt_select(zcb->zcb_spa, bp);
|
||||
ddt_enter(ddt);
|
||||
dde = ddt_lookup(ddt, bp, B_FALSE);
|
||||
|
||||
if (dde == NULL) {
|
||||
refcnt = 0;
|
||||
} else {
|
||||
ddt_phys_t *ddp = ddt_phys_select(dde, bp);
|
||||
ddt_phys_decref(ddp);
|
||||
refcnt = ddp->ddp_refcnt;
|
||||
if (ddt_phys_total_refcnt(dde) == 0)
|
||||
ddt_remove(ddt, dde);
|
||||
}
|
||||
ddt_exit(ddt);
|
||||
}
|
||||
|
||||
VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
|
||||
refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
|
||||
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
|
||||
VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
|
||||
spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
|
||||
ZIO_FLAG_CANFAIL)));
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -6120,49 +6282,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
|
|||
return (counts);
|
||||
}
|
||||
|
||||
static void
|
||||
zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
|
||||
{
|
||||
ddt_bookmark_t ddb = {0};
|
||||
ddt_entry_t dde;
|
||||
int error;
|
||||
int p;
|
||||
|
||||
ASSERT(!dump_opt['L']);
|
||||
|
||||
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
|
||||
blkptr_t blk;
|
||||
ddt_phys_t *ddp = dde.dde_phys;
|
||||
|
||||
if (ddb.ddb_class == DDT_CLASS_UNIQUE)
|
||||
return;
|
||||
|
||||
ASSERT(ddt_phys_total_refcnt(&dde) > 1);
|
||||
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
|
||||
VERIFY(ddt);
|
||||
|
||||
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
||||
if (ddp->ddp_phys_birth == 0)
|
||||
continue;
|
||||
ddt_bp_create(ddb.ddb_checksum,
|
||||
&dde.dde_key, ddp, &blk);
|
||||
if (p == DDT_PHYS_DITTO) {
|
||||
zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
|
||||
} else {
|
||||
zcb->zcb_dedup_asize +=
|
||||
BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
|
||||
zcb->zcb_dedup_blocks++;
|
||||
}
|
||||
}
|
||||
|
||||
ddt_enter(ddt);
|
||||
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
|
||||
ddt_exit(ddt);
|
||||
}
|
||||
|
||||
ASSERT(error == ENOENT);
|
||||
}
|
||||
|
||||
typedef struct checkpoint_sm_exclude_entry_arg {
|
||||
vdev_t *cseea_vd;
|
||||
uint64_t cseea_checkpoint_size;
|
||||
|
@ -6546,10 +6665,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
|
|||
(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
|
||||
increment_indirect_mapping_cb, zcb, NULL);
|
||||
}
|
||||
|
||||
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
||||
zdb_ddt_leak_init(spa, zcb);
|
||||
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
|
@ -6814,6 +6929,8 @@ dump_block_stats(spa_t *spa)
|
|||
int e, c, err;
|
||||
bp_embedded_type_t i;
|
||||
|
||||
ddt_prefetch_all(spa);
|
||||
|
||||
zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
|
||||
|
||||
if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
|
||||
|
@ -6938,7 +7055,6 @@ dump_block_stats(spa_t *spa)
|
|||
(u_longlong_t)total_alloc,
|
||||
(dump_opt['L']) ? "unreachable" : "leaked",
|
||||
(longlong_t)(total_alloc - total_found));
|
||||
leaks = B_TRUE;
|
||||
}
|
||||
|
||||
if (tzb->zb_count == 0) {
|
||||
|
@ -7272,29 +7388,27 @@ dump_simulated_ddt(spa_t *spa)
|
|||
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
||||
|
||||
while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
|
||||
ddt_stat_t dds;
|
||||
uint64_t refcnt = zdde->zdde_ref_blocks;
|
||||
ASSERT(refcnt != 0);
|
||||
|
||||
dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
|
||||
dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
|
||||
dds.dds_psize = zdde->zdde_ref_psize / refcnt;
|
||||
dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
|
||||
ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];
|
||||
|
||||
dds.dds_ref_blocks = zdde->zdde_ref_blocks;
|
||||
dds.dds_ref_lsize = zdde->zdde_ref_lsize;
|
||||
dds.dds_ref_psize = zdde->zdde_ref_psize;
|
||||
dds.dds_ref_dsize = zdde->zdde_ref_dsize;
|
||||
dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
|
||||
dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
|
||||
dds->dds_psize += zdde->zdde_ref_psize / refcnt;
|
||||
dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;
|
||||
|
||||
ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
|
||||
&dds, 0);
|
||||
dds->dds_ref_blocks += zdde->zdde_ref_blocks;
|
||||
dds->dds_ref_lsize += zdde->zdde_ref_lsize;
|
||||
dds->dds_ref_psize += zdde->zdde_ref_psize;
|
||||
dds->dds_ref_dsize += zdde->zdde_ref_dsize;
|
||||
|
||||
umem_free(zdde, sizeof (*zdde));
|
||||
}
|
||||
|
||||
avl_destroy(&t);
|
||||
|
||||
ddt_histogram_stat(&dds_total, &ddh_total);
|
||||
ddt_histogram_total(&dds_total, &ddh_total);
|
||||
|
||||
(void) printf("Simulated DDT histogram:\n");
|
||||
|
||||
|
@ -8022,16 +8136,25 @@ dump_mos_leaks(spa_t *spa)
|
|||
|
||||
mos_leak_vdev(spa->spa_root_vdev);
|
||||
|
||||
for (uint64_t class = 0; class < DDT_CLASSES; class++) {
|
||||
for (uint64_t type = 0; type < DDT_TYPES; type++) {
|
||||
for (uint64_t cksum = 0;
|
||||
cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
|
||||
ddt_t *ddt = spa->spa_ddt[cksum];
|
||||
if (!ddt)
|
||||
continue;
|
||||
for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
|
||||
ddt_t *ddt = spa->spa_ddt[c];
|
||||
if (!ddt)
|
||||
continue;
|
||||
|
||||
/* DDT store objects */
|
||||
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
|
||||
for (ddt_class_t class = 0; class < DDT_CLASSES;
|
||||
class++) {
|
||||
mos_obj_refd(ddt->ddt_object[type][class]);
|
||||
}
|
||||
}
|
||||
|
||||
/* FDT container */
|
||||
mos_obj_refd(ddt->ddt_dir_object);
|
||||
|
||||
/* FDT log objects */
|
||||
mos_obj_refd(ddt->ddt_log[0].ddl_object);
|
||||
mos_obj_refd(ddt->ddt_log[1].ddl_object);
|
||||
}
|
||||
|
||||
if (spa->spa_brt != NULL) {
|
||||
|
@ -8499,13 +8622,22 @@ try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
|
|||
memset(lbuf, 0x00, lsize);
|
||||
memset(lbuf2, 0xff, lsize);
|
||||
|
||||
abd_t labd, labd2;
|
||||
abd_get_from_buf_struct(&labd, lbuf, lsize);
|
||||
abd_get_from_buf_struct(&labd2, lbuf2, lsize);
|
||||
|
||||
boolean_t ret = B_FALSE;
|
||||
if (zio_decompress_data(cfunc, pabd,
|
||||
lbuf, psize, lsize, NULL) == 0 &&
|
||||
&labd, psize, lsize, NULL) == 0 &&
|
||||
zio_decompress_data(cfunc, pabd,
|
||||
lbuf2, psize, lsize, NULL) == 0 &&
|
||||
&labd2, psize, lsize, NULL) == 0 &&
|
||||
memcmp(lbuf, lbuf2, lsize) == 0)
|
||||
return (B_TRUE);
|
||||
return (B_FALSE);
|
||||
ret = B_TRUE;
|
||||
|
||||
abd_free(&labd2);
|
||||
abd_free(&labd);
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
|
@ -9624,6 +9756,9 @@ retry_lookup:
|
|||
}
|
||||
|
||||
fini:
|
||||
if (spa != NULL)
|
||||
zdb_ddt_cleanup(spa);
|
||||
|
||||
if (os != NULL) {
|
||||
close_objset(os, FTAG);
|
||||
} else if (spa != NULL) {
|
||||
|
|
|
@ -844,7 +844,6 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
|||
const char *failmode = NULL;
|
||||
boolean_t checkremove = B_FALSE;
|
||||
uint32_t pri = 0;
|
||||
int32_t flags = 0;
|
||||
|
||||
/*
|
||||
* If this is a checksum or I/O error, then toss it into the
|
||||
|
@ -922,18 +921,28 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
|
|||
}
|
||||
} else if (fmd_nvl_class_match(hdl, nvl,
|
||||
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
|
||||
uint64_t flags = 0;
|
||||
int32_t flags32 = 0;
|
||||
/*
|
||||
* We ignore ereports for checksum errors generated by
|
||||
* scrub/resilver I/O to avoid potentially further
|
||||
* degrading the pool while it's being repaired.
|
||||
*
|
||||
* Note that FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS used to
|
||||
* be int32. To allow newer zed to work on older
|
||||
* kernels, if we don't find the flags, we look for
|
||||
* the older ones too.
|
||||
*/
|
||||
if (((nvlist_lookup_uint32(nvl,
|
||||
FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
|
||||
(pri == ZIO_PRIORITY_SCRUB ||
|
||||
pri == ZIO_PRIORITY_REBUILD)) ||
|
||||
((nvlist_lookup_int32(nvl,
|
||||
((nvlist_lookup_uint64(nvl,
|
||||
FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
|
||||
(flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
|
||||
(flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) ||
|
||||
((nvlist_lookup_int32(nvl,
|
||||
FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags32) == 0) &&
|
||||
(flags32 & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
|
||||
fmd_hdl_debug(hdl, "ignoring '%s' for "
|
||||
"scrub/resilver I/O", class);
|
||||
return;
|
||||
|
|
|
@ -537,7 +537,7 @@ get_usage(zpool_help_t idx)
|
|||
"\t [-o property=value] <pool> <newpool> "
|
||||
"[<device> ...]\n"));
|
||||
case HELP_REGUID:
|
||||
return (gettext("\treguid <pool>\n"));
|
||||
return (gettext("\treguid [-g guid] <pool>\n"));
|
||||
case HELP_SYNC:
|
||||
return (gettext("\tsync [pool] ...\n"));
|
||||
case HELP_VERSION:
|
||||
|
@ -2025,7 +2025,7 @@ zpool_do_create(int argc, char **argv)
|
|||
char *end;
|
||||
u_longlong_t ver;
|
||||
|
||||
ver = strtoull(propval, &end, 10);
|
||||
ver = strtoull(propval, &end, 0);
|
||||
if (*end == '\0' &&
|
||||
ver < SPA_VERSION_FEATURES) {
|
||||
enable_pool_features = B_FALSE;
|
||||
|
@ -8232,19 +8232,32 @@ zpool_do_clear(int argc, char **argv)
|
|||
}
|
||||
|
||||
/*
|
||||
* zpool reguid <pool>
|
||||
* zpool reguid [-g <guid>] <pool>
|
||||
*/
|
||||
int
|
||||
zpool_do_reguid(int argc, char **argv)
|
||||
{
|
||||
uint64_t guid;
|
||||
uint64_t *guidp = NULL;
|
||||
int c;
|
||||
char *endptr;
|
||||
char *poolname;
|
||||
zpool_handle_t *zhp;
|
||||
int ret = 0;
|
||||
|
||||
/* check options */
|
||||
while ((c = getopt(argc, argv, "")) != -1) {
|
||||
while ((c = getopt(argc, argv, "g:")) != -1) {
|
||||
switch (c) {
|
||||
case 'g':
|
||||
errno = 0;
|
||||
guid = strtoull(optarg, &endptr, 10);
|
||||
if (errno != 0 || *endptr != '\0') {
|
||||
(void) fprintf(stderr,
|
||||
gettext("invalid GUID: %s\n"), optarg);
|
||||
usage(B_FALSE);
|
||||
}
|
||||
guidp = &guid;
|
||||
break;
|
||||
case '?':
|
||||
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
|
||||
optopt);
|
||||
|
@ -8270,7 +8283,7 @@ zpool_do_reguid(int argc, char **argv)
|
|||
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
|
||||
return (1);
|
||||
|
||||
ret = zpool_reguid(zhp);
|
||||
ret = zpool_set_guid(zhp, guidp);
|
||||
|
||||
zpool_close(zhp);
|
||||
return (ret);
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
zstream_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
|
||||
|
||||
sbin_PROGRAMS += zstream
|
||||
CPPCHECKTARGETS += zstream
|
||||
|
||||
|
|
|
@ -22,6 +22,8 @@
|
|||
/*
|
||||
* Copyright 2022 Axcient. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*
|
||||
* Copyright (c) 2024, Klara, Inc.
|
||||
*/
|
||||
|
||||
#include <err.h>
|
||||
|
@ -257,83 +259,73 @@ zstream_do_decompress(int argc, char *argv[])
|
|||
ENTRY e = {.key = key};
|
||||
|
||||
p = hsearch(e, FIND);
|
||||
if (p != NULL) {
|
||||
zio_decompress_func_t *xfunc = NULL;
|
||||
switch ((enum zio_compress)(intptr_t)p->data) {
|
||||
case ZIO_COMPRESS_OFF:
|
||||
xfunc = NULL;
|
||||
break;
|
||||
case ZIO_COMPRESS_LZJB:
|
||||
xfunc = lzjb_decompress;
|
||||
break;
|
||||
case ZIO_COMPRESS_GZIP_1:
|
||||
xfunc = gzip_decompress;
|
||||
break;
|
||||
case ZIO_COMPRESS_ZLE:
|
||||
xfunc = zle_decompress;
|
||||
break;
|
||||
case ZIO_COMPRESS_LZ4:
|
||||
xfunc = lz4_decompress_zfs;
|
||||
break;
|
||||
case ZIO_COMPRESS_ZSTD:
|
||||
xfunc = zfs_zstd_decompress;
|
||||
break;
|
||||
default:
|
||||
assert(B_FALSE);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Read and decompress the block
|
||||
*/
|
||||
char *lzbuf = safe_calloc(payload_size);
|
||||
(void) sfread(lzbuf, payload_size, stdin);
|
||||
if (xfunc == NULL) {
|
||||
memcpy(buf, lzbuf, payload_size);
|
||||
drrw->drr_compressiontype =
|
||||
ZIO_COMPRESS_OFF;
|
||||
if (verbose)
|
||||
fprintf(stderr, "Resetting "
|
||||
"compression type to off "
|
||||
"for ino %llu offset "
|
||||
"%llu\n",
|
||||
(u_longlong_t)
|
||||
drrw->drr_object,
|
||||
(u_longlong_t)
|
||||
drrw->drr_offset);
|
||||
} else if (0 != xfunc(lzbuf, buf,
|
||||
payload_size, payload_size, 0)) {
|
||||
/*
|
||||
* The block must not be compressed,
|
||||
* at least not with this compression
|
||||
* type, possibly because it gets
|
||||
* written multiple times in this
|
||||
* stream.
|
||||
*/
|
||||
warnx("decompression failed for "
|
||||
"ino %llu offset %llu",
|
||||
(u_longlong_t)drrw->drr_object,
|
||||
(u_longlong_t)drrw->drr_offset);
|
||||
memcpy(buf, lzbuf, payload_size);
|
||||
} else if (verbose) {
|
||||
drrw->drr_compressiontype =
|
||||
ZIO_COMPRESS_OFF;
|
||||
fprintf(stderr, "successfully "
|
||||
"decompressed ino %llu "
|
||||
"offset %llu\n",
|
||||
(u_longlong_t)drrw->drr_object,
|
||||
(u_longlong_t)drrw->drr_offset);
|
||||
} else {
|
||||
drrw->drr_compressiontype =
|
||||
ZIO_COMPRESS_OFF;
|
||||
}
|
||||
free(lzbuf);
|
||||
} else {
|
||||
if (p == NULL) {
|
||||
/*
|
||||
* Read the contents of the block unaltered
|
||||
*/
|
||||
(void) sfread(buf, payload_size, stdin);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read and decompress the block
|
||||
*/
|
||||
enum zio_compress c =
|
||||
(enum zio_compress)(intptr_t)p->data;
|
||||
|
||||
if (c == ZIO_COMPRESS_OFF) {
|
||||
(void) sfread(buf, payload_size, stdin);
|
||||
drrw->drr_compressiontype = 0;
|
||||
drrw->drr_compressed_size = 0;
|
||||
if (verbose)
|
||||
fprintf(stderr,
|
||||
"Resetting compression type to "
|
||||
"off for ino %llu offset %llu\n",
|
||||
(u_longlong_t)drrw->drr_object,
|
||||
(u_longlong_t)drrw->drr_offset);
|
||||
break;
|
||||
}
|
||||
|
||||
uint64_t lsize = drrw->drr_logical_size;
|
||||
ASSERT3U(payload_size, <=, lsize);
|
||||
|
||||
char *lzbuf = safe_calloc(payload_size);
|
||||
(void) sfread(lzbuf, payload_size, stdin);
|
||||
|
||||
abd_t sabd, dabd;
|
||||
abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
|
||||
abd_get_from_buf_struct(&dabd, buf, lsize);
|
||||
int err = zio_decompress_data(c, &sabd, &dabd,
|
||||
payload_size, lsize, NULL);
|
||||
abd_free(&dabd);
|
||||
abd_free(&sabd);
|
||||
|
||||
if (err == 0) {
|
||||
drrw->drr_compressiontype = 0;
|
||||
drrw->drr_compressed_size = 0;
|
||||
payload_size = lsize;
|
||||
if (verbose) {
|
||||
fprintf(stderr,
|
||||
"successfully decompressed "
|
||||
"ino %llu offset %llu\n",
|
||||
(u_longlong_t)drrw->drr_object,
|
||||
(u_longlong_t)drrw->drr_offset);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* The block must not be compressed, at least
|
||||
* not with this compression type, possibly
|
||||
* because it gets written multiple times in
|
||||
* this stream.
|
||||
*/
|
||||
warnx("decompression failed for "
|
||||
"ino %llu offset %llu",
|
||||
(u_longlong_t)drrw->drr_object,
|
||||
(u_longlong_t)drrw->drr_offset);
|
||||
memcpy(buf, lzbuf, payload_size);
|
||||
}
|
||||
|
||||
free(lzbuf);
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,10 +22,9 @@
|
|||
/*
|
||||
* Copyright 2022 Axcient. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2022 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2024, Klara, Inc.
|
||||
*/
|
||||
|
||||
#include <err.h>
|
||||
|
@ -72,7 +71,7 @@ zstream_do_recompress(int argc, char *argv[])
|
|||
dmu_replay_record_t *drr = &thedrr;
|
||||
zio_cksum_t stream_cksum;
|
||||
int c;
|
||||
int level = -1;
|
||||
int level = 0;
|
||||
|
||||
while ((c = getopt(argc, argv, "l:")) != -1) {
|
||||
switch (c) {
|
||||
|
@ -97,34 +96,22 @@ zstream_do_recompress(int argc, char *argv[])
|
|||
|
||||
if (argc != 1)
|
||||
zstream_usage();
|
||||
int type = 0;
|
||||
zio_compress_info_t *cinfo = NULL;
|
||||
if (0 == strcmp(argv[0], "off")) {
|
||||
type = ZIO_COMPRESS_OFF;
|
||||
cinfo = &zio_compress_table[type];
|
||||
} else if (0 == strcmp(argv[0], "inherit") ||
|
||||
0 == strcmp(argv[0], "empty") ||
|
||||
0 == strcmp(argv[0], "on")) {
|
||||
// Fall through to invalid compression type case
|
||||
} else {
|
||||
for (int i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
|
||||
if (0 == strcmp(zio_compress_table[i].ci_name,
|
||||
argv[0])) {
|
||||
cinfo = &zio_compress_table[i];
|
||||
type = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cinfo == NULL) {
|
||||
fprintf(stderr, "Invalid compression type %s.\n",
|
||||
argv[0]);
|
||||
exit(2);
|
||||
}
|
||||
|
||||
if (cinfo->ci_compress == NULL) {
|
||||
type = 0;
|
||||
cinfo = &zio_compress_table[0];
|
||||
enum zio_compress ctype;
|
||||
if (strcmp(argv[0], "off") == 0) {
|
||||
ctype = ZIO_COMPRESS_OFF;
|
||||
} else {
|
||||
for (ctype = 0; ctype < ZIO_COMPRESS_FUNCTIONS; ctype++) {
|
||||
if (strcmp(argv[0],
|
||||
zio_compress_table[ctype].ci_name) == 0)
|
||||
break;
|
||||
}
|
||||
if (ctype == ZIO_COMPRESS_FUNCTIONS ||
|
||||
zio_compress_table[ctype].ci_compress == NULL) {
|
||||
fprintf(stderr, "Invalid compression type %s.\n",
|
||||
argv[0]);
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
|
||||
if (isatty(STDIN_FILENO)) {
|
||||
|
@ -135,6 +122,7 @@ zstream_do_recompress(int argc, char *argv[])
|
|||
exit(1);
|
||||
}
|
||||
|
||||
abd_init();
|
||||
fletcher_4_init();
|
||||
zio_init();
|
||||
zstd_init();
|
||||
|
@ -247,63 +235,78 @@ zstream_do_recompress(int argc, char *argv[])
|
|||
(void) sfread(buf, payload_size, stdin);
|
||||
break;
|
||||
}
|
||||
if (drrw->drr_compressiontype >=
|
||||
ZIO_COMPRESS_FUNCTIONS) {
|
||||
enum zio_compress dtype = drrw->drr_compressiontype;
|
||||
if (dtype >= ZIO_COMPRESS_FUNCTIONS) {
|
||||
fprintf(stderr, "Invalid compression type in "
|
||||
"stream: %d\n", drrw->drr_compressiontype);
|
||||
"stream: %d\n", dtype);
|
||||
exit(3);
|
||||
}
|
||||
zio_compress_info_t *dinfo =
|
||||
&zio_compress_table[drrw->drr_compressiontype];
|
||||
if (zio_compress_table[dtype].ci_decompress == NULL)
|
||||
dtype = ZIO_COMPRESS_OFF;
|
||||
|
||||
/* Set up buffers to minimize memcpys */
|
||||
char *cbuf, *dbuf;
|
||||
if (cinfo->ci_compress == NULL)
|
||||
if (ctype == ZIO_COMPRESS_OFF)
|
||||
dbuf = buf;
|
||||
else
|
||||
dbuf = safe_calloc(bufsz);
|
||||
|
||||
if (dinfo->ci_decompress == NULL)
|
||||
if (dtype == ZIO_COMPRESS_OFF)
|
||||
cbuf = dbuf;
|
||||
else
|
||||
cbuf = safe_calloc(payload_size);
|
||||
|
||||
/* Read and decompress the payload */
|
||||
(void) sfread(cbuf, payload_size, stdin);
|
||||
if (dinfo->ci_decompress != NULL) {
|
||||
if (0 != dinfo->ci_decompress(cbuf, dbuf,
|
||||
payload_size, MIN(bufsz,
|
||||
drrw->drr_logical_size), dinfo->ci_level)) {
|
||||
if (dtype != ZIO_COMPRESS_OFF) {
|
||||
abd_t cabd, dabd;
|
||||
abd_get_from_buf_struct(&cabd,
|
||||
cbuf, payload_size);
|
||||
abd_get_from_buf_struct(&dabd, dbuf,
|
||||
MIN(bufsz, drrw->drr_logical_size));
|
||||
if (zio_decompress_data(dtype, &cabd, &dabd,
|
||||
payload_size, abd_get_size(&dabd),
|
||||
NULL) != 0) {
|
||||
warnx("decompression type %d failed "
|
||||
"for ino %llu offset %llu",
|
||||
type,
|
||||
dtype,
|
||||
(u_longlong_t)drrw->drr_object,
|
||||
(u_longlong_t)drrw->drr_offset);
|
||||
exit(4);
|
||||
}
|
||||
payload_size = drrw->drr_logical_size;
|
||||
abd_free(&dabd);
|
||||
abd_free(&cabd);
|
||||
free(cbuf);
|
||||
}
|
||||
|
||||
/* Recompress the payload */
|
||||
if (cinfo->ci_compress != NULL) {
|
||||
payload_size = P2ROUNDUP(cinfo->ci_compress(
|
||||
dbuf, buf, drrw->drr_logical_size,
|
||||
MIN(payload_size, bufsz), (level == -1 ?
|
||||
cinfo->ci_level : level)),
|
||||
SPA_MINBLOCKSIZE);
|
||||
if (payload_size != drrw->drr_logical_size) {
|
||||
drrw->drr_compressiontype = type;
|
||||
drrw->drr_compressed_size =
|
||||
payload_size;
|
||||
} else {
|
||||
if (ctype != ZIO_COMPRESS_OFF) {
|
||||
abd_t dabd, abd;
|
||||
abd_get_from_buf_struct(&dabd,
|
||||
dbuf, drrw->drr_logical_size);
|
||||
abd_t *pabd =
|
||||
abd_get_from_buf_struct(&abd, buf, bufsz);
|
||||
size_t csize = zio_compress_data(ctype, &dabd,
|
||||
&pabd, drrw->drr_logical_size, level);
|
||||
size_t rounded =
|
||||
P2ROUNDUP(csize, SPA_MINBLOCKSIZE);
|
||||
if (rounded >= drrw->drr_logical_size) {
|
||||
memcpy(buf, dbuf, payload_size);
|
||||
drrw->drr_compressiontype = 0;
|
||||
drrw->drr_compressed_size = 0;
|
||||
} else {
|
||||
abd_zero_off(pabd, csize,
|
||||
rounded - csize);
|
||||
drrw->drr_compressiontype = ctype;
|
||||
drrw->drr_compressed_size =
|
||||
payload_size = rounded;
|
||||
}
|
||||
abd_free(&abd);
|
||||
abd_free(&dabd);
|
||||
free(dbuf);
|
||||
} else {
|
||||
drrw->drr_compressiontype = type;
|
||||
drrw->drr_compressiontype = 0;
|
||||
drrw->drr_compressed_size = 0;
|
||||
}
|
||||
break;
|
||||
|
@ -371,6 +374,7 @@ zstream_do_recompress(int argc, char *argv[])
|
|||
fletcher_4_fini();
|
||||
zio_fini();
|
||||
zstd_fini();
|
||||
abd_fini();
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
|
|
@ -6746,7 +6746,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
|
|||
load = spa_load_guid(spa);
|
||||
|
||||
(void) pthread_rwlock_wrlock(&ztest_name_lock);
|
||||
error = spa_change_guid(spa);
|
||||
error = spa_change_guid(spa, NULL);
|
||||
zs->zs_guid = spa_guid(spa);
|
||||
(void) pthread_rwlock_unlock(&ztest_name_lock);
|
||||
|
||||
|
|
|
@ -10,7 +10,8 @@ AM_CPPFLAGS = \
|
|||
-I$(top_srcdir)/include \
|
||||
-I$(top_srcdir)/module/icp/include \
|
||||
-I$(top_srcdir)/lib/libspl/include \
|
||||
-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@
|
||||
-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ \
|
||||
-I$(top_srcdir)/lib/libzpool/include
|
||||
|
||||
AM_LIBTOOLFLAGS = --silent
|
||||
|
||||
|
@ -85,4 +86,7 @@ KERNEL_CFLAGS = $(FRAME_LARGER_THAN)
|
|||
LIBRARY_CFLAGS = -no-suppress
|
||||
|
||||
# Forcibly enable asserts/debugging for libzpool &al.
|
||||
FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
|
||||
# Since ZFS_DEBUG can change shared data structures, all libzpool users must
|
||||
# be compiled with the same flags.
|
||||
# See https://github.com/openzfs/zfs/issues/16476
|
||||
LIBZPOOL_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
|
||||
|
|
|
@ -25,6 +25,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [
|
|||
dnl #
|
||||
dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue
|
||||
dnl # 4.12: dynamically allocated bdi in request_queue
|
||||
dnl # 6.11: bdi no longer available through request_queue, so get it from
|
||||
dnl # the gendisk attached to the queue
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [
|
||||
ZFS_LINUX_TEST_SRC([blk_queue_bdi], [
|
||||
|
@ -47,6 +49,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [
|
|||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI], [
|
||||
ZFS_LINUX_TEST_SRC([blk_queue_disk_bdi], [
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/backing-dev.h>
|
||||
], [
|
||||
struct request_queue q;
|
||||
struct gendisk disk;
|
||||
struct backing_dev_info bdi __attribute__ ((unused));
|
||||
q.disk = &disk;
|
||||
q.disk->bdi = &bdi;
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI], [
|
||||
AC_MSG_CHECKING([whether backing_dev_info is available through queue gendisk])
|
||||
ZFS_LINUX_TEST_RESULT([blk_queue_disk_bdi], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_BLK_QUEUE_DISK_BDI, 1,
|
||||
[backing_dev_info is available through queue gendisk])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # 5.9: added blk_queue_update_readahead(),
|
||||
dnl # 5.15: renamed to disk_update_readahead()
|
||||
|
@ -407,6 +433,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
|
|||
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE
|
||||
|
@ -421,6 +448,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
|
|||
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_PLUG
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_BDI
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE
|
||||
|
|
|
@ -58,6 +58,13 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
|
|||
disk = blk_alloc_disk(lim, NUMA_NO_NODE);
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([blkdev_queue_limits_features], [
|
||||
#include <linux/blkdev.h>
|
||||
],[
|
||||
struct queue_limits *lim = NULL;
|
||||
lim->features = 0;
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
|
||||
#include <linux/blkdev.h>
|
||||
],[
|
||||
|
@ -114,6 +121,20 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
|
|||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])
|
||||
|
||||
dnl #
|
||||
dnl # Linux 6.11 API change:
|
||||
dnl # struct queue_limits gains a 'features' field,
|
||||
dnl # used to set flushing options
|
||||
dnl #
|
||||
AC_MSG_CHECKING([whether struct queue_limits has a features field])
|
||||
ZFS_LINUX_TEST_RESULT([blkdev_queue_limits_features], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_BLKDEV_QUEUE_LIMITS_FEATURES], 1,
|
||||
[struct queue_limits has a features field])
|
||||
], [
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # 5.20 API change,
|
||||
dnl # Removed blk_cleanup_disk(), put_disk() should be used.
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
|
||||
ZFS_LINUX_TEST_SRC([page_size], [
|
||||
#include <linux/mm.h>
|
||||
],[
|
||||
unsigned long s;
|
||||
s = page_size(NULL);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
|
||||
AC_MSG_CHECKING([whether page_size() is available])
|
||||
ZFS_LINUX_TEST_RESULT([page_size], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
|
@ -0,0 +1,36 @@
|
|||
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
|
||||
ZFS_LINUX_TEST_SRC([page_size], [
|
||||
#include <linux/mm.h>
|
||||
],[
|
||||
unsigned long s;
|
||||
s = page_size(NULL);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
|
||||
AC_MSG_CHECKING([whether page_size() is available])
|
||||
ZFS_LINUX_TEST_RESULT([page_size], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING], [
|
||||
ZFS_LINUX_TEST_SRC([page_mapping], [
|
||||
#include <linux/pagemap.h>
|
||||
],[
|
||||
struct page *p = NULL;
|
||||
struct address_space *m = page_mapping(NULL);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_MAPPING], [
|
||||
AC_MSG_CHECKING([whether page_mapping() is available])
|
||||
ZFS_LINUX_TEST_RESULT([page_mapping], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_MM_PAGE_MAPPING, 1, [page_mapping() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
|
@ -25,3 +25,62 @@ AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
|
|||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # Linux 6.11 register_sysctl() enforces that sysctl tables no longer
|
||||
dnl # supply a sentinel end-of-table element. 6.6 introduces
|
||||
dnl # register_sysctl_sz() to enable callers to choose, so we use it if
|
||||
dnl # available for backward compatibility.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ], [
|
||||
ZFS_LINUX_TEST_SRC([has_register_sysctl_sz], [
|
||||
#include <linux/sysctl.h>
|
||||
],[
|
||||
struct ctl_table test_table[] __attribute__((unused)) = {0};
|
||||
register_sysctl_sz("", test_table, 0);
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ], [
|
||||
AC_MSG_CHECKING([whether register_sysctl_sz exists])
|
||||
ZFS_LINUX_TEST_RESULT([has_register_sysctl_sz], [
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE(HAVE_REGISTER_SYSCTL_SZ, 1,
|
||||
[register_sysctl_sz exists])
|
||||
],[
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # Linux 6.11 makes const the ctl_table arg of proc_handler
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST], [
|
||||
ZFS_LINUX_TEST_SRC([has_proc_handler_ctl_table_const], [
|
||||
#include <linux/sysctl.h>
|
||||
|
||||
static int test_handler(
|
||||
const struct ctl_table *ctl __attribute((unused)),
|
||||
int write __attribute((unused)),
|
||||
void *buffer __attribute((unused)),
|
||||
size_t *lenp __attribute((unused)),
|
||||
loff_t *ppos __attribute((unused)))
|
||||
{
|
||||
return (0);
|
||||
}
|
||||
], [
|
||||
proc_handler *ph __attribute((unused)) =
|
||||
&test_handler;
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST], [
|
||||
AC_MSG_CHECKING([whether proc_handler ctl_table arg is const])
|
||||
ZFS_LINUX_TEST_RESULT([has_proc_handler_ctl_table_const], [
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE(HAVE_PROC_HANDLER_CTL_TABLE_CONST, 1,
|
||||
[proc_handler ctl_table arg is const])
|
||||
], [
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
|
|
|
@ -167,9 +167,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|||
ZFS_AC_KERNEL_SRC_WRITEPAGE_T
|
||||
ZFS_AC_KERNEL_SRC_RECLAIMED
|
||||
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ
|
||||
ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST
|
||||
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SRC_SYNC_BDEV
|
||||
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
|
||||
ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
|
||||
|
@ -319,9 +322,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|||
ZFS_AC_KERNEL_WRITEPAGE_T
|
||||
ZFS_AC_KERNEL_RECLAIMED
|
||||
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ
|
||||
ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST
|
||||
ZFS_AC_KERNEL_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SYNC_BDEV
|
||||
ZFS_AC_KERNEL_MM_PAGE_SIZE
|
||||
ZFS_AC_KERNEL_MM_PAGE_MAPPING
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_CPU_HAS_FEATURE
|
||||
|
|
|
@ -300,6 +300,7 @@ _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,
|
|||
|
||||
_LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
|
||||
_LIBZFS_H int zpool_reguid(zpool_handle_t *);
|
||||
_LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *);
|
||||
_LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
|
||||
|
||||
_LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
|
||||
|
|
|
@ -77,6 +77,8 @@ noinst_HEADERS = \
|
|||
%D%/spl/sys/zmod.h \
|
||||
%D%/spl/sys/zone.h \
|
||||
\
|
||||
%D%/zfs/sys/abd_os.h \
|
||||
%D%/zfs/sys/abd_impl_os.h \
|
||||
%D%/zfs/sys/arc_os.h \
|
||||
%D%/zfs/sys/freebsd_crypto.h \
|
||||
%D%/zfs/sys/freebsd_event.h \
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#ifndef _ABD_IMPL_OS_H
|
||||
#define _ABD_IMPL_OS_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define abd_enter_critical(flags) critical_enter()
|
||||
#define abd_exit_critical(flags) critical_exit()
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ABD_IMPL_OS_H */
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _ABD_OS_H
|
||||
#define _ABD_OS_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct abd_scatter {
|
||||
uint_t abd_offset;
|
||||
void *abd_chunks[1]; /* actually variable-length */
|
||||
};
|
||||
|
||||
struct abd_linear {
|
||||
void *abd_buf;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ABD_H */
|
|
@ -20,6 +20,8 @@ kernel_linux_HEADERS = \
|
|||
|
||||
kernel_sysdir = $(kerneldir)/sys
|
||||
kernel_sys_HEADERS = \
|
||||
%D%/zfs/sys/abd_os.h \
|
||||
%D%/zfs/sys/abd_impl_os.h \
|
||||
%D%/zfs/sys/policy.h \
|
||||
%D%/zfs/sys/trace_acl.h \
|
||||
%D%/zfs/sys/trace_arc.h \
|
||||
|
|
|
@ -57,6 +57,11 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
|
|||
#endif
|
||||
|
||||
/*
|
||||
* 6.11 API
|
||||
* Setting the flush flags directly is no longer possible; flush flags are set
|
||||
* on the queue_limits structure and passed to blk_disk_alloc(). In this case
|
||||
* we remove this function entirely.
|
||||
*
|
||||
* 4.7 API,
|
||||
* The blk_queue_write_cache() interface has replaced blk_queue_flush()
|
||||
* interface. However, the new interface is GPL-only thus we implement
|
||||
|
@ -68,31 +73,33 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
|
|||
* new one is GPL-only. Thus if the GPL-only version is detected we
|
||||
* implement our own trivial helper.
|
||||
*/
|
||||
#if !defined(HAVE_BLK_ALLOC_DISK_2ARG) || \
|
||||
!defined(HAVE_BLKDEV_QUEUE_LIMITS_FEATURES)
|
||||
static inline void
|
||||
blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
|
||||
blk_queue_set_write_cache(struct request_queue *q, bool on)
|
||||
{
|
||||
#if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY)
|
||||
if (wc)
|
||||
if (on) {
|
||||
blk_queue_flag_set(QUEUE_FLAG_WC, q);
|
||||
else
|
||||
blk_queue_flag_clear(QUEUE_FLAG_WC, q);
|
||||
if (fua)
|
||||
blk_queue_flag_set(QUEUE_FLAG_FUA, q);
|
||||
else
|
||||
} else {
|
||||
blk_queue_flag_clear(QUEUE_FLAG_WC, q);
|
||||
blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
|
||||
}
|
||||
#elif defined(HAVE_BLK_QUEUE_WRITE_CACHE)
|
||||
blk_queue_write_cache(q, wc, fua);
|
||||
blk_queue_write_cache(q, on, on);
|
||||
#elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY)
|
||||
if (wc)
|
||||
q->flush_flags |= REQ_FLUSH;
|
||||
if (fua)
|
||||
q->flush_flags |= REQ_FUA;
|
||||
if (on)
|
||||
q->flush_flags |= REQ_FLUSH | REQ_FUA;
|
||||
else
|
||||
q->flush_flags &= ~(REQ_FLUSH | REQ_FUA);
|
||||
#elif defined(HAVE_BLK_QUEUE_FLUSH)
|
||||
blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0));
|
||||
blk_queue_flush(q, on ? (REQ_FLUSH | REQ_FUA) : 0);
|
||||
#else
|
||||
#error "Unsupported kernel"
|
||||
#endif
|
||||
}
|
||||
#endif /* !HAVE_BLK_ALLOC_DISK_2ARG || !HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */
|
||||
|
||||
/*
|
||||
* Detect if a device has a write cache. Used to set the intial value for the
|
||||
|
@ -126,8 +133,10 @@ blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
|
|||
{
|
||||
#if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \
|
||||
!defined(HAVE_DISK_UPDATE_READAHEAD)
|
||||
#ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC
|
||||
#if defined(HAVE_BLK_QUEUE_BDI_DYNAMIC)
|
||||
q->backing_dev_info->ra_pages = ra_pages;
|
||||
#elif defined(HAVE_BLK_QUEUE_DISK_BDI)
|
||||
q->disk->bdi->ra_pages = ra_pages;
|
||||
#else
|
||||
q->backing_dev_info.ra_pages = ra_pages;
|
||||
#endif
|
||||
|
|
|
@ -21,16 +21,23 @@
|
|||
|
||||
/*
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
|
||||
*/
|
||||
|
||||
#ifndef _ZFS_MM_COMPAT_H
|
||||
#define _ZFS_MM_COMPAT_H
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
|
||||
#ifndef HAVE_MM_PAGE_SIZE
|
||||
#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
|
||||
#endif
|
||||
|
||||
/* 6.11 removed page_mapping(). A simple wrapper around folio_mapping() works */
|
||||
#ifndef HAVE_MM_PAGE_MAPPING
|
||||
#define page_mapping(p) folio_mapping(page_folio(p))
|
||||
#endif
|
||||
|
||||
#endif /* _ZFS_MM_COMPAT_H */
|
||||
|
|
|
@ -20,6 +20,10 @@
|
|||
* You should have received a copy of the GNU General Public License along
|
||||
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2024, Klara Inc.
|
||||
* Copyright (c) 2024, Syneto
|
||||
*/
|
||||
|
||||
#ifndef _SPL_TASKQ_H
|
||||
#define _SPL_TASKQ_H
|
||||
|
@ -33,6 +37,9 @@
|
|||
#include <sys/thread.h>
|
||||
#include <sys/rwlock.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/wmsum.h>
|
||||
|
||||
typedef struct kstat_s kstat_t;
|
||||
|
||||
#define TASKQ_NAMELEN 31
|
||||
|
||||
|
@ -74,6 +81,32 @@ typedef enum tq_lock_role {
|
|||
typedef unsigned long taskqid_t;
|
||||
typedef void (task_func_t)(void *);
|
||||
|
||||
typedef struct taskq_sums {
|
||||
/* gauges (inc/dec counters, current value) */
|
||||
wmsum_t tqs_threads_active; /* threads running a task */
|
||||
wmsum_t tqs_threads_idle; /* threads waiting for work */
|
||||
wmsum_t tqs_threads_total; /* total threads */
|
||||
wmsum_t tqs_tasks_pending; /* tasks waiting to execute */
|
||||
wmsum_t tqs_tasks_priority; /* hi-pri tasks waiting */
|
||||
wmsum_t tqs_tasks_total; /* total waiting tasks */
|
||||
wmsum_t tqs_tasks_delayed; /* tasks deferred to future */
|
||||
wmsum_t tqs_entries_free; /* task entries on free list */
|
||||
|
||||
/* counters (inc only, since taskq creation) */
|
||||
wmsum_t tqs_threads_created; /* threads created */
|
||||
wmsum_t tqs_threads_destroyed; /* threads destroyed */
|
||||
wmsum_t tqs_tasks_dispatched; /* tasks dispatched */
|
||||
wmsum_t tqs_tasks_dispatched_delayed; /* tasks delayed to future */
|
||||
wmsum_t tqs_tasks_executed_normal; /* normal pri tasks executed */
|
||||
wmsum_t tqs_tasks_executed_priority; /* high pri tasks executed */
|
||||
wmsum_t tqs_tasks_executed; /* total tasks executed */
|
||||
wmsum_t tqs_tasks_delayed_requeued; /* delayed tasks requeued */
|
||||
wmsum_t tqs_tasks_cancelled; /* tasks cancelled before run */
|
||||
wmsum_t tqs_thread_wakeups; /* total thread wakeups */
|
||||
wmsum_t tqs_thread_wakeups_nowork; /* thread woken but no tasks */
|
||||
wmsum_t tqs_thread_sleeps; /* total thread sleeps */
|
||||
} taskq_sums_t;
|
||||
|
||||
typedef struct taskq {
|
||||
spinlock_t tq_lock; /* protects taskq_t */
|
||||
char *tq_name; /* taskq name */
|
||||
|
@ -105,6 +138,8 @@ typedef struct taskq {
|
|||
struct hlist_node tq_hp_cb_node;
|
||||
boolean_t tq_hp_support;
|
||||
unsigned long lastspawnstop; /* when to purge dynamic */
|
||||
taskq_sums_t tq_sums;
|
||||
kstat_t *tq_ksp;
|
||||
} taskq_t;
|
||||
|
||||
typedef struct taskq_ent {
|
||||
|
@ -123,6 +158,13 @@ typedef struct taskq_ent {
|
|||
#define TQENT_FLAG_PREALLOC 0x1
|
||||
#define TQENT_FLAG_CANCEL 0x2
|
||||
|
||||
/* bits 2-3 are which list tqent is on */
|
||||
#define TQENT_LIST_NONE 0x0
|
||||
#define TQENT_LIST_PENDING 0x4
|
||||
#define TQENT_LIST_PRIORITY 0x8
|
||||
#define TQENT_LIST_DELAY 0xc
|
||||
#define TQENT_LIST_MASK 0xc
|
||||
|
||||
typedef struct taskq_thread {
|
||||
struct list_head tqt_thread_list;
|
||||
struct list_head tqt_active_list;
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#ifndef _ABD_IMPL_OS_H
|
||||
#define _ABD_IMPL_OS_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define abd_enter_critical(flags) local_irq_save(flags)
|
||||
#define abd_exit_critical(flags) local_irq_restore(flags)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ABD_IMPL_OS_H */
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _ABD_OS_H
|
||||
#define _ABD_OS_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct abd_scatter {
|
||||
uint_t abd_offset;
|
||||
uint_t abd_nents;
|
||||
struct scatterlist *abd_sgl;
|
||||
};
|
||||
|
||||
struct abd_linear {
|
||||
void *abd_buf;
|
||||
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
|
||||
};
|
||||
|
||||
typedef struct abd abd_t;
|
||||
|
||||
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
|
||||
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
|
||||
void *);
|
||||
|
||||
/*
|
||||
* Linux ABD bio functions
|
||||
* Note: these are only needed to support vdev_classic. See comment in
|
||||
* vdev_disk.c.
|
||||
*/
|
||||
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
|
||||
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ABD_H */
|
|
@ -30,6 +30,7 @@
|
|||
#include <sys/debug.h>
|
||||
#include <sys/zfs_refcount.h>
|
||||
#include <sys/uio.h>
|
||||
#include <sys/abd_os.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -44,8 +45,7 @@ typedef enum abd_flags {
|
|||
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
|
||||
ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */
|
||||
ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */
|
||||
ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */
|
||||
ABD_FLAG_ALLOCD = 1 << 9, /* we allocated the abd_t */
|
||||
ABD_FLAG_ALLOCD = 1 << 8, /* we allocated the abd_t */
|
||||
} abd_flags_t;
|
||||
|
||||
typedef struct abd {
|
||||
|
@ -58,19 +58,8 @@ typedef struct abd {
|
|||
#endif
|
||||
kmutex_t abd_mtx;
|
||||
union {
|
||||
struct abd_scatter {
|
||||
uint_t abd_offset;
|
||||
#if defined(__FreeBSD__) && defined(_KERNEL)
|
||||
void *abd_chunks[1]; /* actually variable-length */
|
||||
#else
|
||||
uint_t abd_nents;
|
||||
struct scatterlist *abd_sgl;
|
||||
#endif
|
||||
} abd_scatter;
|
||||
struct abd_linear {
|
||||
void *abd_buf;
|
||||
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
|
||||
} abd_linear;
|
||||
struct abd_scatter abd_scatter;
|
||||
struct abd_linear abd_linear;
|
||||
struct abd_gang {
|
||||
list_t abd_gang_chain;
|
||||
} abd_gang;
|
||||
|
@ -79,9 +68,6 @@ typedef struct abd {
|
|||
|
||||
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
|
||||
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
|
||||
#endif
|
||||
|
||||
extern int zfs_abd_scatter_enabled;
|
||||
|
||||
|
@ -107,6 +93,7 @@ abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
|
|||
abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t);
|
||||
abd_t *abd_get_zeros(size_t);
|
||||
abd_t *abd_get_from_buf(void *, size_t);
|
||||
abd_t *abd_get_from_buf_struct(abd_t *, void *, size_t);
|
||||
void abd_cache_reap_now(void);
|
||||
|
||||
/*
|
||||
|
@ -128,10 +115,6 @@ void abd_release_ownership_of_buf(abd_t *);
|
|||
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
|
||||
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
|
||||
abd_iter_func2_t *, void *);
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
|
||||
void *);
|
||||
#endif
|
||||
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
|
||||
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
|
||||
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
|
||||
|
@ -225,16 +208,6 @@ abd_get_size(abd_t *abd)
|
|||
void abd_init(void);
|
||||
void abd_fini(void);
|
||||
|
||||
/*
|
||||
* Linux ABD bio functions
|
||||
* Note: these are only needed to support vdev_classic. See comment in
|
||||
* vdev_disk.c.
|
||||
*/
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
|
||||
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#define _ABD_IMPL_H
|
||||
|
||||
#include <sys/abd.h>
|
||||
#include <sys/abd_impl_os.h>
|
||||
#include <sys/wmsum.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -111,19 +112,6 @@ void abd_iter_page(struct abd_iter *);
|
|||
#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
|
||||
#define ABD_GANG(abd) (abd->abd_u.abd_gang)
|
||||
|
||||
#if defined(_KERNEL)
|
||||
#if defined(__FreeBSD__)
|
||||
#define abd_enter_critical(flags) critical_enter()
|
||||
#define abd_exit_critical(flags) critical_exit()
|
||||
#else
|
||||
#define abd_enter_critical(flags) local_irq_save(flags)
|
||||
#define abd_exit_critical(flags) local_irq_restore(flags)
|
||||
#endif
|
||||
#else /* !_KERNEL */
|
||||
#define abd_enter_critical(flags) ((void)0)
|
||||
#define abd_exit_critical(flags) ((void)0)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -39,6 +39,13 @@ extern "C" {
|
|||
|
||||
struct abd;
|
||||
|
||||
/*
|
||||
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
|
||||
*/
|
||||
#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
|
||||
#define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */
|
||||
#define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG)
|
||||
|
||||
/*
|
||||
* DDT on-disk storage object types. Each one corresponds to specific
|
||||
* implementation, see ddt_ops_t. The value itself is not stored on disk.
|
||||
|
@ -120,30 +127,80 @@ typedef struct {
|
|||
* characteristics of the stored block, such as its location on disk (DVAs),
|
||||
* birth txg and ref count.
|
||||
*
|
||||
* Note that an entry has an array of four ddt_phys_t, one for each number of
|
||||
* DVAs (copies= property) and another for additional "ditto" copies. Most
|
||||
* users of ddt_phys_t will handle indexing into or counting the phys they
|
||||
* want.
|
||||
* The "traditional" entry has an array of four, one for each number of DVAs
|
||||
* (copies= property) and another for additional "ditto" copies. Users of the
|
||||
* traditional struct will specify the variant (index) of the one they want.
|
||||
*
|
||||
* The newer "flat" entry has only a single form that is specified using the
|
||||
* DDT_PHYS_FLAT variant.
|
||||
*
|
||||
* Since the value size varies, use one of the size macros when interfacing
|
||||
* with the ddt zap.
|
||||
*/
|
||||
typedef struct {
|
||||
dva_t ddp_dva[SPA_DVAS_PER_BP];
|
||||
uint64_t ddp_refcnt;
|
||||
uint64_t ddp_phys_birth;
|
||||
} ddt_phys_t;
|
||||
|
||||
#define DDT_PHYS_MAX (4)
|
||||
|
||||
/*
|
||||
* Named indexes into the ddt_phys_t array in each entry.
|
||||
* Note - this can be used in a flexible array and allocated for
|
||||
* a specific size (ddp_trad or ddp_flat). So be careful not to
|
||||
* copy using "=" assignment but instead use ddt_phys_copy().
|
||||
*/
|
||||
typedef union {
|
||||
/*
|
||||
* Traditional physical payload value for DDT zap (256 bytes)
|
||||
*/
|
||||
struct {
|
||||
dva_t ddp_dva[SPA_DVAS_PER_BP];
|
||||
uint64_t ddp_refcnt;
|
||||
uint64_t ddp_phys_birth;
|
||||
} ddp_trad[DDT_PHYS_MAX];
|
||||
|
||||
/*
|
||||
* Flat physical payload value for DDT zap (72 bytes)
|
||||
*/
|
||||
struct {
|
||||
dva_t ddp_dva[SPA_DVAS_PER_BP];
|
||||
uint64_t ddp_refcnt;
|
||||
uint64_t ddp_phys_birth; /* txg based from BP */
|
||||
uint64_t ddp_class_start; /* in realtime seconds */
|
||||
} ddp_flat;
|
||||
} ddt_univ_phys_t;
|
||||
|
||||
/*
|
||||
* This enum denotes which variant of a ddt_univ_phys_t to target. For
|
||||
* a traditional DDT entry, it represents the indexes into the ddp_trad
|
||||
* array. Any consumer of a ddt_univ_phys_t needs to know which variant
|
||||
* is being targeted.
|
||||
*
|
||||
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
|
||||
* we maintain the ability to free existing dedup-ditto blocks.
|
||||
*/
|
||||
enum ddt_phys_type {
|
||||
|
||||
typedef enum {
|
||||
DDT_PHYS_DITTO = 0,
|
||||
DDT_PHYS_SINGLE = 1,
|
||||
DDT_PHYS_DOUBLE = 2,
|
||||
DDT_PHYS_TRIPLE = 3,
|
||||
DDT_PHYS_TYPES
|
||||
};
|
||||
DDT_PHYS_FLAT = 4,
|
||||
DDT_PHYS_NONE = 5
|
||||
} ddt_phys_variant_t;
|
||||
|
||||
#define DDT_PHYS_VARIANT(ddt, p) \
|
||||
(ASSERT((p) < DDT_PHYS_NONE), \
|
||||
((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
|
||||
|
||||
#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
|
||||
#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
|
||||
|
||||
#define _DDT_PHYS_SWITCH(ddt, flat, trad) \
|
||||
(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
|
||||
|
||||
#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
|
||||
DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
|
||||
|
||||
#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
|
||||
#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
|
||||
#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))
|
||||
|
||||
/*
|
||||
* A "live" entry, holding changes to an entry made this txg, and other data to
|
||||
|
@ -153,17 +210,27 @@ enum ddt_phys_type {
|
|||
/* State flags for dde_flags */
|
||||
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
|
||||
#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */
|
||||
#define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */
|
||||
|
||||
/*
|
||||
* Additional data to support entry update or repair. This is fixed size
|
||||
* because its relatively rarely used.
|
||||
*/
|
||||
typedef struct {
|
||||
/* copy of data after a repair read, to be rewritten */
|
||||
abd_t *dde_repair_abd;
|
||||
|
||||
/* original phys contents before update, for error handling */
|
||||
ddt_univ_phys_t dde_orig_phys;
|
||||
|
||||
/* in-flight update IOs */
|
||||
zio_t *dde_lead_zio[DDT_PHYS_MAX];
|
||||
} ddt_entry_io_t;
|
||||
|
||||
typedef struct {
|
||||
/* key must be first for ddt_key_compare */
|
||||
ddt_key_t dde_key; /* ddt_tree key */
|
||||
ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */
|
||||
|
||||
/* in-flight update IOs */
|
||||
zio_t *dde_lead_zio[DDT_PHYS_TYPES];
|
||||
|
||||
/* copy of data after a repair read, to be rewritten */
|
||||
struct abd *dde_repair_abd;
|
||||
ddt_key_t dde_key; /* ddt_tree key */
|
||||
avl_node_t dde_node; /* ddt_tree_node */
|
||||
|
||||
/* storage type and class the entry was loaded from */
|
||||
ddt_type_t dde_type;
|
||||
|
@ -173,9 +240,35 @@ typedef struct {
|
|||
kcondvar_t dde_cv; /* signaled when load completes */
|
||||
uint64_t dde_waiters; /* count of waiters on dde_cv */
|
||||
|
||||
avl_node_t dde_node; /* ddt_tree node */
|
||||
ddt_entry_io_t *dde_io; /* IO support, when required */
|
||||
|
||||
ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */
|
||||
} ddt_entry_t;
|
||||
|
||||
/*
|
||||
* A lightweight entry is for short-lived or transient uses, like iterating or
|
||||
* inspecting, when you don't care where it came from.
|
||||
*/
|
||||
typedef struct {
|
||||
ddt_key_t ddlwe_key;
|
||||
ddt_type_t ddlwe_type;
|
||||
ddt_class_t ddlwe_class;
|
||||
ddt_univ_phys_t ddlwe_phys;
|
||||
} ddt_lightweight_entry_t;
|
||||
|
||||
/*
|
||||
* In-core DDT log. A separate struct to make it easier to switch between the
|
||||
* appending and flushing logs.
|
||||
*/
|
||||
typedef struct {
|
||||
avl_tree_t ddl_tree; /* logged entries */
|
||||
uint32_t ddl_flags; /* flags for this log */
|
||||
uint64_t ddl_object; /* log object id */
|
||||
uint64_t ddl_length; /* on-disk log size */
|
||||
uint64_t ddl_first_txg; /* txg log became active */
|
||||
ddt_key_t ddl_checkpoint; /* last checkpoint */
|
||||
} ddt_log_t;
|
||||
|
||||
/*
|
||||
* In-core DDT object. This covers all entries and stats for a the whole pool
|
||||
* for a given checksum type.
|
||||
|
@ -184,23 +277,49 @@ typedef struct {
|
|||
kmutex_t ddt_lock; /* protects changes to all fields */
|
||||
|
||||
avl_tree_t ddt_tree; /* "live" (changed) entries this txg */
|
||||
avl_tree_t ddt_log_tree; /* logged entries */
|
||||
|
||||
avl_tree_t ddt_repair_tree; /* entries being repaired */
|
||||
|
||||
enum zio_checksum ddt_checksum; /* checksum algorithm in use */
|
||||
spa_t *ddt_spa; /* pool this ddt is on */
|
||||
objset_t *ddt_os; /* ddt objset (always MOS) */
|
||||
ddt_log_t ddt_log[2]; /* active/flushing logs */
|
||||
ddt_log_t *ddt_log_active; /* pointers into ddt_log */
|
||||
ddt_log_t *ddt_log_flushing; /* swapped when flush starts */
|
||||
|
||||
hrtime_t ddt_flush_start; /* log flush start this txg */
|
||||
uint32_t ddt_flush_pass; /* log flush pass this txg */
|
||||
|
||||
int32_t ddt_flush_count; /* entries flushed this txg */
|
||||
int32_t ddt_flush_min; /* min rem entries to flush */
|
||||
int32_t ddt_log_ingest_rate; /* rolling log ingest rate */
|
||||
int32_t ddt_log_flush_rate; /* rolling log flush rate */
|
||||
int32_t ddt_log_flush_time_rate; /* avg time spent flushing */
|
||||
|
||||
uint64_t ddt_flush_force_txg; /* flush hard before this txg */
|
||||
|
||||
kstat_t *ddt_ksp; /* kstats context */
|
||||
|
||||
enum zio_checksum ddt_checksum; /* checksum algorithm in use */
|
||||
spa_t *ddt_spa; /* pool this ddt is on */
|
||||
objset_t *ddt_os; /* ddt objset (always MOS) */
|
||||
|
||||
uint64_t ddt_dir_object; /* MOS dir holding ddt objects */
|
||||
uint64_t ddt_version; /* DDT version */
|
||||
uint64_t ddt_flags; /* FDT option flags */
|
||||
|
||||
/* per-type/per-class entry store objects */
|
||||
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
|
||||
|
||||
/* object ids for whole-ddt and per-type/per-class stats */
|
||||
/* object ids for stored, logged and per-type/per-class stats */
|
||||
uint64_t ddt_stat_object;
|
||||
ddt_object_t ddt_log_stats;
|
||||
ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
|
||||
|
||||
/* type/class stats by power-2-sized referenced blocks */
|
||||
ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
|
||||
ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
|
||||
|
||||
/* log stats power-2-sized referenced blocks */
|
||||
ddt_histogram_t ddt_log_histogram;
|
||||
} ddt_t;
|
||||
|
||||
/*
|
||||
|
@ -215,20 +334,36 @@ typedef struct {
|
|||
uint64_t ddb_cursor;
|
||||
} ddt_bookmark_t;
|
||||
|
||||
extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
|
||||
uint64_t txg);
|
||||
extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
||||
blkptr_t *bp, uint64_t txg);
|
||||
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
|
||||
const ddt_phys_t *ddp, blkptr_t *bp);
|
||||
const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);
|
||||
|
||||
extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
|
||||
extern void ddt_phys_clear(ddt_phys_t *ddp);
|
||||
extern void ddt_phys_addref(ddt_phys_t *ddp);
|
||||
extern void ddt_phys_decref(ddt_phys_t *ddp);
|
||||
extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
|
||||
extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
||||
const blkptr_t *bp);
|
||||
extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
|
||||
ddt_phys_variant_t v);
|
||||
extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
|
||||
extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
|
||||
extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
|
||||
extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
|
||||
ddt_phys_variant_t v);
|
||||
extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
|
||||
const ddt_entry_t *dde, const blkptr_t *bp);
|
||||
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
|
||||
ddt_phys_variant_t v);
|
||||
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
|
||||
boolean_t encrypted);
|
||||
|
||||
extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
|
||||
const ddt_lightweight_entry_t *ddlwe);
|
||||
extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
|
||||
const ddt_lightweight_entry_t *ddlwe);
|
||||
|
||||
extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
|
||||
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
|
||||
extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh);
|
||||
extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
|
||||
|
||||
extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
|
||||
extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
|
||||
extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
|
||||
|
@ -243,7 +378,7 @@ extern void ddt_enter(ddt_t *ddt);
|
|||
extern void ddt_exit(ddt_t *ddt);
|
||||
extern void ddt_init(void);
|
||||
extern void ddt_fini(void);
|
||||
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
|
||||
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp);
|
||||
extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
|
||||
extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
|
||||
extern void ddt_prefetch_all(spa_t *spa);
|
||||
|
@ -251,6 +386,8 @@ extern void ddt_prefetch_all(spa_t *spa);
|
|||
extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
|
||||
const blkptr_t *bp);
|
||||
|
||||
extern void ddt_alloc_entry_io(ddt_entry_t *dde);
|
||||
|
||||
extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
|
||||
extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
|
||||
|
||||
|
@ -260,7 +397,11 @@ extern void ddt_create(spa_t *spa);
|
|||
extern int ddt_load(spa_t *spa);
|
||||
extern void ddt_unload(spa_t *spa);
|
||||
extern void ddt_sync(spa_t *spa, uint64_t txg);
|
||||
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
|
||||
|
||||
extern void ddt_walk_init(spa_t *spa, uint64_t txg);
|
||||
extern boolean_t ddt_walk_ready(spa_t *spa);
|
||||
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
|
||||
ddt_lightweight_entry_t *ddlwe);
|
||||
|
||||
extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
|
||||
|
||||
|
|
|
@ -28,11 +28,129 @@
|
|||
#define _SYS_DDT_IMPL_H
|
||||
|
||||
#include <sys/ddt.h>
|
||||
#include <sys/bitops.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* DDT version numbers */
|
||||
#define DDT_VERSION_LEGACY (0)
|
||||
#define DDT_VERSION_FDT (1)
|
||||
|
||||
/* Names of interesting objects in the DDT root dir */
|
||||
#define DDT_DIR_VERSION "version"
|
||||
#define DDT_DIR_FLAGS "flags"
|
||||
|
||||
/* Fill a lightweight entry from a live entry. */
|
||||
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
|
||||
memset((ddlwe), 0, sizeof (*ddlwe)); \
|
||||
(ddlwe)->ddlwe_key = (dde)->dde_key; \
|
||||
(ddlwe)->ddlwe_type = (dde)->dde_type; \
|
||||
(ddlwe)->ddlwe_class = (dde)->dde_class; \
|
||||
memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
|
||||
} while (0)
|
||||
|
||||
#define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \
|
||||
memset((ddlwe), 0, sizeof (*ddlwe)); \
|
||||
(ddlwe)->ddlwe_key = (ddle)->ddle_key; \
|
||||
(ddlwe)->ddlwe_type = (ddle)->ddle_type; \
|
||||
(ddlwe)->ddlwe_class = (ddle)->ddle_class; \
|
||||
memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* An entry on the log tree. These are "frozen", and a record of what's in
|
||||
* the on-disk log. They can't be used in place, but can be "loaded" back into
|
||||
* the live tree.
|
||||
*/
|
||||
typedef struct {
|
||||
ddt_key_t ddle_key; /* ddt_log_tree key */
|
||||
avl_node_t ddle_node; /* ddt_log_tree node */
|
||||
|
||||
ddt_type_t ddle_type; /* storage type */
|
||||
ddt_class_t ddle_class; /* storage class */
|
||||
|
||||
/* extra allocation for flat/trad phys */
|
||||
ddt_univ_phys_t ddle_phys[];
|
||||
} ddt_log_entry_t;
|
||||
|
||||
/* On-disk log record types. */
|
||||
typedef enum {
|
||||
DLR_INVALID = 0, /* end of block marker */
|
||||
DLR_ENTRY = 1, /* an entry to add or replace in the log tree */
|
||||
} ddt_log_record_type_t;
|
||||
|
||||
/* On-disk log record header. */
|
||||
typedef struct {
|
||||
/*
|
||||
* dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
|
||||
* access it.
|
||||
*
|
||||
* bits 0-7: record type (ddt_log_record_type_t)
|
||||
* bits 8-15: length of record header+payload
|
||||
* bits 16-47: reserved, all zero
|
||||
* bits 48-55: if type==DLR_ENTRY, storage type (ddt_type)
|
||||
* otherwise all zero
|
||||
* bits 56-63: if type==DLR_ENTRY, storage class (ddt_class)
|
||||
* otherwise all zero
|
||||
*/
|
||||
uint64_t dlr_info;
|
||||
uint8_t dlr_payload[];
|
||||
} ddt_log_record_t;
|
||||
|
||||
#define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8)
|
||||
#define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v)
|
||||
#define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16)
|
||||
#define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v)
|
||||
#define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8)
|
||||
#define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v)
|
||||
#define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8)
|
||||
#define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v)
|
||||
|
||||
/* Payload for DLR_ENTRY. */
|
||||
typedef struct {
|
||||
ddt_key_t dlre_key;
|
||||
ddt_univ_phys_t dlre_phys[];
|
||||
} ddt_log_record_entry_t;
|
||||
|
||||
/* Log flags (ddl_flags, dlh_flags) */
|
||||
#define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */
|
||||
#define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */
|
||||
|
||||
/* On-disk log header, stored in the bonus buffer. */
|
||||
typedef struct {
|
||||
/*
|
||||
* dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
|
||||
* access it.
|
||||
*
|
||||
* bits 0-7: log version
|
||||
* bits 8-15: log flags
|
||||
* bits 16-63: reserved, all zero
|
||||
*/
|
||||
uint64_t dlh_info;
|
||||
|
||||
uint64_t dlh_length; /* log size in bytes */
|
||||
uint64_t dlh_first_txg; /* txg this log went active */
|
||||
ddt_key_t dlh_checkpoint; /* last checkpoint */
|
||||
} ddt_log_header_t;
|
||||
|
||||
#define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8)
|
||||
#define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v)
|
||||
#define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8)
|
||||
#define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v)
|
||||
|
||||
/* DDT log update state */
|
||||
typedef struct {
|
||||
dmu_tx_t *dlu_tx; /* tx the update is being applied to */
|
||||
dnode_t *dlu_dn; /* log object dnode */
|
||||
dmu_buf_t **dlu_dbp; /* array of block buffer pointers */
|
||||
int dlu_ndbp; /* number of block buffer pointers */
|
||||
uint16_t dlu_reclen; /* cached length of record */
|
||||
uint64_t dlu_block; /* block for next entry */
|
||||
uint64_t dlu_offset; /* offset for next entry */
|
||||
} ddt_log_update_t;
|
||||
|
||||
/*
|
||||
* Ops vector to access a specific DDT object type.
|
||||
*/
|
||||
|
@ -42,25 +160,50 @@ typedef struct {
|
|||
boolean_t prehash);
|
||||
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
|
||||
int (*ddt_op_lookup)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
|
||||
const ddt_key_t *ddk, void *phys, size_t psize);
|
||||
int (*ddt_op_contains)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk);
|
||||
void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk);
|
||||
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
|
||||
int (*ddt_op_update)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
|
||||
const ddt_key_t *ddk, const void *phys, size_t psize,
|
||||
dmu_tx_t *tx);
|
||||
int (*ddt_op_remove)(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk, dmu_tx_t *tx);
|
||||
int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
|
||||
ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
|
||||
ddt_key_t *ddk, void *phys, size_t psize);
|
||||
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
|
||||
} ddt_ops_t;
|
||||
|
||||
extern const ddt_ops_t ddt_zap_ops;
|
||||
|
||||
extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
|
||||
/* Dedup log API */
|
||||
extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
|
||||
ddt_log_update_t *dlu);
|
||||
extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
|
||||
ddt_log_update_t *dlu);
|
||||
extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
|
||||
|
||||
extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
|
||||
ddt_lightweight_entry_t *ddlwe);
|
||||
extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
|
||||
const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
|
||||
|
||||
extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
|
||||
dmu_tx_t *tx);
|
||||
extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
|
||||
|
||||
extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
|
||||
|
||||
extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
|
||||
|
||||
extern int ddt_log_load(ddt_t *ddt);
|
||||
extern void ddt_log_alloc(ddt_t *ddt);
|
||||
extern void ddt_log_free(ddt_t *ddt);
|
||||
|
||||
extern void ddt_log_init(void);
|
||||
extern void ddt_log_fini(void);
|
||||
|
||||
/*
|
||||
* These are only exposed so that zdb can access them. Try not to use them
|
||||
|
@ -74,16 +217,15 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
|
|||
*/
|
||||
#define DDT_NAMELEN 32
|
||||
|
||||
extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
|
||||
extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
|
||||
const ddt_univ_phys_t *ddp);
|
||||
|
||||
extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
|
||||
|
||||
extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
|
||||
|
||||
extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
|
||||
char *name);
|
||||
extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
|
||||
uint64_t *walk, ddt_entry_t *dde);
|
||||
uint64_t *walk, ddt_lightweight_entry_t *ddlwe);
|
||||
extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
|
||||
uint64_t *count);
|
||||
extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
|
||||
|
|
|
@ -375,7 +375,9 @@ typedef struct dmu_buf {
|
|||
#define DMU_POOL_L2CACHE "l2cache"
|
||||
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
|
||||
#define DMU_POOL_DDT "DDT-%s-%s-%s"
|
||||
#define DMU_POOL_DDT_LOG "DDT-log-%s-%u"
|
||||
#define DMU_POOL_DDT_STATS "DDT-statistics"
|
||||
#define DMU_POOL_DDT_DIR "DDT-%s"
|
||||
#define DMU_POOL_CREATION_VERSION "creation_version"
|
||||
#define DMU_POOL_SCAN "scan"
|
||||
#define DMU_POOL_ERRORSCRUB "error_scrub"
|
||||
|
|
|
@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
|
|||
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
|
||||
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
|
||||
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
|
||||
ddt_entry_t *dde, dmu_tx_t *tx);
|
||||
ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
|
||||
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
|
||||
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
|
||||
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
|
||||
|
|
|
@ -1710,6 +1710,11 @@ typedef enum {
|
|||
#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
|
||||
#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
|
||||
|
||||
/*
|
||||
* The following are names used when invoking ZFS_IOC_POOL_REGUID.
|
||||
*/
|
||||
#define ZPOOL_REGUID_GUID "guid"
|
||||
|
||||
/*
|
||||
* The following are names used when invoking ZFS_IOC_POOL_TRIM.
|
||||
*/
|
||||
|
|
|
@ -572,7 +572,7 @@ typedef struct blkptr {
|
|||
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
|
||||
BP_GET_PSIZE(bp))
|
||||
|
||||
#define BP_ZERO(bp) \
|
||||
#define BP_ZERO_DVAS(bp) \
|
||||
{ \
|
||||
(bp)->blk_dva[0].dva_word[0] = 0; \
|
||||
(bp)->blk_dva[0].dva_word[1] = 0; \
|
||||
|
@ -580,6 +580,11 @@ typedef struct blkptr {
|
|||
(bp)->blk_dva[1].dva_word[1] = 0; \
|
||||
(bp)->blk_dva[2].dva_word[0] = 0; \
|
||||
(bp)->blk_dva[2].dva_word[1] = 0; \
|
||||
}
|
||||
|
||||
#define BP_ZERO(bp) \
|
||||
{ \
|
||||
BP_ZERO_DVAS(bp); \
|
||||
(bp)->blk_prop = 0; \
|
||||
(bp)->blk_pad[0] = 0; \
|
||||
(bp)->blk_pad[1] = 0; \
|
||||
|
@ -1087,7 +1092,7 @@ extern void spa_strfree(char *);
|
|||
extern uint64_t spa_generate_guid(spa_t *spa);
|
||||
extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
|
||||
extern void spa_freeze(spa_t *spa);
|
||||
extern int spa_change_guid(spa_t *spa);
|
||||
extern int spa_change_guid(spa_t *spa, const uint64_t *guidp);
|
||||
extern void spa_upgrade(spa_t *spa, uint64_t version);
|
||||
extern void spa_evict_all(void);
|
||||
extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
/*
|
||||
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2019, Allan Jude
|
||||
* Copyright (c) 2019, Klara Inc.
|
||||
* Copyright (c) 2019, 2024, Klara, Inc.
|
||||
* Use is subject to license terms.
|
||||
* Copyright (c) 2015, 2016 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
@ -122,25 +122,15 @@ enum zio_zstd_levels {
|
|||
struct zio_prop;
|
||||
|
||||
/* Common signature for all zio compress functions. */
|
||||
typedef size_t zio_compress_func_t(void *src, void *dst,
|
||||
typedef size_t zio_compress_func_t(abd_t *src, abd_t *dst,
|
||||
size_t s_len, size_t d_len, int);
|
||||
/* Common signature for all zio decompress functions. */
|
||||
typedef int zio_decompress_func_t(void *src, void *dst,
|
||||
typedef int zio_decompress_func_t(abd_t *src, abd_t *dst,
|
||||
size_t s_len, size_t d_len, int);
|
||||
/* Common signature for all zio decompress and get level functions. */
|
||||
typedef int zio_decompresslevel_func_t(void *src, void *dst,
|
||||
typedef int zio_decompresslevel_func_t(abd_t *src, abd_t *dst,
|
||||
size_t s_len, size_t d_len, uint8_t *level);
|
||||
/* Common signature for all zio get-compression-level functions. */
|
||||
typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level);
|
||||
|
||||
|
||||
/*
|
||||
* Common signature for all zio decompress functions using an ABD as input.
|
||||
* This is helpful if you have both compressed ARC and scatter ABDs enabled,
|
||||
* but is not a requirement for all compression algorithms.
|
||||
*/
|
||||
typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
|
||||
size_t s_len, size_t d_len, int);
|
||||
/*
|
||||
* Information about each compression function.
|
||||
*/
|
||||
|
@ -163,34 +153,66 @@ extern void lz4_fini(void);
|
|||
/*
|
||||
* Compression routines.
|
||||
*/
|
||||
extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
|
||||
int level);
|
||||
extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
|
||||
int level);
|
||||
extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
|
||||
int level);
|
||||
extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
|
||||
int level);
|
||||
extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
|
||||
int level);
|
||||
extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
|
||||
int level);
|
||||
extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
|
||||
int level);
|
||||
extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
|
||||
int level);
|
||||
extern size_t zfs_lzjb_compress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
extern int zfs_lzjb_decompress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
extern size_t zfs_gzip_compress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
extern int zfs_gzip_decompress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
extern size_t zfs_zle_compress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
extern int zfs_zle_decompress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
extern size_t zfs_lz4_compress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
|
||||
/*
|
||||
* Compress and decompress data if necessary.
|
||||
*/
|
||||
extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst,
|
||||
extern size_t zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst,
|
||||
size_t s_len, uint8_t level);
|
||||
extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
|
||||
size_t s_len, size_t d_len, uint8_t *level);
|
||||
extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
|
||||
extern int zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *abd,
|
||||
size_t s_len, size_t d_len, uint8_t *level);
|
||||
extern int zio_compress_to_feature(enum zio_compress comp);
|
||||
|
||||
#define ZFS_COMPRESS_WRAP_DECL(name) \
|
||||
size_t \
|
||||
name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n) \
|
||||
{ \
|
||||
void *s_buf = abd_borrow_buf_copy(src, s_len); \
|
||||
void *d_buf = abd_borrow_buf(dst, d_len); \
|
||||
size_t c_len = name##_buf(s_buf, d_buf, s_len, d_len, n); \
|
||||
abd_return_buf(src, s_buf, s_len); \
|
||||
abd_return_buf_copy(dst, d_buf, d_len); \
|
||||
return (c_len); \
|
||||
}
|
||||
#define ZFS_DECOMPRESS_WRAP_DECL(name) \
|
||||
int \
|
||||
name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n) \
|
||||
{ \
|
||||
void *s_buf = abd_borrow_buf_copy(src, s_len); \
|
||||
void *d_buf = abd_borrow_buf(dst, d_len); \
|
||||
int err = name##_buf(s_buf, d_buf, s_len, d_len, n); \
|
||||
abd_return_buf(src, s_buf, s_len); \
|
||||
abd_return_buf_copy(dst, d_buf, d_len); \
|
||||
return (err); \
|
||||
}
|
||||
#define ZFS_DECOMPRESS_LEVEL_WRAP_DECL(name) \
|
||||
int \
|
||||
name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *n) \
|
||||
{ \
|
||||
void *s_buf = abd_borrow_buf_copy(src, s_len); \
|
||||
void *d_buf = abd_borrow_buf(dst, d_len); \
|
||||
int err = name##_buf(s_buf, d_buf, s_len, d_len, n); \
|
||||
abd_return_buf(src, s_buf, s_len); \
|
||||
abd_return_buf_copy(dst, d_buf, d_len); \
|
||||
return (err); \
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -90,14 +90,12 @@ typedef struct zfs_zstd_meta {
|
|||
int zstd_init(void);
|
||||
void zstd_fini(void);
|
||||
|
||||
size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len,
|
||||
size_t zfs_zstd_compress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int level);
|
||||
int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
|
||||
int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
|
||||
int zfs_zstd_decompress_level(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, uint8_t *level);
|
||||
int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
|
||||
int zfs_zstd_decompress(abd_t *src, abd_t *dst, size_t s_len,
|
||||
size_t d_len, int n);
|
||||
void zfs_zstd_cache_reap_now(void);
|
||||
|
||||
|
|
|
@ -82,6 +82,7 @@ typedef enum spa_feature {
|
|||
SPA_FEATURE_AVZ_V2,
|
||||
SPA_FEATURE_REDACTION_LIST_SPILL,
|
||||
SPA_FEATURE_RAIDZ_EXPANSION,
|
||||
SPA_FEATURE_FAST_DEDUP,
|
||||
SPA_FEATURES
|
||||
} spa_feature_t;
|
||||
|
||||
|
|
|
@ -556,6 +556,7 @@
|
|||
<elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='zpool_set_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='zpool_set_vdev_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='zpool_skip_pool' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
|
@ -616,7 +617,7 @@
|
|||
<elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='spa_feature_table' size='2296' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='spa_feature_table' size='2352' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
|
@ -6006,7 +6007,8 @@
|
|||
<enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
|
||||
<enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
|
||||
<enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
|
||||
<enumerator name='SPA_FEATURES' value='41'/>
|
||||
<enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
|
||||
<enumerator name='SPA_FEATURES' value='42'/>
|
||||
</enum-decl>
|
||||
<typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
|
||||
<qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
|
||||
|
@ -6638,6 +6640,11 @@
|
|||
<parameter type-id='9c313c2d' name='guid'/>
|
||||
<return type-id='95e97e5e'/>
|
||||
</function-decl>
|
||||
<function-decl name='zpool_set_guid' mangled-name='zpool_set_guid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_set_guid'>
|
||||
<parameter type-id='4c81de99' name='zhp'/>
|
||||
<parameter type-id='713a56f5' name='guid'/>
|
||||
<return type-id='95e97e5e'/>
|
||||
</function-decl>
|
||||
<function-decl name='zpool_reguid' mangled-name='zpool_reguid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_reguid'>
|
||||
<parameter type-id='4c81de99' name='zhp'/>
|
||||
<return type-id='95e97e5e'/>
|
||||
|
@ -9131,8 +9138,8 @@
|
|||
</function-decl>
|
||||
</abi-instr>
|
||||
<abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
|
||||
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18368' id='b93e4d14'>
|
||||
<subrange length='41' type-id='7359adad' id='cb834f44'/>
|
||||
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18816' id='b937914f'>
|
||||
<subrange length='42' type-id='7359adad' id='cb7c937f'/>
|
||||
</array-type-def>
|
||||
<enum-decl name='zfeature_flags' id='6db816a4'>
|
||||
<underlying-type type-id='9cac1fee'/>
|
||||
|
@ -9209,7 +9216,7 @@
|
|||
<pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
|
||||
<qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
|
||||
<pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
|
||||
<var-decl name='spa_feature_table' type-id='b93e4d14' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
|
||||
<var-decl name='spa_feature_table' type-id='b937914f' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
|
||||
<var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
|
||||
<function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
|
||||
<parameter type-id='80f4b756'/>
|
||||
|
|
|
@ -3735,6 +3735,13 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
|
|||
(void) zpool_standard_error(hdl, errno, errbuf);
|
||||
}
|
||||
break;
|
||||
|
||||
case ZFS_ERR_ASHIFT_MISMATCH:
|
||||
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
||||
"The new device cannot have a higher alignment requirement "
|
||||
"than the top-level vdev."));
|
||||
(void) zfs_error(hdl, EZFS_BADTARGET, errbuf);
|
||||
break;
|
||||
default:
|
||||
(void) zpool_standard_error(hdl, errno, errbuf);
|
||||
}
|
||||
|
@ -4305,22 +4312,55 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
|
|||
|
||||
/*
|
||||
* Change the GUID for a pool.
|
||||
*
|
||||
* Similar to zpool_reguid(), but may take a GUID.
|
||||
*
|
||||
* If the guid argument is NULL, then no GUID is passed in the nvlist to the
|
||||
* ioctl().
|
||||
*/
|
||||
int
|
||||
zpool_reguid(zpool_handle_t *zhp)
|
||||
zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid)
|
||||
{
|
||||
char errbuf[ERRBUFLEN];
|
||||
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
||||
nvlist_t *nvl = NULL;
|
||||
zfs_cmd_t zc = {"\0"};
|
||||
int error = -1;
|
||||
|
||||
if (guid != NULL) {
|
||||
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
|
||||
return (no_memory(hdl));
|
||||
|
||||
if (nvlist_add_uint64(nvl, ZPOOL_REGUID_GUID, *guid) != 0) {
|
||||
nvlist_free(nvl);
|
||||
return (no_memory(hdl));
|
||||
}
|
||||
|
||||
zcmd_write_src_nvlist(hdl, &zc, nvl);
|
||||
}
|
||||
|
||||
(void) snprintf(errbuf, sizeof (errbuf),
|
||||
dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
|
||||
|
||||
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
||||
if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
|
||||
return (0);
|
||||
error = zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc);
|
||||
if (error) {
|
||||
return (zpool_standard_error(hdl, errno, errbuf));
|
||||
}
|
||||
if (guid != NULL) {
|
||||
zcmd_free_nvlists(&zc);
|
||||
nvlist_free(nvl);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (zpool_standard_error(hdl, errno, errbuf));
|
||||
/*
|
||||
* Change the GUID for a pool.
|
||||
*/
|
||||
int
|
||||
zpool_reguid(zpool_handle_t *zhp)
|
||||
{
|
||||
return (zpool_set_guid(zhp, NULL));
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
include $(srcdir)/%D%/include/Makefile.am
|
||||
|
||||
libzpool_la_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS)
|
||||
libzpool_la_CFLAGS += $(ZLIB_CFLAGS)
|
||||
|
||||
libzpool_la_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
|
||||
libzpool_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
|
||||
libzpool_la_CPPFLAGS += -I$(srcdir)/include/os/@ac_system_l@/zfs
|
||||
libzpool_la_CPPFLAGS += -DLIB_ZPOOL_BUILD
|
||||
|
||||
|
@ -9,6 +11,7 @@ lib_LTLIBRARIES += libzpool.la
|
|||
CPPCHECKTARGETS += libzpool.la
|
||||
|
||||
dist_libzpool_la_SOURCES = \
|
||||
%D%/abd_os.c \
|
||||
%D%/kernel.c \
|
||||
%D%/taskq.c \
|
||||
%D%/util.c
|
||||
|
@ -39,7 +42,6 @@ nodist_libzpool_la_SOURCES = \
|
|||
module/lua/lvm.c \
|
||||
module/lua/lzio.c \
|
||||
\
|
||||
module/os/linux/zfs/abd_os.c \
|
||||
module/os/linux/zfs/arc_os.c \
|
||||
module/os/linux/zfs/trace.c \
|
||||
module/os/linux/zfs/vdev_file.c \
|
||||
|
@ -79,6 +81,7 @@ nodist_libzpool_la_SOURCES = \
|
|||
module/zfs/dbuf.c \
|
||||
module/zfs/dbuf_stats.c \
|
||||
module/zfs/ddt.c \
|
||||
module/zfs/ddt_log.c \
|
||||
module/zfs/ddt_stats.c \
|
||||
module/zfs/ddt_zap.c \
|
||||
module/zfs/dmu.c \
|
||||
|
|
|
@ -0,0 +1,365 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#include <sys/abd_impl.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/arc.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
|
||||
/*
|
||||
* We're simulating scatter/gather with 4K allocations, since that's more like
|
||||
* what a typical kernel does.
|
||||
*/
|
||||
#define ABD_PAGESIZE (4096)
|
||||
#define ABD_PAGESHIFT (12)
|
||||
#define ABD_PAGEMASK (ABD_PAGESIZE-1)
|
||||
|
||||
/*
|
||||
* See rationale in module/os/linux/zfs/abd_os.c, but in userspace this is
|
||||
* mostly useful to get a mix of linear and scatter ABDs for testing.
|
||||
*/
|
||||
#define ABD_SCATTER_MIN_SIZE (512 * 3)
|
||||
|
||||
abd_t *abd_zero_scatter = NULL;
|
||||
|
||||
static uint_t
|
||||
abd_iovcnt_for_bytes(size_t size)
|
||||
{
|
||||
/*
|
||||
* Each iovec points to a 4K page. There's no real reason to do this
|
||||
* in userspace, but our whole point here is to make it feel a bit
|
||||
* more like a real paged memory model.
|
||||
*/
|
||||
return (P2ROUNDUP(size, ABD_PAGESIZE) / ABD_PAGESIZE);
|
||||
}
|
||||
|
||||
abd_t *
|
||||
abd_alloc_struct_impl(size_t size)
|
||||
{
|
||||
/*
|
||||
* Zero-sized means it will be used for a linear or gang abd, so just
|
||||
* allocate the abd itself and return.
|
||||
*/
|
||||
if (size == 0)
|
||||
return (umem_alloc(sizeof (abd_t), UMEM_NOFAIL));
|
||||
|
||||
/*
|
||||
* Allocating for a scatter abd, so compute how many ABD_PAGESIZE
|
||||
* iovecs we will need to hold this size. Append that allocation to the
|
||||
* end. Note that struct abd_scatter has includes abd_iov[1], so we
|
||||
* allocate one less iovec than we need.
|
||||
*
|
||||
* Note we're not allocating the pages proper, just the iovec pointers.
|
||||
* That's down in abd_alloc_chunks. We _could_ do it here in a single
|
||||
* allocation, but it's fiddly and harder to read for no real gain.
|
||||
*/
|
||||
uint_t n = abd_iovcnt_for_bytes(size);
|
||||
abd_t *abd = umem_alloc(sizeof (abd_t) + (n-1) * sizeof (struct iovec),
|
||||
UMEM_NOFAIL);
|
||||
ABD_SCATTER(abd).abd_offset = 0;
|
||||
ABD_SCATTER(abd).abd_iovcnt = n;
|
||||
return (abd);
|
||||
}
|
||||
|
||||
void
|
||||
abd_free_struct_impl(abd_t *abd)
|
||||
{
|
||||
/* For scatter, compute the extra amount we need to free */
|
||||
uint_t iovcnt =
|
||||
abd_is_linear(abd) || abd_is_gang(abd) ?
|
||||
0 : (ABD_SCATTER(abd).abd_iovcnt - 1);
|
||||
umem_free(abd, sizeof (abd_t) + iovcnt * sizeof (struct iovec));
|
||||
}
|
||||
|
||||
void
|
||||
abd_alloc_chunks(abd_t *abd, size_t size)
|
||||
{
|
||||
/*
|
||||
* We've already allocated the iovec array; ensure that the wanted size
|
||||
* actually matches, otherwise the caller has made a mistake somewhere.
|
||||
*/
|
||||
uint_t n = ABD_SCATTER(abd).abd_iovcnt;
|
||||
ASSERT3U(n, ==, abd_iovcnt_for_bytes(size));
|
||||
|
||||
/*
|
||||
* Allocate a ABD_PAGESIZE region for each iovec.
|
||||
*/
|
||||
struct iovec *iov = ABD_SCATTER(abd).abd_iov;
|
||||
for (int i = 0; i < n; i++) {
|
||||
iov[i].iov_base =
|
||||
umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
|
||||
iov[i].iov_len = ABD_PAGESIZE;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
abd_free_chunks(abd_t *abd)
|
||||
{
|
||||
uint_t n = ABD_SCATTER(abd).abd_iovcnt;
|
||||
struct iovec *iov = ABD_SCATTER(abd).abd_iov;
|
||||
for (int i = 0; i < n; i++)
|
||||
umem_free_aligned(iov[i].iov_base, ABD_PAGESIZE);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
abd_size_alloc_linear(size_t size)
|
||||
{
|
||||
return (size < ABD_SCATTER_MIN_SIZE);
|
||||
}
|
||||
|
||||
void
|
||||
abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
|
||||
{
|
||||
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
|
||||
int waste = P2ROUNDUP(abd->abd_size, ABD_PAGESIZE) - abd->abd_size;
|
||||
if (op == ABDSTAT_INCR) {
|
||||
arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
|
||||
} else {
|
||||
arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
|
||||
{
|
||||
(void) abd;
|
||||
(void) op;
|
||||
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
|
||||
}
|
||||
|
||||
void
|
||||
abd_verify_scatter(abd_t *abd)
|
||||
{
|
||||
#ifdef ZFS_DEBUG
|
||||
/*
|
||||
* scatter abds shall have:
|
||||
* - at least one iovec
|
||||
* - all iov_base point somewhere
|
||||
* - all iov_len are ABD_PAGESIZE
|
||||
* - offset set within the abd pages somewhere
|
||||
*/
|
||||
uint_t n = ABD_SCATTER(abd).abd_iovcnt;
|
||||
ASSERT3U(n, >, 0);
|
||||
|
||||
uint_t len = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
ASSERT3P(ABD_SCATTER(abd).abd_iov[i].iov_base, !=, NULL);
|
||||
ASSERT3U(ABD_SCATTER(abd).abd_iov[i].iov_len, ==, ABD_PAGESIZE);
|
||||
len += ABD_PAGESIZE;
|
||||
}
|
||||
|
||||
ASSERT3U(ABD_SCATTER(abd).abd_offset, <, len);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
abd_init(void)
|
||||
{
|
||||
/*
|
||||
* Create the "zero" scatter abd. This is always the size of the
|
||||
* largest possible block, but only actually has a single allocated
|
||||
* page, which all iovecs in the abd point to.
|
||||
*/
|
||||
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
|
||||
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
|
||||
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
|
||||
|
||||
void *zero =
|
||||
umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
|
||||
memset(zero, 0, ABD_PAGESIZE);
|
||||
|
||||
uint_t n = abd_iovcnt_for_bytes(SPA_MAXBLOCKSIZE);
|
||||
struct iovec *iov = ABD_SCATTER(abd_zero_scatter).abd_iov;
|
||||
for (int i = 0; i < n; i++) {
|
||||
iov[i].iov_base = zero;
|
||||
iov[i].iov_len = ABD_PAGESIZE;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
abd_fini(void)
|
||||
{
|
||||
umem_free_aligned(
|
||||
ABD_SCATTER(abd_zero_scatter).abd_iov[0].iov_base, ABD_PAGESIZE);
|
||||
abd_free_struct(abd_zero_scatter);
|
||||
abd_zero_scatter = NULL;
|
||||
}
|
||||
|
||||
void
|
||||
abd_free_linear_page(abd_t *abd)
|
||||
{
|
||||
/*
|
||||
* LINEAR_PAGE is specific to the Linux kernel; we never set this
|
||||
* flag, so this will never be called.
|
||||
*/
|
||||
(void) abd;
|
||||
PANIC("unreachable");
|
||||
}
|
||||
|
||||
abd_t *
|
||||
abd_alloc_for_io(size_t size, boolean_t is_metadata)
|
||||
{
|
||||
return (abd_alloc(size, is_metadata));
|
||||
}
|
||||
|
||||
abd_t *
|
||||
abd_get_offset_scatter(abd_t *dabd, abd_t *sabd, size_t off, size_t size)
|
||||
{
|
||||
|
||||
/*
|
||||
* Create a new scatter dabd by borrowing data pages from sabd to cover
|
||||
* off+size.
|
||||
*
|
||||
* sabd is an existing scatter abd with a set of iovecs, each covering
|
||||
* an ABD_PAGESIZE (4K) allocation. It's "zero" is at abd_offset.
|
||||
*
|
||||
* [........][........][........][........]
|
||||
* ^- sabd_offset
|
||||
*
|
||||
* We want to produce a new abd, referencing those allocations at the
|
||||
* given offset.
|
||||
*
|
||||
* [........][........][........][........]
|
||||
* ^- dabd_offset = sabd_offset + off
|
||||
* ^- dabd_offset + size
|
||||
*
|
||||
* In this example, dabd needs three iovecs. The first iovec is offset
|
||||
* 0, so the final dabd_offset is masked back into the first iovec.
|
||||
*
|
||||
* [........][........][........]
|
||||
* ^- dabd_offset
|
||||
*/
|
||||
size_t soff = ABD_SCATTER(sabd).abd_offset + off;
|
||||
size_t doff = soff & ABD_PAGEMASK;
|
||||
size_t iovcnt = abd_iovcnt_for_bytes(doff + size);
|
||||
|
||||
/*
|
||||
* If the passed-in abd has enough allocated iovecs already, reuse it.
|
||||
* Otherwise, make a new one. The caller will free the original if the
|
||||
* one it gets back is not the same.
|
||||
*
|
||||
* Note that it's ok if we reuse an abd with more iovecs than we need.
|
||||
* abd_size has the usable amount of data, and the abd does not own the
|
||||
* pages referenced by the iovecs. At worst, they're holding dangling
|
||||
* pointers that we'll never use anyway.
|
||||
*/
|
||||
if (dabd == NULL || ABD_SCATTER(dabd).abd_iovcnt < iovcnt)
|
||||
dabd = abd_alloc_struct(iovcnt << ABD_PAGESHIFT);
|
||||
|
||||
/* Set offset into first page in view */
|
||||
ABD_SCATTER(dabd).abd_offset = doff;
|
||||
|
||||
/* Copy the wanted iovecs from the source to the dest */
|
||||
memcpy(&ABD_SCATTER(dabd).abd_iov[0],
|
||||
&ABD_SCATTER(sabd).abd_iov[soff >> ABD_PAGESHIFT],
|
||||
iovcnt * sizeof (struct iovec));
|
||||
|
||||
return (dabd);
|
||||
}
|
||||
|
||||
void
|
||||
abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||||
{
|
||||
ASSERT(!abd_is_gang(abd));
|
||||
abd_verify(abd);
|
||||
memset(aiter, 0, sizeof (struct abd_iter));
|
||||
aiter->iter_abd = abd;
|
||||
}
|
||||
|
||||
boolean_t
|
||||
abd_iter_at_end(struct abd_iter *aiter)
|
||||
{
|
||||
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
|
||||
return (aiter->iter_pos == aiter->iter_abd->abd_size);
|
||||
}
|
||||
|
||||
void
|
||||
abd_iter_advance(struct abd_iter *aiter, size_t amount)
|
||||
{
|
||||
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
|
||||
ASSERT0(aiter->iter_mapsize);
|
||||
|
||||
if (abd_iter_at_end(aiter))
|
||||
return;
|
||||
|
||||
aiter->iter_pos += amount;
|
||||
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
|
||||
}
|
||||
|
||||
void
|
||||
abd_iter_map(struct abd_iter *aiter)
|
||||
{
|
||||
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
|
||||
ASSERT0(aiter->iter_mapsize);
|
||||
|
||||
if (abd_iter_at_end(aiter))
|
||||
return;
|
||||
|
||||
if (abd_is_linear(aiter->iter_abd)) {
|
||||
aiter->iter_mapaddr =
|
||||
ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
|
||||
aiter->iter_mapsize =
|
||||
aiter->iter_abd->abd_size - aiter->iter_pos;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* For scatter, we index into the appropriate iovec, and return the
|
||||
* smaller of the amount requested, or up to the end of the page.
|
||||
*/
|
||||
size_t poff = aiter->iter_pos + ABD_SCATTER(aiter->iter_abd).abd_offset;
|
||||
|
||||
ASSERT3U(poff >> ABD_PAGESHIFT, <=,
|
||||
ABD_SCATTER(aiter->iter_abd).abd_iovcnt);
|
||||
struct iovec *iov = &ABD_SCATTER(aiter->iter_abd).
|
||||
abd_iov[poff >> ABD_PAGESHIFT];
|
||||
|
||||
aiter->iter_mapsize = MIN(ABD_PAGESIZE - (poff & ABD_PAGEMASK),
|
||||
aiter->iter_abd->abd_size - aiter->iter_pos);
|
||||
ASSERT3U(aiter->iter_mapsize, <=, ABD_PAGESIZE);
|
||||
|
||||
aiter->iter_mapaddr = iov->iov_base + (poff & ABD_PAGEMASK);
|
||||
}
|
||||
|
||||
void
|
||||
abd_iter_unmap(struct abd_iter *aiter)
|
||||
{
|
||||
if (abd_iter_at_end(aiter))
|
||||
return;
|
||||
|
||||
ASSERT3P(aiter->iter_mapaddr, !=, NULL);
|
||||
ASSERT3U(aiter->iter_mapsize, >, 0);
|
||||
|
||||
aiter->iter_mapaddr = NULL;
|
||||
aiter->iter_mapsize = 0;
|
||||
}
|
||||
|
||||
void
|
||||
abd_cache_reap_now(void)
|
||||
{
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
libzpooldir = $(includedir)/libzpool
|
||||
libzpool_HEADERS = \
|
||||
%D%/sys/abd_os.h \
|
||||
%D%/sys/abd_impl_os.h
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#ifndef _ABD_IMPL_OS_H
|
||||
#define _ABD_IMPL_OS_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define abd_enter_critical(flags) ((void)0)
|
||||
#define abd_exit_critical(flags) ((void)0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ABD_IMPL_OS_H */
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _ABD_OS_H
|
||||
#define _ABD_OS_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct abd_scatter {
|
||||
uint_t abd_offset;
|
||||
uint_t abd_iovcnt;
|
||||
struct iovec abd_iov[1]; /* actually variable-length */
|
||||
};
|
||||
|
||||
struct abd_linear {
|
||||
void *abd_buf;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ABD_H */
|
|
@ -175,17 +175,6 @@ Increasing this value will
|
|||
result in a slower thread creation rate which may be preferable for some
|
||||
configurations.
|
||||
.
|
||||
.It Sy spl_max_show_tasks Ns = Ns Sy 512 Pq uint
|
||||
The maximum number of tasks per pending list in each taskq shown in
|
||||
.Pa /proc/spl/taskq{,-all} .
|
||||
Write
|
||||
.Sy 0
|
||||
to turn off the limit.
|
||||
The proc file will walk the lists with lock held,
|
||||
reading it could cause a lock-up if the list grow too large
|
||||
without limiting the output.
|
||||
"(truncated)" will be shown if the list is larger than the limit.
|
||||
.
|
||||
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
|
||||
Minimum idle threads exit interval for dynamic taskqs.
|
||||
Smaller values allow idle threads exit more often and potentially be
|
||||
|
|
108
man/man4/zfs.4
108
man/man4/zfs.4
|
@ -77,6 +77,17 @@ the array is dynamically sized based on total system memory.
|
|||
dnode slots allocated in a single operation as a power of 2.
|
||||
The default value minimizes lock contention for the bulk operation performed.
|
||||
.
|
||||
.It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint
|
||||
Controls the number of copies stored for DeDup Table
|
||||
.Pq DDT
|
||||
objects.
|
||||
Reducing the number of copies to 1 from the previous default of 3
|
||||
can reduce the write inflation caused by deduplication.
|
||||
This assumes redundancy for this data is provided by the vdev layer.
|
||||
If the DDT is damaged, space may be leaked
|
||||
.Pq not freed
|
||||
when the DDT can not report the correct reference count.
|
||||
.
|
||||
.It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
|
||||
Limit the amount we can prefetch with one call to this amount in bytes.
|
||||
This helps to limit the amount of memory that can be used by prefetching.
|
||||
|
@ -121,20 +132,26 @@ Controls whether buffers present on special vdevs are eligible for caching
|
|||
into L2ARC.
|
||||
If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
|
||||
.
|
||||
.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||
.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
|
||||
Controls whether only MFU metadata and data are cached from ARC into L2ARC.
|
||||
This may be desired to avoid wasting space on L2ARC when reading/writing large
|
||||
amounts of data that are not expected to be accessed more than once.
|
||||
.Pp
|
||||
The default is off,
|
||||
The default is 0,
|
||||
meaning both MRU and MFU data and metadata are cached.
|
||||
When turning off this feature, some MRU buffers will still be present
|
||||
in ARC and eventually cached on L2ARC.
|
||||
When turning off this feature (setting it to 0), some MRU buffers will
|
||||
still be present in ARC and eventually cached on L2ARC.
|
||||
.No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
|
||||
some prefetched buffers will be cached to L2ARC, and those might later
|
||||
transition to MRU, in which case the
|
||||
.Sy l2arc_mru_asize No arcstat will not be Sy 0 .
|
||||
.Pp
|
||||
Setting it to 1 means to L2 cache only MFU data and metadata.
|
||||
.Pp
|
||||
Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
|
||||
only MFU data (ie: MRU data are not cached). This can be the right setting
|
||||
to cache as much metadata as possible even when having high data turnover.
|
||||
.Pp
|
||||
Regardless of
|
||||
.Sy l2arc_noprefetch ,
|
||||
some MFU buffers might be evicted from ARC,
|
||||
|
@ -821,6 +838,7 @@ This is a limit on how many pages the ARC shrinker makes available for
|
|||
eviction in response to one page allocation attempt.
|
||||
Note that in practice, the kernel's shrinker can ask us to evict
|
||||
up to about four times this for one allocation attempt.
|
||||
To reduce OOM risk, this limit is applied for kswapd reclaims only.
|
||||
.Pp
|
||||
The default limit of
|
||||
.Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
|
||||
|
@ -974,6 +992,88 @@ milliseconds until the operation completes.
|
|||
.It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||
Enable prefetching dedup-ed blocks which are going to be freed.
|
||||
.
|
||||
.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
|
||||
Maximum number of dedup log flush passes (iterations) each transaction.
|
||||
.Pp
|
||||
At the start of each transaction, OpenZFS will estimate how many entries it
|
||||
needs to flush out to keep up with the change rate, taking the amount and time
|
||||
taken to flush on previous txgs into account (see
|
||||
.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
|
||||
It will spread this amount into a number of passes.
|
||||
At each pass, it will use the amount already flushed and the total time taken
|
||||
by flushing and by other IO to recompute how much it should do for the remainder
|
||||
of the txg.
|
||||
.Pp
|
||||
Reducing the max number of passes will make flushing more aggressive, flushing
|
||||
out more entries on each pass.
|
||||
This can be faster, but also more likely to compete with other IO.
|
||||
Increasing the max number of passes will put fewer entries onto each pass,
|
||||
keeping the overhead of dedup changes to a minimum but possibly causing a large
|
||||
number of changes to be dumped on the last pass, which can blow out the txg
|
||||
sync time beyond
|
||||
.Sy zfs_txg_timeout .
|
||||
.
|
||||
.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
|
||||
Minimum time to spend on dedup log flush each transaction.
|
||||
.Pp
|
||||
At least this long will be spent flushing dedup log entries each transaction,
|
||||
up to
|
||||
.Sy zfs_txg_timeout .
|
||||
This occurs even if doing so would delay the transaction, that is, other IO
|
||||
completes under this time.
|
||||
.
|
||||
.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
|
||||
Flush at least this many entries each transaction.
|
||||
.Pp
|
||||
OpenZFS will estimate how many entries it needs to flush each transaction to
|
||||
keep up with the ingest rate (see
|
||||
.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
|
||||
This sets the minimum for that estimate.
|
||||
Raising it can force OpenZFS to flush more aggressively, keeping the log small
|
||||
and so reducing pool import times, but can make it less able to back off if
|
||||
log flushing would compete with other IO too much.
|
||||
.
|
||||
.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
|
||||
Number of transactions to use to compute the flow rate.
|
||||
.Pp
|
||||
OpenZFS will estimate how many entries it needs to flush each transaction by
|
||||
monitoring the number of entries changed (ingest rate), number of entries
|
||||
flushed (flush rate) and time spent flushing (flush time rate) and combining
|
||||
these into an overall "flow rate".
|
||||
It will use an exponential weighted moving average over some number of recent
|
||||
transactions to compute these rates.
|
||||
This sets the number of transactions to compute these averages over.
|
||||
Setting it higher can help to smooth out the flow rate in the face of spiky
|
||||
workloads, but will take longer for the flow rate to adjust to a sustained
|
||||
change in the ingress rate.
|
||||
.
|
||||
.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
|
||||
Max transactions to before starting to flush dedup logs.
|
||||
.Pp
|
||||
OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
|
||||
If there is nothing to flush, it will accumulate changes for no more than this
|
||||
many transactions before switching the logs and starting to flush entries out.
|
||||
.
|
||||
.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
|
||||
Max memory to use for dedup logs.
|
||||
.Pp
|
||||
OpenZFS will spend no more than this much memory on maintaining the in-memory
|
||||
dedup log.
|
||||
Flushing will begin when around half this amount is being spent on logs.
|
||||
The default value of
|
||||
.Sy 0
|
||||
will cause it to be set by
|
||||
.Sy zfs_dedup_log_mem_max_percent
|
||||
instead.
|
||||
.
|
||||
.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
|
||||
Max memory to use for dedup logs, as a percentage of total memory.
|
||||
.Pp
|
||||
If
|
||||
.Sy zfs_dedup_log_mem_max
|
||||
is not set, it will be initialised as a percentage of the total memory in the
|
||||
system.
|
||||
.
|
||||
.It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
|
||||
Start to delay each transaction once there is this amount of dirty data,
|
||||
expressed as a percentage of
|
||||
|
|
|
@ -17,8 +17,9 @@
|
|||
.\" Copyright (c) 2019, Klara Inc.
|
||||
.\" Copyright (c) 2019, Allan Jude
|
||||
.\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
|
||||
.\" Copyright (c) 2023, Klara Inc.
|
||||
.\"
|
||||
.Dd June 23, 2022
|
||||
.Dd February 14, 2024
|
||||
.Dt ZPOOL-FEATURES 7
|
||||
.Os
|
||||
.
|
||||
|
@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the
|
|||
.Sy enabled
|
||||
state when all datasets that use this feature are destroyed.
|
||||
.
|
||||
.feature com.klarasystems fast_dedup yes
|
||||
This feature allows more advanced deduplication features to be enabled on new
|
||||
dedup tables.
|
||||
.Pp
|
||||
This feature will be
|
||||
.Sy active
|
||||
when the first deduplicated block is written after a new dedup table is created
|
||||
(ie after a new pool creation, or new checksum used on a dataset with
|
||||
.Sy dedup
|
||||
enabled).
|
||||
It will be returned to the
|
||||
.Sy enabled
|
||||
state when all deduplicated blocks using it are freed.
|
||||
.
|
||||
.feature com.delphix extensible_dataset no
|
||||
This feature allows more flexible use of internal ZFS data structures,
|
||||
and exists for other features to depend on.
|
||||
|
|
|
@ -25,8 +25,10 @@
|
|||
.\" Copyright (c) 2018 George Melikov. All Rights Reserved.
|
||||
.\" Copyright 2017 Nexenta Systems, Inc.
|
||||
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
|
||||
.\" Copyright (c) 2024, Klara Inc.
|
||||
.\" Copyright (c) 2024, Mateusz Piotrowski
|
||||
.\"
|
||||
.Dd May 31, 2021
|
||||
.Dd June 21, 2023
|
||||
.Dt ZPOOL-REGUID 8
|
||||
.Os
|
||||
.
|
||||
|
@ -36,6 +38,7 @@
|
|||
.Sh SYNOPSIS
|
||||
.Nm zpool
|
||||
.Cm reguid
|
||||
.Op Fl g Ar guid
|
||||
.Ar pool
|
||||
.
|
||||
.Sh DESCRIPTION
|
||||
|
@ -43,6 +46,15 @@ Generates a new unique identifier for the pool.
|
|||
You must ensure that all devices in this pool are online and healthy before
|
||||
performing this action.
|
||||
.
|
||||
.Bl -tag -width Ds
|
||||
.It Fl g Ar guid
|
||||
Set the pool GUID to the provided value.
|
||||
The GUID can be any 64-bit value accepted by
|
||||
.Xr strtoull 3
|
||||
in base 10.
|
||||
.Nm
|
||||
will return an error if the provided GUID is already in use.
|
||||
.El
|
||||
.Sh SEE ALSO
|
||||
.Xr zpool-export 8 ,
|
||||
.Xr zpool-import 8
|
||||
|
|
|
@ -16,8 +16,8 @@ src = @abs_srcdir@
|
|||
obj = @abs_builddir@
|
||||
else
|
||||
zfs_include = $(srctree)/include/zfs
|
||||
icp_include = $(srctree)/$(src)/icp/include
|
||||
zstd_include = $(srctree)/$(src)/zstd/include
|
||||
icp_include = $(src)/icp/include
|
||||
zstd_include = $(src)/zstd/include
|
||||
ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
|
||||
endif
|
||||
|
||||
|
@ -323,6 +323,7 @@ ZFS_OBJS := \
|
|||
dbuf.o \
|
||||
dbuf_stats.o \
|
||||
ddt.o \
|
||||
ddt_log.o \
|
||||
ddt_stats.o \
|
||||
ddt_zap.o \
|
||||
dmu.o \
|
||||
|
|
|
@ -252,6 +252,7 @@ SRCS+= abd.c \
|
|||
dbuf.c \
|
||||
dbuf_stats.c \
|
||||
ddt.c \
|
||||
ddt_log.c \
|
||||
ddt_stats.c \
|
||||
ddt_zap.c \
|
||||
dmu.c \
|
||||
|
@ -426,6 +427,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast
|
|||
|
||||
CFLAGS.abd.c= -Wno-cast-qual
|
||||
CFLAGS.ddt.c= -Wno-cast-qual
|
||||
CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith
|
||||
CFLAGS.ddt_zap.c= -Wno-cast-qual
|
||||
CFLAGS.dmu.c= -Wno-cast-qual
|
||||
CFLAGS.dmu_traverse.c= -Wno-cast-qual
|
||||
|
|
|
@ -95,14 +95,12 @@ struct {
|
|||
*/
|
||||
static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;
|
||||
|
||||
#if defined(_KERNEL)
|
||||
SYSCTL_DECL(_vfs_zfs);
|
||||
|
||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
|
||||
&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
|
||||
SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
|
||||
&zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
|
||||
#endif
|
||||
|
||||
kmem_cache_t *abd_chunk_cache;
|
||||
static kstat_t *abd_ksp;
|
||||
|
@ -250,7 +248,7 @@ abd_alloc_zero_scatter(void)
|
|||
|
||||
n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
|
||||
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
|
||||
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
|
||||
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
|
||||
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
|
||||
|
||||
ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
|
||||
|
|
|
@ -6125,7 +6125,9 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
|
|||
error == EOPNOTSUPP)
|
||||
goto bad_locked_fallback;
|
||||
*ap->a_lenp = (size_t)len;
|
||||
#ifdef MAC
|
||||
out_locked:
|
||||
#endif
|
||||
if (invp != outvp)
|
||||
VOP_UNLOCK(invp);
|
||||
VOP_UNLOCK(outvp);
|
||||
|
|
|
@ -868,16 +868,16 @@ spl_init(void)
|
|||
if ((rc = spl_tsd_init()))
|
||||
goto out2;
|
||||
|
||||
if ((rc = spl_taskq_init()))
|
||||
if ((rc = spl_proc_init()))
|
||||
goto out3;
|
||||
|
||||
if ((rc = spl_kmem_cache_init()))
|
||||
if ((rc = spl_kstat_init()))
|
||||
goto out4;
|
||||
|
||||
if ((rc = spl_proc_init()))
|
||||
if ((rc = spl_taskq_init()))
|
||||
goto out5;
|
||||
|
||||
if ((rc = spl_kstat_init()))
|
||||
if ((rc = spl_kmem_cache_init()))
|
||||
goto out6;
|
||||
|
||||
if ((rc = spl_zlib_init()))
|
||||
|
@ -891,13 +891,13 @@ spl_init(void)
|
|||
out8:
|
||||
spl_zlib_fini();
|
||||
out7:
|
||||
spl_kstat_fini();
|
||||
out6:
|
||||
spl_proc_fini();
|
||||
out5:
|
||||
spl_kmem_cache_fini();
|
||||
out4:
|
||||
out6:
|
||||
spl_taskq_fini();
|
||||
out5:
|
||||
spl_kstat_fini();
|
||||
out4:
|
||||
spl_proc_fini();
|
||||
out3:
|
||||
spl_tsd_fini();
|
||||
out2:
|
||||
|
@ -913,10 +913,10 @@ spl_fini(void)
|
|||
{
|
||||
spl_zone_fini();
|
||||
spl_zlib_fini();
|
||||
spl_kstat_fini();
|
||||
spl_proc_fini();
|
||||
spl_kmem_cache_fini();
|
||||
spl_taskq_fini();
|
||||
spl_kstat_fini();
|
||||
spl_proc_fini();
|
||||
spl_tsd_fini();
|
||||
spl_kvmem_fini();
|
||||
spl_random_fini();
|
||||
|
|
|
@ -22,13 +22,15 @@
|
|||
*
|
||||
* Solaris Porting Layer (SPL) Proc Implementation.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
|
||||
*/
|
||||
|
||||
#include <sys/systeminfo.h>
|
||||
#include <sys/kstat.h>
|
||||
#include <sys/kmem.h>
|
||||
#include <sys/kmem_cache.h>
|
||||
#include <sys/vmem.h>
|
||||
#include <sys/taskq.h>
|
||||
#include <sys/proc.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/kmod.h>
|
||||
|
@ -43,6 +45,12 @@ typedef struct ctl_table __no_const spl_ctl_table;
|
|||
typedef struct ctl_table spl_ctl_table;
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_PROC_HANDLER_CTL_TABLE_CONST
|
||||
#define CONST_CTL_TABLE const struct ctl_table
|
||||
#else
|
||||
#define CONST_CTL_TABLE struct ctl_table
|
||||
#endif
|
||||
|
||||
static unsigned long table_min = 0;
|
||||
static unsigned long table_max = ~0;
|
||||
|
||||
|
@ -54,13 +62,11 @@ static struct ctl_table_header *spl_kstat = NULL;
|
|||
static struct proc_dir_entry *proc_spl = NULL;
|
||||
static struct proc_dir_entry *proc_spl_kmem = NULL;
|
||||
static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
|
||||
static struct proc_dir_entry *proc_spl_taskq_all = NULL;
|
||||
static struct proc_dir_entry *proc_spl_taskq = NULL;
|
||||
struct proc_dir_entry *proc_spl_kstat = NULL;
|
||||
|
||||
#ifdef DEBUG_KMEM
|
||||
static int
|
||||
proc_domemused(struct ctl_table *table, int write,
|
||||
proc_domemused(CONST_CTL_TABLE *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int rc = 0;
|
||||
|
@ -88,7 +94,7 @@ proc_domemused(struct ctl_table *table, int write,
|
|||
#endif /* DEBUG_KMEM */
|
||||
|
||||
static int
|
||||
proc_doslab(struct ctl_table *table, int write,
|
||||
proc_doslab(CONST_CTL_TABLE *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int rc = 0;
|
||||
|
@ -135,7 +141,7 @@ proc_doslab(struct ctl_table *table, int write,
|
|||
}
|
||||
|
||||
static int
|
||||
proc_dohostid(struct ctl_table *table, int write,
|
||||
proc_dohostid(CONST_CTL_TABLE *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
char *end, str[32];
|
||||
|
@ -168,195 +174,6 @@ proc_dohostid(struct ctl_table *table, int write,
|
|||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
taskq_seq_show_headers(struct seq_file *f)
|
||||
{
|
||||
seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
|
||||
"taskq", "act", "nthr", "spwn", "maxt", "pri",
|
||||
"mina", "maxa", "cura", "flags");
|
||||
}
|
||||
|
||||
/* indices into the lheads array below */
|
||||
#define LHEAD_PEND 0
|
||||
#define LHEAD_PRIO 1
|
||||
#define LHEAD_DELAY 2
|
||||
#define LHEAD_WAIT 3
|
||||
#define LHEAD_ACTIVE 4
|
||||
#define LHEAD_SIZE 5
|
||||
|
||||
static unsigned int spl_max_show_tasks = 512;
|
||||
/* CSTYLED */
|
||||
module_param(spl_max_show_tasks, uint, 0644);
|
||||
MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
|
||||
|
||||
static int
|
||||
taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
|
||||
{
|
||||
taskq_t *tq = p;
|
||||
taskq_thread_t *tqt = NULL;
|
||||
spl_wait_queue_entry_t *wq;
|
||||
struct task_struct *tsk;
|
||||
taskq_ent_t *tqe;
|
||||
char name[100];
|
||||
struct list_head *lheads[LHEAD_SIZE], *lh;
|
||||
static char *list_names[LHEAD_SIZE] =
|
||||
{"pend", "prio", "delay", "wait", "active" };
|
||||
int i, j, have_lheads = 0;
|
||||
unsigned long wflags, flags;
|
||||
|
||||
spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
|
||||
spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
|
||||
|
||||
/* get the various lists and check whether they're empty */
|
||||
lheads[LHEAD_PEND] = &tq->tq_pend_list;
|
||||
lheads[LHEAD_PRIO] = &tq->tq_prio_list;
|
||||
lheads[LHEAD_DELAY] = &tq->tq_delay_list;
|
||||
#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
|
||||
lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
|
||||
#else
|
||||
lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
|
||||
#endif
|
||||
lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
|
||||
|
||||
for (i = 0; i < LHEAD_SIZE; ++i) {
|
||||
if (list_empty(lheads[i]))
|
||||
lheads[i] = NULL;
|
||||
else
|
||||
++have_lheads;
|
||||
}
|
||||
|
||||
/* early return in non-"all" mode if lists are all empty */
|
||||
if (!allflag && !have_lheads) {
|
||||
spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
|
||||
spin_unlock_irqrestore(&tq->tq_lock, flags);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/* unlock the waitq quickly */
|
||||
if (!lheads[LHEAD_WAIT])
|
||||
spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
|
||||
|
||||
/* show the base taskq contents */
|
||||
snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
|
||||
seq_printf(f, "%-25s ", name);
|
||||
seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
|
||||
tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
|
||||
tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
|
||||
tq->tq_nalloc, tq->tq_flags);
|
||||
|
||||
/* show the active list */
|
||||
if (lheads[LHEAD_ACTIVE]) {
|
||||
j = 0;
|
||||
list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
|
||||
if (j == 0)
|
||||
seq_printf(f, "\t%s:",
|
||||
list_names[LHEAD_ACTIVE]);
|
||||
else if (j == 2) {
|
||||
seq_printf(f, "\n\t ");
|
||||
j = 0;
|
||||
}
|
||||
seq_printf(f, " [%d]%pf(%ps)",
|
||||
tqt->tqt_thread->pid,
|
||||
tqt->tqt_task->tqent_func,
|
||||
tqt->tqt_task->tqent_arg);
|
||||
++j;
|
||||
}
|
||||
seq_printf(f, "\n");
|
||||
}
|
||||
|
||||
for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
|
||||
if (lheads[i]) {
|
||||
j = 0;
|
||||
list_for_each(lh, lheads[i]) {
|
||||
if (spl_max_show_tasks != 0 &&
|
||||
j >= spl_max_show_tasks) {
|
||||
seq_printf(f, "\n\t(truncated)");
|
||||
break;
|
||||
}
|
||||
/* show the wait waitq list */
|
||||
if (i == LHEAD_WAIT) {
|
||||
#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
|
||||
wq = list_entry(lh,
|
||||
spl_wait_queue_entry_t, entry);
|
||||
#else
|
||||
wq = list_entry(lh,
|
||||
spl_wait_queue_entry_t, task_list);
|
||||
#endif
|
||||
if (j == 0)
|
||||
seq_printf(f, "\t%s:",
|
||||
list_names[i]);
|
||||
else if (j % 8 == 0)
|
||||
seq_printf(f, "\n\t ");
|
||||
|
||||
tsk = wq->private;
|
||||
seq_printf(f, " %d", tsk->pid);
|
||||
/* pend, prio and delay lists */
|
||||
} else {
|
||||
tqe = list_entry(lh, taskq_ent_t,
|
||||
tqent_list);
|
||||
if (j == 0)
|
||||
seq_printf(f, "\t%s:",
|
||||
list_names[i]);
|
||||
else if (j % 2 == 0)
|
||||
seq_printf(f, "\n\t ");
|
||||
|
||||
seq_printf(f, " %pf(%ps)",
|
||||
tqe->tqent_func,
|
||||
tqe->tqent_arg);
|
||||
}
|
||||
++j;
|
||||
}
|
||||
seq_printf(f, "\n");
|
||||
}
|
||||
if (lheads[LHEAD_WAIT])
|
||||
spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
|
||||
spin_unlock_irqrestore(&tq->tq_lock, flags);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
taskq_all_seq_show(struct seq_file *f, void *p)
|
||||
{
|
||||
return (taskq_seq_show_impl(f, p, B_TRUE));
|
||||
}
|
||||
|
||||
static int
|
||||
taskq_seq_show(struct seq_file *f, void *p)
|
||||
{
|
||||
return (taskq_seq_show_impl(f, p, B_FALSE));
|
||||
}
|
||||
|
||||
static void *
|
||||
taskq_seq_start(struct seq_file *f, loff_t *pos)
|
||||
{
|
||||
struct list_head *p;
|
||||
loff_t n = *pos;
|
||||
|
||||
down_read(&tq_list_sem);
|
||||
if (!n)
|
||||
taskq_seq_show_headers(f);
|
||||
|
||||
p = tq_list.next;
|
||||
while (n--) {
|
||||
p = p->next;
|
||||
if (p == &tq_list)
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
return (list_entry(p, taskq_t, tq_taskqs));
|
||||
}
|
||||
|
||||
static void *
|
||||
taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
|
||||
{
|
||||
taskq_t *tq = p;
|
||||
|
||||
++*pos;
|
||||
return ((tq->tq_taskqs.next == &tq_list) ?
|
||||
NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
|
||||
}
|
||||
|
||||
static void
|
||||
slab_seq_show_headers(struct seq_file *f)
|
||||
{
|
||||
|
@ -492,66 +309,6 @@ static const kstat_proc_op_t proc_slab_operations = {
|
|||
#endif
|
||||
};
|
||||
|
||||
static void
|
||||
taskq_seq_stop(struct seq_file *f, void *v)
|
||||
{
|
||||
up_read(&tq_list_sem);
|
||||
}
|
||||
|
||||
static const struct seq_operations taskq_all_seq_ops = {
|
||||
.show = taskq_all_seq_show,
|
||||
.start = taskq_seq_start,
|
||||
.next = taskq_seq_next,
|
||||
.stop = taskq_seq_stop,
|
||||
};
|
||||
|
||||
static const struct seq_operations taskq_seq_ops = {
|
||||
.show = taskq_seq_show,
|
||||
.start = taskq_seq_start,
|
||||
.next = taskq_seq_next,
|
||||
.stop = taskq_seq_stop,
|
||||
};
|
||||
|
||||
static int
|
||||
proc_taskq_all_open(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return (seq_open(filp, &taskq_all_seq_ops));
|
||||
}
|
||||
|
||||
static int
|
||||
proc_taskq_open(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return (seq_open(filp, &taskq_seq_ops));
|
||||
}
|
||||
|
||||
static const kstat_proc_op_t proc_taskq_all_operations = {
|
||||
#ifdef HAVE_PROC_OPS_STRUCT
|
||||
.proc_open = proc_taskq_all_open,
|
||||
.proc_read = seq_read,
|
||||
.proc_lseek = seq_lseek,
|
||||
.proc_release = seq_release,
|
||||
#else
|
||||
.open = proc_taskq_all_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
#endif
|
||||
};
|
||||
|
||||
static const kstat_proc_op_t proc_taskq_operations = {
|
||||
#ifdef HAVE_PROC_OPS_STRUCT
|
||||
.proc_open = proc_taskq_open,
|
||||
.proc_read = seq_read,
|
||||
.proc_lseek = seq_lseek,
|
||||
.proc_release = seq_release,
|
||||
#else
|
||||
.open = proc_taskq_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
#endif
|
||||
};
|
||||
|
||||
static struct ctl_table spl_kmem_table[] = {
|
||||
#ifdef DEBUG_KMEM
|
||||
{
|
||||
|
@ -668,8 +425,6 @@ static void spl_proc_cleanup(void)
|
|||
remove_proc_entry("kstat", proc_spl);
|
||||
remove_proc_entry("slab", proc_spl_kmem);
|
||||
remove_proc_entry("kmem", proc_spl);
|
||||
remove_proc_entry("taskq-all", proc_spl);
|
||||
remove_proc_entry("taskq", proc_spl);
|
||||
remove_proc_entry("spl", NULL);
|
||||
|
||||
#ifndef HAVE_REGISTER_SYSCTL_TABLE
|
||||
|
@ -688,6 +443,37 @@ static void spl_proc_cleanup(void)
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef HAVE_REGISTER_SYSCTL_TABLE
|
||||
|
||||
/*
|
||||
* Traditionally, struct ctl_table arrays have been terminated by an "empty"
|
||||
* sentinel element (specifically, one with .procname == NULL).
|
||||
*
|
||||
* Linux 6.6 began migrating away from this, adding register_sysctl_sz() so
|
||||
* that callers could provide the size directly, and redefining
|
||||
* register_sysctl() to just call register_sysctl_sz() with the array size. It
|
||||
* retained support for the terminating element so that existing callers would
|
||||
* continue to work.
|
||||
*
|
||||
* Linux 6.11 removed support for the terminating element, instead interpreting
|
||||
* it as a real malformed element, and rejecting it.
|
||||
*
|
||||
* In order to continue support older kernels, we retain the terminating
|
||||
* sentinel element for our sysctl tables, but instead detect availability of
|
||||
* register_sysctl_sz(). If it exists, we pass it the array size -1, stopping
|
||||
* the kernel from trying to process the terminator. For pre-6.6 kernels that
|
||||
* don't have register_sysctl_sz(), we just use register_sysctl(), which can
|
||||
* handle the terminating element as it always has.
|
||||
*/
|
||||
#ifdef HAVE_REGISTER_SYSCTL_SZ
|
||||
#define spl_proc_register_sysctl(p, t) \
|
||||
register_sysctl_sz(p, t, ARRAY_SIZE(t)-1)
|
||||
#else
|
||||
#define spl_proc_register_sysctl(p, t) \
|
||||
register_sysctl(p, t)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int
|
||||
spl_proc_init(void)
|
||||
{
|
||||
|
@ -698,16 +484,17 @@ spl_proc_init(void)
|
|||
if (spl_header == NULL)
|
||||
return (-EUNATCH);
|
||||
#else
|
||||
spl_header = register_sysctl("kernel/spl", spl_table);
|
||||
spl_header = spl_proc_register_sysctl("kernel/spl", spl_table);
|
||||
if (spl_header == NULL)
|
||||
return (-EUNATCH);
|
||||
|
||||
spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table);
|
||||
spl_kmem = spl_proc_register_sysctl("kernel/spl/kmem", spl_kmem_table);
|
||||
if (spl_kmem == NULL) {
|
||||
rc = -EUNATCH;
|
||||
goto out;
|
||||
}
|
||||
spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table);
|
||||
spl_kstat = spl_proc_register_sysctl("kernel/spl/kstat",
|
||||
spl_kstat_table);
|
||||
if (spl_kstat == NULL) {
|
||||
rc = -EUNATCH;
|
||||
goto out;
|
||||
|
@ -720,20 +507,6 @@ spl_proc_init(void)
|
|||
goto out;
|
||||
}
|
||||
|
||||
proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
|
||||
&proc_taskq_all_operations, NULL);
|
||||
if (proc_spl_taskq_all == NULL) {
|
||||
rc = -EUNATCH;
|
||||
goto out;
|
||||
}
|
||||
|
||||
proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
|
||||
&proc_taskq_operations, NULL);
|
||||
if (proc_spl_taskq == NULL) {
|
||||
rc = -EUNATCH;
|
||||
goto out;
|
||||
}
|
||||
|
||||
proc_spl_kmem = proc_mkdir("kmem", proc_spl);
|
||||
if (proc_spl_kmem == NULL) {
|
||||
rc = -EUNATCH;
|
||||
|
|
|
@ -22,16 +22,98 @@
|
|||
*
|
||||
* Solaris Porting Layer (SPL) Task Queue Implementation.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2024, Klara Inc.
|
||||
* Copyright (c) 2024, Syneto
|
||||
*/
|
||||
|
||||
#include <sys/timer.h>
|
||||
#include <sys/taskq.h>
|
||||
#include <sys/kmem.h>
|
||||
#include <sys/tsd.h>
|
||||
#include <sys/trace_spl.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/atomic.h>
|
||||
#include <sys/kstat.h>
|
||||
#ifdef HAVE_CPU_HOTPLUG
|
||||
#include <linux/cpuhotplug.h>
|
||||
#endif
|
||||
|
||||
typedef struct taskq_kstats {
|
||||
/* static values, for completeness */
|
||||
kstat_named_t tqks_threads_max;
|
||||
kstat_named_t tqks_entry_pool_min;
|
||||
kstat_named_t tqks_entry_pool_max;
|
||||
|
||||
/* gauges (inc/dec counters, current value) */
|
||||
kstat_named_t tqks_threads_active;
|
||||
kstat_named_t tqks_threads_idle;
|
||||
kstat_named_t tqks_threads_total;
|
||||
kstat_named_t tqks_tasks_pending;
|
||||
kstat_named_t tqks_tasks_priority;
|
||||
kstat_named_t tqks_tasks_total;
|
||||
kstat_named_t tqks_tasks_delayed;
|
||||
kstat_named_t tqks_entries_free;
|
||||
|
||||
/* counters (inc only, since taskq creation) */
|
||||
kstat_named_t tqks_threads_created;
|
||||
kstat_named_t tqks_threads_destroyed;
|
||||
kstat_named_t tqks_tasks_dispatched;
|
||||
kstat_named_t tqks_tasks_dispatched_delayed;
|
||||
kstat_named_t tqks_tasks_executed_normal;
|
||||
kstat_named_t tqks_tasks_executed_priority;
|
||||
kstat_named_t tqks_tasks_executed;
|
||||
kstat_named_t tqks_tasks_delayed_requeued;
|
||||
kstat_named_t tqks_tasks_cancelled;
|
||||
kstat_named_t tqks_thread_wakeups;
|
||||
kstat_named_t tqks_thread_wakeups_nowork;
|
||||
kstat_named_t tqks_thread_sleeps;
|
||||
} taskq_kstats_t;
|
||||
|
||||
static taskq_kstats_t taskq_kstats_template = {
|
||||
{ "threads_max", KSTAT_DATA_UINT64 },
|
||||
{ "entry_pool_min", KSTAT_DATA_UINT64 },
|
||||
{ "entry_pool_max", KSTAT_DATA_UINT64 },
|
||||
{ "threads_active", KSTAT_DATA_UINT64 },
|
||||
{ "threads_idle", KSTAT_DATA_UINT64 },
|
||||
{ "threads_total", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_pending", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_priority", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_total", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_delayed", KSTAT_DATA_UINT64 },
|
||||
{ "entries_free", KSTAT_DATA_UINT64 },
|
||||
|
||||
{ "threads_created", KSTAT_DATA_UINT64 },
|
||||
{ "threads_destroyed", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_dispatched", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_dispatched_delayed", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_executed_normal", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_executed_priority", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_executed", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_delayed_requeued", KSTAT_DATA_UINT64 },
|
||||
{ "tasks_cancelled", KSTAT_DATA_UINT64 },
|
||||
{ "thread_wakeups", KSTAT_DATA_UINT64 },
|
||||
{ "thread_wakeups_nowork", KSTAT_DATA_UINT64 },
|
||||
{ "thread_sleeps", KSTAT_DATA_UINT64 },
|
||||
};
|
||||
|
||||
#define TQSTAT_INC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, 1)
|
||||
#define TQSTAT_DEC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, -1)
|
||||
|
||||
#define _TQSTAT_MOD_LIST(mod, tq, t) do { \
|
||||
switch (t->tqent_flags & TQENT_LIST_MASK) { \
|
||||
case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\
|
||||
case TQENT_LIST_PENDING: mod(tq, tasks_pending); break; \
|
||||
case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break; \
|
||||
case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break; \
|
||||
} \
|
||||
} while (0)
|
||||
#define TQSTAT_INC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_INC, tq, t)
|
||||
#define TQSTAT_DEC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t)
|
||||
|
||||
#define TQENT_SET_LIST(t, l) \
|
||||
t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l;
|
||||
|
||||
static int spl_taskq_thread_bind = 0;
|
||||
module_param(spl_taskq_thread_bind, int, 0644);
|
||||
MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
|
||||
|
@ -134,6 +216,7 @@ retry:
|
|||
ASSERT(!timer_pending(&t->tqent_timer));
|
||||
|
||||
list_del_init(&t->tqent_list);
|
||||
TQSTAT_DEC(tq, entries_free);
|
||||
return (t);
|
||||
}
|
||||
|
||||
|
@ -204,12 +287,11 @@ task_done(taskq_t *tq, taskq_ent_t *t)
|
|||
{
|
||||
ASSERT(tq);
|
||||
ASSERT(t);
|
||||
ASSERT(list_empty(&t->tqent_list));
|
||||
|
||||
/* Wake tasks blocked in taskq_wait_id() */
|
||||
wake_up_all(&t->tqent_waitq);
|
||||
|
||||
list_del_init(&t->tqent_list);
|
||||
|
||||
if (tq->tq_nalloc <= tq->tq_minalloc) {
|
||||
t->tqent_id = TASKQID_INVALID;
|
||||
t->tqent_func = NULL;
|
||||
|
@ -217,6 +299,7 @@ task_done(taskq_t *tq, taskq_ent_t *t)
|
|||
t->tqent_flags = 0;
|
||||
|
||||
list_add_tail(&t->tqent_list, &tq->tq_free_list);
|
||||
TQSTAT_INC(tq, entries_free);
|
||||
} else {
|
||||
task_free(tq, t);
|
||||
}
|
||||
|
@ -263,6 +346,8 @@ task_expire_impl(taskq_ent_t *t)
|
|||
spin_unlock_irqrestore(&tq->tq_lock, flags);
|
||||
|
||||
wake_up(&tq->tq_work_waitq);
|
||||
|
||||
TQSTAT_INC(tq, tasks_delayed_requeued);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -534,7 +619,11 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
|
|||
t = taskq_find(tq, id);
|
||||
if (t && t != ERR_PTR(-EBUSY)) {
|
||||
list_del_init(&t->tqent_list);
|
||||
TQSTAT_DEC_LIST(tq, t);
|
||||
TQSTAT_DEC(tq, tasks_total);
|
||||
|
||||
t->tqent_flags |= TQENT_FLAG_CANCEL;
|
||||
TQSTAT_INC(tq, tasks_cancelled);
|
||||
|
||||
/*
|
||||
* When canceling the lowest outstanding task id we
|
||||
|
@ -604,13 +693,19 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
|
|||
spin_lock(&t->tqent_lock);
|
||||
|
||||
/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
|
||||
if (flags & TQ_NOQUEUE)
|
||||
if (flags & TQ_NOQUEUE) {
|
||||
TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
|
||||
list_add(&t->tqent_list, &tq->tq_prio_list);
|
||||
/* Queue to the priority list instead of the pending list */
|
||||
else if (flags & TQ_FRONT)
|
||||
} else if (flags & TQ_FRONT) {
|
||||
TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
|
||||
list_add_tail(&t->tqent_list, &tq->tq_prio_list);
|
||||
else
|
||||
} else {
|
||||
TQENT_SET_LIST(t, TQENT_LIST_PENDING);
|
||||
list_add_tail(&t->tqent_list, &tq->tq_pend_list);
|
||||
}
|
||||
TQSTAT_INC_LIST(tq, t);
|
||||
TQSTAT_INC(tq, tasks_total);
|
||||
|
||||
t->tqent_id = rc = tq->tq_next_id;
|
||||
tq->tq_next_id++;
|
||||
|
@ -629,6 +724,8 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
|
|||
|
||||
wake_up(&tq->tq_work_waitq);
|
||||
|
||||
TQSTAT_INC(tq, tasks_dispatched);
|
||||
|
||||
/* Spawn additional taskq threads if required. */
|
||||
if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
|
||||
(void) taskq_thread_spawn(tq);
|
||||
|
@ -662,6 +759,9 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
|
|||
|
||||
/* Queue to the delay list for subsequent execution */
|
||||
list_add_tail(&t->tqent_list, &tq->tq_delay_list);
|
||||
TQENT_SET_LIST(t, TQENT_LIST_DELAY);
|
||||
TQSTAT_INC_LIST(tq, t);
|
||||
TQSTAT_INC(tq, tasks_total);
|
||||
|
||||
t->tqent_id = rc = tq->tq_next_id;
|
||||
tq->tq_next_id++;
|
||||
|
@ -676,6 +776,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
|
|||
|
||||
spin_unlock(&t->tqent_lock);
|
||||
|
||||
TQSTAT_INC(tq, tasks_dispatched_delayed);
|
||||
|
||||
/* Spawn additional taskq threads if required. */
|
||||
if (tq->tq_nactive == tq->tq_nthreads)
|
||||
(void) taskq_thread_spawn(tq);
|
||||
|
@ -724,10 +826,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
|
|||
t->tqent_flags |= TQENT_FLAG_PREALLOC;
|
||||
|
||||
/* Queue to the priority list instead of the pending list */
|
||||
if (flags & TQ_FRONT)
|
||||
if (flags & TQ_FRONT) {
|
||||
TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
|
||||
list_add_tail(&t->tqent_list, &tq->tq_prio_list);
|
||||
else
|
||||
} else {
|
||||
TQENT_SET_LIST(t, TQENT_LIST_PENDING);
|
||||
list_add_tail(&t->tqent_list, &tq->tq_pend_list);
|
||||
}
|
||||
TQSTAT_INC_LIST(tq, t);
|
||||
TQSTAT_INC(tq, tasks_total);
|
||||
|
||||
t->tqent_id = tq->tq_next_id;
|
||||
tq->tq_next_id++;
|
||||
|
@ -742,6 +849,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
|
|||
|
||||
wake_up(&tq->tq_work_waitq);
|
||||
|
||||
TQSTAT_INC(tq, tasks_dispatched);
|
||||
|
||||
/* Spawn additional taskq threads if required. */
|
||||
if (tq->tq_nactive == tq->tq_nthreads)
|
||||
(void) taskq_thread_spawn(tq);
|
||||
|
@ -908,6 +1017,8 @@ taskq_thread(void *args)
|
|||
wake_up(&tq->tq_wait_waitq);
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
|
||||
TQSTAT_INC(tq, threads_total);
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
|
||||
if (list_empty(&tq->tq_pend_list) &&
|
||||
|
@ -919,9 +1030,15 @@ taskq_thread(void *args)
|
|||
add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
|
||||
spin_unlock_irqrestore(&tq->tq_lock, flags);
|
||||
|
||||
TQSTAT_INC(tq, thread_sleeps);
|
||||
TQSTAT_INC(tq, threads_idle);
|
||||
|
||||
schedule();
|
||||
seq_tasks = 0;
|
||||
|
||||
TQSTAT_DEC(tq, threads_idle);
|
||||
TQSTAT_INC(tq, thread_wakeups);
|
||||
|
||||
spin_lock_irqsave_nested(&tq->tq_lock, flags,
|
||||
tq->tq_lock_class);
|
||||
remove_wait_queue(&tq->tq_work_waitq, &wait);
|
||||
|
@ -931,6 +1048,8 @@ taskq_thread(void *args)
|
|||
|
||||
if ((t = taskq_next_ent(tq)) != NULL) {
|
||||
list_del_init(&t->tqent_list);
|
||||
TQSTAT_DEC_LIST(tq, t);
|
||||
TQSTAT_DEC(tq, tasks_total);
|
||||
|
||||
/*
|
||||
* A TQENT_FLAG_PREALLOC task may be reused or freed
|
||||
|
@ -955,6 +1074,7 @@ taskq_thread(void *args)
|
|||
tq->tq_nactive++;
|
||||
spin_unlock_irqrestore(&tq->tq_lock, flags);
|
||||
|
||||
TQSTAT_INC(tq, threads_active);
|
||||
DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
|
||||
|
||||
/* Perform the requested task */
|
||||
|
@ -962,8 +1082,17 @@ taskq_thread(void *args)
|
|||
|
||||
DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
|
||||
|
||||
TQSTAT_DEC(tq, threads_active);
|
||||
if ((t->tqent_flags & TQENT_LIST_MASK) ==
|
||||
TQENT_LIST_PENDING)
|
||||
TQSTAT_INC(tq, tasks_executed_normal);
|
||||
else
|
||||
TQSTAT_INC(tq, tasks_executed_priority);
|
||||
TQSTAT_INC(tq, tasks_executed);
|
||||
|
||||
spin_lock_irqsave_nested(&tq->tq_lock, flags,
|
||||
tq->tq_lock_class);
|
||||
|
||||
tq->tq_nactive--;
|
||||
list_del_init(&tqt->tqt_active_list);
|
||||
tqt->tqt_task = NULL;
|
||||
|
@ -989,7 +1118,8 @@ taskq_thread(void *args)
|
|||
tqt->tqt_id = TASKQID_INVALID;
|
||||
tqt->tqt_flags = 0;
|
||||
wake_up_all(&tq->tq_wait_waitq);
|
||||
}
|
||||
} else
|
||||
TQSTAT_INC(tq, thread_wakeups_nowork);
|
||||
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
|
||||
|
@ -998,6 +1128,10 @@ taskq_thread(void *args)
|
|||
__set_current_state(TASK_RUNNING);
|
||||
tq->tq_nthreads--;
|
||||
list_del_init(&tqt->tqt_thread_list);
|
||||
|
||||
TQSTAT_DEC(tq, threads_total);
|
||||
TQSTAT_INC(tq, threads_destroyed);
|
||||
|
||||
error:
|
||||
kmem_free(tqt, sizeof (taskq_thread_t));
|
||||
spin_unlock_irqrestore(&tq->tq_lock, flags);
|
||||
|
@ -1037,9 +1171,156 @@ taskq_thread_create(taskq_t *tq)
|
|||
|
||||
wake_up_process(tqt->tqt_thread);
|
||||
|
||||
TQSTAT_INC(tq, threads_created);
|
||||
|
||||
return (tqt);
|
||||
}
|
||||
|
||||
static void
|
||||
taskq_stats_init(taskq_t *tq)
|
||||
{
|
||||
taskq_sums_t *tqs = &tq->tq_sums;
|
||||
wmsum_init(&tqs->tqs_threads_active, 0);
|
||||
wmsum_init(&tqs->tqs_threads_idle, 0);
|
||||
wmsum_init(&tqs->tqs_threads_total, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_pending, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_priority, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_total, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_delayed, 0);
|
||||
wmsum_init(&tqs->tqs_entries_free, 0);
|
||||
wmsum_init(&tqs->tqs_threads_created, 0);
|
||||
wmsum_init(&tqs->tqs_threads_destroyed, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_dispatched, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_executed_normal, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_executed_priority, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_executed, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0);
|
||||
wmsum_init(&tqs->tqs_tasks_cancelled, 0);
|
||||
wmsum_init(&tqs->tqs_thread_wakeups, 0);
|
||||
wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0);
|
||||
wmsum_init(&tqs->tqs_thread_sleeps, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
taskq_stats_fini(taskq_t *tq)
|
||||
{
|
||||
taskq_sums_t *tqs = &tq->tq_sums;
|
||||
wmsum_fini(&tqs->tqs_threads_active);
|
||||
wmsum_fini(&tqs->tqs_threads_idle);
|
||||
wmsum_fini(&tqs->tqs_threads_total);
|
||||
wmsum_fini(&tqs->tqs_tasks_pending);
|
||||
wmsum_fini(&tqs->tqs_tasks_priority);
|
||||
wmsum_fini(&tqs->tqs_tasks_total);
|
||||
wmsum_fini(&tqs->tqs_tasks_delayed);
|
||||
wmsum_fini(&tqs->tqs_entries_free);
|
||||
wmsum_fini(&tqs->tqs_threads_created);
|
||||
wmsum_fini(&tqs->tqs_threads_destroyed);
|
||||
wmsum_fini(&tqs->tqs_tasks_dispatched);
|
||||
wmsum_fini(&tqs->tqs_tasks_dispatched_delayed);
|
||||
wmsum_fini(&tqs->tqs_tasks_executed_normal);
|
||||
wmsum_fini(&tqs->tqs_tasks_executed_priority);
|
||||
wmsum_fini(&tqs->tqs_tasks_executed);
|
||||
wmsum_fini(&tqs->tqs_tasks_delayed_requeued);
|
||||
wmsum_fini(&tqs->tqs_tasks_cancelled);
|
||||
wmsum_fini(&tqs->tqs_thread_wakeups);
|
||||
wmsum_fini(&tqs->tqs_thread_wakeups_nowork);
|
||||
wmsum_fini(&tqs->tqs_thread_sleeps);
|
||||
}
|
||||
|
||||
static int
|
||||
taskq_kstats_update(kstat_t *ksp, int rw)
|
||||
{
|
||||
if (rw == KSTAT_WRITE)
|
||||
return (EACCES);
|
||||
|
||||
taskq_t *tq = ksp->ks_private;
|
||||
taskq_kstats_t *tqks = ksp->ks_data;
|
||||
|
||||
tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads;
|
||||
tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc;
|
||||
tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc;
|
||||
|
||||
taskq_sums_t *tqs = &tq->tq_sums;
|
||||
|
||||
tqks->tqks_threads_active.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_threads_active);
|
||||
tqks->tqks_threads_idle.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_threads_idle);
|
||||
tqks->tqks_threads_total.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_threads_total);
|
||||
tqks->tqks_tasks_pending.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_pending);
|
||||
tqks->tqks_tasks_priority.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_priority);
|
||||
tqks->tqks_tasks_total.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_total);
|
||||
tqks->tqks_tasks_delayed.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_delayed);
|
||||
tqks->tqks_entries_free.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_entries_free);
|
||||
tqks->tqks_threads_created.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_threads_created);
|
||||
tqks->tqks_threads_destroyed.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_threads_destroyed);
|
||||
tqks->tqks_tasks_dispatched.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_dispatched);
|
||||
tqks->tqks_tasks_dispatched_delayed.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_dispatched_delayed);
|
||||
tqks->tqks_tasks_executed_normal.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_executed_normal);
|
||||
tqks->tqks_tasks_executed_priority.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_executed_priority);
|
||||
tqks->tqks_tasks_executed.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_executed);
|
||||
tqks->tqks_tasks_delayed_requeued.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_delayed_requeued);
|
||||
tqks->tqks_tasks_cancelled.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_tasks_cancelled);
|
||||
tqks->tqks_thread_wakeups.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_thread_wakeups);
|
||||
tqks->tqks_thread_wakeups_nowork.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_thread_wakeups_nowork);
|
||||
tqks->tqks_thread_sleeps.value.ui64 =
|
||||
wmsum_value(&tqs->tqs_thread_sleeps);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
taskq_kstats_init(taskq_t *tq)
|
||||
{
|
||||
char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
|
||||
snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance);
|
||||
|
||||
kstat_t *ksp = kstat_create("taskq", 0, name, "misc",
|
||||
KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t),
|
||||
KSTAT_FLAG_VIRTUAL);
|
||||
|
||||
if (ksp == NULL)
|
||||
return;
|
||||
|
||||
ksp->ks_private = tq;
|
||||
ksp->ks_update = taskq_kstats_update;
|
||||
ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP);
|
||||
memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t));
|
||||
kstat_install(ksp);
|
||||
|
||||
tq->tq_ksp = ksp;
|
||||
}
|
||||
|
||||
static void
|
||||
taskq_kstats_fini(taskq_t *tq)
|
||||
{
|
||||
if (tq->tq_ksp == NULL)
|
||||
return;
|
||||
|
||||
kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t));
|
||||
kstat_delete(tq->tq_ksp);
|
||||
|
||||
tq->tq_ksp = NULL;
|
||||
}
|
||||
|
||||
taskq_t *
|
||||
taskq_create(const char *name, int threads_arg, pri_t pri,
|
||||
int minalloc, int maxalloc, uint_t flags)
|
||||
|
@ -1104,6 +1385,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
|
|||
init_waitqueue_head(&tq->tq_wait_waitq);
|
||||
tq->tq_lock_class = TQ_LOCK_GENERAL;
|
||||
INIT_LIST_HEAD(&tq->tq_taskqs);
|
||||
taskq_stats_init(tq);
|
||||
|
||||
if (flags & TASKQ_PREPOPULATE) {
|
||||
spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
|
||||
|
@ -1137,14 +1419,17 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
|
|||
|
||||
if (rc) {
|
||||
taskq_destroy(tq);
|
||||
tq = NULL;
|
||||
} else {
|
||||
down_write(&tq_list_sem);
|
||||
tq->tq_instance = taskq_find_by_name(name) + 1;
|
||||
list_add_tail(&tq->tq_taskqs, &tq_list);
|
||||
up_write(&tq_list_sem);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
down_write(&tq_list_sem);
|
||||
tq->tq_instance = taskq_find_by_name(name) + 1;
|
||||
list_add_tail(&tq->tq_taskqs, &tq_list);
|
||||
up_write(&tq_list_sem);
|
||||
|
||||
/* Install kstats late, because the name includes tq_instance */
|
||||
taskq_kstats_init(tq);
|
||||
|
||||
return (tq);
|
||||
}
|
||||
EXPORT_SYMBOL(taskq_create);
|
||||
|
@ -1177,6 +1462,8 @@ taskq_destroy(taskq_t *tq)
|
|||
|
||||
taskq_wait(tq);
|
||||
|
||||
taskq_kstats_fini(tq);
|
||||
|
||||
/* remove taskq from global list used by the kstats */
|
||||
down_write(&tq_list_sem);
|
||||
list_del(&tq->tq_taskqs);
|
||||
|
@ -1230,6 +1517,7 @@ taskq_destroy(taskq_t *tq)
|
|||
|
||||
spin_unlock_irqrestore(&tq->tq_lock, flags);
|
||||
|
||||
taskq_stats_fini(tq);
|
||||
kmem_strfree(tq->tq_name);
|
||||
kmem_free(tq, sizeof (taskq_t));
|
||||
}
|
||||
|
@ -1271,6 +1559,100 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri,
|
|||
}
|
||||
EXPORT_SYMBOL(taskq_create_synced);
|
||||
|
||||
static kstat_t *taskq_summary_ksp = NULL;
|
||||
|
||||
static int
|
||||
spl_taskq_kstat_headers(char *buf, size_t size)
|
||||
{
|
||||
size_t n = snprintf(buf, size,
|
||||
"%-20s | %-17s | %-23s\n"
|
||||
"%-20s | %-17s | %-23s\n"
|
||||
"%-20s | %-17s | %-23s\n",
|
||||
"", "threads", "tasks on queue",
|
||||
"taskq name", "tot [act idl] max", " pend [ norm high] dly",
|
||||
"--------------------", "-----------------",
|
||||
"-----------------------");
|
||||
return (n >= size ? ENOMEM : 0);
|
||||
}
|
||||
|
||||
static int
|
||||
spl_taskq_kstat_data(char *buf, size_t size, void *data)
|
||||
{
|
||||
struct list_head *tql = NULL;
|
||||
taskq_t *tq;
|
||||
char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
|
||||
char threads[25];
|
||||
char tasks[30];
|
||||
size_t n;
|
||||
int err = 0;
|
||||
|
||||
down_read(&tq_list_sem);
|
||||
list_for_each_prev(tql, &tq_list) {
|
||||
tq = list_entry(tql, taskq_t, tq_taskqs);
|
||||
|
||||
mutex_enter(tq->tq_ksp->ks_lock);
|
||||
taskq_kstats_update(tq->tq_ksp, KSTAT_READ);
|
||||
taskq_kstats_t *tqks = tq->tq_ksp->ks_data;
|
||||
|
||||
snprintf(name, sizeof (name), "%s.%d", tq->tq_name,
|
||||
tq->tq_instance);
|
||||
snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu",
|
||||
tqks->tqks_threads_total.value.ui64,
|
||||
tqks->tqks_threads_active.value.ui64,
|
||||
tqks->tqks_threads_idle.value.ui64,
|
||||
tqks->tqks_threads_max.value.ui64);
|
||||
snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu",
|
||||
tqks->tqks_tasks_total.value.ui64,
|
||||
tqks->tqks_tasks_pending.value.ui64,
|
||||
tqks->tqks_tasks_priority.value.ui64,
|
||||
tqks->tqks_tasks_delayed.value.ui64);
|
||||
|
||||
mutex_exit(tq->tq_ksp->ks_lock);
|
||||
|
||||
n = snprintf(buf, size, "%-20s | %-17s | %-23s\n",
|
||||
name, threads, tasks);
|
||||
if (n >= size) {
|
||||
err = ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
buf = &buf[n];
|
||||
size -= n;
|
||||
}
|
||||
|
||||
up_read(&tq_list_sem);
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
static void
|
||||
spl_taskq_kstat_init(void)
|
||||
{
|
||||
kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc",
|
||||
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
|
||||
|
||||
if (ksp == NULL)
|
||||
return;
|
||||
|
||||
ksp->ks_data = (void *)(uintptr_t)1;
|
||||
ksp->ks_ndata = 1;
|
||||
kstat_set_raw_ops(ksp, spl_taskq_kstat_headers,
|
||||
spl_taskq_kstat_data, NULL);
|
||||
kstat_install(ksp);
|
||||
|
||||
taskq_summary_ksp = ksp;
|
||||
}
|
||||
|
||||
static void
|
||||
spl_taskq_kstat_fini(void)
|
||||
{
|
||||
if (taskq_summary_ksp == NULL)
|
||||
return;
|
||||
|
||||
kstat_delete(taskq_summary_ksp);
|
||||
taskq_summary_ksp = NULL;
|
||||
}
|
||||
|
||||
static unsigned int spl_taskq_kick = 0;
|
||||
|
||||
/*
|
||||
|
@ -1451,12 +1833,16 @@ spl_taskq_init(void)
|
|||
*/
|
||||
dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
|
||||
|
||||
spl_taskq_kstat_init();
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
spl_taskq_fini(void)
|
||||
{
|
||||
spl_taskq_kstat_fini();
|
||||
|
||||
taskq_destroy(dynamic_taskq);
|
||||
dynamic_taskq = NULL;
|
||||
|
||||
|
|
|
@ -186,6 +186,13 @@ issig(void)
|
|||
|
||||
schedule();
|
||||
#endif
|
||||
/*
|
||||
* Dequeued SIGSTOP/SIGTSTP.
|
||||
* Check if process has other singal pending.
|
||||
*/
|
||||
if (signal_pending(current))
|
||||
return (1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
|
|
@ -58,22 +58,16 @@
|
|||
#include <sys/arc.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#ifdef _KERNEL
|
||||
#include <linux/kmap_compat.h>
|
||||
#include <linux/mm_compat.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/version.h>
|
||||
#endif
|
||||
|
||||
#ifdef _KERNEL
|
||||
#if defined(MAX_ORDER)
|
||||
#define ABD_MAX_ORDER (MAX_ORDER)
|
||||
#elif defined(MAX_PAGE_ORDER)
|
||||
#define ABD_MAX_ORDER (MAX_PAGE_ORDER)
|
||||
#endif
|
||||
#else
|
||||
#define ABD_MAX_ORDER (1)
|
||||
#endif
|
||||
|
||||
typedef struct abd_stats {
|
||||
kstat_named_t abdstat_struct_size;
|
||||
|
@ -193,11 +187,9 @@ abd_t *abd_zero_scatter = NULL;
|
|||
|
||||
struct page;
|
||||
/*
|
||||
* _KERNEL - Will point to ZERO_PAGE if it is available or it will be
|
||||
* an allocated zero'd PAGESIZE buffer.
|
||||
* Userspace - Will be an allocated zero'ed PAGESIZE buffer.
|
||||
*
|
||||
* abd_zero_page is assigned to each of the pages of abd_zero_scatter.
|
||||
* abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
|
||||
* point to ZERO_PAGE if it is available or it will be an allocated zero'd
|
||||
* PAGESIZE buffer.
|
||||
*/
|
||||
static struct page *abd_zero_page = NULL;
|
||||
|
||||
|
@ -232,7 +224,6 @@ abd_free_struct_impl(abd_t *abd)
|
|||
ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
|
||||
}
|
||||
|
||||
#ifdef _KERNEL
|
||||
static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
|
||||
|
||||
/*
|
||||
|
@ -509,7 +500,7 @@ abd_alloc_zero_scatter(void)
|
|||
ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
|
||||
ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
|
||||
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
|
||||
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
|
||||
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
|
||||
|
||||
abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
|
||||
sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
|
||||
|
@ -520,134 +511,6 @@ abd_alloc_zero_scatter(void)
|
|||
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
|
||||
}
|
||||
|
||||
#else /* _KERNEL */
|
||||
|
||||
#ifndef PAGE_SHIFT
|
||||
#define PAGE_SHIFT (highbit64(PAGESIZE)-1)
|
||||
#endif
|
||||
|
||||
#define zfs_kmap_local(chunk) ((void *)chunk)
|
||||
#define zfs_kunmap_local(addr) do { (void)(addr); } while (0)
|
||||
#define local_irq_save(flags) do { (void)(flags); } while (0)
|
||||
#define local_irq_restore(flags) do { (void)(flags); } while (0)
|
||||
#define nth_page(pg, i) \
|
||||
((struct page *)((void *)(pg) + (i) * PAGESIZE))
|
||||
|
||||
struct scatterlist {
|
||||
struct page *page;
|
||||
int length;
|
||||
int end;
|
||||
};
|
||||
|
||||
static void
|
||||
sg_init_table(struct scatterlist *sg, int nr)
|
||||
{
|
||||
memset(sg, 0, nr * sizeof (struct scatterlist));
|
||||
sg[nr - 1].end = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* This must be called if any of the sg_table allocation functions
|
||||
* are called.
|
||||
*/
|
||||
static void
|
||||
abd_free_sg_table(abd_t *abd)
|
||||
{
|
||||
int nents = ABD_SCATTER(abd).abd_nents;
|
||||
vmem_free(ABD_SCATTER(abd).abd_sgl,
|
||||
nents * sizeof (struct scatterlist));
|
||||
}
|
||||
|
||||
#define for_each_sg(sgl, sg, nr, i) \
|
||||
for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
|
||||
|
||||
static inline void
|
||||
sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
|
||||
unsigned int offset)
|
||||
{
|
||||
/* currently we don't use offset */
|
||||
ASSERT(offset == 0);
|
||||
sg->page = page;
|
||||
sg->length = len;
|
||||
}
|
||||
|
||||
static inline struct page *
|
||||
sg_page(struct scatterlist *sg)
|
||||
{
|
||||
return (sg->page);
|
||||
}
|
||||
|
||||
static inline struct scatterlist *
|
||||
sg_next(struct scatterlist *sg)
|
||||
{
|
||||
if (sg->end)
|
||||
return (NULL);
|
||||
|
||||
return (sg + 1);
|
||||
}
|
||||
|
||||
void
|
||||
abd_alloc_chunks(abd_t *abd, size_t size)
|
||||
{
|
||||
unsigned nr_pages = abd_chunkcnt_for_bytes(size);
|
||||
struct scatterlist *sg;
|
||||
int i;
|
||||
|
||||
ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
|
||||
sizeof (struct scatterlist), KM_SLEEP);
|
||||
sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
|
||||
|
||||
abd_for_each_sg(abd, sg, nr_pages, i) {
|
||||
struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
|
||||
sg_set_page(sg, p, PAGESIZE, 0);
|
||||
}
|
||||
ABD_SCATTER(abd).abd_nents = nr_pages;
|
||||
}
|
||||
|
||||
void
|
||||
abd_free_chunks(abd_t *abd)
|
||||
{
|
||||
int i, n = ABD_SCATTER(abd).abd_nents;
|
||||
struct scatterlist *sg;
|
||||
|
||||
abd_for_each_sg(abd, sg, n, i) {
|
||||
struct page *p = nth_page(sg_page(sg), 0);
|
||||
umem_free_aligned(p, PAGESIZE);
|
||||
}
|
||||
abd_free_sg_table(abd);
|
||||
}
|
||||
|
||||
static void
|
||||
abd_alloc_zero_scatter(void)
|
||||
{
|
||||
unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
|
||||
struct scatterlist *sg;
|
||||
int i;
|
||||
|
||||
abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
|
||||
memset(abd_zero_page, 0, PAGESIZE);
|
||||
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
|
||||
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
|
||||
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
|
||||
ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
|
||||
ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
|
||||
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
|
||||
ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
|
||||
sizeof (struct scatterlist), KM_SLEEP);
|
||||
|
||||
sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
|
||||
|
||||
abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
|
||||
sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
|
||||
}
|
||||
|
||||
ABDSTAT_BUMP(abdstat_scatter_cnt);
|
||||
ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
|
||||
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
|
||||
}
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
||||
boolean_t
|
||||
abd_size_alloc_linear(size_t size)
|
||||
{
|
||||
|
@ -712,14 +575,10 @@ abd_free_zero_scatter(void)
|
|||
abd_free_struct(abd_zero_scatter);
|
||||
abd_zero_scatter = NULL;
|
||||
ASSERT3P(abd_zero_page, !=, NULL);
|
||||
#if defined(_KERNEL)
|
||||
#if defined(HAVE_ZERO_PAGE_GPL_ONLY)
|
||||
abd_unmark_zfs_page(abd_zero_page);
|
||||
__free_page(abd_zero_page);
|
||||
#endif /* HAVE_ZERO_PAGE_GPL_ONLY */
|
||||
#else
|
||||
umem_free_aligned(abd_zero_page, PAGESIZE);
|
||||
#endif /* _KERNEL */
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -1014,8 +873,6 @@ abd_cache_reap_now(void)
|
|||
{
|
||||
}
|
||||
|
||||
#if defined(_KERNEL)
|
||||
|
||||
/*
|
||||
* This is abd_iter_page(), the function underneath abd_iterate_page_func().
|
||||
* It yields the next page struct and data offset and size within it, without
|
||||
|
@ -1297,5 +1154,3 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
|
|||
module_param(zfs_abd_scatter_max_order, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
|
||||
"Maximum order allocation used for a scatter ABD.");
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
|
|
@ -201,9 +201,9 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
|
|||
* See also the comment above zfs_arc_shrinker_limit.
|
||||
*/
|
||||
int64_t can_free = btop(arc_evictable_memory());
|
||||
int64_t limit = zfs_arc_shrinker_limit != 0 ?
|
||||
zfs_arc_shrinker_limit : INT64_MAX;
|
||||
return (MIN(can_free, limit));
|
||||
if (current_is_kswapd() && zfs_arc_shrinker_limit)
|
||||
can_free = MIN(can_free, zfs_arc_shrinker_limit);
|
||||
return (can_free);
|
||||
}
|
||||
|
||||
static unsigned long
|
||||
|
|
|
@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
|
|||
zfsvfs_t *snap_zfsvfs;
|
||||
zfs_snapentry_t *se;
|
||||
char *full_name, *full_path;
|
||||
char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
|
||||
NULL };
|
||||
char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",
|
||||
NULL, NULL, NULL };
|
||||
char *envp[] = { NULL };
|
||||
int error;
|
||||
struct path spath;
|
||||
|
@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
|
|||
* value from call_usermodehelper() will be (exitcode << 8 + signal).
|
||||
*/
|
||||
dprintf("mount; name=%s path=%s\n", full_name, full_path);
|
||||
argv[5] = full_name;
|
||||
argv[6] = full_path;
|
||||
argv[6] = full_name;
|
||||
argv[7] = full_path;
|
||||
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
|
||||
if (error) {
|
||||
if (!(error & MOUNT_BUSY << 8)) {
|
||||
|
|
|
@ -69,6 +69,7 @@
|
|||
#include <sys/zpl.h>
|
||||
#include <sys/zil.h>
|
||||
#include <sys/sa_impl.h>
|
||||
#include <linux/mm_compat.h>
|
||||
|
||||
/*
|
||||
* Programming rules.
|
||||
|
@ -1820,24 +1821,36 @@ zfs_setattr_dir(znode_t *dzp)
|
|||
&gid, sizeof (gid));
|
||||
}
|
||||
|
||||
if (zp->z_projid != dzp->z_projid) {
|
||||
|
||||
uint64_t projid = dzp->z_projid;
|
||||
if (zp->z_projid != projid) {
|
||||
if (!(zp->z_pflags & ZFS_PROJID)) {
|
||||
zp->z_pflags |= ZFS_PROJID;
|
||||
SA_ADD_BULK_ATTR(bulk, count,
|
||||
SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
|
||||
sizeof (zp->z_pflags));
|
||||
err = sa_add_projid(zp->z_sa_hdl, tx, projid);
|
||||
if (unlikely(err == EEXIST)) {
|
||||
err = 0;
|
||||
} else if (err != 0) {
|
||||
goto sa_add_projid_err;
|
||||
} else {
|
||||
projid = ZFS_INVALID_PROJID;
|
||||
}
|
||||
}
|
||||
|
||||
zp->z_projid = dzp->z_projid;
|
||||
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
|
||||
NULL, &zp->z_projid, sizeof (zp->z_projid));
|
||||
if (projid != ZFS_INVALID_PROJID) {
|
||||
zp->z_projid = projid;
|
||||
SA_ADD_BULK_ATTR(bulk, count,
|
||||
SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
|
||||
sizeof (zp->z_projid));
|
||||
}
|
||||
}
|
||||
|
||||
sa_add_projid_err:
|
||||
mutex_exit(&dzp->z_lock);
|
||||
|
||||
if (likely(count > 0)) {
|
||||
err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
|
||||
dmu_tx_commit(tx);
|
||||
} else if (projid == ZFS_INVALID_PROJID) {
|
||||
dmu_tx_commit(tx);
|
||||
} else {
|
||||
dmu_tx_abort(tx);
|
||||
}
|
||||
|
|
|
@ -295,6 +295,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
|
|||
{
|
||||
struct super_block *s;
|
||||
objset_t *os;
|
||||
boolean_t issnap = B_FALSE;
|
||||
int err;
|
||||
|
||||
err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
|
||||
|
@ -326,6 +327,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
|
|||
if (zpl_enter(zfsvfs, FTAG) == 0) {
|
||||
if (os != zfsvfs->z_os)
|
||||
err = -SET_ERROR(EBUSY);
|
||||
issnap = zfsvfs->z_issnap;
|
||||
zpl_exit(zfsvfs, FTAG);
|
||||
} else {
|
||||
err = -SET_ERROR(EBUSY);
|
||||
|
@ -349,7 +351,11 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
|
|||
return (ERR_PTR(err));
|
||||
}
|
||||
s->s_flags |= SB_ACTIVE;
|
||||
} else if ((flags ^ s->s_flags) & SB_RDONLY) {
|
||||
} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
|
||||
/*
|
||||
* Skip ro check for snap since snap is always ro regardless
|
||||
* ro flag is passed by mount or not.
|
||||
*/
|
||||
deactivate_locked_super(s);
|
||||
return (ERR_PTR(-EBUSY));
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
*/
|
||||
/*
|
||||
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
|
||||
* Copyright (c) 2024, Klara, Inc.
|
||||
*/
|
||||
|
||||
|
@ -1089,11 +1090,42 @@ static const struct block_device_operations zvol_ops = {
|
|||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Since 6.9, Linux has been removing queue limit setters in favour of an
|
||||
* initial queue_limits struct applied when the device is open. Since 6.11,
|
||||
* queue_limits is being extended to allow more things to be applied when the
|
||||
* device is open. Setters are also being removed for this.
|
||||
*
|
||||
* For OpenZFS, this means that depending on kernel version, some options may
|
||||
* be set up before the device is open, and some applied to an open device
|
||||
* (queue) after the fact.
|
||||
*
|
||||
* We manage this complexity by having our own limits struct,
|
||||
* zvol_queue_limits_t, in which we carry any queue config that we're
|
||||
* interested in setting. This structure is the same on all kernels.
|
||||
*
|
||||
* These limits are then applied to the queue at device open time by the most
|
||||
* appropriate method for the kernel.
|
||||
*
|
||||
* zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
|
||||
* blk_alloc_disk() exists). This converts our limits struct to a proper Linux
|
||||
* struct queue_limits, and passes it in. Any fields added in later kernels are
|
||||
* (obviously) not set up here.
|
||||
*
|
||||
* zvol_queue_limits_apply() is called on all kernel versions after the queue
|
||||
* is created, and applies any remaining config. Before 6.9 that will be
|
||||
* everything, via setter methods. After 6.9 that will be whatever couldn't be
|
||||
* put into struct queue_limits. (This implies that zvol_queue_limits_apply()
|
||||
* will always be a no-op on the latest kernel we support).
|
||||
*/
|
||||
typedef struct zvol_queue_limits {
|
||||
unsigned int zql_max_hw_sectors;
|
||||
unsigned short zql_max_segments;
|
||||
unsigned int zql_max_segment_size;
|
||||
unsigned int zql_io_opt;
|
||||
unsigned int zql_physical_block_size;
|
||||
unsigned int zql_max_discard_sectors;
|
||||
unsigned int zql_discard_granularity;
|
||||
} zvol_queue_limits_t;
|
||||
|
||||
static void
|
||||
|
@ -1162,6 +1194,11 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
|
|||
}
|
||||
|
||||
limits->zql_io_opt = zv->zv_volblocksize;
|
||||
|
||||
limits->zql_physical_block_size = zv->zv_volblocksize;
|
||||
limits->zql_max_discard_sectors =
|
||||
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
|
||||
limits->zql_discard_granularity = zv->zv_volblocksize;
|
||||
}
|
||||
|
||||
#ifdef HAVE_BLK_ALLOC_DISK_2ARG
|
||||
|
@ -1174,18 +1211,35 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
|
|||
qlimits->max_segments = limits->zql_max_segments;
|
||||
qlimits->max_segment_size = limits->zql_max_segment_size;
|
||||
qlimits->io_opt = limits->zql_io_opt;
|
||||
qlimits->physical_block_size = limits->zql_physical_block_size;
|
||||
qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
|
||||
qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
|
||||
qlimits->discard_granularity = limits->zql_discard_granularity;
|
||||
#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
|
||||
qlimits->features =
|
||||
BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
#endif
|
||||
|
||||
static void
|
||||
zvol_queue_limits_apply(zvol_queue_limits_t *limits,
|
||||
struct request_queue *queue)
|
||||
{
|
||||
#ifndef HAVE_BLK_ALLOC_DISK_2ARG
|
||||
blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
|
||||
blk_queue_max_segments(queue, limits->zql_max_segments);
|
||||
blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
|
||||
blk_queue_io_opt(queue, limits->zql_io_opt);
|
||||
}
|
||||
blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
|
||||
blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
|
||||
blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
|
||||
#endif
|
||||
#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
|
||||
blk_queue_set_write_cache(queue, B_TRUE);
|
||||
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int
|
||||
zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
|
||||
|
@ -1198,7 +1252,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
|
|||
|
||||
zso->zvo_disk->minors = ZVOL_MINORS;
|
||||
zso->zvo_queue = zso->zvo_disk->queue;
|
||||
zvol_queue_limits_apply(limits, zso->zvo_queue);
|
||||
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
|
||||
struct queue_limits qlimits;
|
||||
zvol_queue_limits_convert(limits, &qlimits);
|
||||
|
@ -1211,6 +1264,7 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
|
|||
zso->zvo_disk = disk;
|
||||
zso->zvo_disk->minors = ZVOL_MINORS;
|
||||
zso->zvo_queue = zso->zvo_disk->queue;
|
||||
|
||||
#else
|
||||
zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
|
||||
if (zso->zvo_queue == NULL)
|
||||
|
@ -1223,7 +1277,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
|
|||
}
|
||||
|
||||
zso->zvo_disk->queue = zso->zvo_queue;
|
||||
zvol_queue_limits_apply(limits, zso->zvo_queue);
|
||||
#endif /* HAVE_BLK_ALLOC_DISK */
|
||||
#else
|
||||
zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
|
||||
|
@ -1237,8 +1290,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
|
|||
}
|
||||
|
||||
zso->zvo_disk->queue = zso->zvo_queue;
|
||||
zvol_queue_limits_apply(limits, zso->zvo_queue);
|
||||
#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
|
||||
|
||||
zvol_queue_limits_apply(limits, zso->zvo_queue);
|
||||
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
@ -1260,7 +1315,6 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
|
|||
return (1);
|
||||
}
|
||||
zso->zvo_queue = zso->zvo_disk->queue;
|
||||
zvol_queue_limits_apply(limits, zso->zvo_queue);
|
||||
zso->zvo_disk->minors = ZVOL_MINORS;
|
||||
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
|
||||
struct queue_limits qlimits;
|
||||
|
@ -1291,10 +1345,11 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
|
|||
|
||||
/* Our queue is now created, assign it to our disk */
|
||||
zso->zvo_disk->queue = zso->zvo_queue;
|
||||
zvol_queue_limits_apply(limits, zso->zvo_queue);
|
||||
#endif
|
||||
|
||||
zvol_queue_limits_apply(limits, zso->zvo_queue);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
@ -1303,7 +1358,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
|
|||
* request queue and generic disk structures for the block device.
|
||||
*/
|
||||
static zvol_state_t *
|
||||
zvol_alloc(dev_t dev, const char *name)
|
||||
zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
|
||||
{
|
||||
zvol_state_t *zv;
|
||||
struct zvol_state_os *zso;
|
||||
|
@ -1323,6 +1378,7 @@ zvol_alloc(dev_t dev, const char *name)
|
|||
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
|
||||
zv->zv_zso = zso;
|
||||
zv->zv_volmode = volmode;
|
||||
zv->zv_volblocksize = volblocksize;
|
||||
|
||||
list_link_init(&zv->zv_next);
|
||||
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
@ -1360,8 +1416,6 @@ zvol_alloc(dev_t dev, const char *name)
|
|||
if (ret != 0)
|
||||
goto out_kmem;
|
||||
|
||||
blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
|
||||
|
||||
/* Limit read-ahead to a single page to prevent over-prefetching. */
|
||||
blk_queue_set_read_ahead(zso->zvo_queue, 1);
|
||||
|
||||
|
@ -1370,9 +1424,6 @@ zvol_alloc(dev_t dev, const char *name)
|
|||
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
|
||||
}
|
||||
|
||||
/* Enable /proc/diskstats */
|
||||
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
|
||||
|
||||
zso->zvo_queue->queuedata = zv;
|
||||
zso->zvo_dev = dev;
|
||||
zv->zv_open_count = 0;
|
||||
|
@ -1617,7 +1668,8 @@ zvol_os_create_minor(const char *name)
|
|||
if (error)
|
||||
goto out_dmu_objset_disown;
|
||||
|
||||
zv = zvol_alloc(MKDEV(zvol_major, minor), name);
|
||||
zv = zvol_alloc(MKDEV(zvol_major, minor), name,
|
||||
doi->doi_data_block_size);
|
||||
if (zv == NULL) {
|
||||
error = SET_ERROR(EAGAIN);
|
||||
goto out_dmu_objset_disown;
|
||||
|
@ -1627,7 +1679,6 @@ zvol_os_create_minor(const char *name)
|
|||
if (dmu_objset_is_snapshot(os))
|
||||
zv->zv_flags |= ZVOL_RDONLY;
|
||||
|
||||
zv->zv_volblocksize = doi->doi_data_block_size;
|
||||
zv->zv_volsize = volsize;
|
||||
zv->zv_objset = os;
|
||||
|
||||
|
@ -1639,14 +1690,6 @@ zvol_os_create_minor(const char *name)
|
|||
|
||||
set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
|
||||
|
||||
|
||||
|
||||
blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
|
||||
zv->zv_volblocksize);
|
||||
blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
|
||||
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
|
||||
blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
|
||||
zv->zv_volblocksize);
|
||||
#ifdef QUEUE_FLAG_DISCARD
|
||||
blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
|
||||
#endif
|
||||
|
|
|
@ -754,6 +754,12 @@ zpool_feature_init(void)
|
|||
"Support for raidz expansion",
|
||||
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
|
||||
|
||||
zfeature_register(SPA_FEATURE_FAST_DEDUP,
|
||||
"com.klarasystems:fast_dedup", "fast_dedup",
|
||||
"Support for advanced deduplication",
|
||||
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
|
||||
sfeatures);
|
||||
|
||||
zfs_mod_list_supported_free(sfeatures);
|
||||
}
|
||||
|
||||
|
|
|
@ -113,7 +113,7 @@ abd_verify(abd_t *abd)
|
|||
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
|
||||
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
|
||||
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
|
||||
ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
|
||||
ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
|
||||
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
|
||||
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
|
||||
if (abd_is_linear(abd)) {
|
||||
|
@ -603,13 +603,11 @@ abd_get_zeros(size_t size)
|
|||
}
|
||||
|
||||
/*
|
||||
* Allocate a linear ABD structure for buf.
|
||||
* Create a linear ABD for an existing buf.
|
||||
*/
|
||||
abd_t *
|
||||
abd_get_from_buf(void *buf, size_t size)
|
||||
static abd_t *
|
||||
abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)
|
||||
{
|
||||
abd_t *abd = abd_alloc_struct(0);
|
||||
|
||||
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
|
||||
|
||||
/*
|
||||
|
@ -625,6 +623,20 @@ abd_get_from_buf(void *buf, size_t size)
|
|||
return (abd);
|
||||
}
|
||||
|
||||
abd_t *
|
||||
abd_get_from_buf(void *buf, size_t size)
|
||||
{
|
||||
abd_t *abd = abd_alloc_struct(0);
|
||||
return (abd_get_from_buf_impl(abd, buf, size));
|
||||
}
|
||||
|
||||
abd_t *
|
||||
abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)
|
||||
{
|
||||
abd_init_struct(abd);
|
||||
return (abd_get_from_buf_impl(abd, buf, size));
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the raw buffer associated with a linear ABD.
|
||||
*/
|
||||
|
|
|
@ -1767,12 +1767,12 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
|
|||
uint64_t csize;
|
||||
uint64_t lsize = HDR_GET_LSIZE(hdr);
|
||||
uint64_t psize = HDR_GET_PSIZE(hdr);
|
||||
void *tmpbuf = NULL;
|
||||
abd_t *abd = hdr->b_l1hdr.b_pabd;
|
||||
boolean_t free_abd = B_FALSE;
|
||||
|
||||
ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
|
||||
ASSERT(HDR_AUTHENTICATED(hdr));
|
||||
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
|
||||
ASSERT3P(abd, !=, NULL);
|
||||
|
||||
/*
|
||||
* The MAC is calculated on the compressed data that is stored on disk.
|
||||
|
@ -1784,14 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
|
|||
*/
|
||||
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
|
||||
!HDR_COMPRESSION_ENABLED(hdr)) {
|
||||
|
||||
abd = NULL;
|
||||
csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
|
||||
hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
|
||||
ASSERT3P(tmpbuf, !=, NULL);
|
||||
hdr->b_l1hdr.b_pabd, &abd, lsize, hdr->b_complevel);
|
||||
ASSERT3P(abd, !=, NULL);
|
||||
ASSERT3U(csize, <=, psize);
|
||||
abd = abd_get_from_buf(tmpbuf, lsize);
|
||||
abd_take_ownership_of_buf(abd, B_TRUE);
|
||||
abd_zero_off(abd, csize, psize - csize);
|
||||
free_abd = B_TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1810,16 +1809,10 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
|
|||
|
||||
if (ret == 0)
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
|
||||
else if (ret != ENOENT)
|
||||
goto error;
|
||||
else if (ret == ENOENT)
|
||||
ret = 0;
|
||||
|
||||
if (tmpbuf != NULL)
|
||||
abd_free(abd);
|
||||
|
||||
return (0);
|
||||
|
||||
error:
|
||||
if (tmpbuf != NULL)
|
||||
if (free_abd)
|
||||
abd_free(abd);
|
||||
|
||||
return (ret);
|
||||
|
@ -1836,7 +1829,6 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
|
|||
{
|
||||
int ret;
|
||||
abd_t *cabd = NULL;
|
||||
void *tmp = NULL;
|
||||
boolean_t no_crypt = B_FALSE;
|
||||
boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
|
||||
|
||||
|
@ -1871,17 +1863,14 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
|
|||
* linear buffer and wrapping it in an abd later.
|
||||
*/
|
||||
cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
|
||||
tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
|
||||
|
||||
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
|
||||
hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
|
||||
hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
|
||||
HDR_GET_LSIZE(hdr), &hdr->b_complevel);
|
||||
if (ret != 0) {
|
||||
abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
|
||||
goto error;
|
||||
}
|
||||
|
||||
abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
|
||||
arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
|
||||
arc_hdr_size(hdr), hdr);
|
||||
hdr->b_l1hdr.b_pabd = cabd;
|
||||
|
@ -2123,10 +2112,14 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
|
|||
/* Skip byteswapping and checksumming (already done) */
|
||||
return (0);
|
||||
} else {
|
||||
abd_t dabd;
|
||||
abd_get_from_buf_struct(&dabd, buf->b_data,
|
||||
HDR_GET_LSIZE(hdr));
|
||||
error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
|
||||
hdr->b_l1hdr.b_pabd, buf->b_data,
|
||||
hdr->b_l1hdr.b_pabd, &dabd,
|
||||
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
|
||||
&hdr->b_complevel);
|
||||
abd_free(&dabd);
|
||||
|
||||
/*
|
||||
* Absent hardware errors or software bugs, this should
|
||||
|
@ -8531,18 +8524,15 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
|
|||
!HDR_COMPRESSION_ENABLED(hdr)) {
|
||||
abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
|
||||
ARC_HDR_USE_RESERVE);
|
||||
void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
|
||||
|
||||
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
|
||||
hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
|
||||
hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
|
||||
HDR_GET_LSIZE(hdr), &hdr->b_complevel);
|
||||
if (ret != 0) {
|
||||
abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
|
||||
arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
|
||||
goto error;
|
||||
}
|
||||
|
||||
abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
|
||||
arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
|
||||
arc_hdr_size(hdr), hdr);
|
||||
hdr->b_l1hdr.b_pabd = cabd;
|
||||
|
@ -9037,9 +9027,8 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
|
|||
}
|
||||
|
||||
if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
|
||||
size_t bufsize = MAX(size, asize);
|
||||
void *buf = zio_buf_alloc(bufsize);
|
||||
uint64_t csize = zio_compress_data(compress, to_write, &buf,
|
||||
cabd = abd_alloc_for_io(MAX(size, asize), ismd);
|
||||
uint64_t csize = zio_compress_data(compress, to_write, &cabd,
|
||||
size, hdr->b_complevel);
|
||||
if (csize > psize) {
|
||||
/*
|
||||
|
@ -9047,13 +9036,12 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
|
|||
* psize. Even if it fits into asize, it does not
|
||||
* matter, since checksum will never match on read.
|
||||
*/
|
||||
zio_buf_free(buf, bufsize);
|
||||
abd_free(cabd);
|
||||
return (SET_ERROR(EIO));
|
||||
}
|
||||
if (asize > csize)
|
||||
memset((char *)buf + csize, 0, asize - csize);
|
||||
to_write = cabd = abd_get_from_buf(buf, bufsize);
|
||||
abd_take_ownership_of_buf(cabd, B_TRUE);
|
||||
abd_zero_off(cabd, csize, asize - csize);
|
||||
to_write = cabd;
|
||||
}
|
||||
|
||||
if (HDR_ENCRYPTED(hdr)) {
|
||||
|
@ -9158,12 +9146,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
|||
*/
|
||||
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
|
||||
/*
|
||||
* If pass == 1 or 3, we cache MRU metadata and data
|
||||
* respectively.
|
||||
* pass == 0: MFU meta
|
||||
* pass == 1: MRU meta
|
||||
* pass == 2: MFU data
|
||||
* pass == 3: MRU data
|
||||
*/
|
||||
if (l2arc_mfuonly) {
|
||||
if (l2arc_mfuonly == 1) {
|
||||
if (pass == 1 || pass == 3)
|
||||
continue;
|
||||
} else if (l2arc_mfuonly > 1) {
|
||||
if (pass == 3)
|
||||
continue;
|
||||
}
|
||||
|
||||
uint64_t passed_sz = 0;
|
||||
|
@ -10179,7 +10172,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
|
|||
{
|
||||
int err = 0;
|
||||
zio_cksum_t cksum;
|
||||
abd_t *abd = NULL;
|
||||
uint64_t asize;
|
||||
|
||||
ASSERT(this_lbp != NULL && next_lbp != NULL);
|
||||
|
@ -10241,16 +10233,22 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
|
|||
switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
|
||||
case ZIO_COMPRESS_OFF:
|
||||
break;
|
||||
case ZIO_COMPRESS_LZ4:
|
||||
abd = abd_alloc_for_io(asize, B_TRUE);
|
||||
case ZIO_COMPRESS_LZ4: {
|
||||
abd_t *abd = abd_alloc_linear(asize, B_TRUE);
|
||||
abd_copy_from_buf_off(abd, this_lb, 0, asize);
|
||||
if ((err = zio_decompress_data(
|
||||
abd_t dabd;
|
||||
abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
|
||||
err = zio_decompress_data(
|
||||
L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
|
||||
abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
|
||||
abd, &dabd, asize, sizeof (*this_lb), NULL);
|
||||
abd_free(&dabd);
|
||||
abd_free(abd);
|
||||
if (err != 0) {
|
||||
err = SET_ERROR(EINVAL);
|
||||
goto cleanup;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
err = SET_ERROR(EINVAL);
|
||||
goto cleanup;
|
||||
|
@ -10267,8 +10265,6 @@ cleanup:
|
|||
l2arc_log_blk_fetch_abort(*next_io);
|
||||
*next_io = NULL;
|
||||
}
|
||||
if (abd != NULL)
|
||||
abd_free(abd);
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
@ -10504,7 +10500,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
|
|||
uint64_t psize, asize;
|
||||
zio_t *wzio;
|
||||
l2arc_lb_abd_buf_t *abd_buf;
|
||||
uint8_t *tmpbuf = NULL;
|
||||
abd_t *abd = NULL;
|
||||
l2arc_lb_ptr_buf_t *lb_ptr_buf;
|
||||
|
||||
VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
|
||||
|
@ -10527,7 +10523,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
|
|||
|
||||
/* try to compress the buffer */
|
||||
psize = zio_compress_data(ZIO_COMPRESS_LZ4,
|
||||
abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
|
||||
abd_buf->abd, &abd, sizeof (*lb), 0);
|
||||
|
||||
/* a log block is never entirely zero */
|
||||
ASSERT(psize != 0);
|
||||
|
@ -10553,27 +10549,26 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
|
|||
ZIO_CHECKSUM_FLETCHER_4);
|
||||
if (asize < sizeof (*lb)) {
|
||||
/* compression succeeded */
|
||||
memset(tmpbuf + psize, 0, asize - psize);
|
||||
abd_zero_off(abd, psize, asize - psize);
|
||||
L2BLK_SET_COMPRESS(
|
||||
(&l2dhdr->dh_start_lbps[0])->lbp_prop,
|
||||
ZIO_COMPRESS_LZ4);
|
||||
} else {
|
||||
/* compression failed */
|
||||
memcpy(tmpbuf, lb, sizeof (*lb));
|
||||
abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
|
||||
L2BLK_SET_COMPRESS(
|
||||
(&l2dhdr->dh_start_lbps[0])->lbp_prop,
|
||||
ZIO_COMPRESS_OFF);
|
||||
}
|
||||
|
||||
/* checksum what we're about to write */
|
||||
fletcher_4_native(tmpbuf, asize, NULL,
|
||||
abd_fletcher_4_native(abd, asize, NULL,
|
||||
&l2dhdr->dh_start_lbps[0].lbp_cksum);
|
||||
|
||||
abd_free(abd_buf->abd);
|
||||
|
||||
/* perform the write itself */
|
||||
abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
|
||||
abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
|
||||
abd_buf->abd = abd;
|
||||
wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
|
||||
asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
|
||||
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
|
||||
|
|
|
@ -142,8 +142,13 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
|
|||
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
|
||||
uint8_t dstbuf[BPE_PAYLOAD_SIZE];
|
||||
decode_embedded_bp_compressed(bp, dstbuf);
|
||||
VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
|
||||
dstbuf, buf, psize, buflen, NULL));
|
||||
abd_t cabd, dabd;
|
||||
abd_get_from_buf_struct(&cabd, dstbuf, psize);
|
||||
abd_get_from_buf_struct(&dabd, buf, buflen);
|
||||
VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &cabd,
|
||||
&dabd, psize, buflen, NULL));
|
||||
abd_free(&dabd);
|
||||
abd_free(&cabd);
|
||||
} else {
|
||||
ASSERT3U(lsize, ==, psize);
|
||||
decode_embedded_bp_compressed(bp, buf);
|
||||
|
|
|
@ -40,6 +40,9 @@ static dataset_kstat_values_t empty_dataset_kstats = {
|
|||
{
|
||||
{ "zil_commit_count", KSTAT_DATA_UINT64 },
|
||||
{ "zil_commit_writer_count", KSTAT_DATA_UINT64 },
|
||||
{ "zil_commit_error_count", KSTAT_DATA_UINT64 },
|
||||
{ "zil_commit_stall_count", KSTAT_DATA_UINT64 },
|
||||
{ "zil_commit_suspend_count", KSTAT_DATA_UINT64 },
|
||||
{ "zil_itx_count", KSTAT_DATA_UINT64 },
|
||||
{ "zil_itx_indirect_count", KSTAT_DATA_UINT64 },
|
||||
{ "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 },
|
||||
|
@ -201,6 +204,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
|
|||
void
|
||||
dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
|
||||
{
|
||||
if (dk->dk_kstats == NULL)
|
||||
return;
|
||||
|
||||
dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
|
||||
char *ds_name;
|
||||
|
||||
|
|
1501
module/zfs/ddt.c
1501
module/zfs/ddt.c
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,764 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2023, Klara Inc.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/spa.h>
|
||||
#include <sys/ddt.h>
|
||||
#include <sys/dmu_tx.h>
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/ddt_impl.h>
|
||||
#include <sys/dnode.h>
|
||||
#include <sys/dbuf.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/zio_checksum.h>
|
||||
|
||||
/*
|
||||
* No more than this many txgs before swapping logs.
|
||||
*/
|
||||
uint_t zfs_dedup_log_txg_max = 8;
|
||||
|
||||
/*
|
||||
* Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
|
||||
* load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
|
||||
*/
|
||||
uint64_t zfs_dedup_log_mem_max = 0;
|
||||
uint_t zfs_dedup_log_mem_max_percent = 1;
|
||||
|
||||
|
||||
static kmem_cache_t *ddt_log_entry_flat_cache;
|
||||
static kmem_cache_t *ddt_log_entry_trad_cache;
|
||||
|
||||
#define DDT_LOG_ENTRY_FLAT_SIZE \
|
||||
(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
|
||||
#define DDT_LOG_ENTRY_TRAD_SIZE \
|
||||
(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
|
||||
|
||||
#define DDT_LOG_ENTRY_SIZE(ddt) \
|
||||
_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
|
||||
|
||||
void
|
||||
ddt_log_init(void)
|
||||
{
|
||||
ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
|
||||
DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||
ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
|
||||
DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||
|
||||
/*
|
||||
* Max memory for log AVL entries. At least 1M, because we need
|
||||
* something (that's ~3800 entries per tree). They can say 100% if they
|
||||
* want; it just means they're at the mercy of the the txg flush limit.
|
||||
*/
|
||||
if (zfs_dedup_log_mem_max == 0) {
|
||||
zfs_dedup_log_mem_max_percent =
|
||||
MIN(zfs_dedup_log_mem_max_percent, 100);
|
||||
zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
|
||||
zfs_dedup_log_mem_max_percent / 100;
|
||||
}
|
||||
zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
|
||||
}
|
||||
|
||||
void
|
||||
ddt_log_fini(void)
|
||||
{
|
||||
kmem_cache_destroy(ddt_log_entry_trad_cache);
|
||||
kmem_cache_destroy(ddt_log_entry_flat_cache);
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_name(ddt_t *ddt, char *name, uint_t n)
|
||||
{
|
||||
snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
|
||||
zio_checksum_table[ddt->ddt_checksum].ci_name, n);
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
|
||||
{
|
||||
dmu_buf_t *db;
|
||||
VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
|
||||
ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
|
||||
DLH_SET_VERSION(hdr, 1);
|
||||
DLH_SET_FLAGS(hdr, ddl->ddl_flags);
|
||||
hdr->dlh_length = ddl->ddl_length;
|
||||
hdr->dlh_first_txg = ddl->ddl_first_txg;
|
||||
hdr->dlh_checkpoint = ddl->ddl_checkpoint;
|
||||
|
||||
dmu_buf_rele(db, FTAG);
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT3U(ddt->ddt_dir_object, >, 0);
|
||||
ASSERT3U(ddl->ddl_object, ==, 0);
|
||||
|
||||
char name[DDT_NAMELEN];
|
||||
ddt_log_name(ddt, name, n);
|
||||
|
||||
ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
|
||||
DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
|
||||
DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
|
||||
VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
|
||||
sizeof (uint64_t), 1, &ddl->ddl_object, tx));
|
||||
ddl->ddl_length = 0;
|
||||
ddl->ddl_first_txg = tx->tx_txg;
|
||||
ddt_log_update_header(ddt, ddl, tx);
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
|
||||
{
|
||||
ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
|
||||
ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT3U(ddt->ddt_dir_object, >, 0);
|
||||
|
||||
if (ddl->ddl_object == 0)
|
||||
return;
|
||||
|
||||
ASSERT0(ddl->ddl_length);
|
||||
|
||||
char name[DDT_NAMELEN];
|
||||
ddt_log_name(ddt, name, n);
|
||||
|
||||
VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
|
||||
VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
|
||||
|
||||
ddl->ddl_object = 0;
|
||||
}
|
||||
|
||||
void
|
||||
ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
|
||||
{
|
||||
ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
|
||||
ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_update_stats(ddt_t *ddt)
|
||||
{
|
||||
/*
|
||||
* Log object stats. We count the number of live entries in the log
|
||||
* tree, even if there are more than on disk, and even if the same
|
||||
* entry is on both append and flush trees, because that's more what
|
||||
* the user expects to see. This does mean the on-disk size is not
|
||||
* really correlated with the number of entries, but I don't think
|
||||
* that's reasonable to expect anyway.
|
||||
*/
|
||||
dmu_object_info_t doi;
|
||||
uint64_t nblocks;
|
||||
dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
|
||||
nblocks = doi.doi_physical_blocks_512;
|
||||
dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
|
||||
nblocks += doi.doi_physical_blocks_512;
|
||||
|
||||
ddt_object_t *ddo = &ddt->ddt_log_stats;
|
||||
ddo->ddo_count =
|
||||
avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
|
||||
avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
|
||||
ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
|
||||
ddo->ddo_dspace = nblocks << 9;
|
||||
}
|
||||
|
||||
void
|
||||
ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
|
||||
{
|
||||
ASSERT3U(nentries, >, 0);
|
||||
ASSERT3P(dlu->dlu_dbp, ==, NULL);
|
||||
|
||||
if (ddt->ddt_log_active->ddl_object == 0)
|
||||
ddt_log_create(ddt, tx);
|
||||
|
||||
/*
|
||||
* We want to store as many entries as we can in a block, but never
|
||||
* split an entry across block boundaries.
|
||||
*/
|
||||
size_t reclen = P2ALIGN_TYPED(
|
||||
sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
|
||||
DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
|
||||
ASSERT3U(reclen, <=, UINT16_MAX);
|
||||
dlu->dlu_reclen = reclen;
|
||||
|
||||
VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
|
||||
&dlu->dlu_dn));
|
||||
dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
|
||||
|
||||
uint64_t nblocks = howmany(nentries,
|
||||
dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
|
||||
uint64_t offset = ddt->ddt_log_active->ddl_length;
|
||||
uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
|
||||
|
||||
VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
|
||||
B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
|
||||
DMU_READ_NO_PREFETCH));
|
||||
|
||||
dlu->dlu_tx = tx;
|
||||
dlu->dlu_block = dlu->dlu_offset = 0;
|
||||
}
|
||||
|
||||
static ddt_log_entry_t *
|
||||
ddt_log_alloc_entry(ddt_t *ddt)
|
||||
{
|
||||
ddt_log_entry_t *ddle;
|
||||
|
||||
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
|
||||
ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
|
||||
memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
|
||||
} else {
|
||||
ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
|
||||
memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
|
||||
}
|
||||
|
||||
return (ddle);
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
|
||||
{
|
||||
/* Create the log tree entry from a live or stored entry */
|
||||
avl_index_t where;
|
||||
ddt_log_entry_t *ddle =
|
||||
avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
|
||||
if (ddle == NULL) {
|
||||
ddle = ddt_log_alloc_entry(ddt);
|
||||
ddle->ddle_key = ddlwe->ddlwe_key;
|
||||
avl_insert(&ddl->ddl_tree, ddle, where);
|
||||
}
|
||||
ddle->ddle_type = ddlwe->ddlwe_type;
|
||||
ddle->ddle_class = ddlwe->ddlwe_class;
|
||||
memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
|
||||
}
|
||||
|
||||
void
|
||||
ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
|
||||
{
|
||||
ASSERT3U(dlu->dlu_dbp, !=, NULL);
|
||||
|
||||
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
|
||||
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
|
||||
|
||||
/* Get our block */
|
||||
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
|
||||
dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
|
||||
|
||||
/*
|
||||
* If this would take us past the end of the block, finish it and
|
||||
* move to the next one.
|
||||
*/
|
||||
if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
|
||||
ASSERT3U(dlu->dlu_offset, >, 0);
|
||||
dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
|
||||
dlu->dlu_block++;
|
||||
dlu->dlu_offset = 0;
|
||||
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
|
||||
db = dlu->dlu_dbp[dlu->dlu_block];
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is the first time touching the block, inform the DMU that
|
||||
* we will fill it, and zero it out.
|
||||
*/
|
||||
if (dlu->dlu_offset == 0) {
|
||||
dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
|
||||
memset(db->db_data, 0, db->db_size);
|
||||
}
|
||||
|
||||
/* Create the log record directly in the buffer */
|
||||
ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
|
||||
DLR_SET_TYPE(dlr, DLR_ENTRY);
|
||||
DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
|
||||
DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
|
||||
DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
|
||||
|
||||
ddt_log_record_entry_t *dlre =
|
||||
(ddt_log_record_entry_t *)&dlr->dlr_payload;
|
||||
dlre->dlre_key = ddlwe->ddlwe_key;
|
||||
memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
|
||||
|
||||
/* Advance offset for next record. */
|
||||
dlu->dlu_offset += dlu->dlu_reclen;
|
||||
}
|
||||
|
||||
void
|
||||
ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
|
||||
{
|
||||
ASSERT3U(dlu->dlu_dbp, !=, NULL);
|
||||
ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
|
||||
ASSERT3U(dlu->dlu_offset, >, 0);
|
||||
|
||||
/*
|
||||
* Close out the last block. Whatever we haven't used will be zeroed,
|
||||
* which matches DLR_INVALID, so we can detect this during load.
|
||||
*/
|
||||
dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
|
||||
|
||||
dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
|
||||
|
||||
ddt->ddt_log_active->ddl_length +=
|
||||
dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
|
||||
dnode_rele(dlu->dlu_dn, FTAG);
|
||||
|
||||
ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
|
||||
|
||||
memset(dlu, 0, sizeof (ddt_log_update_t));
|
||||
|
||||
ddt_log_update_stats(ddt);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
|
||||
{
|
||||
ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
|
||||
if (ddle == NULL)
|
||||
return (B_FALSE);
|
||||
|
||||
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
|
||||
|
||||
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
|
||||
|
||||
avl_remove(&ddl->ddl_tree, ddle);
|
||||
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
|
||||
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
|
||||
ddt_lightweight_entry_t *ddlwe)
|
||||
{
|
||||
ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
|
||||
if (ddle == NULL)
|
||||
return (B_FALSE);
|
||||
|
||||
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
|
||||
|
||||
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
|
||||
|
||||
avl_remove(&ddl->ddl_tree, ddle);
|
||||
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
|
||||
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
void
|
||||
ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
|
||||
{
|
||||
ddt_log_t *ddl = ddt->ddt_log_flushing;
|
||||
|
||||
ASSERT3U(ddl->ddl_object, !=, 0);
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
/*
|
||||
* There should not be any entries on the log tree before the given
|
||||
* checkpoint. Assert that this is the case.
|
||||
*/
|
||||
ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
|
||||
if (ddle != NULL)
|
||||
VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
|
||||
>, 0);
|
||||
#endif
|
||||
|
||||
ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
|
||||
ddl->ddl_checkpoint = ddlwe->ddlwe_key;
|
||||
ddt_log_update_header(ddt, ddl, tx);
|
||||
|
||||
ddt_log_update_stats(ddt);
|
||||
}
|
||||
|
||||
void
|
||||
ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
|
||||
{
|
||||
ddt_log_t *ddl = ddt->ddt_log_flushing;
|
||||
|
||||
if (ddl->ddl_object == 0)
|
||||
return;
|
||||
|
||||
ASSERT(avl_is_empty(&ddl->ddl_tree));
|
||||
|
||||
/* Eject the entire object */
|
||||
dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
|
||||
|
||||
ddl->ddl_length = 0;
|
||||
ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
|
||||
memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
|
||||
ddt_log_update_header(ddt, ddl, tx);
|
||||
|
||||
ddt_log_update_stats(ddt);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
|
||||
{
|
||||
/* Swap the logs. The old flushing one must be empty */
|
||||
VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
|
||||
|
||||
/*
|
||||
* If there are still blocks on the flushing log, truncate it first.
|
||||
* This can happen if there were entries on the flushing log that were
|
||||
* removed in memory via ddt_lookup(); their vestigal remains are
|
||||
* on disk.
|
||||
*/
|
||||
if (ddt->ddt_log_flushing->ddl_length > 0)
|
||||
ddt_log_truncate(ddt, tx);
|
||||
|
||||
/*
|
||||
* Swap policy. We swap the logs (and so begin flushing) when the
|
||||
* active tree grows too large, or when we haven't swapped it in
|
||||
* some amount of time, or if something has requested the logs be
|
||||
* flushed ASAP (see ddt_walk_init()).
|
||||
*/
|
||||
|
||||
/*
|
||||
* The log tree is too large if the memory usage of its entries is over
|
||||
* half of the memory limit. This effectively gives each log tree half
|
||||
* the available memory.
|
||||
*/
|
||||
const boolean_t too_large =
|
||||
(avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
|
||||
DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
|
||||
|
||||
const boolean_t too_old =
|
||||
tx->tx_txg >=
|
||||
(ddt->ddt_log_active->ddl_first_txg +
|
||||
MAX(1, zfs_dedup_log_txg_max));
|
||||
|
||||
const boolean_t force =
|
||||
ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
|
||||
|
||||
if (!(too_large || too_old || force))
|
||||
return (B_FALSE);
|
||||
|
||||
ddt_log_t *swap = ddt->ddt_log_active;
|
||||
ddt->ddt_log_active = ddt->ddt_log_flushing;
|
||||
ddt->ddt_log_flushing = swap;
|
||||
|
||||
ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
|
||||
ddt->ddt_log_active->ddl_flags &=
|
||||
~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
|
||||
|
||||
ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
|
||||
ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
|
||||
|
||||
ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
|
||||
|
||||
ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
|
||||
ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
|
||||
|
||||
ddt_log_update_stats(ddt);
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static inline void
|
||||
ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
|
||||
const ddt_key_t *checkpoint)
|
||||
{
|
||||
ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
|
||||
|
||||
ddt_log_record_entry_t *dlre =
|
||||
(ddt_log_record_entry_t *)dlr->dlr_payload;
|
||||
if (checkpoint != NULL &&
|
||||
ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
|
||||
/* Skip pre-checkpoint entries; they're already flushed. */
|
||||
return;
|
||||
}
|
||||
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
|
||||
ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
|
||||
|
||||
ddlwe.ddlwe_key = dlre->dlre_key;
|
||||
memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
|
||||
|
||||
ddt_log_update_entry(ddt, ddl, &ddlwe);
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
|
||||
{
|
||||
void *cookie = NULL;
|
||||
ddt_log_entry_t *ddle;
|
||||
IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
|
||||
while ((ddle =
|
||||
avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
|
||||
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
|
||||
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
|
||||
}
|
||||
ASSERT(avl_is_empty(&ddl->ddl_tree));
|
||||
}
|
||||
|
||||
static int
|
||||
ddt_log_load_one(ddt_t *ddt, uint_t n)
|
||||
{
|
||||
ASSERT3U(n, <, 2);
|
||||
|
||||
ddt_log_t *ddl = &ddt->ddt_log[n];
|
||||
|
||||
char name[DDT_NAMELEN];
|
||||
ddt_log_name(ddt, name, n);
|
||||
|
||||
uint64_t obj;
|
||||
int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
|
||||
sizeof (uint64_t), 1, &obj);
|
||||
if (err == ENOENT)
|
||||
return (0);
|
||||
if (err != 0)
|
||||
return (err);
|
||||
|
||||
dnode_t *dn;
|
||||
err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
|
||||
if (err != 0)
|
||||
return (err);
|
||||
|
||||
ddt_log_header_t hdr;
|
||||
dmu_buf_t *db;
|
||||
err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
|
||||
if (err != 0) {
|
||||
dnode_rele(dn, FTAG);
|
||||
return (err);
|
||||
}
|
||||
memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
|
||||
dmu_buf_rele(db, FTAG);
|
||||
|
||||
if (DLH_GET_VERSION(&hdr) != 1) {
|
||||
dnode_rele(dn, FTAG);
|
||||
zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
|
||||
"unknown version=%llu", spa_name(ddt->ddt_spa), name,
|
||||
(u_longlong_t)DLH_GET_VERSION(&hdr));
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
|
||||
ddt_key_t *checkpoint = NULL;
|
||||
if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
|
||||
/*
|
||||
* If the log has a checkpoint, then we can ignore any entries
|
||||
* that have already been flushed.
|
||||
*/
|
||||
ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
|
||||
checkpoint = &hdr.dlh_checkpoint;
|
||||
}
|
||||
|
||||
if (hdr.dlh_length > 0) {
|
||||
dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
|
||||
ZIO_PRIORITY_SYNC_READ);
|
||||
|
||||
for (uint64_t offset = 0; offset < hdr.dlh_length;
|
||||
offset += dn->dn_datablksz) {
|
||||
err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
|
||||
DMU_READ_PREFETCH);
|
||||
if (err != 0) {
|
||||
dnode_rele(dn, FTAG);
|
||||
ddt_log_empty(ddt, ddl);
|
||||
return (err);
|
||||
}
|
||||
|
||||
uint64_t boffset = 0;
|
||||
while (boffset < db->db_size) {
|
||||
ddt_log_record_t *dlr =
|
||||
(ddt_log_record_t *)(db->db_data + boffset);
|
||||
|
||||
/* Partially-filled block, skip the rest */
|
||||
if (DLR_GET_TYPE(dlr) == DLR_INVALID)
|
||||
break;
|
||||
|
||||
switch (DLR_GET_TYPE(dlr)) {
|
||||
case DLR_ENTRY:
|
||||
ddt_log_load_entry(ddt, ddl, dlr,
|
||||
checkpoint);
|
||||
break;
|
||||
|
||||
default:
|
||||
dmu_buf_rele(db, FTAG);
|
||||
dnode_rele(dn, FTAG);
|
||||
ddt_log_empty(ddt, ddl);
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
|
||||
boffset += DLR_GET_RECLEN(dlr);
|
||||
}
|
||||
|
||||
dmu_buf_rele(db, FTAG);
|
||||
}
|
||||
}
|
||||
|
||||
dnode_rele(dn, FTAG);
|
||||
|
||||
ddl->ddl_object = obj;
|
||||
ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
|
||||
ddl->ddl_length = hdr.dlh_length;
|
||||
ddl->ddl_first_txg = hdr.dlh_first_txg;
|
||||
|
||||
if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
|
||||
ddt->ddt_log_flushing = ddl;
|
||||
else
|
||||
ddt->ddt_log_active = ddl;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
ddt_log_load(ddt_t *ddt)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
|
||||
/*
|
||||
* The DDT is going to be freed again in a moment, so there's
|
||||
* no point loading the log; it'll just slow down import.
|
||||
*/
|
||||
return (0);
|
||||
}
|
||||
|
||||
ASSERT0(ddt->ddt_log[0].ddl_object);
|
||||
ASSERT0(ddt->ddt_log[1].ddl_object);
|
||||
if (ddt->ddt_dir_object == 0) {
|
||||
/*
|
||||
* If we're configured but the containing dir doesn't exist
|
||||
* yet, then the log object can't possibly exist either.
|
||||
*/
|
||||
ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
if ((err = ddt_log_load_one(ddt, 0)) != 0)
|
||||
return (err);
|
||||
if ((err = ddt_log_load_one(ddt, 1)) != 0)
|
||||
return (err);
|
||||
|
||||
VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
|
||||
VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
|
||||
VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
|
||||
VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
|
||||
|
||||
/*
|
||||
* We have two finalisation tasks:
|
||||
*
|
||||
* - rebuild the histogram. We do this at the end rather than while
|
||||
* we're loading so we don't need to uncount and recount entries that
|
||||
* appear multiple times in the log.
|
||||
*
|
||||
* - remove entries from the flushing tree that are on both trees. This
|
||||
* happens when ddt_lookup() rehydrates an entry from the flushing
|
||||
* tree, as ddt_log_take_key() removes the entry from the in-memory
|
||||
* tree but doesn't remove it from disk.
|
||||
*/
|
||||
|
||||
/*
|
||||
* We don't technically need a config lock here, since there shouldn't
|
||||
* be pool config changes during DDT load. dva_get_dsize_sync() via
|
||||
* ddt_stat_generate() is expecting it though, and it won't hurt
|
||||
* anything, so we take it.
|
||||
*/
|
||||
spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
|
||||
|
||||
avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
|
||||
avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
|
||||
ddt_log_entry_t *ae = avl_first(al);
|
||||
ddt_log_entry_t *fe = avl_first(fl);
|
||||
while (ae != NULL || fe != NULL) {
|
||||
ddt_log_entry_t *ddle;
|
||||
if (ae == NULL) {
|
||||
/* active exhausted, take flushing */
|
||||
ddle = fe;
|
||||
fe = AVL_NEXT(fl, fe);
|
||||
} else if (fe == NULL) {
|
||||
/* flushing exuhausted, take active */
|
||||
ddle = ae;
|
||||
ae = AVL_NEXT(al, ae);
|
||||
} else {
|
||||
/* compare active and flushing */
|
||||
int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
|
||||
if (c < 0) {
|
||||
/* active behind, take and advance */
|
||||
ddle = ae;
|
||||
ae = AVL_NEXT(al, ae);
|
||||
} else if (c > 0) {
|
||||
/* flushing behind, take and advance */
|
||||
ddle = fe;
|
||||
fe = AVL_NEXT(fl, fe);
|
||||
} else {
|
||||
/* match. remove from flushing, take active */
|
||||
ddle = fe;
|
||||
fe = AVL_NEXT(fl, fe);
|
||||
avl_remove(fl, ddle);
|
||||
|
||||
ddle = ae;
|
||||
ae = AVL_NEXT(al, ae);
|
||||
}
|
||||
}
|
||||
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
|
||||
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
|
||||
}
|
||||
|
||||
spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
|
||||
|
||||
ddt_log_update_stats(ddt);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
ddt_log_alloc(ddt_t *ddt)
|
||||
{
|
||||
ASSERT3P(ddt->ddt_log_active, ==, NULL);
|
||||
ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
|
||||
|
||||
avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
|
||||
sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
|
||||
avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
|
||||
sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
|
||||
ddt->ddt_log_active = &ddt->ddt_log[0];
|
||||
ddt->ddt_log_flushing = &ddt->ddt_log[1];
|
||||
ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
|
||||
}
|
||||
|
||||
void
|
||||
ddt_log_free(ddt_t *ddt)
|
||||
{
|
||||
ddt_log_empty(ddt, &ddt->ddt_log[0]);
|
||||
ddt_log_empty(ddt, &ddt->ddt_log[1]);
|
||||
avl_destroy(&ddt->ddt_log[0].ddl_tree);
|
||||
avl_destroy(&ddt->ddt_log[1].ddl_tree);
|
||||
}
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
|
||||
"Max transactions before starting to flush dedup logs");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
|
||||
"Max memory for dedup logs");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
|
||||
"Max memory for dedup logs, as % of total memory");
|
|
@ -33,27 +33,32 @@
|
|||
#include <sys/ddt_impl.h>
|
||||
|
||||
static void
|
||||
ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
|
||||
ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
|
||||
ddt_stat_t *dds)
|
||||
{
|
||||
spa_t *spa = ddt->ddt_spa;
|
||||
ddt_phys_t *ddp = dde->dde_phys;
|
||||
ddt_key_t *ddk = &dde->dde_key;
|
||||
uint64_t lsize = DDK_GET_LSIZE(ddk);
|
||||
uint64_t psize = DDK_GET_PSIZE(ddk);
|
||||
uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key);
|
||||
uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key);
|
||||
|
||||
memset(dds, 0, sizeof (*dds));
|
||||
|
||||
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
||||
uint64_t dsize = 0;
|
||||
uint64_t refcnt = ddp->ddp_refcnt;
|
||||
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
|
||||
const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
|
||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||
|
||||
if (ddp->ddp_phys_birth == 0)
|
||||
if (ddt_phys_birth(ddp, v) == 0)
|
||||
continue;
|
||||
|
||||
int ndvas = DDK_GET_CRYPT(&dde->dde_key) ?
|
||||
SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
|
||||
int ndvas = ddt_phys_dva_count(ddp, v,
|
||||
DDK_GET_CRYPT(&ddlwe->ddlwe_key));
|
||||
const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
|
||||
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
|
||||
|
||||
uint64_t dsize = 0;
|
||||
for (int d = 0; d < ndvas; d++)
|
||||
dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
|
||||
dsize += dva_get_dsize_sync(spa, &dvas[d]);
|
||||
|
||||
uint64_t refcnt = ddt_phys_refcnt(ddp, v);
|
||||
|
||||
dds->dds_blocks += 1;
|
||||
dds->dds_lsize += lsize;
|
||||
|
@ -67,61 +72,108 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
|
||||
static void
|
||||
ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src)
|
||||
{
|
||||
const uint64_t *s = (const uint64_t *)src;
|
||||
uint64_t *d = (uint64_t *)dst;
|
||||
uint64_t *d_end = (uint64_t *)(dst + 1);
|
||||
dst->dds_blocks += src->dds_blocks;
|
||||
dst->dds_lsize += src->dds_lsize;
|
||||
dst->dds_psize += src->dds_psize;
|
||||
dst->dds_dsize += src->dds_dsize;
|
||||
dst->dds_ref_blocks += src->dds_ref_blocks;
|
||||
dst->dds_ref_lsize += src->dds_ref_lsize;
|
||||
dst->dds_ref_psize += src->dds_ref_psize;
|
||||
dst->dds_ref_dsize += src->dds_ref_dsize;
|
||||
}
|
||||
|
||||
ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
|
||||
static void
|
||||
ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src)
|
||||
{
|
||||
/* This caught more during development than you might expect... */
|
||||
ASSERT3U(dst->dds_blocks, >=, src->dds_blocks);
|
||||
ASSERT3U(dst->dds_lsize, >=, src->dds_lsize);
|
||||
ASSERT3U(dst->dds_psize, >=, src->dds_psize);
|
||||
ASSERT3U(dst->dds_dsize, >=, src->dds_dsize);
|
||||
ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks);
|
||||
ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize);
|
||||
ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize);
|
||||
ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize);
|
||||
|
||||
for (int i = 0; i < d_end - d; i++)
|
||||
d[i] += (s[i] ^ neg) - neg;
|
||||
dst->dds_blocks -= src->dds_blocks;
|
||||
dst->dds_lsize -= src->dds_lsize;
|
||||
dst->dds_psize -= src->dds_psize;
|
||||
dst->dds_dsize -= src->dds_dsize;
|
||||
dst->dds_ref_blocks -= src->dds_ref_blocks;
|
||||
dst->dds_ref_lsize -= src->dds_ref_lsize;
|
||||
dst->dds_ref_psize -= src->dds_ref_psize;
|
||||
dst->dds_ref_dsize -= src->dds_ref_dsize;
|
||||
}
|
||||
|
||||
void
|
||||
ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
|
||||
ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
|
||||
const ddt_lightweight_entry_t *ddlwe)
|
||||
{
|
||||
ddt_stat_t dds;
|
||||
ddt_histogram_t *ddh;
|
||||
int bucket;
|
||||
|
||||
ddt_stat_generate(ddt, dde, &dds);
|
||||
ddt_stat_generate(ddt, ddlwe, &dds);
|
||||
|
||||
bucket = highbit64(dds.dds_ref_blocks) - 1;
|
||||
ASSERT3U(bucket, >=, 0);
|
||||
if (bucket < 0)
|
||||
return;
|
||||
|
||||
ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
|
||||
ddt_stat_add(&ddh->ddh_stat[bucket], &dds);
|
||||
}
|
||||
|
||||
ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
|
||||
void
|
||||
ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
|
||||
const ddt_lightweight_entry_t *ddlwe)
|
||||
{
|
||||
ddt_stat_t dds;
|
||||
int bucket;
|
||||
|
||||
ddt_stat_generate(ddt, ddlwe, &dds);
|
||||
|
||||
bucket = highbit64(dds.dds_ref_blocks) - 1;
|
||||
if (bucket < 0)
|
||||
return;
|
||||
|
||||
ddt_stat_sub(&ddh->ddh_stat[bucket], &dds);
|
||||
}
|
||||
|
||||
void
|
||||
ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
|
||||
{
|
||||
for (int h = 0; h < 64; h++)
|
||||
ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
|
||||
ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]);
|
||||
}
|
||||
|
||||
void
|
||||
ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
|
||||
ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh)
|
||||
{
|
||||
memset(dds, 0, sizeof (*dds));
|
||||
|
||||
for (int h = 0; h < 64; h++)
|
||||
ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
|
||||
ddt_stat_add(dds, &ddh->ddh_stat[h]);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
ddt_histogram_empty(const ddt_histogram_t *ddh)
|
||||
{
|
||||
const uint64_t *s = (const uint64_t *)ddh;
|
||||
const uint64_t *s_end = (const uint64_t *)(ddh + 1);
|
||||
for (int h = 0; h < 64; h++) {
|
||||
const ddt_stat_t *dds = &ddh->ddh_stat[h];
|
||||
|
||||
while (s < s_end)
|
||||
if (*s++ != 0)
|
||||
return (B_FALSE);
|
||||
if (dds->dds_blocks == 0 &&
|
||||
dds->dds_lsize == 0 &&
|
||||
dds->dds_psize == 0 &&
|
||||
dds->dds_dsize == 0 &&
|
||||
dds->dds_ref_blocks == 0 &&
|
||||
dds->dds_ref_lsize == 0 &&
|
||||
dds->dds_ref_psize == 0 &&
|
||||
dds->dds_ref_dsize == 0)
|
||||
continue;
|
||||
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
@ -170,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
|
|||
ddo_total->ddo_mspace += ddo->ddo_mspace;
|
||||
}
|
||||
}
|
||||
|
||||
ddt_object_t *ddo = &ddt->ddt_log_stats;
|
||||
ddo_total->ddo_count += ddo->ddo_count;
|
||||
ddo_total->ddo_dspace += ddo->ddo_dspace;
|
||||
ddo_total->ddo_mspace += ddo->ddo_mspace;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -207,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
|
|||
&ddt->ddt_histogram_cache[type][class]);
|
||||
}
|
||||
}
|
||||
|
||||
ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -217,7 +276,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
|
|||
|
||||
ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
|
||||
ddt_get_dedup_histogram(spa, ddh_total);
|
||||
ddt_histogram_stat(dds_total, ddh_total);
|
||||
ddt_histogram_total(dds_total, ddh_total);
|
||||
kmem_free(ddh_total, sizeof (ddt_histogram_t));
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
/*
|
||||
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, Klara Inc.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
|
@ -51,8 +52,13 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
|
|||
|
||||
ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */
|
||||
|
||||
c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
|
||||
ci->ci_level);
|
||||
/* Call compress function directly to avoid hole detection. */
|
||||
abd_t sabd, dabd;
|
||||
abd_get_from_buf_struct(&sabd, (void *)src, s_len);
|
||||
abd_get_from_buf_struct(&dabd, dst, d_len);
|
||||
c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level);
|
||||
abd_free(&dabd);
|
||||
abd_free(&sabd);
|
||||
|
||||
if (c_len == s_len) {
|
||||
cpfunc = ZIO_COMPRESS_OFF;
|
||||
|
@ -71,12 +77,18 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
|
|||
{
|
||||
uchar_t version = *src++;
|
||||
int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
|
||||
zio_compress_info_t *ci = &zio_compress_table[cpfunc];
|
||||
|
||||
if (ci->ci_decompress != NULL)
|
||||
(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
|
||||
else
|
||||
if (zio_compress_table[cpfunc].ci_decompress == NULL) {
|
||||
memcpy(dst, src, d_len);
|
||||
return;
|
||||
}
|
||||
|
||||
abd_t sabd, dabd;
|
||||
abd_get_from_buf_struct(&sabd, src, s_len);
|
||||
abd_get_from_buf_struct(&dabd, dst, d_len);
|
||||
VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL));
|
||||
abd_free(&dabd);
|
||||
abd_free(&sabd);
|
||||
|
||||
if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
|
||||
(ZFS_HOST_BYTEORDER != 0))
|
||||
|
@ -108,7 +120,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
|
|||
|
||||
static int
|
||||
ddt_zap_lookup(objset_t *os, uint64_t object,
|
||||
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize)
|
||||
const ddt_key_t *ddk, void *phys, size_t psize)
|
||||
{
|
||||
uchar_t *cbuf;
|
||||
uint64_t one, csize;
|
||||
|
@ -155,7 +167,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object)
|
|||
|
||||
static int
|
||||
ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
|
||||
const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx)
|
||||
const void *phys, size_t psize, dmu_tx_t *tx)
|
||||
{
|
||||
const size_t cbuf_size = psize + 1;
|
||||
|
||||
|
@ -181,7 +193,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
|
|||
|
||||
static int
|
||||
ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
|
||||
ddt_phys_t *phys, size_t psize)
|
||||
void *phys, size_t psize)
|
||||
{
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t za;
|
||||
|
|
|
@ -95,6 +95,12 @@ uint_t dmu_prefetch_max = 8 * 1024 * 1024;
|
|||
uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Override copies= for dedup state objects. 0 means the traditional behaviour
|
||||
* (ie the default for the containing objset ie 3 for the MOS).
|
||||
*/
|
||||
uint_t dmu_ddt_copies = 0;
|
||||
|
||||
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
|
||||
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
|
||||
{DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" },
|
||||
|
@ -2272,6 +2278,28 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
|||
case ZFS_REDUNDANT_METADATA_NONE:
|
||||
break;
|
||||
}
|
||||
|
||||
if (dmu_ddt_copies > 0) {
|
||||
/*
|
||||
* If this tuneable is set, and this is a write for a
|
||||
* dedup entry store (zap or log), then we treat it
|
||||
* something like ZFS_REDUNDANT_METADATA_MOST on a
|
||||
* regular dataset: this many copies, and one more for
|
||||
* "higher" indirect blocks. This specific exception is
|
||||
* necessary because dedup objects are stored in the
|
||||
* MOS, which always has the highest possible copies.
|
||||
*/
|
||||
dmu_object_type_t stype =
|
||||
dn ? dn->dn_storage_type : DMU_OT_NONE;
|
||||
if (stype == DMU_OT_NONE)
|
||||
stype = type;
|
||||
if (stype == DMU_OT_DDT_ZAP) {
|
||||
copies = dmu_ddt_copies;
|
||||
if (level >=
|
||||
zfs_redundant_metadata_most_ditto_level)
|
||||
copies++;
|
||||
}
|
||||
}
|
||||
} else if (wp & WP_NOFILL) {
|
||||
ASSERT(level == 0);
|
||||
|
||||
|
@ -2824,3 +2852,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
|
|||
/* CSTYLED */
|
||||
ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
|
||||
"Limit one prefetch call to this size");
|
||||
|
||||
/* CSTYLED */
|
||||
ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
|
||||
"Override copies= for dedup objects");
|
||||
|
|
|
@ -1391,7 +1391,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
|
|||
abd_t *dabd = abd_alloc_linear(
|
||||
drrw->drr_logical_size, B_FALSE);
|
||||
err = zio_decompress_data(drrw->drr_compressiontype,
|
||||
abd, abd_to_buf(dabd), abd_get_size(abd),
|
||||
abd, dabd, abd_get_size(abd),
|
||||
abd_get_size(dabd), NULL);
|
||||
|
||||
if (err != 0) {
|
||||
|
@ -1407,9 +1407,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
|
|||
/* Recompress the data */
|
||||
abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
|
||||
B_FALSE);
|
||||
void *buf = abd_to_buf(cabd);
|
||||
uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
|
||||
abd, &buf, abd_get_size(abd),
|
||||
abd, &cabd, abd_get_size(abd),
|
||||
rwa->os->os_complevel);
|
||||
abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
|
||||
/* Swap in newly compressed data into the abd */
|
||||
|
@ -2221,7 +2220,7 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
|
|||
|
||||
err = zio_decompress_data(
|
||||
drrw->drr_compressiontype,
|
||||
abd, abd_to_buf(decomp_abd),
|
||||
abd, decomp_abd,
|
||||
abd_get_size(abd),
|
||||
abd_get_size(decomp_abd), NULL);
|
||||
|
||||
|
|
|
@ -2425,8 +2425,14 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
|
|||
fnvlist_free(token_nv);
|
||||
compressed = kmem_alloc(packed_size, KM_SLEEP);
|
||||
|
||||
compressed_size = gzip_compress(packed, compressed,
|
||||
/* Call compress function directly to avoid hole detection. */
|
||||
abd_t pabd, cabd;
|
||||
abd_get_from_buf_struct(&pabd, packed, packed_size);
|
||||
abd_get_from_buf_struct(&cabd, compressed, packed_size);
|
||||
compressed_size = zfs_gzip_compress(&pabd, &cabd,
|
||||
packed_size, packed_size, 6);
|
||||
abd_free(&cabd);
|
||||
abd_free(&pabd);
|
||||
|
||||
zio_cksum_t cksum;
|
||||
fletcher_4_native_varsize(compressed, compressed_size, &cksum);
|
||||
|
|
|
@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
|
|||
zap_cursor_fini(&zc);
|
||||
}
|
||||
|
||||
ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
|
||||
|
||||
spa_scan_stat_init(spa);
|
||||
vdev_scan_stat_init(spa->spa_root_vdev);
|
||||
|
||||
|
@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
|
|||
|
||||
memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
|
||||
|
||||
ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
|
||||
|
||||
dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
|
||||
|
||||
spa_history_log_internal(spa, "scan setup", tx,
|
||||
|
@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
|
|||
txg_sync_waiting(scn->scn_dp) ||
|
||||
NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
|
||||
spa_shutting_down(scn->scn_dp->dp_spa) ||
|
||||
(zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
|
||||
(zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
|
||||
!ddt_walk_ready(scn->scn_dp->dp_spa)) {
|
||||
if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
|
||||
dprintf("suspending at first available bookmark "
|
||||
"%llx/%llx/%llx/%llx\n",
|
||||
|
@ -2929,11 +2934,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
|
|||
|
||||
void
|
||||
dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
|
||||
ddt_entry_t *dde, dmu_tx_t *tx)
|
||||
ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
|
||||
{
|
||||
(void) tx;
|
||||
const ddt_key_t *ddk = &dde->dde_key;
|
||||
ddt_phys_t *ddp = dde->dde_phys;
|
||||
const ddt_key_t *ddk = &ddlwe->ddlwe_key;
|
||||
blkptr_t bp;
|
||||
zbookmark_phys_t zb = { 0 };
|
||||
|
||||
|
@ -2954,11 +2958,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
|
|||
if (scn->scn_done_txg != 0)
|
||||
return;
|
||||
|
||||
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
||||
if (ddp->ddp_phys_birth == 0 ||
|
||||
ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
|
||||
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
|
||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||
uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
|
||||
|
||||
if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
|
||||
continue;
|
||||
ddt_bp_create(checksum, ddk, ddp, &bp);
|
||||
ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
|
||||
|
||||
scn->scn_visited_this_txg++;
|
||||
scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
|
||||
|
@ -3002,11 +3008,11 @@ static void
|
|||
dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
|
||||
{
|
||||
ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
|
||||
ddt_entry_t dde = {{{{0}}}};
|
||||
ddt_lightweight_entry_t ddlwe = {0};
|
||||
int error;
|
||||
uint64_t n = 0;
|
||||
|
||||
while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
|
||||
while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
|
||||
ddt_t *ddt;
|
||||
|
||||
if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
|
||||
|
@ -3021,16 +3027,28 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
|
|||
ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
|
||||
ASSERT(avl_first(&ddt->ddt_tree) == NULL);
|
||||
|
||||
dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
|
||||
dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
|
||||
n++;
|
||||
|
||||
if (dsl_scan_check_suspend(scn, NULL))
|
||||
break;
|
||||
}
|
||||
|
||||
zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
|
||||
"suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
|
||||
(int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
|
||||
if (error == EAGAIN) {
|
||||
dsl_scan_check_suspend(scn, NULL);
|
||||
error = 0;
|
||||
|
||||
zfs_dbgmsg("waiting for ddt to become ready for scan "
|
||||
"on %s with class_max = %u; suspending=%u",
|
||||
scn->scn_dp->dp_spa->spa_name,
|
||||
(int)scn->scn_phys.scn_ddt_class_max,
|
||||
(int)scn->scn_suspending);
|
||||
} else
|
||||
zfs_dbgmsg("scanned %llu ddt entries on %s with "
|
||||
"class_max = %u; suspending=%u", (longlong_t)n,
|
||||
scn->scn_dp->dp_spa->spa_name,
|
||||
(int)scn->scn_phys.scn_ddt_class_max,
|
||||
(int)scn->scn_suspending);
|
||||
|
||||
ASSERT(error == 0 || error == ENOENT);
|
||||
ASSERT(error != ENOENT ||
|
||||
|
|
|
@ -47,8 +47,9 @@ typedef uLongf zlen_t;
|
|||
|
||||
#endif
|
||||
|
||||
size_t
|
||||
gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
||||
static size_t
|
||||
zfs_gzip_compress_buf(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, int n)
|
||||
{
|
||||
int ret;
|
||||
zlen_t dstlen = d_len;
|
||||
|
@ -82,8 +83,9 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
|||
return ((size_t)dstlen);
|
||||
}
|
||||
|
||||
int
|
||||
gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
||||
static int
|
||||
zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, int n)
|
||||
{
|
||||
(void) n;
|
||||
zlen_t dstlen = d_len;
|
||||
|
@ -103,3 +105,6 @@ gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
|||
|
||||
return (0);
|
||||
}
|
||||
|
||||
ZFS_COMPRESS_WRAP_DECL(zfs_gzip_compress)
|
||||
ZFS_DECOMPRESS_WRAP_DECL(zfs_gzip_decompress)
|
||||
|
|
|
@ -52,8 +52,8 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
|
|||
|
||||
static kmem_cache_t *lz4_cache;
|
||||
|
||||
size_t
|
||||
lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
|
||||
static size_t
|
||||
zfs_lz4_compress_buf(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, int n)
|
||||
{
|
||||
(void) n;
|
||||
|
@ -80,8 +80,8 @@ lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
|
|||
return (bufsiz + sizeof (bufsiz));
|
||||
}
|
||||
|
||||
int
|
||||
lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
|
||||
static int
|
||||
zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, int n)
|
||||
{
|
||||
(void) n;
|
||||
|
@ -100,6 +100,9 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
|
|||
d_start, bufsiz, d_len) < 0);
|
||||
}
|
||||
|
||||
ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress)
|
||||
ZFS_DECOMPRESS_WRAP_DECL(zfs_lz4_decompress)
|
||||
|
||||
/*
|
||||
* LZ4 API Description:
|
||||
*
|
||||
|
|
|
@ -45,8 +45,9 @@
|
|||
#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
|
||||
#define LEMPEL_SIZE 1024
|
||||
|
||||
size_t
|
||||
lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
||||
static size_t
|
||||
zfs_lzjb_compress_buf(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, int n)
|
||||
{
|
||||
(void) n;
|
||||
uchar_t *src = s_start;
|
||||
|
@ -100,8 +101,9 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
|||
return (dst - (uchar_t *)d_start);
|
||||
}
|
||||
|
||||
int
|
||||
lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
||||
static int
|
||||
zfs_lzjb_decompress_buf(void *s_start, void *d_start,
|
||||
size_t s_len, size_t d_len, int n)
|
||||
{
|
||||
(void) s_len, (void) n;
|
||||
uchar_t *src = s_start;
|
||||
|
@ -130,3 +132,6 @@ lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
|||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
ZFS_COMPRESS_WRAP_DECL(zfs_lzjb_compress)
|
||||
ZFS_DECOMPRESS_WRAP_DECL(zfs_lzjb_decompress)
|
||||
|
|
|
@ -1040,16 +1040,34 @@ spa_change_guid_sync(void *arg, dmu_tx_t *tx)
|
|||
* online when we do this, or else any vdevs that weren't present
|
||||
* would be orphaned from our pool. We are also going to issue a
|
||||
* sysevent to update any watchers.
|
||||
*
|
||||
* The GUID of the pool will be changed to the value pointed to by guidp.
|
||||
* The GUID may not be set to the reserverd value of 0.
|
||||
* The new GUID will be generated if guidp is NULL.
|
||||
*/
|
||||
int
|
||||
spa_change_guid(spa_t *spa)
|
||||
spa_change_guid(spa_t *spa, const uint64_t *guidp)
|
||||
{
|
||||
int error;
|
||||
uint64_t guid;
|
||||
int error;
|
||||
|
||||
mutex_enter(&spa->spa_vdev_top_lock);
|
||||
mutex_enter(&spa_namespace_lock);
|
||||
guid = spa_generate_guid(NULL);
|
||||
|
||||
if (guidp != NULL) {
|
||||
guid = *guidp;
|
||||
if (guid == 0) {
|
||||
error = SET_ERROR(EINVAL);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (spa_guid_exists(guid, 0)) {
|
||||
error = SET_ERROR(EEXIST);
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
guid = spa_generate_guid(NULL);
|
||||
}
|
||||
|
||||
error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
|
||||
spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
|
||||
|
@ -1068,6 +1086,7 @@ spa_change_guid(spa_t *spa)
|
|||
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
|
||||
}
|
||||
|
||||
out:
|
||||
mutex_exit(&spa_namespace_lock);
|
||||
mutex_exit(&spa->spa_vdev_top_lock);
|
||||
|
||||
|
@ -7602,8 +7621,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
|
|||
* The new device cannot have a higher alignment requirement
|
||||
* than the top-level vdev.
|
||||
*/
|
||||
if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
|
||||
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
|
||||
if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) {
|
||||
return (spa_vdev_exit(spa, newrootvd, txg,
|
||||
ZFS_ERR_ASHIFT_MISMATCH));
|
||||
}
|
||||
|
||||
/*
|
||||
* RAIDZ-expansion-specific checks.
|
||||
|
|
|
@ -645,7 +645,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
|||
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
|
||||
DATA_TYPE_INT32, zio->io_error, NULL);
|
||||
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
|
||||
DATA_TYPE_INT32, zio->io_flags, NULL);
|
||||
DATA_TYPE_UINT64, zio->io_flags, NULL);
|
||||
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
|
||||
DATA_TYPE_UINT32, zio->io_stage, NULL);
|
||||
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
|
||||
|
|
|
@ -1794,17 +1794,45 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
|
|||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* inputs:
|
||||
* zc_nvlist_src nvlist optionally containing ZPOOL_REGUID_GUID
|
||||
* zc_nvlist_src_size size of the nvlist
|
||||
*/
|
||||
static int
|
||||
zfs_ioc_pool_reguid(zfs_cmd_t *zc)
|
||||
{
|
||||
uint64_t *guidp = NULL;
|
||||
nvlist_t *props = NULL;
|
||||
spa_t *spa;
|
||||
uint64_t guid;
|
||||
int error;
|
||||
|
||||
if (zc->zc_nvlist_src_size != 0) {
|
||||
error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
|
||||
zc->zc_iflags, &props);
|
||||
if (error != 0)
|
||||
return (error);
|
||||
|
||||
error = nvlist_lookup_uint64(props, ZPOOL_REGUID_GUID, &guid);
|
||||
if (error == 0)
|
||||
guidp = &guid;
|
||||
else if (error == ENOENT)
|
||||
guidp = NULL;
|
||||
else
|
||||
goto out;
|
||||
}
|
||||
|
||||
error = spa_open(zc->zc_name, &spa, FTAG);
|
||||
if (error == 0) {
|
||||
error = spa_change_guid(spa);
|
||||
error = spa_change_guid(spa, guidp);
|
||||
spa_close(spa, FTAG);
|
||||
}
|
||||
|
||||
out:
|
||||
if (props != NULL)
|
||||
nvlist_free(props);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
|
|
465
module/zfs/zio.c
465
module/zfs/zio.c
|
@ -299,10 +299,13 @@ zio_fini(void)
|
|||
* ==========================================================================
|
||||
*/
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
|
||||
#if defined(ZFS_DEBUG) && defined(_KERNEL)
|
||||
#define ZFS_ZIO_BUF_CANARY 1
|
||||
#endif
|
||||
|
||||
#ifdef ZFS_ZIO_BUF_CANARY
|
||||
static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
|
||||
|
||||
/*
|
||||
* Use empty space after the buffer to detect overflows.
|
||||
*
|
||||
|
@ -314,7 +317,6 @@ static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
|
|||
static void
|
||||
zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
|
||||
{
|
||||
#ifdef ZFS_DEBUG
|
||||
size_t off = P2ROUNDUP(size, sizeof (ulong_t));
|
||||
ulong_t *canary = p + off / sizeof (ulong_t);
|
||||
size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
|
||||
|
@ -323,13 +325,11 @@ zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
|
|||
asize = (c + 2) << SPA_MINBLOCKSHIFT;
|
||||
for (; off < asize; canary++, off += sizeof (ulong_t))
|
||||
*canary = zio_buf_canary;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
|
||||
{
|
||||
#ifdef ZFS_DEBUG
|
||||
size_t off = P2ROUNDUP(size, sizeof (ulong_t));
|
||||
ulong_t *canary = p + off / sizeof (ulong_t);
|
||||
size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
|
||||
|
@ -343,8 +343,8 @@ zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
|
|||
*canary, zio_buf_canary);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
|
||||
|
@ -363,7 +363,9 @@ zio_buf_alloc(size_t size)
|
|||
#endif
|
||||
|
||||
void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
|
||||
#ifdef ZFS_ZIO_BUF_CANARY
|
||||
zio_buf_put_canary(p, size, zio_buf_cache, c);
|
||||
#endif
|
||||
return (p);
|
||||
}
|
||||
|
||||
|
@ -381,7 +383,9 @@ zio_data_buf_alloc(size_t size)
|
|||
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
||||
|
||||
void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
|
||||
#ifdef ZFS_ZIO_BUF_CANARY
|
||||
zio_buf_put_canary(p, size, zio_data_buf_cache, c);
|
||||
#endif
|
||||
return (p);
|
||||
}
|
||||
|
||||
|
@ -395,7 +399,9 @@ zio_buf_free(void *buf, size_t size)
|
|||
atomic_add_64(&zio_buf_cache_frees[c], 1);
|
||||
#endif
|
||||
|
||||
#ifdef ZFS_ZIO_BUF_CANARY
|
||||
zio_buf_check_canary(buf, size, zio_buf_cache, c);
|
||||
#endif
|
||||
kmem_cache_free(zio_buf_cache[c], buf);
|
||||
}
|
||||
|
||||
|
@ -406,7 +412,9 @@ zio_data_buf_free(void *buf, size_t size)
|
|||
|
||||
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
||||
|
||||
#ifdef ZFS_ZIO_BUF_CANARY
|
||||
zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
|
||||
#endif
|
||||
kmem_cache_free(zio_data_buf_cache[c], buf);
|
||||
}
|
||||
|
||||
|
@ -479,11 +487,9 @@ static void
|
|||
zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
|
||||
{
|
||||
if (zio->io_error == 0) {
|
||||
void *tmp = abd_borrow_buf(data, size);
|
||||
int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
|
||||
zio->io_abd, tmp, zio->io_size, size,
|
||||
zio->io_abd, data, zio->io_size, size,
|
||||
&zio->io_prop.zp_complevel);
|
||||
abd_return_buf_copy(data, tmp, size);
|
||||
|
||||
if (zio_injection_enabled && ret == 0)
|
||||
ret = zio_handle_fault_injection(zio, EINVAL);
|
||||
|
@ -530,17 +536,18 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
|
|||
* from the indirect block. We decompress it now and
|
||||
* throw away the result after we are finished.
|
||||
*/
|
||||
tmp = zio_buf_alloc(lsize);
|
||||
abd_t *abd = abd_alloc_linear(lsize, B_TRUE);
|
||||
ret = zio_decompress_data(BP_GET_COMPRESS(bp),
|
||||
zio->io_abd, tmp, zio->io_size, lsize,
|
||||
zio->io_abd, abd, zio->io_size, lsize,
|
||||
&zio->io_prop.zp_complevel);
|
||||
if (ret != 0) {
|
||||
abd_free(abd);
|
||||
ret = SET_ERROR(EIO);
|
||||
goto error;
|
||||
}
|
||||
ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
|
||||
tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
|
||||
zio_buf_free(tmp, lsize);
|
||||
ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
|
||||
abd, lsize, BP_SHOULD_BYTESWAP(bp), mac);
|
||||
abd_free(abd);
|
||||
} else {
|
||||
ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
|
||||
zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
|
||||
|
@ -1858,30 +1865,32 @@ zio_write_compress(zio_t *zio)
|
|||
/* If it's a compressed write that is not raw, compress the buffer. */
|
||||
if (compress != ZIO_COMPRESS_OFF &&
|
||||
!(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
|
||||
void *cbuf = NULL;
|
||||
abd_t *cabd = NULL;
|
||||
if (abd_cmp_zero(zio->io_abd, lsize) == 0)
|
||||
psize = 0;
|
||||
else if (compress == ZIO_COMPRESS_EMPTY)
|
||||
psize = lsize;
|
||||
else
|
||||
psize = zio_compress_data(compress, zio->io_abd, &cbuf,
|
||||
psize = zio_compress_data(compress, zio->io_abd, &cabd,
|
||||
lsize, zp->zp_complevel);
|
||||
if (psize == 0) {
|
||||
compress = ZIO_COMPRESS_OFF;
|
||||
} else if (psize >= lsize) {
|
||||
compress = ZIO_COMPRESS_OFF;
|
||||
if (cbuf != NULL)
|
||||
zio_buf_free(cbuf, lsize);
|
||||
if (cabd != NULL)
|
||||
abd_free(cabd);
|
||||
} else if (!zp->zp_dedup && !zp->zp_encrypt &&
|
||||
psize <= BPE_PAYLOAD_SIZE &&
|
||||
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
|
||||
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
|
||||
void *cbuf = abd_borrow_buf_copy(cabd, lsize);
|
||||
encode_embedded_bp_compressed(bp,
|
||||
cbuf, compress, lsize, psize);
|
||||
BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
|
||||
BP_SET_TYPE(bp, zio->io_prop.zp_type);
|
||||
BP_SET_LEVEL(bp, zio->io_prop.zp_level);
|
||||
zio_buf_free(cbuf, lsize);
|
||||
abd_return_buf(cabd, cbuf, lsize);
|
||||
abd_free(cabd);
|
||||
BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
|
||||
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
||||
ASSERT(spa_feature_is_active(spa,
|
||||
|
@ -1900,14 +1909,12 @@ zio_write_compress(zio_t *zio)
|
|||
psize);
|
||||
if (rounded >= lsize) {
|
||||
compress = ZIO_COMPRESS_OFF;
|
||||
zio_buf_free(cbuf, lsize);
|
||||
abd_free(cabd);
|
||||
psize = lsize;
|
||||
} else {
|
||||
abd_t *cdata = abd_get_from_buf(cbuf, lsize);
|
||||
abd_take_ownership_of_buf(cdata, B_TRUE);
|
||||
abd_zero_off(cdata, psize, rounded - psize);
|
||||
abd_zero_off(cabd, psize, rounded - psize);
|
||||
psize = rounded;
|
||||
zio_push_transform(zio, cdata,
|
||||
zio_push_transform(zio, cabd,
|
||||
psize, lsize, NULL);
|
||||
}
|
||||
}
|
||||
|
@ -3254,17 +3261,21 @@ static void
|
|||
zio_ddt_child_read_done(zio_t *zio)
|
||||
{
|
||||
blkptr_t *bp = zio->io_bp;
|
||||
ddt_t *ddt;
|
||||
ddt_entry_t *dde = zio->io_private;
|
||||
ddt_phys_t *ddp;
|
||||
zio_t *pio = zio_unique_parent(zio);
|
||||
|
||||
mutex_enter(&pio->io_lock);
|
||||
ddp = ddt_phys_select(dde, bp);
|
||||
if (zio->io_error == 0)
|
||||
ddt_phys_clear(ddp); /* this ddp doesn't need repair */
|
||||
ddt = ddt_select(zio->io_spa, bp);
|
||||
|
||||
if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
|
||||
dde->dde_repair_abd = zio->io_abd;
|
||||
if (zio->io_error == 0) {
|
||||
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
|
||||
/* this phys variant doesn't need repair */
|
||||
ddt_phys_clear(dde->dde_phys, v);
|
||||
}
|
||||
|
||||
if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
|
||||
dde->dde_io->dde_repair_abd = zio->io_abd;
|
||||
else
|
||||
abd_free(zio->io_abd);
|
||||
mutex_exit(&pio->io_lock);
|
||||
|
@ -3282,21 +3293,25 @@ zio_ddt_read_start(zio_t *zio)
|
|||
if (zio->io_child_error[ZIO_CHILD_DDT]) {
|
||||
ddt_t *ddt = ddt_select(zio->io_spa, bp);
|
||||
ddt_entry_t *dde = ddt_repair_start(ddt, bp);
|
||||
ddt_phys_t *ddp = dde->dde_phys;
|
||||
ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
|
||||
ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp);
|
||||
ddt_univ_phys_t *ddp = dde->dde_phys;
|
||||
blkptr_t blk;
|
||||
|
||||
ASSERT(zio->io_vsd == NULL);
|
||||
zio->io_vsd = dde;
|
||||
|
||||
if (ddp_self == NULL)
|
||||
if (v_self == DDT_PHYS_NONE)
|
||||
return (zio);
|
||||
|
||||
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
||||
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
|
||||
/* issue I/O for the other copies */
|
||||
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
|
||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||
|
||||
if (ddt_phys_birth(ddp, v) == 0 || v == v_self)
|
||||
continue;
|
||||
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
|
||||
&blk);
|
||||
|
||||
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
|
||||
ddp, v, &blk);
|
||||
zio_nowait(zio_read(zio, zio->io_spa, &blk,
|
||||
abd_alloc_for_io(zio->io_size, B_TRUE),
|
||||
zio->io_size, zio_ddt_child_read_done, dde,
|
||||
|
@ -3338,8 +3353,8 @@ zio_ddt_read_done(zio_t *zio)
|
|||
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
|
||||
return (NULL);
|
||||
}
|
||||
if (dde->dde_repair_abd != NULL) {
|
||||
abd_copy(zio->io_abd, dde->dde_repair_abd,
|
||||
if (dde->dde_io->dde_repair_abd != NULL) {
|
||||
abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd,
|
||||
zio->io_size);
|
||||
zio->io_child_error[ZIO_CHILD_DDT] = 0;
|
||||
}
|
||||
|
@ -3372,28 +3387,36 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
|
|||
* loaded).
|
||||
*/
|
||||
|
||||
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
|
||||
zio_t *lio = dde->dde_lead_zio[p];
|
||||
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
|
||||
if (DDT_PHYS_IS_DITTO(ddt, p))
|
||||
continue;
|
||||
|
||||
if (lio != NULL && do_raw) {
|
||||
if (dde->dde_io == NULL)
|
||||
continue;
|
||||
|
||||
zio_t *lio = dde->dde_io->dde_lead_zio[p];
|
||||
if (lio == NULL)
|
||||
continue;
|
||||
|
||||
if (do_raw)
|
||||
return (lio->io_size != zio->io_size ||
|
||||
abd_cmp(zio->io_abd, lio->io_abd) != 0);
|
||||
} else if (lio != NULL) {
|
||||
return (lio->io_orig_size != zio->io_orig_size ||
|
||||
abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
|
||||
}
|
||||
|
||||
return (lio->io_orig_size != zio->io_orig_size ||
|
||||
abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
|
||||
}
|
||||
|
||||
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
|
||||
ddt_phys_t *ddp = &dde->dde_phys[p];
|
||||
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
|
||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||
uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v);
|
||||
|
||||
if (ddp->ddp_phys_birth != 0 && do_raw) {
|
||||
if (phys_birth != 0 && do_raw) {
|
||||
blkptr_t blk = *zio->io_bp;
|
||||
uint64_t psize;
|
||||
abd_t *tmpabd;
|
||||
int error;
|
||||
|
||||
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
|
||||
ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
|
||||
psize = BP_GET_PSIZE(&blk);
|
||||
|
||||
if (psize != zio->io_size)
|
||||
|
@ -3416,13 +3439,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
|
|||
abd_free(tmpabd);
|
||||
ddt_enter(ddt);
|
||||
return (error != 0);
|
||||
} else if (ddp->ddp_phys_birth != 0) {
|
||||
} else if (phys_birth != 0) {
|
||||
arc_buf_t *abuf = NULL;
|
||||
arc_flags_t aflags = ARC_FLAG_WAIT;
|
||||
blkptr_t blk = *zio->io_bp;
|
||||
int error;
|
||||
|
||||
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
|
||||
ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
|
||||
|
||||
if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
|
||||
return (B_TRUE);
|
||||
|
@ -3450,50 +3473,87 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
|
|||
}
|
||||
|
||||
static void
|
||||
zio_ddt_child_write_ready(zio_t *zio)
|
||||
zio_ddt_child_write_done(zio_t *zio)
|
||||
{
|
||||
int p = zio->io_prop.zp_copies;
|
||||
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
|
||||
ddt_entry_t *dde = zio->io_private;
|
||||
ddt_phys_t *ddp = &dde->dde_phys[p];
|
||||
zio_t *pio;
|
||||
|
||||
if (zio->io_error)
|
||||
return;
|
||||
zio_link_t *zl = NULL;
|
||||
ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
|
||||
|
||||
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
|
||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||
ddt_univ_phys_t *ddp = dde->dde_phys;
|
||||
|
||||
ddt_enter(ddt);
|
||||
|
||||
ASSERT(dde->dde_lead_zio[p] == zio);
|
||||
/* we're the lead, so once we're done there's no one else outstanding */
|
||||
if (dde->dde_io->dde_lead_zio[p] == zio)
|
||||
dde->dde_io->dde_lead_zio[p] = NULL;
|
||||
|
||||
ddt_phys_fill(ddp, zio->io_bp);
|
||||
ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys;
|
||||
|
||||
zio_link_t *zl = NULL;
|
||||
while ((pio = zio_walk_parents(zio, &zl)) != NULL)
|
||||
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
|
||||
if (zio->io_error != 0) {
|
||||
/*
|
||||
* The write failed, so we're about to abort the entire IO
|
||||
* chain. We need to revert the entry back to what it was at
|
||||
* the last time it was successfully extended.
|
||||
*/
|
||||
ddt_phys_copy(ddp, orig, v);
|
||||
ddt_phys_clear(orig, v);
|
||||
|
||||
ddt_exit(ddt);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We've successfully added new DVAs to the entry. Clear the saved
|
||||
* state or, if there's still outstanding IO, remember it so we can
|
||||
* revert to a known good state if that IO fails.
|
||||
*/
|
||||
if (dde->dde_io->dde_lead_zio[p] == NULL)
|
||||
ddt_phys_clear(orig, v);
|
||||
else
|
||||
ddt_phys_copy(orig, ddp, v);
|
||||
|
||||
/*
|
||||
* Add references for all dedup writes that were waiting on the
|
||||
* physical one, skipping any other physical writes that are waiting.
|
||||
*/
|
||||
zio_t *pio;
|
||||
zl = NULL;
|
||||
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
|
||||
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
|
||||
ddt_phys_addref(ddp, v);
|
||||
}
|
||||
|
||||
ddt_exit(ddt);
|
||||
}
|
||||
|
||||
static void
|
||||
zio_ddt_child_write_done(zio_t *zio)
|
||||
zio_ddt_child_write_ready(zio_t *zio)
|
||||
{
|
||||
int p = zio->io_prop.zp_copies;
|
||||
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
|
||||
ddt_entry_t *dde = zio->io_private;
|
||||
ddt_phys_t *ddp = &dde->dde_phys[p];
|
||||
|
||||
zio_link_t *zl = NULL;
|
||||
ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
|
||||
|
||||
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
|
||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||
|
||||
if (zio->io_error != 0)
|
||||
return;
|
||||
|
||||
ddt_enter(ddt);
|
||||
|
||||
ASSERT(ddp->ddp_refcnt == 0);
|
||||
ASSERT(dde->dde_lead_zio[p] == zio);
|
||||
dde->dde_lead_zio[p] = NULL;
|
||||
ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
|
||||
|
||||
if (zio->io_error == 0) {
|
||||
zio_link_t *zl = NULL;
|
||||
while (zio_walk_parents(zio, &zl) != NULL)
|
||||
ddt_phys_addref(ddp);
|
||||
} else {
|
||||
ddt_phys_clear(ddp);
|
||||
zio_t *pio;
|
||||
zl = NULL;
|
||||
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
|
||||
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
|
||||
ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
|
||||
}
|
||||
|
||||
ddt_exit(ddt);
|
||||
|
@ -3506,11 +3566,8 @@ zio_ddt_write(zio_t *zio)
|
|||
blkptr_t *bp = zio->io_bp;
|
||||
uint64_t txg = zio->io_txg;
|
||||
zio_prop_t *zp = &zio->io_prop;
|
||||
int p = zp->zp_copies;
|
||||
zio_t *cio = NULL;
|
||||
ddt_t *ddt = ddt_select(spa, bp);
|
||||
ddt_entry_t *dde;
|
||||
ddt_phys_t *ddp;
|
||||
|
||||
ASSERT(BP_GET_DEDUP(bp));
|
||||
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
|
||||
|
@ -3518,7 +3575,7 @@ zio_ddt_write(zio_t *zio)
|
|||
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
|
||||
|
||||
ddt_enter(ddt);
|
||||
dde = ddt_lookup(ddt, bp, B_TRUE);
|
||||
dde = ddt_lookup(ddt, bp);
|
||||
if (dde == NULL) {
|
||||
/* DDT size is over its quota so no new entries */
|
||||
zp->zp_dedup = B_FALSE;
|
||||
|
@ -3528,7 +3585,6 @@ zio_ddt_write(zio_t *zio)
|
|||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
ddp = &dde->dde_phys[p];
|
||||
|
||||
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
|
||||
/*
|
||||
|
@ -3553,29 +3609,227 @@ zio_ddt_write(zio_t *zio)
|
|||
return (zio);
|
||||
}
|
||||
|
||||
if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
|
||||
if (ddp->ddp_phys_birth != 0)
|
||||
ddt_bp_fill(ddp, bp, txg);
|
||||
if (dde->dde_lead_zio[p] != NULL)
|
||||
zio_add_child(zio, dde->dde_lead_zio[p]);
|
||||
else
|
||||
ddt_phys_addref(ddp);
|
||||
} else if (zio->io_bp_override) {
|
||||
ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
|
||||
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
|
||||
ddt_phys_fill(ddp, bp);
|
||||
ddt_phys_addref(ddp);
|
||||
} else {
|
||||
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
||||
zio->io_orig_size, zio->io_orig_size, zp,
|
||||
zio_ddt_child_write_ready, NULL,
|
||||
zio_ddt_child_write_done, dde, zio->io_priority,
|
||||
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
|
||||
int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
|
||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||
ddt_univ_phys_t *ddp = dde->dde_phys;
|
||||
|
||||
zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
|
||||
dde->dde_lead_zio[p] = cio;
|
||||
/*
|
||||
* In the common cases, at this point we have a regular BP with no
|
||||
* allocated DVAs, and the corresponding DDT entry for its checksum.
|
||||
* Our goal is to fill the BP with enough DVAs to satisfy its copies=
|
||||
* requirement.
|
||||
*
|
||||
* One of three things needs to happen to fulfill this:
|
||||
*
|
||||
* - if the DDT entry has enough DVAs to satisfy the BP, we just copy
|
||||
* them out of the entry and return;
|
||||
*
|
||||
* - if the DDT entry has no DVAs (ie its brand new), then we have to
|
||||
* issue the write as normal so that DVAs can be allocated and the
|
||||
* data land on disk. We then copy the DVAs into the DDT entry on
|
||||
* return.
|
||||
*
|
||||
* - if the DDT entry has some DVAs, but too few, we have to issue the
|
||||
* write, adjusted to have allocate fewer copies. When it returns, we
|
||||
* add the new DVAs to the DDT entry, and update the BP to have the
|
||||
* full amount it originally requested.
|
||||
*
|
||||
* In all cases, if there's already a writing IO in flight, we need to
|
||||
* defer the action until after the write is done. If our action is to
|
||||
* write, we need to adjust our request for additional DVAs to match
|
||||
* what will be in the DDT entry after it completes. In this way every
|
||||
* IO can be guaranteed to recieve enough DVAs simply by joining the
|
||||
* end of the chain and letting the sequence play out.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Number of DVAs in the DDT entry. If the BP is encrypted we ignore
|
||||
* the third one as normal.
|
||||
*/
|
||||
int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
|
||||
IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
|
||||
|
||||
/* Number of DVAs requested bya the IO. */
|
||||
uint8_t need_dvas = zp->zp_copies;
|
||||
|
||||
/*
|
||||
* What we do next depends on whether or not there's IO outstanding that
|
||||
* will update this entry.
|
||||
*/
|
||||
if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
|
||||
/*
|
||||
* No IO outstanding, so we only need to worry about ourselves.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Override BPs bring their own DVAs and their own problems.
|
||||
*/
|
||||
if (zio->io_bp_override) {
|
||||
/*
|
||||
* For a brand-new entry, all the work has been done
|
||||
* for us, and we can just fill it out from the provided
|
||||
* block and leave.
|
||||
*/
|
||||
if (have_dvas == 0) {
|
||||
ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
|
||||
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
|
||||
ddt_phys_extend(ddp, v, bp);
|
||||
ddt_phys_addref(ddp, v);
|
||||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we already have this entry, then we want to treat
|
||||
* it like a regular write. To do this we just wipe
|
||||
* them out and proceed like a regular write.
|
||||
*
|
||||
* Even if there are some DVAs in the entry, we still
|
||||
* have to clear them out. We can't use them to fill
|
||||
* out the dedup entry, as they are all referenced
|
||||
* together by a bp already on disk, and will be freed
|
||||
* as a group.
|
||||
*/
|
||||
BP_ZERO_DVAS(bp);
|
||||
BP_SET_BIRTH(bp, 0, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are enough DVAs in the entry to service our request,
|
||||
* then we can just use them as-is.
|
||||
*/
|
||||
if (have_dvas >= need_dvas) {
|
||||
ddt_bp_fill(ddp, v, bp, txg);
|
||||
ddt_phys_addref(ddp, v);
|
||||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Otherwise, we have to issue IO to fill the entry up to the
|
||||
* amount we need.
|
||||
*/
|
||||
need_dvas -= have_dvas;
|
||||
} else {
|
||||
/*
|
||||
* There's a write in-flight. If there's already enough DVAs on
|
||||
* the entry, then either there were already enough to start
|
||||
* with, or the in-flight IO is between READY and DONE, and so
|
||||
* has extended the entry with new DVAs. Either way, we don't
|
||||
* need to do anything, we can just slot in behind it.
|
||||
*/
|
||||
|
||||
if (zio->io_bp_override) {
|
||||
/*
|
||||
* If there's a write out, then we're soon going to
|
||||
* have our own copies of this block, so clear out the
|
||||
* override block and treat it as a regular dedup
|
||||
* write. See comment above.
|
||||
*/
|
||||
BP_ZERO_DVAS(bp);
|
||||
BP_SET_BIRTH(bp, 0, 0);
|
||||
}
|
||||
|
||||
if (have_dvas >= need_dvas) {
|
||||
/*
|
||||
* A minor point: there might already be enough
|
||||
* committed DVAs in the entry to service our request,
|
||||
* but we don't know which are completed and which are
|
||||
* allocated but not yet written. In this case, should
|
||||
* the IO for the new DVAs fail, we will be on the end
|
||||
* of the IO chain and will also recieve an error, even
|
||||
* though our request could have been serviced.
|
||||
*
|
||||
* This is an extremely rare case, as it requires the
|
||||
* original block to be copied with a request for a
|
||||
* larger number of DVAs, then copied again requesting
|
||||
* the same (or already fulfilled) number of DVAs while
|
||||
* the first request is active, and then that first
|
||||
* request errors. In return, the logic required to
|
||||
* catch and handle it is complex. For now, I'm just
|
||||
* not going to bother with it.
|
||||
*/
|
||||
|
||||
/*
|
||||
* We always fill the bp here as we may have arrived
|
||||
* after the in-flight write has passed READY, and so
|
||||
* missed out.
|
||||
*/
|
||||
ddt_bp_fill(ddp, v, bp, txg);
|
||||
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
|
||||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
|
||||
/*
|
||||
* There's not enough in the entry yet, so we need to look at
|
||||
* the write in-flight and see how many DVAs it will have once
|
||||
* it completes.
|
||||
*
|
||||
* The in-flight write has potentially had its copies request
|
||||
* reduced (if we're filling out an existing entry), so we need
|
||||
* to reach in and get the original write to find out what it is
|
||||
* expecting.
|
||||
*
|
||||
* Note that the parent of the lead zio will always have the
|
||||
* highest zp_copies of any zio in the chain, because ones that
|
||||
* can be serviced without additional IO are always added to
|
||||
* the back of the chain.
|
||||
*/
|
||||
zio_link_t *zl = NULL;
|
||||
zio_t *pio =
|
||||
zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
|
||||
ASSERT(pio);
|
||||
uint8_t parent_dvas = pio->io_prop.zp_copies;
|
||||
|
||||
if (parent_dvas >= need_dvas) {
|
||||
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
|
||||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Still not enough, so we will need to issue to get the
|
||||
* shortfall.
|
||||
*/
|
||||
need_dvas -= parent_dvas;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to write. We will create a new write with the copies
|
||||
* property adjusted to match the number of DVAs we need to need to
|
||||
* grow the DDT entry by to satisfy the request.
|
||||
*/
|
||||
zio_prop_t czp = *zp;
|
||||
czp.zp_copies = need_dvas;
|
||||
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
||||
zio->io_orig_size, zio->io_orig_size, &czp,
|
||||
zio_ddt_child_write_ready, NULL,
|
||||
zio_ddt_child_write_done, dde, zio->io_priority,
|
||||
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
|
||||
|
||||
zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
|
||||
|
||||
/*
|
||||
* We are the new lead zio, because our parent has the highest
|
||||
* zp_copies that has been requested for this entry so far.
|
||||
*/
|
||||
ddt_alloc_entry_io(dde);
|
||||
if (dde->dde_io->dde_lead_zio[p] == NULL) {
|
||||
/*
|
||||
* First time out, take a copy of the stable entry to revert
|
||||
* to if there's an error (see zio_ddt_child_write_done())
|
||||
*/
|
||||
ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
|
||||
} else {
|
||||
/*
|
||||
* Make the existing chain our child, because it cannot
|
||||
* complete until we have.
|
||||
*/
|
||||
zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
|
||||
}
|
||||
dde->dde_io->dde_lead_zio[p] = cio;
|
||||
|
||||
ddt_exit(ddt);
|
||||
|
||||
zio_nowait(cio);
|
||||
|
@ -3591,18 +3845,17 @@ zio_ddt_free(zio_t *zio)
|
|||
spa_t *spa = zio->io_spa;
|
||||
blkptr_t *bp = zio->io_bp;
|
||||
ddt_t *ddt = ddt_select(spa, bp);
|
||||
ddt_entry_t *dde;
|
||||
ddt_phys_t *ddp;
|
||||
ddt_entry_t *dde = NULL;
|
||||
|
||||
ASSERT(BP_GET_DEDUP(bp));
|
||||
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
||||
|
||||
ddt_enter(ddt);
|
||||
freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
|
||||
freedde = dde = ddt_lookup(ddt, bp);
|
||||
if (dde) {
|
||||
ddp = ddt_phys_select(dde, bp);
|
||||
if (ddp)
|
||||
ddt_phys_decref(ddp);
|
||||
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
|
||||
if (v != DDT_PHYS_NONE)
|
||||
ddt_phys_decref(dde->dde_phys, v);
|
||||
}
|
||||
ddt_exit(ddt);
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
|
||||
/*
|
||||
* Copyright (c) 2013, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2019, Klara Inc.
|
||||
* Copyright (c) 2019, 2024, Klara, Inc.
|
||||
* Copyright (c) 2019, Allan Jude
|
||||
*/
|
||||
|
||||
|
@ -48,26 +48,42 @@ static unsigned long zio_decompress_fail_fraction = 0;
|
|||
|
||||
/*
|
||||
* Compression vectors.
|
||||
*
|
||||
* NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS.
|
||||
* THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE
|
||||
* PART OF THE ON-DISK FORMAT.
|
||||
*/
|
||||
zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
|
||||
{"inherit", 0, NULL, NULL, NULL},
|
||||
{"on", 0, NULL, NULL, NULL},
|
||||
{"uncompressed", 0, NULL, NULL, NULL},
|
||||
{"lzjb", 0, lzjb_compress, lzjb_decompress, NULL},
|
||||
{"empty", 0, NULL, NULL, NULL},
|
||||
{"gzip-1", 1, gzip_compress, gzip_decompress, NULL},
|
||||
{"gzip-2", 2, gzip_compress, gzip_decompress, NULL},
|
||||
{"gzip-3", 3, gzip_compress, gzip_decompress, NULL},
|
||||
{"gzip-4", 4, gzip_compress, gzip_decompress, NULL},
|
||||
{"gzip-5", 5, gzip_compress, gzip_decompress, NULL},
|
||||
{"gzip-6", 6, gzip_compress, gzip_decompress, NULL},
|
||||
{"gzip-7", 7, gzip_compress, gzip_decompress, NULL},
|
||||
{"gzip-8", 8, gzip_compress, gzip_decompress, NULL},
|
||||
{"gzip-9", 9, gzip_compress, gzip_decompress, NULL},
|
||||
{"zle", 64, zle_compress, zle_decompress, NULL},
|
||||
{"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL},
|
||||
{"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress_wrap,
|
||||
zfs_zstd_decompress, zfs_zstd_decompress_level},
|
||||
{"inherit", 0, NULL, NULL, NULL},
|
||||
{"on", 0, NULL, NULL, NULL},
|
||||
{"uncompressed", 0, NULL, NULL, NULL},
|
||||
{"lzjb", 0,
|
||||
zfs_lzjb_compress, zfs_lzjb_decompress, NULL},
|
||||
{"empty", 0, NULL, NULL, NULL},
|
||||
{"gzip-1", 1,
|
||||
zfs_gzip_compress, zfs_gzip_decompress, NULL},
|
||||
{"gzip-2", 2,
|
||||
zfs_gzip_compress, zfs_gzip_decompress, NULL},
|
||||
{"gzip-3", 3,
|
||||
zfs_gzip_compress, zfs_gzip_decompress, NULL},
|
||||
{"gzip-4", 4,
|
||||
zfs_gzip_compress, zfs_gzip_decompress, NULL},
|
||||
{"gzip-5", 5,
|
||||
zfs_gzip_compress, zfs_gzip_decompress, NULL},
|
||||
{"gzip-6", 6,
|
||||
zfs_gzip_compress, zfs_gzip_decompress, NULL},
|
||||
{"gzip-7", 7,
|
||||
zfs_gzip_compress, zfs_gzip_decompress, NULL},
|
||||
{"gzip-8", 8,
|
||||
zfs_gzip_compress, zfs_gzip_decompress, NULL},
|
||||
{"gzip-9", 9,
|
||||
zfs_gzip_compress, zfs_gzip_decompress, NULL},
|
||||
{"zle", 64,
|
||||
zfs_zle_compress, zfs_zle_decompress, NULL},
|
||||
{"lz4", 0,
|
||||
zfs_lz4_compress, zfs_lz4_decompress, NULL},
|
||||
{"zstd", ZIO_ZSTD_LEVEL_DEFAULT,
|
||||
zfs_zstd_compress, zfs_zstd_decompress, zfs_zstd_decompress_level},
|
||||
};
|
||||
|
||||
uint8_t
|
||||
|
@ -112,20 +128,16 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
|
|||
}
|
||||
|
||||
size_t
|
||||
zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
|
||||
zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len,
|
||||
uint8_t level)
|
||||
{
|
||||
size_t c_len, d_len;
|
||||
uint8_t complevel;
|
||||
zio_compress_info_t *ci = &zio_compress_table[c];
|
||||
|
||||
ASSERT3U(c, <, ZIO_COMPRESS_FUNCTIONS);
|
||||
ASSERT3U(ci->ci_compress, !=, NULL);
|
||||
ASSERT3U(s_len, >, 0);
|
||||
|
||||
/* Compress at least 12.5% */
|
||||
d_len = s_len - (s_len >> 3);
|
||||
|
||||
complevel = ci->ci_level;
|
||||
|
||||
if (c == ZIO_COMPRESS_ZSTD) {
|
||||
|
@ -142,12 +154,12 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
|
|||
}
|
||||
|
||||
if (*dst == NULL)
|
||||
*dst = zio_buf_alloc(s_len);
|
||||
*dst = abd_alloc_sametype(src, s_len);
|
||||
|
||||
/* No compression algorithms can read from ABDs directly */
|
||||
void *tmp = abd_borrow_buf_copy(src, s_len);
|
||||
c_len = ci->ci_compress(tmp, *dst, s_len, d_len, complevel);
|
||||
abd_return_buf(src, tmp, s_len);
|
||||
/* Compress at least 12.5%, but limit to the size of the dest abd. */
|
||||
d_len = MIN(s_len - (s_len >> 3), abd_get_size(*dst));
|
||||
|
||||
c_len = ci->ci_compress(src, *dst, s_len, d_len, complevel);
|
||||
|
||||
if (c_len > d_len)
|
||||
return (s_len);
|
||||
|
@ -157,26 +169,18 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
|
|||
}
|
||||
|
||||
int
|
||||
zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
|
||||
zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *dst,
|
||||
size_t s_len, size_t d_len, uint8_t *level)
|
||||
{
|
||||
zio_compress_info_t *ci = &zio_compress_table[c];
|
||||
if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
|
||||
return (SET_ERROR(EINVAL));
|
||||
|
||||
int err;
|
||||
if (ci->ci_decompress_level != NULL && level != NULL)
|
||||
return (ci->ci_decompress_level(src, dst, s_len, d_len, level));
|
||||
|
||||
return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
|
||||
}
|
||||
|
||||
int
|
||||
zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
|
||||
size_t s_len, size_t d_len, uint8_t *level)
|
||||
{
|
||||
void *tmp = abd_borrow_buf_copy(src, s_len);
|
||||
int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level);
|
||||
abd_return_buf(src, tmp, s_len);
|
||||
err = ci->ci_decompress_level(src, dst, s_len, d_len, level);
|
||||
else
|
||||
err = ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
|
||||
|
||||
/*
|
||||
* Decompression shouldn't fail, because we've already verified
|
||||
|
@ -185,9 +189,9 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
|
|||
*/
|
||||
if (zio_decompress_fail_fraction != 0 &&
|
||||
random_in_range(zio_decompress_fail_fraction) == 0)
|
||||
ret = SET_ERROR(EINVAL);
|
||||
err = SET_ERROR(EINVAL);
|
||||
|
||||
return (ret);
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
|
|
|
@ -34,8 +34,9 @@
|
|||
#include <sys/sysmacros.h>
|
||||
#include <sys/zio_compress.h>
|
||||
|
||||
size_t
|
||||
zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
||||
static size_t
|
||||
zfs_zle_compress_buf(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, int n)
|
||||
{
|
||||
uchar_t *src = s_start;
|
||||
uchar_t *dst = d_start;
|
||||
|
@ -64,8 +65,9 @@ zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
|||
return (src == s_end ? dst - (uchar_t *)d_start : s_len);
|
||||
}
|
||||
|
||||
int
|
||||
zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
||||
static int
|
||||
zfs_zle_decompress_buf(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, int n)
|
||||
{
|
||||
uchar_t *src = s_start;
|
||||
uchar_t *dst = d_start;
|
||||
|
@ -89,3 +91,6 @@ zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
|
|||
}
|
||||
return (dst == d_end ? 0 : -1);
|
||||
}
|
||||
|
||||
ZFS_COMPRESS_WRAP_DECL(zfs_zle_compress)
|
||||
ZFS_DECOMPRESS_WRAP_DECL(zfs_zle_decompress)
|
||||
|
|
|
@ -429,68 +429,9 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
|
|||
return (1);
|
||||
}
|
||||
|
||||
|
||||
size_t
|
||||
zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
|
||||
int level)
|
||||
{
|
||||
int16_t zstd_level;
|
||||
if (zstd_enum_to_level(level, &zstd_level)) {
|
||||
ZSTDSTAT_BUMP(zstd_stat_com_inval);
|
||||
return (s_len);
|
||||
}
|
||||
/*
|
||||
* A zstd early abort heuristic.
|
||||
*
|
||||
* - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
|
||||
* 128k), don't try any of this, just go.
|
||||
* (because experimentally that was a reasonable cutoff for a perf win
|
||||
* with tiny ratio change)
|
||||
* - First, we try LZ4 compression, and if it doesn't early abort, we
|
||||
* jump directly to whatever compression level we intended to try.
|
||||
* - Second, we try zstd-1 - if that errors out (usually, but not
|
||||
* exclusively, if it would overflow), we give up early.
|
||||
*
|
||||
* If it works, instead we go on and compress anyway.
|
||||
*
|
||||
* Why two passes? LZ4 alone gets you a lot of the way, but on highly
|
||||
* compressible data, it was losing up to 8.5% of the compressed
|
||||
* savings versus no early abort, and all the zstd-fast levels are
|
||||
* worse indications on their own than LZ4, and don't improve the LZ4
|
||||
* pass noticably if stacked like this.
|
||||
*/
|
||||
size_t actual_abort_size = zstd_abort_size;
|
||||
if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
|
||||
s_len >= actual_abort_size) {
|
||||
int pass_len = 1;
|
||||
pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
|
||||
if (pass_len < d_len) {
|
||||
ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
|
||||
goto keep_trying;
|
||||
}
|
||||
ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
|
||||
|
||||
pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
|
||||
ZIO_ZSTD_LEVEL_1);
|
||||
if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
|
||||
ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
|
||||
return (s_len);
|
||||
}
|
||||
ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
|
||||
} else {
|
||||
ZSTDSTAT_BUMP(zstd_stat_passignored);
|
||||
if (s_len < actual_abort_size) {
|
||||
ZSTDSTAT_BUMP(zstd_stat_passignored_size);
|
||||
}
|
||||
}
|
||||
keep_trying:
|
||||
return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
|
||||
|
||||
}
|
||||
|
||||
/* Compress block using zstd */
|
||||
size_t
|
||||
zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
|
||||
static size_t
|
||||
zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
|
||||
int level)
|
||||
{
|
||||
size_t c_len;
|
||||
|
@ -594,9 +535,73 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
|
|||
return (c_len + sizeof (*hdr));
|
||||
}
|
||||
|
||||
|
||||
static size_t
|
||||
zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
|
||||
int level)
|
||||
{
|
||||
int16_t zstd_level;
|
||||
if (zstd_enum_to_level(level, &zstd_level)) {
|
||||
ZSTDSTAT_BUMP(zstd_stat_com_inval);
|
||||
return (s_len);
|
||||
}
|
||||
/*
|
||||
* A zstd early abort heuristic.
|
||||
*
|
||||
* - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
|
||||
* 128k), don't try any of this, just go.
|
||||
* (because experimentally that was a reasonable cutoff for a perf win
|
||||
* with tiny ratio change)
|
||||
* - First, we try LZ4 compression, and if it doesn't early abort, we
|
||||
* jump directly to whatever compression level we intended to try.
|
||||
* - Second, we try zstd-1 - if that errors out (usually, but not
|
||||
* exclusively, if it would overflow), we give up early.
|
||||
*
|
||||
* If it works, instead we go on and compress anyway.
|
||||
*
|
||||
* Why two passes? LZ4 alone gets you a lot of the way, but on highly
|
||||
* compressible data, it was losing up to 8.5% of the compressed
|
||||
* savings versus no early abort, and all the zstd-fast levels are
|
||||
* worse indications on their own than LZ4, and don't improve the LZ4
|
||||
* pass noticably if stacked like this.
|
||||
*/
|
||||
size_t actual_abort_size = zstd_abort_size;
|
||||
if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
|
||||
s_len >= actual_abort_size) {
|
||||
int pass_len = 1;
|
||||
abd_t sabd, dabd;
|
||||
abd_get_from_buf_struct(&sabd, s_start, s_len);
|
||||
abd_get_from_buf_struct(&dabd, d_start, d_len);
|
||||
pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
|
||||
abd_free(&dabd);
|
||||
abd_free(&sabd);
|
||||
if (pass_len < d_len) {
|
||||
ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
|
||||
goto keep_trying;
|
||||
}
|
||||
ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
|
||||
|
||||
pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
|
||||
d_len, ZIO_ZSTD_LEVEL_1);
|
||||
if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
|
||||
ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
|
||||
return (s_len);
|
||||
}
|
||||
ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
|
||||
} else {
|
||||
ZSTDSTAT_BUMP(zstd_stat_passignored);
|
||||
if (s_len < actual_abort_size) {
|
||||
ZSTDSTAT_BUMP(zstd_stat_passignored_size);
|
||||
}
|
||||
}
|
||||
keep_trying:
|
||||
return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
|
||||
|
||||
}
|
||||
|
||||
/* Decompress block using zstd and return its stored level */
|
||||
int
|
||||
zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
|
||||
static int
|
||||
zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, uint8_t *level)
|
||||
{
|
||||
ZSTD_DCtx *dctx;
|
||||
|
@ -671,15 +676,20 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
|
|||
}
|
||||
|
||||
/* Decompress datablock using zstd */
|
||||
int
|
||||
zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
|
||||
int level __maybe_unused)
|
||||
static int
|
||||
zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
|
||||
size_t d_len, int level __maybe_unused)
|
||||
{
|
||||
|
||||
return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
|
||||
return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
|
||||
NULL));
|
||||
}
|
||||
|
||||
ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
|
||||
ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
|
||||
ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
|
||||
|
||||
|
||||
/* Allocator for zstd compression context using mempool_allocator */
|
||||
static void *
|
||||
zstd_alloc(void *opaque __maybe_unused, size_t size)
|
||||
|
|
|
@ -145,6 +145,24 @@ for kernel_version in %{?kernel_versions}; do
|
|||
%{?kernel_cc} \
|
||||
%{?kernel_ld} \
|
||||
%{?kernel_llvm}
|
||||
|
||||
# Pre-6.10 kernel builds didn't need to copy over the source files to the
|
||||
# build directory. However we do need to do it though post-6.10 due to
|
||||
# these commits:
|
||||
#
|
||||
# b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
|
||||
# directory
|
||||
#
|
||||
# 9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
|
||||
# rules
|
||||
#
|
||||
# Note that kmodtool actually copies over the source into the build
|
||||
# directory, so what we're doing here is normal. For efficiency reasons
|
||||
# though we just use hardlinks instead of copying.
|
||||
#
|
||||
# See https://github.com/openzfs/zfs/issues/16439 for more info.
|
||||
cp -lR ../%{module}-%{version}/module/* module/
|
||||
|
||||
make %{?_smp_mflags}
|
||||
cd ..
|
||||
done
|
||||
|
|
|
@ -514,6 +514,10 @@ tags = ['functional', 'cli_root', 'zpool_offline']
|
|||
tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
|
||||
tags = ['functional', 'cli_root', 'zpool_online']
|
||||
|
||||
[tests/functional/cli_root/zpool_reguid]
|
||||
tests = ['zpool_reguid_001_pos', 'zpool_reguid_002_neg']
|
||||
tags = ['functional', 'cli_root', 'zpool_reguid']
|
||||
|
||||
[tests/functional/cli_root/zpool_remove]
|
||||
tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
|
||||
'zpool_remove_003_pos']
|
||||
|
@ -672,7 +676,9 @@ post =
|
|||
tags = ['functional', 'deadman']
|
||||
|
||||
[tests/functional/dedup]
|
||||
tests = ['dedup_quota']
|
||||
tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import',
|
||||
'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
|
||||
'dedup_legacy_fdt_mixed', 'dedup_quota']
|
||||
pre =
|
||||
post =
|
||||
tags = ['functional', 'dedup']
|
||||
|
|
|
@ -24,7 +24,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/badsend
|
|||
|
||||
|
||||
scripts_zfs_tests_bin_PROGRAMS += %D%/btree_test
|
||||
%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
|
||||
%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
|
||||
%C%_btree_test_LDADD = \
|
||||
libzpool.la \
|
||||
libzfs_core.la
|
||||
|
|
|
@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift
|
|||
DDT_ZAP_DEFAULT_BS dedup.ddt_zap_default_bs ddt_zap_default_bs
|
||||
DDT_ZAP_DEFAULT_IBS dedup.ddt_zap_default_ibs ddt_zap_default_ibs
|
||||
DDT_DATA_IS_SPECIAL ddt_data_is_special zfs_ddt_data_is_special
|
||||
DEDUP_LOG_TXG_MAX dedup.log_txg_max zfs_dedup_log_txg_max
|
||||
DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms
|
||||
DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second
|
||||
DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode
|
||||
|
|
|
@ -1424,6 +1424,12 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||
functional/deadman/deadman_zio.ksh \
|
||||
functional/dedup/cleanup.ksh \
|
||||
functional/dedup/setup.ksh \
|
||||
functional/dedup/dedup_fdt_create.ksh \
|
||||
functional/dedup/dedup_fdt_import.ksh \
|
||||
functional/dedup/dedup_legacy_create.ksh \
|
||||
functional/dedup/dedup_legacy_import.ksh \
|
||||
functional/dedup/dedup_legacy_fdt_upgrade.ksh \
|
||||
functional/dedup/dedup_legacy_fdt_mixed.ksh \
|
||||
functional/dedup/dedup_quota.ksh \
|
||||
functional/delegate/cleanup.ksh \
|
||||
functional/delegate/setup.ksh \
|
||||
|
|
|
@ -55,7 +55,7 @@ function display_status
|
|||
((ret |= $?))
|
||||
|
||||
typeset mntpnt=$(get_prop mountpoint $pool)
|
||||
dd if=/dev/random of=$mntpnt/testfile.$$ &
|
||||
dd if=/dev/urandom of=$mntpnt/testfile.$$ &
|
||||
typeset pid=$!
|
||||
|
||||
zpool iostat -v 1 3 > /dev/null
|
||||
|
|
|
@ -54,7 +54,7 @@ log_must truncate -s 1G $VDEV
|
|||
|
||||
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV
|
||||
|
||||
log_must dd if=/dev/random of=/$TESTPOOL/file1 bs=1 count=1000
|
||||
log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1 count=1000
|
||||
|
||||
ulimit -f 2
|
||||
log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 all
|
||||
|
|
|
@ -109,5 +109,6 @@ if is_linux || is_freebsd; then
|
|||
"feature@block_cloning"
|
||||
"feature@vdev_zaps_v2"
|
||||
"feature@raidz_expansion"
|
||||
"feature@fast_dedup"
|
||||
)
|
||||
fi
|
||||
|
|
|
@ -95,6 +95,10 @@ while (( i < 16384 )); do
|
|||
done
|
||||
((i += 1))
|
||||
done
|
||||
|
||||
# Force the DDT logs to disk with a scrub so they can be prefetched
|
||||
log_must zpool scrub -w $TESTPOOL
|
||||
|
||||
log_note "Dataset generation completed."
|
||||
|
||||
typeset -A generated
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_reguid
|
||||
dist_pkgdata_SCRIPTS = \
|
||||
setup.ksh \
|
||||
cleanup.ksh \
|
||||
zpool_reguid_001_pos.ksh \
|
||||
zpool_reguid_002_neg.ksh
|
|
@ -0,0 +1,32 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
# Use is subject to license terms.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
default_cleanup
|
|
@ -0,0 +1,34 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
# Use is subject to license terms.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
DISK=${DISKS%% *}
|
||||
|
||||
default_setup $DISK
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue