ZFS Version 2.2.2

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEETzupq20fjWg9wt+1athg7tRZgCcFAmVpF5cACgkQathg7tRZ
 gCccrg/9HvUbVz2D4uEulwHhXfjYlQAq6u7aUnLNBP9bKypb6aQ0xoCCFUPycAgp
 7NojDpZneizQx3CpBgKoof3yS1cxPTGpaJmXZUEd9XGGv8u9XYV2bjSQCyoIcRun
 FLhFyI95qcATP1gm4QC9ZWlGVoSQrzSuGcO9QSSpgK1npV27SZuIn/f8UmvkihZt
 R1GMSK//V0Um48QmOmzAJikGAoRG5KHVEaBeEnD2EEHB2q0bcagC/XUBQQDayG6R
 eHa/WTrwyh89ij5lf4AWi29MXBSEu/5VPRmDytXZeMyCc5nI3B2YC/dS9GHGF8mf
 H0UaIoxUBTdRW4qNLbZ7zPfXAEQLOkOYvoeUzfmSfPIz+rITQ7AUi2vwcf5wO1lI
 o9BpaYnze3cDfZ9Nvv7tamOORbKWyFsIgro4DFh4PwbYqhHB93u1bhpb8HHObrVX
 mVvQkzW0fmCpXgfZs5NwwrGkWzKYV2Ck6iyPXLQnnG9vQ6nSDA8UdCVgNWmMgUar
 S+sCtac/7A5sEbUYNEktLJybr1AN4wKbVUWRC7Sd6My0daEZsSy8mrWeCbMOLVa6
 ADbrX+OEk9LZkiUjPiLrfmnJ7bs5QHEbB/hN27qifIrVUvgr/Eo9XeJzx3SgMRTA
 9PYkpRJnfZYisIL0Td2pkMAoja3LtXBMRXv1iKO4nfo7460amU8=
 =kjGu
 -----END PGP SIGNATURE-----

Merge tag 'zfs-2.2.2' into truenas/zfs-2.2.2-test

ZFS Version 2.2.2

Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
This commit is contained in:
Ameer Hamza 2023-12-01 20:14:01 +05:00
commit d762e7eabe
24 changed files with 446 additions and 255 deletions

2
META
View File

@ -1,7 +1,7 @@
Meta: 1
Name: zfs
Branch: 1.0
Version: 2.2.1
Version: 2.2.2
Release: 1
Release-Tags: relext
License: CDDL

View File

@ -32,4 +32,4 @@ For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197`
# Supported Kernels
* The `META` file contains the officially recognized supported Linux kernel versions.
* Supported FreeBSD versions are any supported branches and releases starting from 12.2-RELEASE.
* Supported FreeBSD versions are any supported branches and releases starting from 12.4-RELEASE.

View File

@ -34,6 +34,7 @@
* Copyright (c) 2021 Allan Jude
* Copyright (c) 2021 Toomas Soome <tsoome@me.com>
* Copyright (c) 2023, Klara Inc.
* Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
*/
#include <stdio.h>
@ -80,6 +81,7 @@
#include <sys/dsl_scan.h>
#include <sys/btree.h>
#include <sys/brt.h>
#include <sys/brt_impl.h>
#include <zfs_comutil.h>
#include <sys/zstd/zstd.h>
@ -899,6 +901,8 @@ usage(void)
"don't print label contents\n");
(void) fprintf(stderr, " -t --txg=INTEGER "
"highest txg to use when searching for uberblocks\n");
(void) fprintf(stderr, " -T --brt-stats "
"BRT statistics\n");
(void) fprintf(stderr, " -u --uberblock "
"uberblock\n");
(void) fprintf(stderr, " -U --cachefile=PATH "
@ -999,6 +1003,15 @@ zdb_nicenum(uint64_t num, char *buf, size_t buflen)
nicenum(num, buf, buflen);
}
static void
zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
{
if (dump_opt['P'])
(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
else
zfs_nicebytes(bytes, buf, buflen);
}
static const char histo_stars[] = "****************************************";
static const uint64_t histo_width = sizeof (histo_stars) - 1;
@ -2081,6 +2094,76 @@ dump_all_ddts(spa_t *spa)
dump_dedup_ratio(&dds_total);
}
static void
dump_brt(spa_t *spa)
{
if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
printf("BRT: unsupported on this pool\n");
return;
}
if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
printf("BRT: empty\n");
return;
}
brt_t *brt = spa->spa_brt;
VERIFY(brt);
char count[32], used[32], saved[32];
zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
uint64_t ratio = brt_get_ratio(spa);
printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
(u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
if (dump_opt['T'] < 2)
return;
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
if (brtvd == NULL)
continue;
if (!brtvd->bv_initiated) {
printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
continue;
}
zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
vdevid, count, used, saved);
}
if (dump_opt['T'] < 3)
return;
char dva[64];
printf("\n%-16s %-10s\n", "DVA", "REFCNT");
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
if (brtvd == NULL || !brtvd->bv_initiated)
continue;
zap_cursor_t zc;
zap_attribute_t za;
for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
uint64_t offset = *(uint64_t *)za.za_name;
uint64_t refcnt = za.za_first_integer;
snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid,
(u_longlong_t)offset);
printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt);
}
zap_cursor_fini(&zc);
}
}
static void
dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
{
@ -8093,6 +8176,9 @@ dump_zpool(spa_t *spa)
if (dump_opt['D'])
dump_all_ddts(spa);
if (dump_opt['T'])
dump_brt(spa);
if (dump_opt['d'] > 2 || dump_opt['m'])
dump_metaslabs(spa);
if (dump_opt['M'])
@ -8879,6 +8965,7 @@ main(int argc, char **argv)
{"io-stats", no_argument, NULL, 's'},
{"simulate-dedup", no_argument, NULL, 'S'},
{"txg", required_argument, NULL, 't'},
{"brt-stats", no_argument, NULL, 'T'},
{"uberblock", no_argument, NULL, 'u'},
{"cachefile", required_argument, NULL, 'U'},
{"verbose", no_argument, NULL, 'v'},
@ -8892,7 +8979,7 @@ main(int argc, char **argv)
};
while ((c = getopt_long(argc, argv,
"AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ",
"AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
long_options, NULL)) != -1) {
switch (c) {
case 'b':
@ -8914,6 +9001,7 @@ main(int argc, char **argv)
case 'R':
case 's':
case 'S':
case 'T':
case 'u':
case 'y':
case 'Z':
@ -9076,22 +9164,6 @@ main(int argc, char **argv)
if (dump_opt['l'])
return (dump_label(argv[0]));
if (dump_opt['O']) {
if (argc != 2)
usage();
dump_opt['v'] = verbose + 3;
return (dump_path(argv[0], argv[1], NULL));
}
if (dump_opt['r']) {
target_is_spa = B_FALSE;
if (argc != 3)
usage();
dump_opt['v'] = verbose;
error = dump_path(argv[0], argv[1], &object);
if (error != 0)
fatal("internal error: %s", strerror(error));
}
if (dump_opt['X'] || dump_opt['F'])
rewind = ZPOOL_DO_REWIND |
(dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
@ -9192,6 +9264,29 @@ main(int argc, char **argv)
searchdirs = NULL;
}
/*
* We need to make sure to process -O option or call
* dump_path after the -e option has been processed,
* which imports the pool to the namespace if it's
* not in the cachefile.
*/
if (dump_opt['O']) {
if (argc != 2)
usage();
dump_opt['v'] = verbose + 3;
return (dump_path(argv[0], argv[1], NULL));
}
if (dump_opt['r']) {
target_is_spa = B_FALSE;
if (argc != 3)
usage();
dump_opt['v'] = verbose;
error = dump_path(argv[0], argv[1], &object);
if (error != 0)
fatal("internal error: %s", strerror(error));
}
/*
* import_checkpointed_state makes the assumption that the
* target pool that we pass it is already part of the spa

View File

@ -47,7 +47,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [
#include <linux/fs.h>
],[
struct inode ip;
struct timespec64 ts;
struct timespec64 ts = {0};
memset(&ip, 0, sizeof(ip));
inode_set_ctime_to_ts(&ip, ts);

View File

@ -33,6 +33,7 @@ COMMON_H = \
sys/bqueue.h \
sys/btree.h \
sys/brt.h \
sys/brt_impl.h \
sys/dataset_kstats.h \
sys/dbuf.h \
sys/ddt.h \

View File

@ -101,7 +101,7 @@ void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
void vfs_clearmntopt(vfs_t *vfsp, const char *name);
int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp);
int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype,
char *fspath, char *fspec, int fsflags);
char *fspath, char *fspec, int fsflags, vfs_t *parent_vfsp);
typedef uint64_t vfs_feature_t;

View File

@ -56,6 +56,7 @@ enum symfollow { NO_FOLLOW = NOFOLLOW };
#ifndef IN_BASE
#include_next <sys/vnode.h>
#endif
#include <sys/ccompat.h>
#include <sys/mount.h>
#include <sys/cred.h>
#include <sys/fcntl.h>
@ -104,7 +105,7 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync)
zfs_vmobject_wlock(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, flags);
zfs_vmobject_wunlock(vp->v_object);
VOP_UNLOCK(vp);
VOP_UNLOCK1(vp);
}
}
#endif

199
include/sys/brt_impl.h Normal file
View File

@ -0,0 +1,199 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
*/
#ifndef _SYS_BRT_IMPL_H
#define _SYS_BRT_IMPL_H
#ifdef __cplusplus
extern "C" {
#endif
/*
* BRT - Block Reference Table.
*/
#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:"
/*
* We divide each VDEV into 16MB chunks. Each chunk is represented in memory
* by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
* Each element in this array represents how many BRT entries do we have in this
* chunk of storage. We always load this entire array into memory and update as
* needed. By having it in memory we can quickly tell (during zio_free()) if
* there are any BRT entries that we might need to update.
*
* This value cannot be larger than 16MB, at least as long as we support
* 512 byte block sizes. With 512 byte block size we can have exactly
* 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
* many for a 16bit counter.
*/
#define BRT_RANGESIZE (16 * 1024 * 1024)
_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
"BRT_RANGESIZE is too large.");
/*
* We don't want to update the whole structure every time. Maintain bitmap
* of dirty blocks within the regions, so that a single bit represents a
* block size of entcounts. For example if we have a 1PB vdev then all
* entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
* 128MB array of entcounts into 32kB disk blocks, as we don't want to update
* the whole 128MB on disk when we have updated only a single entcount.
* We maintain a bitmap where each 32kB disk block within 128MB entcounts array
* is represented by a single bit. This gives us 4096 bits. A set bit in the
* bitmap means that we had a change in at least one of the 16384 entcounts
* that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
*/
#define BRT_BLOCKSIZE (32 * 1024)
#define BRT_RANGESIZE_TO_NBLOCKS(size) \
(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
#define BRT_LITTLE_ENDIAN 0
#define BRT_BIG_ENDIAN 1
#ifdef _ZFS_LITTLE_ENDIAN
#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN
#else
#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN
#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
#endif
typedef struct brt_vdev_phys {
uint64_t bvp_mos_entries;
uint64_t bvp_size;
uint64_t bvp_byteorder;
uint64_t bvp_totalcount;
uint64_t bvp_rangesize;
uint64_t bvp_usedspace;
uint64_t bvp_savedspace;
} brt_vdev_phys_t;
typedef struct brt_vdev {
/*
* VDEV id.
*/
uint64_t bv_vdevid;
/*
* Is the structure initiated?
* (bv_entcount and bv_bitmap are allocated?)
*/
boolean_t bv_initiated;
/*
* Object number in the MOS for the entcount array and brt_vdev_phys.
*/
uint64_t bv_mos_brtvdev;
/*
* Object number in the MOS for the entries table.
*/
uint64_t bv_mos_entries;
/*
* Entries to sync.
*/
avl_tree_t bv_tree;
/*
* Does the bv_entcount[] array needs byte swapping?
*/
boolean_t bv_need_byteswap;
/*
* Number of entries in the bv_entcount[] array.
*/
uint64_t bv_size;
/*
* This is the array with BRT entry count per BRT_RANGESIZE.
*/
uint16_t *bv_entcount;
/*
* Sum of all bv_entcount[]s.
*/
uint64_t bv_totalcount;
/*
* Space on disk occupied by cloned blocks (without compression).
*/
uint64_t bv_usedspace;
/*
* How much additional space would be occupied without block cloning.
*/
uint64_t bv_savedspace;
/*
* brt_vdev_phys needs updating on disk.
*/
boolean_t bv_meta_dirty;
/*
* bv_entcount[] needs updating on disk.
*/
boolean_t bv_entcount_dirty;
/*
* bv_entcount[] potentially can be a bit too big to sychronize it all
* when we just changed few entcounts. The fields below allow us to
* track updates to bv_entcount[] array since the last sync.
* A single bit in the bv_bitmap represents as many entcounts as can
* fit into a single BRT_BLOCKSIZE.
* For example we have 65536 entcounts in the bv_entcount array
* (so the whole array is 128kB). We updated bv_entcount[2] and
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
*/
ulong_t *bv_bitmap;
uint64_t bv_nblocks;
} brt_vdev_t;
/*
* In-core brt
*/
typedef struct brt {
krwlock_t brt_lock;
spa_t *brt_spa;
#define brt_mos brt_spa->spa_meta_objset
uint64_t brt_rangesize;
uint64_t brt_usedspace;
uint64_t brt_savedspace;
avl_tree_t brt_pending_tree[TXG_SIZE];
kmutex_t brt_pending_lock[TXG_SIZE];
/* Sum of all entries across all bv_trees. */
uint64_t brt_nentries;
brt_vdev_t *brt_vdevs;
uint64_t brt_nvdevs;
} brt_t;
/* Size of bre_offset / sizeof (uint64_t). */
#define BRT_KEY_WORDS (1)
/*
* In-core brt entry.
* On-disk we use bre_offset as the key and bre_refcount as the value.
*/
typedef struct brt_entry {
uint64_t bre_offset;
uint64_t bre_refcount;
avl_node_t bre_node;
} brt_entry_t;
typedef struct brt_pending_entry {
blkptr_t bpe_bp;
int bpe_count;
avl_node_t bpe_node;
} brt_pending_entry_t;
#ifdef __cplusplus
}
#endif
#endif /* _SYS_BRT_IMPL_H */

View File

@ -1072,8 +1072,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, struct blkptr *bps, size_t *nbpsp);
int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps,
boolean_t replay);
uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps);
/*
* Initial setup and final teardown.

View File

@ -14,7 +14,7 @@
.\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
.\" Copyright (c) 2017 Intel Corporation.
.\"
.Dd June 27, 2023
.Dd November 18, 2023
.Dt ZDB 8
.Os
.
@ -23,7 +23,7 @@
.Nd display ZFS storage pool debugging and consistency information
.Sh SYNOPSIS
.Nm
.Op Fl AbcdDFGhikLMNPsvXYy
.Op Fl AbcdDFGhikLMNPsTvXYy
.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns
.Op Fl I Ar inflight-I/O-ops
.Oo Fl o Ar var Ns = Ns Ar value Oc Ns
@ -403,6 +403,13 @@ Display operation counts, bandwidth, and error counts of I/O to the pool from
Simulate the effects of deduplication, constructing a DDT and then display
that DDT as with
.Fl DD .
.It Fl T , -brt-stats
Display block reference table (BRT) statistics, including the size of uniques
blocks cloned, the space saving as a result of cloning, and the saving ratio.
.It Fl TT
Display the per-vdev BRT statistics, including total references.
.It Fl TTT
Dump the contents of the block reference tables.
.It Fl u , -uberblock
Display the current uberblock.
.El

View File

@ -120,7 +120,7 @@ vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
int
mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
char *fspec, int fsflags)
char *fspec, int fsflags, vfs_t *parent_vfsp)
{
struct vfsconf *vfsp;
struct mount *mp;
@ -220,6 +220,13 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
mp->mnt_opt = mp->mnt_optnew;
(void) VFS_STATFS(mp, &mp->mnt_stat);
#ifdef VFS_SUPPORTS_EXJAIL_CLONE
/*
* Clone the mnt_exjail credentials of the parent, as required.
*/
vfs_exjail_clone(parent_vfsp, mp);
#endif
/*
* Prevent external consumers of mount options from reading
* mnt_optnew.

View File

@ -32,11 +32,7 @@ __FBSDID("$FreeBSD$");
#include <sys/kmem.h>
#include <sys/kmem_cache.h>
#include <sys/zmod.h>
#if __FreeBSD_version >= 1300041
#include <contrib/zlib/zlib.h>
#else
#include <sys/zlib.h>
#endif
#include <sys/kobj.h>
@ -90,11 +86,7 @@ zlib_inflateInit(z_stream *stream)
static int
zlib_inflate(z_stream *stream, int finish)
{
#if __FreeBSD_version >= 1300024
return (inflate(stream, finish));
#else
return (_zlib104_inflate(stream, finish));
#endif
}

View File

@ -46,6 +46,7 @@ knlist_sx_xunlock(void *arg)
sx_xunlock((struct sx *)arg);
}
#if __FreeBSD_version >= 1300128
static void
knlist_sx_assert_lock(void *arg, int what)
{
@ -55,11 +56,28 @@ knlist_sx_assert_lock(void *arg, int what)
else
sx_assert((struct sx *)arg, SX_UNLOCKED);
}
#else
static void
knlist_sx_assert_locked(void *arg)
{
sx_assert((struct sx *)arg, SX_LOCKED);
}
static void
knlist_sx_assert_unlocked(void *arg)
{
sx_assert((struct sx *)arg, SX_UNLOCKED);
}
#endif
void
knlist_init_sx(struct knlist *knl, struct sx *lock)
{
#if __FreeBSD_version >= 1300128
knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock,
knlist_sx_assert_lock);
#else
knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock,
knlist_sx_assert_locked, knlist_sx_assert_unlocked);
#endif
}

View File

@ -1026,7 +1026,8 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
"%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
dvp->v_vfsp->mnt_stat.f_mntonname, name);
err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0,
dvp->v_vfsp);
kmem_free(mountpoint, mountpoint_len);
if (err == 0) {
/*

View File

@ -6213,6 +6213,7 @@ zfs_deallocate(struct vop_deallocate_args *ap)
}
#endif
#if __FreeBSD_version >= 1300039
#ifndef _SYS_SYSPROTO_H_
struct vop_copy_file_range_args {
struct vnode *a_invp;
@ -6319,6 +6320,7 @@ bad_write_fallback:
ap->a_incred, ap->a_outcred, ap->a_fsizetd);
return (error);
}
#endif
struct vop_vector zfs_vnodeops;
struct vop_vector zfs_fifoops;
@ -6383,7 +6385,9 @@ struct vop_vector zfs_vnodeops = {
#if __FreeBSD_version >= 1400043
.vop_add_writecount = vop_stdadd_writecount_nomsync,
#endif
#if __FreeBSD_version >= 1300039
.vop_copy_file_range = zfs_freebsd_copy_file_range,
#endif
};
VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);

View File

@ -28,6 +28,7 @@
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/brt.h>
#include <sys/brt_impl.h>
#include <sys/ddt.h>
#include <sys/bitmap.h>
#include <sys/zap.h>
@ -234,178 +235,15 @@
* destination dataset is mounted and its ZIL replayed.
* To address this situation we leverage zil_claim() mechanism where ZFS will
* parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
* entries, we will bump reference counters for their BPs in the BRT and then
* on mount and ZIL replay we will just attach BPs to the file without
* bumping reference counters.
* Note it is still possible that after zil_claim() we never mount the
* destination, so we never replay its ZIL and we destroy it. This way we would
* end up with leaked references in BRT. We address that too as ZFS gives us
* a chance to clean this up on dataset destroy (see zil_free_clone_range()).
* entries, we will bump reference counters for their BPs in the BRT. Then
* on mount and ZIL replay we bump the reference counters once more, while the
* first references are dropped during ZIL destroy by zil_free_clone_range().
* It is possible that after zil_claim() we never mount the destination, so
* we never replay its ZIL and just destroy it. In this case the only taken
* references will be dropped by zil_free_clone_range(), since the cloning is
* not going to ever take place.
*/
/*
* BRT - Block Reference Table.
*/
#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:"
/*
* We divide each VDEV into 16MB chunks. Each chunk is represented in memory
* by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
* Each element in this array represents how many BRT entries do we have in this
* chunk of storage. We always load this entire array into memory and update as
* needed. By having it in memory we can quickly tell (during zio_free()) if
* there are any BRT entries that we might need to update.
*
* This value cannot be larger than 16MB, at least as long as we support
* 512 byte block sizes. With 512 byte block size we can have exactly
* 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
* many for a 16bit counter.
*/
#define BRT_RANGESIZE (16 * 1024 * 1024)
_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
"BRT_RANGESIZE is too large.");
/*
* We don't want to update the whole structure every time. Maintain bitmap
* of dirty blocks within the regions, so that a single bit represents a
* block size of entcounts. For example if we have a 1PB vdev then all
* entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
* 128MB array of entcounts into 32kB disk blocks, as we don't want to update
* the whole 128MB on disk when we have updated only a single entcount.
* We maintain a bitmap where each 32kB disk block within 128MB entcounts array
* is represented by a single bit. This gives us 4096 bits. A set bit in the
* bitmap means that we had a change in at least one of the 16384 entcounts
* that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
*/
#define BRT_BLOCKSIZE (32 * 1024)
#define BRT_RANGESIZE_TO_NBLOCKS(size) \
(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
#define BRT_LITTLE_ENDIAN 0
#define BRT_BIG_ENDIAN 1
#ifdef _ZFS_LITTLE_ENDIAN
#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN
#else
#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN
#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
#endif
typedef struct brt_vdev_phys {
uint64_t bvp_mos_entries;
uint64_t bvp_size;
uint64_t bvp_byteorder;
uint64_t bvp_totalcount;
uint64_t bvp_rangesize;
uint64_t bvp_usedspace;
uint64_t bvp_savedspace;
} brt_vdev_phys_t;
typedef struct brt_vdev {
/*
* VDEV id.
*/
uint64_t bv_vdevid;
/*
* Is the structure initiated?
* (bv_entcount and bv_bitmap are allocated?)
*/
boolean_t bv_initiated;
/*
* Object number in the MOS for the entcount array and brt_vdev_phys.
*/
uint64_t bv_mos_brtvdev;
/*
* Object number in the MOS for the entries table.
*/
uint64_t bv_mos_entries;
/*
* Entries to sync.
*/
avl_tree_t bv_tree;
/*
* Does the bv_entcount[] array needs byte swapping?
*/
boolean_t bv_need_byteswap;
/*
* Number of entries in the bv_entcount[] array.
*/
uint64_t bv_size;
/*
* This is the array with BRT entry count per BRT_RANGESIZE.
*/
uint16_t *bv_entcount;
/*
* Sum of all bv_entcount[]s.
*/
uint64_t bv_totalcount;
/*
* Space on disk occupied by cloned blocks (without compression).
*/
uint64_t bv_usedspace;
/*
* How much additional space would be occupied without block cloning.
*/
uint64_t bv_savedspace;
/*
* brt_vdev_phys needs updating on disk.
*/
boolean_t bv_meta_dirty;
/*
* bv_entcount[] needs updating on disk.
*/
boolean_t bv_entcount_dirty;
/*
* bv_entcount[] potentially can be a bit too big to sychronize it all
* when we just changed few entcounts. The fields below allow us to
* track updates to bv_entcount[] array since the last sync.
* A single bit in the bv_bitmap represents as many entcounts as can
* fit into a single BRT_BLOCKSIZE.
* For example we have 65536 entcounts in the bv_entcount array
* (so the whole array is 128kB). We updated bv_entcount[2] and
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
*/
ulong_t *bv_bitmap;
uint64_t bv_nblocks;
} brt_vdev_t;
/*
* In-core brt
*/
typedef struct brt {
krwlock_t brt_lock;
spa_t *brt_spa;
#define brt_mos brt_spa->spa_meta_objset
uint64_t brt_rangesize;
uint64_t brt_usedspace;
uint64_t brt_savedspace;
avl_tree_t brt_pending_tree[TXG_SIZE];
kmutex_t brt_pending_lock[TXG_SIZE];
/* Sum of all entries across all bv_trees. */
uint64_t brt_nentries;
brt_vdev_t *brt_vdevs;
uint64_t brt_nvdevs;
} brt_t;
/* Size of bre_offset / sizeof (uint64_t). */
#define BRT_KEY_WORDS (1)
/*
* In-core brt entry.
* On-disk we use bre_offset as the key and bre_refcount as the value.
*/
typedef struct brt_entry {
uint64_t bre_offset;
uint64_t bre_refcount;
avl_node_t bre_node;
} brt_entry_t;
typedef struct brt_pending_entry {
blkptr_t bpe_bp;
int bpe_count;
avl_node_t bpe_node;
} brt_pending_entry_t;
static kmem_cache_t *brt_entry_cache;
static kmem_cache_t *brt_pending_entry_cache;

View File

@ -2700,15 +2700,23 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
* writes and clones into this block.
*/
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
VERIFY(!dbuf_undirty(db, tx));
ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL);
if (db->db_buf != NULL) {
arc_buf_destroy(db->db_buf, db);
db->db_buf = NULL;
dbuf_clear_data(db);
}
db->db_state = DB_NOFILL;
DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
dmu_buf_will_not_fill(db_fake, tx);
dbuf_noread(db);
(void) dbuf_dirty(db, tx);
}
void

View File

@ -2267,7 +2267,7 @@ out:
int
dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay)
dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
{
spa_t *spa;
dmu_buf_t **dbp, *dbuf;
@ -2341,10 +2341,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
* When data in embedded into BP there is no need to create
* BRT entry as there is no data block. Just copy the BP as
* it contains the data.
* Also, when replaying ZIL we don't want to bump references
* in the BRT as it was already done during ZIL claim.
*/
if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
brt_pending_add(spa, bp, tx);
}
}

View File

@ -1764,7 +1764,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
}
/*
* Checks if the dnode contains any uncommitted dirty records.
* Checks if the dnode itself is dirty, or is carrying any uncommitted records.
* It is important to check both conditions, as some operations (eg appending
* to a file) can dirty both as a single logical unit, but they are not synced
* out atomically, so checking one and not the other can result in an object
* appearing to be clean mid-way through a commit.
*
* Do not change this lightly! If you get it wrong, dmu_offset_next() can
* detect a hole where there is really data, leading to silent corruption.
*/
boolean_t
dnode_is_dirty(dnode_t *dn)
@ -1772,7 +1779,8 @@ dnode_is_dirty(dnode_t *dn)
mutex_enter(&dn->dn_mtx);
for (int i = 0; i < TXG_SIZE; i++) {
if (multilist_link_active(&dn->dn_dirty_link[i])) {
if (multilist_link_active(&dn->dn_dirty_link[i]) ||
!list_is_empty(&dn->dn_dirty_records[i])) {
mutex_exit(&dn->dn_mtx);
return (B_TRUE);
}

View File

@ -1333,7 +1333,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
}
error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
bps, nbps, B_FALSE);
bps, nbps);
if (error != 0) {
dmu_tx_commit(tx);
break;
@ -1467,7 +1467,7 @@ zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
if (zp->z_blksz < blksz)
zfs_grow_blocksize(zp, blksz, tx);
dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);

View File

@ -158,22 +158,23 @@ zio_init(void)
zio_link_cache = kmem_cache_create("zio_link_cache",
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
/*
* For small buffers, we want a cache for each multiple of
* SPA_MINBLOCKSIZE. For larger buffers, we want a cache
* for each quarter-power of 2.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
size_t align, cflags, data_cflags;
char name[32];
/*
* Create cache for each half-power of 2 size, starting from
* SPA_MINBLOCKSIZE. It should give us memory space efficiency
* of ~7/8, sufficient for transient allocations mostly using
* these caches.
*/
size_t p2 = size;
size_t align = 0;
size_t data_cflags, cflags;
data_cflags = KMC_NODEBUG;
cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
KMC_NODEBUG : 0;
while (!ISP2(p2))
p2 &= p2 - 1;
if (!IS_P2ALIGNED(size, p2 / 2))
continue;
#ifndef _KERNEL
/*
@ -184,37 +185,47 @@ zio_init(void)
*/
if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
continue;
/*
* Here's the problem - on 4K native devices in userland on
* Linux using O_DIRECT, buffers must be 4K aligned or I/O
* will fail with EINVAL, causing zdb (and others) to coredump.
* Since userland probably doesn't need optimized buffer caches,
* we just force 4K alignment on everything.
*/
align = 8 * SPA_MINBLOCKSIZE;
#else
if (size < PAGESIZE) {
align = SPA_MINBLOCKSIZE;
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
align = PAGESIZE;
}
#endif
if (IS_P2ALIGNED(size, PAGESIZE))
align = PAGESIZE;
else
align = 1 << (highbit64(size ^ (size - 1)) - 1);
if (align != 0) {
char name[36];
if (cflags == data_cflags) {
/*
* Resulting kmem caches would be identical.
* Save memory by creating only one.
*/
(void) snprintf(name, sizeof (name),
"zio_buf_comb_%lu", (ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name,
size, align, NULL, NULL, NULL, NULL, NULL,
cflags);
zio_data_buf_cache[c] = zio_buf_cache[c];
continue;
}
(void) snprintf(name, sizeof (name), "zio_buf_%lu",
(ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size,
align, NULL, NULL, NULL, NULL, NULL, cflags);
cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
KMC_NODEBUG : 0;
data_cflags = KMC_NODEBUG;
if (cflags == data_cflags) {
/*
* Resulting kmem caches would be identical.
* Save memory by creating only one.
*/
(void) snprintf(name, sizeof (name),
"zio_buf_comb_%lu", (ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size, align,
NULL, NULL, NULL, NULL, NULL, cflags);
zio_data_buf_cache[c] = zio_buf_cache[c];
continue;
(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
(ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size,
align, NULL, NULL, NULL, NULL, NULL, data_cflags);
}
(void) snprintf(name, sizeof (name), "zio_buf_%lu",
(ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size, align,
NULL, NULL, NULL, NULL, NULL, cflags);
(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
(ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
NULL, NULL, NULL, NULL, NULL, data_cflags);
}
while (--c != 0) {

View File

@ -103,6 +103,7 @@ if [ -d ${dkms_root}/%{module} ]; then
fi
fi
done
cd ${dkms_root}
fi
# Uninstall this version of zfs dkms modules before installation of the package.

View File

@ -58,7 +58,7 @@ set -A args "create" "add" "destroy" "import fakepool" \
"setvprop" "blah blah" "-%" "--?" "-*" "-=" \
"-a" "-f" "-g" "-j" "-n" "-o" "-p" "-p /tmp" \
"-t" "-w" "-z" "-E" "-H" "-I" "-J" \
"-Q" "-R" "-T" "-W"
"-Q" "-R" "-W"
log_assert "Execute zdb using invalid parameters."

View File

@ -123,7 +123,10 @@ if not httpd:
with open('$HTTPS_PORT_FILE', 'w') as portf:
print(port, file=portf)
httpd.socket = ssl.wrap_socket(httpd.socket, server_side=True, keyfile='/$TESTPOOL/snakeoil.key', certfile='$SSL_CA_CERT_FILE', ssl_version=ssl.PROTOCOL_TLS)
sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
sslctx.check_hostname = False
sslctx.load_cert_chain(certfile='$SSL_CA_CERT_FILE', keyfile='/$TESTPOOL/snakeoil.key')
httpd.socket = httpd.socket = sslctx.wrap_socket(httpd.socket, server_side=True)
os.chdir('$STF_SUITE/tests/functional/cli_root/zfs_load-key')