Merge pull request #207 from truenas/truenas/zfs-2.2.3-staging-2

Sync with upstream zfs-2.2.3-staging for Dragonfish BETA.1
This commit is contained in:
Alexander Motin 2024-01-17 11:07:23 -05:00 committed by GitHub
commit fc0a9f0cda
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
73 changed files with 1913 additions and 649 deletions

View File

@ -168,7 +168,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
(u_longlong_t)lr->lr_length);
if (txtype == TX_WRITE2 || verbose < 5)
if (txtype == TX_WRITE2 || verbose < 4)
return;
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
@ -178,6 +178,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
"will claim" : "won't claim");
print_log_bp(bp, tab_prefix);
if (verbose < 5)
return;
if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp));
@ -202,6 +204,9 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
if (error)
goto out;
} else {
if (verbose < 5)
return;
/* data is stored after the end of the lr_write record */
data = abd_alloc(lr->lr_length, B_FALSE);
abd_copy_from_buf(data, lr + 1, lr->lr_length);
@ -217,6 +222,28 @@ out:
abd_free(data);
}
static void
zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg)
{
(void) txtype;
const lr_write_t *lr = arg;
const blkptr_t *bp = &lr->lr_blkptr;
int verbose = MAX(dump_opt['d'], dump_opt['i']);
(void) printf("%s(encrypted)\n", tab_prefix);
if (verbose < 4)
return;
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
(void) printf("%shas blkptr, %s\n", tab_prefix,
!BP_IS_HOLE(bp) &&
bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
print_log_bp(bp, tab_prefix);
}
}
static void
zil_prt_rec_truncate(zilog_t *zilog, int txtype, const void *arg)
{
@ -312,11 +339,34 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg)
{
(void) zilog, (void) txtype;
const lr_clone_range_t *lr = arg;
int verbose = MAX(dump_opt['d'], dump_opt['i']);
(void) printf("%sfoid %llu, offset %llx, length %llx, blksize %llx\n",
tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
(u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz);
if (verbose < 4)
return;
for (unsigned int i = 0; i < lr->lr_nbps; i++) {
(void) printf("%s[%u/%llu] ", tab_prefix, i + 1,
(u_longlong_t)lr->lr_nbps);
print_log_bp(&lr->lr_bps[i], "");
}
}
static void
zil_prt_rec_clone_range_enc(zilog_t *zilog, int txtype, const void *arg)
{
(void) zilog, (void) txtype;
const lr_clone_range_t *lr = arg;
int verbose = MAX(dump_opt['d'], dump_opt['i']);
(void) printf("%s(encrypted)\n", tab_prefix);
if (verbose < 4)
return;
for (unsigned int i = 0; i < lr->lr_nbps; i++) {
(void) printf("%s[%u/%llu] ", tab_prefix, i + 1,
(u_longlong_t)lr->lr_nbps);
@ -327,6 +377,7 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg)
typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *);
typedef struct zil_rec_info {
zil_prt_rec_func_t zri_print;
zil_prt_rec_func_t zri_print_enc;
const char *zri_name;
uint64_t zri_count;
} zil_rec_info_t;
@ -341,7 +392,9 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
{.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "},
{.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "},
{.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "},
{.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "},
{.zri_print = zil_prt_rec_write,
.zri_print_enc = zil_prt_rec_write_enc,
.zri_name = "TX_WRITE "},
{.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "},
{.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "},
{.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "},
@ -358,6 +411,7 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
{.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_EXCHANGE "},
{.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_WHITEOUT "},
{.zri_print = zil_prt_rec_clone_range,
.zri_print_enc = zil_prt_rec_clone_range_enc,
.zri_name = "TX_CLONE_RANGE "},
};
@ -384,6 +438,8 @@ print_log_record(zilog_t *zilog, const lr_t *lr, void *arg, uint64_t claim_txg)
if (txtype && verbose >= 3) {
if (!zilog->zl_os->os_encrypted) {
zil_rec_info[txtype].zri_print(zilog, txtype, lr);
} else if (zil_rec_info[txtype].zri_print_enc) {
zil_rec_info[txtype].zri_print_enc(zilog, txtype, lr);
} else {
(void) printf("%s(encrypted)\n", tab_prefix);
}

View File

@ -2,12 +2,15 @@ dnl #
dnl # 4.9, current_time() added
dnl # 4.18, return type changed from timespec to timespec64
dnl #
dnl # Note that we don't care about the return type in this check. If we have
dnl # to implement a fallback, we'll know we're <4.9, which was timespec.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_CURRENT_TIME], [
ZFS_LINUX_TEST_SRC([current_time], [
#include <linux/fs.h>
], [
struct inode ip __attribute__ ((unused));
ip.i_atime = current_time(&ip);
(void) current_time(&ip);
])
])

View File

@ -1,7 +1,8 @@
dnl #
dnl # Starting from Linux 5.13, flush_dcache_page() becomes an inline
dnl # function and may indirectly referencing GPL-only cpu_feature_keys on
dnl # powerpc
dnl # function and may indirectly referencing GPL-only symbols:
dnl # on powerpc: cpu_feature_keys
dnl # on riscv: PageHuge (added from 6.2)
dnl #
dnl #

View File

@ -79,6 +79,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [
__kernel_fpu_end();
], [], [])
ZFS_LINUX_TEST_SRC([kernel_neon], [
#include <asm/neon.h>
], [
kernel_neon_begin();
kernel_neon_end();
], [], [ZFS_META_LICENSE])
])
AC_DEFUN([ZFS_AC_KERNEL_FPU], [
@ -105,9 +111,20 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
[kernel exports FPU functions])
],[
AC_MSG_RESULT(internal)
AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1,
[kernel fpu internal])
dnl #
dnl # ARM neon symbols (only on arm and arm64)
dnl # could be GPL-only on arm64 after Linux 6.2
dnl #
ZFS_LINUX_TEST_RESULT([kernel_neon_license],[
AC_MSG_RESULT(kernel_neon_*)
AC_DEFINE(HAVE_KERNEL_NEON, 1,
[kernel has kernel_neon_* functions])
],[
# catch-all
AC_MSG_RESULT(internal)
AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1,
[kernel fpu internal])
])
])
])
])

View File

@ -52,6 +52,48 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [
memset(&ip, 0, sizeof(ip));
inode_set_ctime_to_ts(&ip, ts);
])
dnl #
dnl # 6.7 API change
dnl # i_atime/i_mtime no longer directly accessible, must use
dnl # inode_get_mtime(ip), inode_set_mtime*(ip) to
dnl # read/write.
dnl #
ZFS_LINUX_TEST_SRC([inode_get_atime], [
#include <linux/fs.h>
],[
struct inode ip;
memset(&ip, 0, sizeof(ip));
inode_get_atime(&ip);
])
ZFS_LINUX_TEST_SRC([inode_get_mtime], [
#include <linux/fs.h>
],[
struct inode ip;
memset(&ip, 0, sizeof(ip));
inode_get_mtime(&ip);
])
ZFS_LINUX_TEST_SRC([inode_set_atime_to_ts], [
#include <linux/fs.h>
],[
struct inode ip;
struct timespec64 ts = {0};
memset(&ip, 0, sizeof(ip));
inode_set_atime_to_ts(&ip, ts);
])
ZFS_LINUX_TEST_SRC([inode_set_mtime_to_ts], [
#include <linux/fs.h>
],[
struct inode ip;
struct timespec64 ts = {0};
memset(&ip, 0, sizeof(ip));
inode_set_mtime_to_ts(&ip, ts);
])
])
AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [
@ -90,4 +132,40 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [
],[
AC_MSG_RESULT(no)
])
AC_MSG_CHECKING([whether inode_get_atime() exists])
ZFS_LINUX_TEST_RESULT([inode_get_atime], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_INODE_GET_ATIME, 1,
[inode_get_atime() exists in linux/fs.h])
],[
AC_MSG_RESULT(no)
])
AC_MSG_CHECKING([whether inode_set_atime_to_ts() exists])
ZFS_LINUX_TEST_RESULT([inode_set_atime_to_ts], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_INODE_SET_ATIME_TO_TS, 1,
[inode_set_atime_to_ts() exists in linux/fs.h])
],[
AC_MSG_RESULT(no)
])
AC_MSG_CHECKING([whether inode_get_mtime() exists])
ZFS_LINUX_TEST_RESULT([inode_get_mtime], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_INODE_GET_MTIME, 1,
[inode_get_mtime() exists in linux/fs.h])
],[
AC_MSG_RESULT(no)
])
AC_MSG_CHECKING([whether inode_set_mtime_to_ts() exists])
ZFS_LINUX_TEST_RESULT([inode_set_mtime_to_ts], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_INODE_SET_MTIME_TO_TS, 1,
[inode_set_mtime_to_ts() exists in linux/fs.h])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -19,12 +19,44 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK], [
],[])
])
dnl #
dnl # 6.7 API change
dnl # s_shrink is now a pointer.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR], [
ZFS_LINUX_TEST_SRC([super_block_s_shrink_ptr], [
#include <linux/fs.h>
unsigned long shrinker_cb(struct shrinker *shrink,
struct shrink_control *sc) { return 0; }
static struct shrinker shrinker = {
.count_objects = shrinker_cb,
.scan_objects = shrinker_cb,
.seeks = DEFAULT_SEEKS,
};
static const struct super_block
sb __attribute__ ((unused)) = {
.s_shrink = &shrinker,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_SUPER_BLOCK_S_SHRINK], [
AC_MSG_CHECKING([whether super_block has s_shrink])
ZFS_LINUX_TEST_RESULT([super_block_s_shrink], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK, 1,
[have super_block s_shrink])
],[
ZFS_LINUX_TEST_ERROR([sb->s_shrink()])
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether super_block has s_shrink pointer])
ZFS_LINUX_TEST_RESULT([super_block_s_shrink_ptr], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK_PTR, 1,
[have super_block s_shrink pointer])
],[
AC_MSG_RESULT(no)
ZFS_LINUX_TEST_ERROR([sb->s_shrink()])
])
])
])
@ -96,6 +128,25 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK], [
])
])
dnl #
dnl # 6.7 API change
dnl # register_shrinker has been replaced by shrinker_register.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER], [
ZFS_LINUX_TEST_SRC([shrinker_register], [
#include <linux/shrinker.h>
unsigned long shrinker_cb(struct shrinker *shrink,
struct shrink_control *sc) { return 0; }
],[
struct shrinker cache_shrinker = {
.count_objects = shrinker_cb,
.scan_objects = shrinker_cb,
.seeks = DEFAULT_SEEKS,
};
shrinker_register(&cache_shrinker);
])
])
AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[
dnl #
dnl # 6.0 API change
@ -133,14 +184,36 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[
dnl # cs->shrink() is logically split in to
dnl # cs->count_objects() and cs->scan_objects()
dnl #
AC_MSG_CHECKING([if cs->count_objects callback exists])
AC_MSG_CHECKING(
[whether cs->count_objects callback exists])
ZFS_LINUX_TEST_RESULT(
[shrinker_cb_shrink_control_split],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1,
[cs->count_objects exists])
[shrinker_cb_shrink_control_split],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1,
[cs->count_objects exists])
],[
AC_MSG_RESULT(no)
AC_MSG_CHECKING(
[whether shrinker_register exists])
ZFS_LINUX_TEST_RESULT([shrinker_register], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_SHRINKER_REGISTER, 1,
[shrinker_register exists])
dnl # We assume that the split shrinker
dnl # callback exists if
dnl # shrinker_register() exists,
dnl # because the latter is a much more
dnl # recent addition, and the macro
dnl # test for shrinker_register() only
dnl # works if the callback is split
AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK,
1, [cs->count_objects exists])
],[
AC_MSG_RESULT(no)
ZFS_LINUX_TEST_ERROR([shrinker])
])
])
])
])
@ -174,10 +247,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT], [
AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER], [
ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK
ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR
ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_HAS_NID
ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK
ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT
ZFS_AC_KERNEL_SRC_REGISTER_SHRINKER_VARARG
ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER
])
AC_DEFUN([ZFS_AC_KERNEL_SHRINKER], [

View File

@ -168,6 +168,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE
;;
riscv*)
ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE
;;
esac
AC_MSG_CHECKING([for available kernel interfaces])
@ -310,6 +313,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_CPU_HAS_FEATURE
ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE
;;
riscv*)
ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE
;;
esac
])

View File

@ -91,6 +91,12 @@
#define param_set_max_auto_ashift_args(var) \
CTLTYPE_UINT, NULL, 0, param_set_max_auto_ashift, "IU"
#define spa_taskq_read_param_set_args(var) \
CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A"
#define spa_taskq_write_param_set_args(var) \
CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A"
#define fletcher_4_param_set_args(var) \
CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A"

View File

@ -62,7 +62,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
}
static inline void
zfs_uio_advance(zfs_uio_t *uio, size_t size)
zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
{
zfs_uio_resid(uio) -= size;
zfs_uio_offset(uio) += size;

View File

@ -42,8 +42,8 @@
/*
* Starting from Linux 5.13, flush_dcache_page() becomes an inline function
* and under some configurations, may indirectly referencing GPL-only
* cpu_feature_keys on powerpc. Override this function when it is detected
* being GPL-only.
* symbols, e.g., cpu_feature_keys on powerpc and PageHuge on riscv.
* Override this function when it is detected being GPL-only.
*/
#if defined __powerpc__ && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY
#include <linux/simd_powerpc.h>
@ -53,6 +53,17 @@
clear_bit(PG_dcache_clean, &(page)->flags); \
} while (0)
#endif
/*
* For riscv implementation, the use of PageHuge can be safely removed.
* Because it handles pages allocated by HugeTLB, while flush_dcache_page
* in zfs module is only called on kernel pages.
*/
#if defined __riscv && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY
#define flush_dcache_page(page) do { \
if (test_bit(PG_dcache_clean, &(page)->flags)) \
clear_bit(PG_dcache_clean, &(page)->flags); \
} while (0)
#endif
/*
* 2.6.30 API change,

View File

@ -71,9 +71,15 @@
#define ID_AA64PFR0_EL1 sys_reg(3, 0, 0, 1, 0)
#define ID_AA64ISAR0_EL1 sys_reg(3, 0, 0, 6, 0)
#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON))
#define kfpu_allowed() 1
#define kfpu_begin() kernel_neon_begin()
#define kfpu_end() kernel_neon_end()
#else
#define kfpu_allowed() 0
#define kfpu_begin() do {} while (0)
#define kfpu_end() do {} while (0)
#endif
#define kfpu_init() (0)
#define kfpu_fini() do {} while (0)

View File

@ -53,9 +53,15 @@
#include <asm/elf.h>
#include <asm/hwcap.h>
#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON))
#define kfpu_allowed() 1
#define kfpu_begin() kernel_neon_begin()
#define kfpu_end() kernel_neon_end()
#else
#define kfpu_allowed() 0
#define kfpu_begin() do {} while (0)
#define kfpu_end() do {} while (0)
#endif
#define kfpu_init() (0)
#define kfpu_fini() do {} while (0)

View File

@ -70,8 +70,6 @@ typedef enum kmem_cbrc {
#define KMC_REAP_CHUNK INT_MAX
#define KMC_DEFAULT_SEEKS 1
#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */
extern struct list_head spl_kmem_cache_list;
extern struct rw_semaphore spl_kmem_cache_sem;

View File

@ -29,12 +29,13 @@
/*
* Due to frequent changes in the shrinker API the following
* compatibility wrappers should be used. They are as follows:
* compatibility wrapper should be used.
*
* SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost);
* shrinker = spl_register_shrinker(name, countfunc, scanfunc, seek_cost);
* spl_unregister_shrinker(shrinker);
*
* SPL_SHRINKER_DECLARE is used to declare a shrinker with the name varname,
* which is passed to spl_register_shrinker()/spl_unregister_shrinker().
* spl_register_shrinker is used to create and register a shrinker with the
* given name.
* The countfunc returns the number of free-able objects.
* The scanfunc returns the number of objects that were freed.
* The callbacks can return SHRINK_STOP if further calls can't make any more
@ -57,57 +58,28 @@
* ...scan objects in the cache and reclaim them...
* }
*
* SPL_SHRINKER_DECLARE(my_shrinker, my_count, my_scan, DEFAULT_SEEKS);
* static struct shrinker *my_shrinker;
*
* void my_init_func(void) {
* spl_register_shrinker(&my_shrinker);
* my_shrinker = spl_register_shrinker("my-shrinker",
* my_count, my_scan, DEFAULT_SEEKS);
* }
*
* void my_fini_func(void) {
* spl_unregister_shrinker(my_shrinker);
* }
*/
#ifdef HAVE_REGISTER_SHRINKER_VARARG
#define spl_register_shrinker(x) register_shrinker(x, "zfs-arc-shrinker")
#else
#define spl_register_shrinker(x) register_shrinker(x)
#endif
#define spl_unregister_shrinker(x) unregister_shrinker(x)
typedef unsigned long (*spl_shrinker_cb)
(struct shrinker *, struct shrink_control *);
/*
* Linux 3.0 to 3.11 Shrinker API Compatibility.
*/
#if defined(HAVE_SINGLE_SHRINKER_CALLBACK)
#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \
static int \
__ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\
{ \
if (sc->nr_to_scan != 0) { \
(void) scanfunc(shrink, sc); \
} \
return (countfunc(shrink, sc)); \
} \
\
static struct shrinker varname = { \
.shrink = __ ## varname ## _wrapper, \
.seeks = seek_cost, \
}
struct shrinker *spl_register_shrinker(const char *name,
spl_shrinker_cb countfunc, spl_shrinker_cb scanfunc, int seek_cost);
void spl_unregister_shrinker(struct shrinker *);
#ifndef SHRINK_STOP
/* 3.0-3.11 compatibility */
#define SHRINK_STOP (-1)
/*
* Linux 3.12 and later Shrinker API Compatibility.
*/
#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \
static struct shrinker varname = { \
.count_objects = countfunc, \
.scan_objects = scanfunc, \
.seeks = seek_cost, \
}
#else
/*
* Linux 2.x to 2.6.22, or a newer shrinker API has been introduced.
*/
#error "Unknown shrinker callback"
#endif
#endif /* SPL_SHRINKER_H */

View File

@ -95,7 +95,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
}
static inline void
zfs_uio_advance(zfs_uio_t *uio, size_t size)
zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
{
uio->uio_resid -= size;
uio->uio_loffset += size;

View File

@ -282,5 +282,25 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg);
#else
#define zpl_inode_set_ctime_to_ts(ip, ts) (ip->i_ctime = ts)
#endif
#ifdef HAVE_INODE_GET_ATIME
#define zpl_inode_get_atime(ip) inode_get_atime(ip)
#else
#define zpl_inode_get_atime(ip) (ip->i_atime)
#endif
#ifdef HAVE_INODE_SET_ATIME_TO_TS
#define zpl_inode_set_atime_to_ts(ip, ts) inode_set_atime_to_ts(ip, ts)
#else
#define zpl_inode_set_atime_to_ts(ip, ts) (ip->i_atime = ts)
#endif
#ifdef HAVE_INODE_GET_MTIME
#define zpl_inode_get_mtime(ip) inode_get_mtime(ip)
#else
#define zpl_inode_get_mtime(ip) (ip->i_mtime)
#endif
#ifdef HAVE_INODE_SET_MTIME_TO_TS
#define zpl_inode_set_mtime_to_ts(ip, ts) inode_set_mtime_to_ts(ip, ts)
#else
#define zpl_inode_set_mtime_to_ts(ip, ts) (ip->i_mtime = ts)
#endif
#endif /* _SYS_ZPL_H */

View File

@ -71,6 +71,7 @@ typedef struct dataset_kstats {
int dataset_kstats_create(dataset_kstats_t *, objset_t *);
void dataset_kstats_destroy(dataset_kstats_t *);
void dataset_kstats_rename(dataset_kstats_t *dk, const char *);
void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t);
void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t);

View File

@ -379,8 +379,8 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,

View File

@ -206,6 +206,7 @@ void dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin,
dmu_tx_t *tx);
int dmu_objset_create_crypt_check(dsl_dir_t *parentdd,
dsl_crypto_params_t *dcp, boolean_t *will_encrypt);
boolean_t dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb);
void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd,
struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx);
uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey,

View File

@ -181,7 +181,7 @@ typedef struct zil_vdev_node {
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;
#define ZIL_PREV_BLKS 16
#define ZIL_BURSTS 8
/*
* Stable storage intent log management structure. One per dataset.
@ -216,14 +216,18 @@ struct zilog {
uint64_t zl_parse_lr_count; /* number of log records parsed */
itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
list_t zl_itx_commit_list; /* itx list to be committed */
uint64_t zl_cur_used; /* current commit log size used */
uint64_t zl_cur_size; /* current burst full size */
uint64_t zl_cur_left; /* current burst remaining size */
uint64_t zl_cur_max; /* biggest record in current burst */
list_t zl_lwb_list; /* in-flight log write list */
avl_tree_t zl_bp_tree; /* track bps during log parse */
clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */
zil_header_t zl_old_header; /* debugging aid */
uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
uint_t zl_parallel; /* workload is multi-threaded */
uint_t zl_prev_rotor; /* rotor for zl_prev[] */
uint_t zl_prev_opt[ZIL_BURSTS]; /* optimal block size */
uint_t zl_prev_min[ZIL_BURSTS]; /* minimal first block size */
txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */

View File

@ -64,6 +64,9 @@ libspl_assert(const char *buf, const char *file, const char *func, int line)
#undef verify
#endif
#define PANIC(fmt, a...) \
libspl_assertf(__FILE__, __FUNCTION__, __LINE__, fmt, ## a)
#define VERIFY(cond) \
(void) ((!(cond)) && \
libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__))

View File

@ -90,7 +90,7 @@ zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len)
}
static inline void
zfs_uio_advance(zfs_uio_t *uio, size_t size)
zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
{
uio->uio_resid -= size;
uio->uio_loffset += size;

View File

@ -31,14 +31,6 @@ for use by the kmem caches.
For the majority of systems and workloads only a small number of threads are
required.
.
.It Sy spl_kmem_cache_reclaim Ns = Ns Sy 0 Pq uint
When this is set it prevents Linux from being able to rapidly reclaim all the
memory held by the kmem caches.
This may be useful in circumstances where it's preferable that Linux
reclaim memory from some other subsystem first.
Setting this will increase the likelihood out of memory events on a memory
constrained system.
.
.It Sy spl_kmem_cache_obj_per_slab Ns = Ns Sy 8 Pq uint
The preferred number of objects per slab in the cache.
In general, a larger value will increase the caches memory footprint

View File

@ -798,7 +798,7 @@ Note that this should not be set below the ZED thresholds
(currently 10 checksums over 10 seconds)
or else the daemon may not trigger any action.
.
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
This controls the amount of time that a ZIL block (lwb) will remain "open"
when it isn't "full", and it has a thread waiting for it to be committed to
stable storage.
@ -2160,13 +2160,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
It tunes a tradeoff between additional memory copy and possibly worse log
space efficiency vs additional range lock/unlock.
.
.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
This sets the minimum delay in nanoseconds ZIL care to delay block commit,
waiting for more records.
If ZIL writes are too fast, kernel may not be able sleep for so short interval,
increasing log latency above allowed by
.Sy zfs_commit_timeout_pct .
.
.It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
Disable the cache flush commands that are normally sent to disk by
the ZIL after an LWB write has completed.
@ -2280,6 +2273,16 @@ If
.Sy 0 ,
generate a system-dependent value close to 6 threads per taskq.
.
.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
Set the queue and thread configuration for the IO read queues.
This is an advanced debugging parameter.
Don't change this unless you understand what it does.
.
.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp
Set the queue and thread configuration for the IO write queues.
This is an advanced debugging parameter.
Don't change this unless you understand what it does.
.
.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Do not create zvol device nodes.
This may slightly improve startup time on

View File

@ -364,9 +364,12 @@ When this feature is enabled ZFS will use block cloning for operations like
Block cloning allows to create multiple references to a single block.
It is much faster than copying the data (as the actual data is neither read nor
written) and takes no additional space.
Blocks can be cloned across datasets under some conditions (like disabled
encryption and equal
.Nm recordsize ) .
Blocks can be cloned across datasets under some conditions (like equal
.Nm recordsize ,
the same master encryption key, etc.).
ZFS tries its best to clone across datasets including encrypted ones.
This is limited for various (nontrivial) reasons depending on the OS
and/or ZFS internals.
.Pp
This feature becomes
.Sy active

View File

@ -80,6 +80,7 @@ SPL_OBJS := \
spl-kstat.o \
spl-proc.o \
spl-procfs-list.o \
spl-shrinker.o \
spl-taskq.o \
spl-thread.o \
spl-trace.o \

View File

@ -187,19 +187,18 @@ kstat_sysctl_dataset_string(SYSCTL_HANDLER_ARGS)
static int
kstat_sysctl_io(SYSCTL_HANDLER_ARGS)
{
struct sbuf *sb;
struct sbuf sb;
kstat_t *ksp = arg1;
kstat_io_t *kip = ksp->ks_data;
int rc;
sb = sbuf_new_auto();
if (sb == NULL)
return (ENOMEM);
sbuf_new_for_sysctl(&sb, NULL, 0, req);
/* Update the aggsums before reading */
(void) ksp->ks_update(ksp, KSTAT_READ);
/* though wlentime & friends are signed, they will never be negative */
sbuf_printf(sb,
sbuf_printf(&sb,
"%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
"%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
kip->nread, kip->nwritten,
@ -207,25 +206,21 @@ kstat_sysctl_io(SYSCTL_HANDLER_ARGS)
kip->wtime, kip->wlentime, kip->wlastupdate,
kip->rtime, kip->rlentime, kip->rlastupdate,
kip->wcnt, kip->rcnt);
rc = sbuf_finish(sb);
if (rc == 0)
rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
sbuf_delete(sb);
rc = sbuf_finish(&sb);
sbuf_delete(&sb);
return (rc);
}
static int
kstat_sysctl_raw(SYSCTL_HANDLER_ARGS)
{
struct sbuf *sb;
struct sbuf sb;
void *data;
kstat_t *ksp = arg1;
void *(*addr_op)(kstat_t *ksp, loff_t index);
int n, has_header, rc = 0;
sb = sbuf_new_auto();
if (sb == NULL)
return (ENOMEM);
sbuf_new_for_sysctl(&sb, NULL, PAGE_SIZE, req);
if (ksp->ks_raw_ops.addr)
addr_op = ksp->ks_raw_ops.addr;
@ -258,8 +253,10 @@ restart_headers:
if (has_header) {
if (rc == ENOMEM && !kstat_resize_raw(ksp))
goto restart_headers;
if (rc == 0)
sbuf_printf(sb, "\n%s", ksp->ks_raw_buf);
if (rc == 0) {
sbuf_cat(&sb, "\n");
sbuf_cat(&sb, ksp->ks_raw_buf);
}
}
while ((data = addr_op(ksp, n)) != NULL) {
@ -270,22 +267,19 @@ restart:
if (rc == ENOMEM && !kstat_resize_raw(ksp))
goto restart;
if (rc == 0)
sbuf_printf(sb, "%s", ksp->ks_raw_buf);
sbuf_cat(&sb, ksp->ks_raw_buf);
} else {
ASSERT3U(ksp->ks_ndata, ==, 1);
sbuf_hexdump(sb, ksp->ks_data,
sbuf_hexdump(&sb, ksp->ks_data,
ksp->ks_data_size, NULL, 0);
}
n++;
}
free(ksp->ks_raw_buf, M_TEMP);
mutex_exit(ksp->ks_lock);
sbuf_trim(sb);
rc = sbuf_finish(sb);
if (rc == 0)
rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
sbuf_delete(sb);
rc = sbuf_finish(&sb);
sbuf_delete(&sb);
return (rc);
}

View File

@ -110,7 +110,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
dmu_buf_will_fill(db, tx, B_FALSE);
else
dmu_buf_will_dirty(db, tx);
@ -126,7 +126,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
}
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
dmu_buf_fill_done(db, tx, B_FALSE);
offset += tocpy;
size -= tocpy;

View File

@ -1251,7 +1251,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
iovec_t *dst_iovecs;
zil_chain_t *zilc;
lr_t *lr;
uint64_t txtype, lr_len;
uint64_t txtype, lr_len, nused;
uint_t crypt_len, nr_iovecs, vec;
uint_t aad_len = 0, total_len = 0;
@ -1268,7 +1268,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
zilc = (zil_chain_t *)src;
slrp = src + sizeof (zil_chain_t);
aadp = aadbuf;
blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
ASSERT3U(nused, >=, sizeof (zil_chain_t));
ASSERT3U(nused, <=, datalen);
blkend = src + nused;
/*
* Calculate the number of encrypted iovecs we will need.
@ -1287,6 +1290,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
txtype = lr->lrc_txtype;
lr_len = lr->lrc_reclen;
}
ASSERT3U(lr_len, >=, sizeof (lr_t));
ASSERT3U(lr_len, <=, blkend - slrp);
nr_iovecs++;
if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))

View File

@ -1333,6 +1333,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
}
}
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
dataset_kstats_rename(&zv->zv_kstat, newname);
}
/*

View File

@ -76,17 +76,6 @@ module_param(spl_kmem_cache_magazine_size, uint, 0444);
MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
"Default magazine size (2-256), set automatically (0)");
/*
* The default behavior is to report the number of objects remaining in the
* cache. This allows the Linux VM to repeatedly reclaim objects from the
* cache when memory is low satisfy other memory allocations. Alternately,
* setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
* is reclaimed. This may increase the likelihood of out of memory events.
*/
static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
module_param(spl_kmem_cache_reclaim, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");

View File

@ -0,0 +1,115 @@
/*
* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
* Copyright (C) 2007 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Brian Behlendorf <behlendorf1@llnl.gov>.
* UCRL-CODE-235197
*
* This file is part of the SPL, Solaris Porting Layer.
*
* The SPL is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
*
* The SPL is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
*
* Solaris Porting Layer (SPL) Shrinker Implementation.
*/
#include <sys/kmem.h>
#include <sys/shrinker.h>
#ifdef HAVE_SINGLE_SHRINKER_CALLBACK
/* 3.0-3.11: single shrink() callback, which we wrap to carry both functions */
struct spl_shrinker_wrap {
struct shrinker shrinker;
spl_shrinker_cb countfunc;
spl_shrinker_cb scanfunc;
};
static int
spl_shrinker_single_cb(struct shrinker *shrinker, struct shrink_control *sc)
{
struct spl_shrinker_wrap *sw = (struct spl_shrinker_wrap *)shrinker;
if (sc->nr_to_scan != 0)
(void) sw->scanfunc(&sw->shrinker, sc);
return (sw->countfunc(&sw->shrinker, sc));
}
#endif
struct shrinker *
spl_register_shrinker(const char *name, spl_shrinker_cb countfunc,
spl_shrinker_cb scanfunc, int seek_cost)
{
struct shrinker *shrinker;
/* allocate shrinker */
#if defined(HAVE_SHRINKER_REGISTER)
/* 6.7: kernel will allocate the shrinker for us */
shrinker = shrinker_alloc(0, name);
#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
/* 3.12-6.6: we allocate the shrinker */
shrinker = kmem_zalloc(sizeof (struct shrinker), KM_SLEEP);
#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK)
/* 3.0-3.11: allocate a wrapper */
struct spl_shrinker_wrap *sw =
kmem_zalloc(sizeof (struct spl_shrinker_wrap), KM_SLEEP);
shrinker = &sw->shrinker;
#else
/* 2.x-2.6.22, or a newer shrinker API has been introduced. */
#error "Unknown shrinker API"
#endif
if (shrinker == NULL)
return (NULL);
/* set callbacks */
#ifdef HAVE_SINGLE_SHRINKER_CALLBACK
sw->countfunc = countfunc;
sw->scanfunc = scanfunc;
shrinker->shrink = spl_shrinker_single_cb;
#else
shrinker->count_objects = countfunc;
shrinker->scan_objects = scanfunc;
#endif
/* set params */
shrinker->seeks = seek_cost;
/* register with kernel */
#if defined(HAVE_SHRINKER_REGISTER)
shrinker_register(shrinker);
#elif defined(HAVE_REGISTER_SHRINKER_VARARG)
register_shrinker(shrinker, name);
#else
register_shrinker(shrinker);
#endif
return (shrinker);
}
EXPORT_SYMBOL(spl_register_shrinker);
void
spl_unregister_shrinker(struct shrinker *shrinker)
{
#if defined(HAVE_SHRINKER_REGISTER)
shrinker_free(shrinker);
#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
unregister_shrinker(shrinker);
kmem_free(shrinker, sizeof (struct shrinker));
#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK)
unregister_shrinker(shrinker);
kmem_free(shrinker, sizeof (struct spl_shrinker_wrap));
#else
#error "Unknown shrinker API"
#endif
}
EXPORT_SYMBOL(spl_unregister_shrinker);

View File

@ -247,8 +247,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
return (sc->nr_to_scan);
}
SPL_SHRINKER_DECLARE(arc_shrinker,
arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
static struct shrinker *arc_shrinker = NULL;
int
arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
@ -351,14 +350,18 @@ arc_lowmem_init(void)
* reclaim from the arc. This is done to prevent kswapd from
* swapping out pages when it is preferable to shrink the arc.
*/
spl_register_shrinker(&arc_shrinker);
arc_shrinker = spl_register_shrinker("zfs-arc-shrinker",
arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
VERIFY(arc_shrinker);
arc_set_sys_free(allmem);
}
void
arc_lowmem_fini(void)
{
spl_unregister_shrinker(&arc_shrinker);
spl_unregister_shrinker(arc_shrinker);
arc_shrinker = NULL;
}
int

View File

@ -85,7 +85,7 @@ static blk_mode_t
#else
static fmode_t
#endif
vdev_bdev_mode(spa_mode_t spa_mode)
vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive)
{
#ifdef HAVE_BLK_MODE_T
blk_mode_t mode = 0;
@ -95,6 +95,9 @@ vdev_bdev_mode(spa_mode_t spa_mode)
if (spa_mode & SPA_MODE_WRITE)
mode |= BLK_OPEN_WRITE;
if (exclusive)
mode |= BLK_OPEN_EXCL;
#else
fmode_t mode = 0;
@ -103,6 +106,9 @@ vdev_bdev_mode(spa_mode_t spa_mode)
if (spa_mode & SPA_MODE_WRITE)
mode |= FMODE_WRITE;
if (exclusive)
mode |= FMODE_EXCL;
#endif
return (mode);
@ -225,10 +231,10 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder,
{
#ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG
return (blkdev_get_by_path(path,
vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops));
vdev_bdev_mode(mode, B_TRUE), holder, hops));
#else
return (blkdev_get_by_path(path,
vdev_bdev_mode(mode) | FMODE_EXCL, holder));
vdev_bdev_mode(mode, B_TRUE), holder));
#endif
}
@ -238,7 +244,7 @@ vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder)
#ifdef HAVE_BLKDEV_PUT_HOLDER
return (blkdev_put(bdev, holder));
#else
return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL));
return (blkdev_put(bdev, vdev_bdev_mode(mode, B_TRUE)));
#endif
}
@ -248,9 +254,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
{
struct block_device *bdev;
#ifdef HAVE_BLK_MODE_T
blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
#else
fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
#endif
hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
vdev_disk_t *vd;

View File

@ -520,8 +520,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
ip->i_uid = SUID_TO_KUID(0);
ip->i_gid = SGID_TO_KGID(0);
ip->i_blkbits = SPA_MINBLOCKSHIFT;
ip->i_atime = now;
ip->i_mtime = now;
zpl_inode_set_atime_to_ts(ip, now);
zpl_inode_set_mtime_to_ts(ip, now);
zpl_inode_set_ctime_to_ts(ip, now);
ip->i_fop = fops;
ip->i_op = ops;

View File

@ -1258,12 +1258,18 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
* and inode caches. This can occur when the ARC needs to free meta data
* blocks but can't because they are all pinned by entries in these caches.
*/
#if defined(HAVE_SUPER_BLOCK_S_SHRINK)
#define S_SHRINK(sb) (&(sb)->s_shrink)
#elif defined(HAVE_SUPER_BLOCK_S_SHRINK_PTR)
#define S_SHRINK(sb) ((sb)->s_shrink)
#endif
int
zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
{
zfsvfs_t *zfsvfs = sb->s_fs_info;
int error = 0;
struct shrinker *shrinker = &sb->s_shrink;
struct shrinker *shrinker = S_SHRINK(sb);
struct shrink_control sc = {
.nr_to_scan = nr_to_scan,
.gfp_mask = GFP_KERNEL,
@ -1275,7 +1281,7 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \
defined(SHRINK_CONTROL_HAS_NID) && \
defined(SHRINKER_NUMA_AWARE)
if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) {
if (shrinker->flags & SHRINKER_NUMA_AWARE) {
*objects = 0;
for_each_online_node(sc.nid) {
*objects += (*shrinker->scan_objects)(shrinker, &sc);

View File

@ -2464,15 +2464,16 @@ top:
if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
zp->z_atime_dirty = B_FALSE;
ZFS_TIME_ENCODE(&ip->i_atime, atime);
inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
ZFS_TIME_ENCODE(&tmp_atime, atime);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
&atime, sizeof (atime));
}
if (mask & (ATTR_MTIME | ATTR_SIZE)) {
ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
vap->va_mtime, ZTOI(zp));
zpl_inode_set_mtime_to_ts(ZTOI(zp),
zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
mtime, sizeof (mtime));
@ -3686,7 +3687,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
caddr_t va;
int err = 0;
uint64_t mtime[2], ctime[2];
inode_timespec_t tmp_ctime;
inode_timespec_t tmp_ts;
sa_bulk_attr_t bulk[3];
int cnt = 0;
struct address_space *mapping;
@ -3850,9 +3851,10 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
&zp->z_pflags, 8);
/* Preserve the mtime and ctime provided by the inode */
ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
tmp_ctime = zpl_inode_get_ctime(ip);
ZFS_TIME_ENCODE(&tmp_ctime, ctime);
tmp_ts = zpl_inode_get_mtime(ip);
ZFS_TIME_ENCODE(&tmp_ts, mtime);
tmp_ts = zpl_inode_get_ctime(ip);
ZFS_TIME_ENCODE(&tmp_ts, ctime);
zp->z_atime_dirty = B_FALSE;
zp->z_seq++;
@ -3902,7 +3904,7 @@ zfs_dirty_inode(struct inode *ip, int flags)
zfsvfs_t *zfsvfs = ITOZSB(ip);
dmu_tx_t *tx;
uint64_t mode, atime[2], mtime[2], ctime[2];
inode_timespec_t tmp_ctime;
inode_timespec_t tmp_ts;
sa_bulk_attr_t bulk[4];
int error = 0;
int cnt = 0;
@ -3947,10 +3949,12 @@ zfs_dirty_inode(struct inode *ip, int flags)
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
/* Preserve the mode, mtime and ctime provided by the inode */
ZFS_TIME_ENCODE(&ip->i_atime, atime);
ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
tmp_ctime = zpl_inode_get_ctime(ip);
ZFS_TIME_ENCODE(&tmp_ctime, ctime);
tmp_ts = zpl_inode_get_atime(ip);
ZFS_TIME_ENCODE(&tmp_ts, atime);
tmp_ts = zpl_inode_get_mtime(ip);
ZFS_TIME_ENCODE(&tmp_ts, mtime);
tmp_ts = zpl_inode_get_ctime(ip);
ZFS_TIME_ENCODE(&tmp_ts, ctime);
mode = ip->i_mode;
zp->z_mode = mode;
@ -3993,7 +3997,9 @@ zfs_inactive(struct inode *ip)
if (error) {
dmu_tx_abort(tx);
} else {
ZFS_TIME_ENCODE(&ip->i_atime, atime);
inode_timespec_t tmp_atime;
tmp_atime = zpl_inode_get_atime(ip);
ZFS_TIME_ENCODE(&tmp_atime, atime);
mutex_enter(&zp->z_lock);
(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
(void *)&atime, sizeof (atime), tx);

View File

@ -542,7 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
uint64_t links;
uint64_t z_uid, z_gid;
uint64_t atime[2], mtime[2], ctime[2], btime[2];
inode_timespec_t tmp_ctime;
inode_timespec_t tmp_ts;
uint64_t projid = ZFS_DEFAULT_PROJID;
sa_bulk_attr_t bulk[12];
int count = 0;
@ -614,10 +614,12 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
if (zp->z_pflags & ZFS_XATTR)
zp->z_xattr_parent = parent;
ZFS_TIME_DECODE(&ip->i_atime, atime);
ZFS_TIME_DECODE(&ip->i_mtime, mtime);
ZFS_TIME_DECODE(&tmp_ctime, ctime);
zpl_inode_set_ctime_to_ts(ip, tmp_ctime);
ZFS_TIME_DECODE(&tmp_ts, atime);
zpl_inode_set_atime_to_ts(ip, tmp_ts);
ZFS_TIME_DECODE(&tmp_ts, mtime);
zpl_inode_set_mtime_to_ts(ip, tmp_ts);
ZFS_TIME_DECODE(&tmp_ts, ctime);
zpl_inode_set_ctime_to_ts(ip, tmp_ts);
ZFS_TIME_DECODE(&zp->z_btime, btime);
ip->i_ino = zp->z_id;
@ -1197,7 +1199,7 @@ zfs_rezget(znode_t *zp)
uint64_t gen;
uint64_t z_uid, z_gid;
uint64_t atime[2], mtime[2], ctime[2], btime[2];
inode_timespec_t tmp_ctime;
inode_timespec_t tmp_ts;
uint64_t projid = ZFS_DEFAULT_PROJID;
znode_hold_t *zh;
@ -1290,10 +1292,12 @@ zfs_rezget(znode_t *zp)
zfs_uid_write(ZTOI(zp), z_uid);
zfs_gid_write(ZTOI(zp), z_gid);
ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
ZFS_TIME_DECODE(&tmp_ctime, ctime);
zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
ZFS_TIME_DECODE(&tmp_ts, atime);
zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts);
ZFS_TIME_DECODE(&tmp_ts, mtime);
zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
ZFS_TIME_DECODE(&tmp_ts, ctime);
zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
ZFS_TIME_DECODE(&zp->z_btime, btime);
if ((uint32_t)gen != ZTOI(zp)->i_generation) {
@ -1401,22 +1405,24 @@ zfs_zinactive(znode_t *zp)
boolean_t
zfs_relatime_need_update(const struct inode *ip)
{
inode_timespec_t now, tmp_ctime;
inode_timespec_t now, tmp_atime, tmp_ts;
gethrestime(&now);
tmp_atime = zpl_inode_get_atime(ip);
/*
* In relatime mode, only update the atime if the previous atime
* is earlier than either the ctime or mtime or if at least a day
* has passed since the last update of atime.
*/
if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
tmp_ts = zpl_inode_get_mtime(ip);
if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
return (B_TRUE);
tmp_ctime = zpl_inode_get_ctime(ip);
if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0)
tmp_ts = zpl_inode_get_ctime(ip);
if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
return (B_TRUE);
if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60)
return (B_TRUE);
return (B_FALSE);
@ -1439,7 +1445,7 @@ void
zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
uint64_t ctime[2])
{
inode_timespec_t now, tmp_ctime;
inode_timespec_t now, tmp_ts;
gethrestime(&now);
@ -1447,7 +1453,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
if (flag & ATTR_MTIME) {
ZFS_TIME_ENCODE(&now, mtime);
ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
ZFS_TIME_DECODE(&tmp_ts, mtime);
zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
if (ZTOZSB(zp)->z_use_fuids) {
zp->z_pflags |= (ZFS_ARCHIVE |
ZFS_AV_MODIFIED);
@ -1456,8 +1463,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
if (flag & ATTR_CTIME) {
ZFS_TIME_ENCODE(&now, ctime);
ZFS_TIME_DECODE(&tmp_ctime, ctime);
zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
ZFS_TIME_DECODE(&tmp_ts, ctime);
zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
if (ZTOZSB(zp)->z_use_fuids)
zp->z_pflags |= ZFS_ARCHIVE;
}

View File

@ -1405,7 +1405,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
boolean_t *no_crypt)
{
int ret;
uint64_t txtype, lr_len;
uint64_t txtype, lr_len, nused;
uint_t nr_src, nr_dst, crypt_len;
uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
@ -1432,7 +1432,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
zilc = (zil_chain_t *)src;
slrp = src + sizeof (zil_chain_t);
aadp = aadbuf;
blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
ASSERT3U(nused, >=, sizeof (zil_chain_t));
ASSERT3U(nused, <=, datalen);
blkend = src + nused;
/* calculate the number of encrypted iovecs we will need */
for (; slrp < blkend; slrp += lr_len) {
@ -1445,6 +1448,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
txtype = BSWAP_64(lr->lrc_txtype);
lr_len = BSWAP_64(lr->lrc_reclen);
}
ASSERT3U(lr_len, >=, sizeof (lr_t));
ASSERT3U(lr_len, <=, blkend - slrp);
nr_iovecs++;
if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))

View File

@ -526,7 +526,8 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia)
vap->va_ctime = ia->ia_ctime;
if (vap->va_mask & ATTR_ATIME)
ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip);
zpl_inode_set_atime_to_ts(ip,
zpl_inode_timestamp_truncate(ia->ia_atime, ip));
cookie = spl_fstrans_mark();
#ifdef HAVE_USERNS_IOPS_SETATTR

View File

@ -1528,6 +1528,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
*/
set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
set_disk_ro(zv->zv_zso->zvo_disk, readonly);
dataset_kstats_rename(&zv->zv_kstat, newname);
}
void

View File

@ -802,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
boolean_t gang = abd_is_gang(abd);
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
while (size > 0) {
/* If we are at the end of the gang ABD we are done */
if (gang && !c_abd)
break;
IMPLY(abd_is_gang(abd), c_abd != NULL);
abd_iter_map(&aiter);
@ -930,7 +927,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
{
int ret = 0;
struct abd_iter daiter, saiter;
boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
abd_t *c_dabd, *c_sabd;
if (size == 0)
@ -942,16 +938,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
ASSERT3U(doff + size, <=, dabd->abd_size);
ASSERT3U(soff + size, <=, sabd->abd_size);
dabd_is_gang_abd = abd_is_gang(dabd);
sabd_is_gang_abd = abd_is_gang(sabd);
c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
c_sabd = abd_init_abd_iter(sabd, &saiter, soff);
while (size > 0) {
/* if we are at the end of the gang ABD we are done */
if ((dabd_is_gang_abd && !c_dabd) ||
(sabd_is_gang_abd && !c_sabd))
break;
IMPLY(abd_is_gang(dabd), c_dabd != NULL);
IMPLY(abd_is_gang(sabd), c_sabd != NULL);
abd_iter_map(&daiter);
abd_iter_map(&saiter);
@ -1032,66 +1024,40 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
int i;
ssize_t len, dlen;
struct abd_iter caiters[3];
struct abd_iter daiter = {0};
struct abd_iter daiter;
void *caddrs[3];
unsigned long flags __maybe_unused = 0;
abd_t *c_cabds[3];
abd_t *c_dabd = NULL;
boolean_t cabds_is_gang_abd[3];
boolean_t dabd_is_gang_abd = B_FALSE;
ASSERT3U(parity, <=, 3);
for (i = 0; i < parity; i++) {
cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
abd_verify(cabds[i]);
ASSERT3U(csize, <=, cabds[i]->abd_size);
c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0);
}
if (dabd) {
dabd_is_gang_abd = abd_is_gang(dabd);
ASSERT3S(dsize, >=, 0);
if (dsize > 0) {
ASSERT(dabd);
abd_verify(dabd);
ASSERT3U(dsize, <=, dabd->abd_size);
c_dabd = abd_init_abd_iter(dabd, &daiter, 0);
}
ASSERT3S(dsize, >=, 0);
abd_enter_critical(flags);
while (csize > 0) {
/* if we are at the end of the gang ABD we are done */
if (dabd_is_gang_abd && !c_dabd)
break;
len = csize;
for (i = 0; i < parity; i++) {
/*
* If we are at the end of the gang ABD we are
* done.
*/
if (cabds_is_gang_abd[i] && !c_cabds[i])
break;
IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
abd_iter_map(&caiters[i]);
caddrs[i] = caiters[i].iter_mapaddr;
len = MIN(caiters[i].iter_mapsize, len);
}
len = csize;
if (dabd && dsize > 0)
if (dsize > 0) {
IMPLY(abd_is_gang(dabd), c_dabd != NULL);
abd_iter_map(&daiter);
switch (parity) {
case 3:
len = MIN(caiters[2].iter_mapsize, len);
zfs_fallthrough;
case 2:
len = MIN(caiters[1].iter_mapsize, len);
zfs_fallthrough;
case 1:
len = MIN(caiters[0].iter_mapsize, len);
}
/* must be progressive */
ASSERT3S(len, >, 0);
if (dabd && dsize > 0) {
/* this needs precise iter.length */
len = MIN(daiter.iter_mapsize, len);
dlen = len;
} else
@ -1114,7 +1080,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
&caiters[i], len);
}
if (dabd && dsize > 0) {
if (dsize > 0) {
abd_iter_unmap(&daiter);
c_dabd =
abd_advance_abd_iter(dabd, c_dabd, &daiter,
@ -1153,16 +1119,16 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
struct abd_iter xiters[3];
void *caddrs[3], *xaddrs[3];
unsigned long flags __maybe_unused = 0;
boolean_t cabds_is_gang_abd[3];
boolean_t tabds_is_gang_abd[3];
abd_t *c_cabds[3];
abd_t *c_tabds[3];
ASSERT3U(parity, <=, 3);
for (i = 0; i < parity; i++) {
cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
tabds_is_gang_abd[i] = abd_is_gang(tabds[i]);
abd_verify(cabds[i]);
abd_verify(tabds[i]);
ASSERT3U(tsize, <=, cabds[i]->abd_size);
ASSERT3U(tsize, <=, tabds[i]->abd_size);
c_cabds[i] =
abd_init_abd_iter(cabds[i], &citers[i], 0);
c_tabds[i] =
@ -1171,36 +1137,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
abd_enter_critical(flags);
while (tsize > 0) {
len = tsize;
for (i = 0; i < parity; i++) {
/*
* If we are at the end of the gang ABD we
* are done.
*/
if (cabds_is_gang_abd[i] && !c_cabds[i])
break;
if (tabds_is_gang_abd[i] && !c_tabds[i])
break;
IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL);
abd_iter_map(&citers[i]);
abd_iter_map(&xiters[i]);
caddrs[i] = citers[i].iter_mapaddr;
xaddrs[i] = xiters[i].iter_mapaddr;
len = MIN(citers[i].iter_mapsize, len);
len = MIN(xiters[i].iter_mapsize, len);
}
len = tsize;
switch (parity) {
case 3:
len = MIN(xiters[2].iter_mapsize, len);
len = MIN(citers[2].iter_mapsize, len);
zfs_fallthrough;
case 2:
len = MIN(xiters[1].iter_mapsize, len);
len = MIN(citers[1].iter_mapsize, len);
zfs_fallthrough;
case 1:
len = MIN(xiters[0].iter_mapsize, len);
len = MIN(citers[0].iter_mapsize, len);
}
/* must be progressive */
ASSERT3S(len, >, 0);
/*

View File

@ -8042,9 +8042,8 @@ l2arc_write_size(l2arc_dev_t *dev)
*/
size = l2arc_write_max;
if (size == 0) {
cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
"be greater than zero, resetting it to the default (%d)",
L2ARC_WRITE_SIZE);
cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
"resetting it to the default (%d)", L2ARC_WRITE_SIZE);
size = l2arc_write_max = L2ARC_WRITE_SIZE;
}
@ -8067,30 +8066,9 @@ l2arc_write_size(l2arc_dev_t *dev)
* device. This is important in l2arc_evict(), otherwise infinite
* iteration can occur.
*/
if (size > dev->l2ad_end - dev->l2ad_start) {
cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
"plus the overhead of log blocks (persistent L2ARC, "
"%llu bytes) exceeds the size of the cache device "
"(guid %llu), resetting them to the default (%d)",
(u_longlong_t)l2arc_log_blk_overhead(size, dev),
(u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);
size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
if (l2arc_trim_ahead > 1) {
cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1");
l2arc_trim_ahead = 1;
}
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
size += l2arc_log_blk_overhead(size, dev);
if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
size += MAX(64 * 1024 * 1024,
(size * l2arc_trim_ahead) / 100);
}
}
size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);
return (size);

View File

@ -157,10 +157,8 @@
* (copying the file content to the new dataset and removing the source file).
* In that case Block Cloning will only be used briefly, because the BRT entries
* will be removed when the source is removed.
* Note: currently it is not possible to clone blocks between encrypted
* datasets, even if those datasets use the same encryption key (this includes
* snapshots of encrypted datasets). Cloning blocks between datasets that use
* the same keys should be possible and should be implemented in the future.
* Block Cloning across encrypted datasets is supported as long as both
* datasets share the same master key (e.g. snapshots and clones)
*
* Block Cloning flow through ZFS layers.
*
@ -344,7 +342,7 @@ brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
ASSERT3U(idx, <, brtvd->bv_size);
if (brtvd->bv_need_byteswap) {
if (unlikely(brtvd->bv_need_byteswap)) {
return (BSWAP_16(brtvd->bv_entcount[idx]));
} else {
return (brtvd->bv_entcount[idx]);
@ -357,7 +355,7 @@ brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
ASSERT3U(idx, <, brtvd->bv_size);
if (brtvd->bv_need_byteswap) {
if (unlikely(brtvd->bv_need_byteswap)) {
brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
} else {
brtvd->bv_entcount[idx] = entcnt;
@ -392,55 +390,39 @@ brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
#ifdef ZFS_DEBUG
static void
brt_vdev_dump(brt_t *brt)
brt_vdev_dump(brt_vdev_t *brtvd)
{
brt_vdev_t *brtvd;
uint64_t vdevid;
uint64_t idx;
if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
return;
}
if (brt->brt_nvdevs == 0) {
zfs_dbgmsg("BRT empty");
return;
}
zfs_dbgmsg("BRT vdev dump:");
for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
uint64_t idx;
brtvd = &brt->brt_vdevs[vdevid];
zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
"size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
(u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
(u_longlong_t)brtvd->bv_size,
(u_longlong_t)brtvd->bv_totalcount,
(u_longlong_t)brtvd->bv_nblocks,
(size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
if (brtvd->bv_totalcount > 0) {
zfs_dbgmsg(" entcounts:");
for (idx = 0; idx < brtvd->bv_size; idx++) {
if (brt_vdev_entcount_get(brtvd, idx) > 0) {
zfs_dbgmsg(" [%04llu] %hu",
(u_longlong_t)idx,
brt_vdev_entcount_get(brtvd, idx));
}
zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
"size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
(u_longlong_t)brtvd->bv_vdevid,
brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
(u_longlong_t)brtvd->bv_size,
(u_longlong_t)brtvd->bv_totalcount,
(u_longlong_t)brtvd->bv_nblocks,
(size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
if (brtvd->bv_totalcount > 0) {
zfs_dbgmsg(" entcounts:");
for (idx = 0; idx < brtvd->bv_size; idx++) {
uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
if (entcnt > 0) {
zfs_dbgmsg(" [%04llu] %hu",
(u_longlong_t)idx, entcnt);
}
}
if (brtvd->bv_entcount_dirty) {
char *bitmap;
}
if (brtvd->bv_entcount_dirty) {
char *bitmap;
bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
bitmap[idx] =
BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
}
bitmap[idx] = '\0';
zfs_dbgmsg(" bitmap: %s", bitmap);
kmem_free(bitmap, brtvd->bv_nblocks + 1);
bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
bitmap[idx] =
BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
}
bitmap[idx] = '\0';
zfs_dbgmsg(" dirty: %s", bitmap);
kmem_free(bitmap, brtvd->bv_nblocks + 1);
}
}
#endif
@ -769,7 +751,8 @@ brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
BT_SET(brtvd->bv_bitmap, idx);
#ifdef ZFS_DEBUG
brt_vdev_dump(brt);
if (zfs_flags & ZFS_DEBUG_BRT)
brt_vdev_dump(brtvd);
#endif
}
@ -805,7 +788,8 @@ brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
BT_SET(brtvd->bv_bitmap, idx);
#ifdef ZFS_DEBUG
brt_vdev_dump(brt);
if (zfs_flags & ZFS_DEBUG_BRT)
brt_vdev_dump(brtvd);
#endif
}

View File

@ -198,6 +198,18 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
zil_sums_fini(&dk->dk_zil_sums);
}
void
dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
{
dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
char *ds_name;
ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name);
ASSERT3S(ds_name, !=, NULL);
(void) strlcpy(ds_name, name,
KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
}
void
dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
int64_t nwritten)

View File

@ -1619,8 +1619,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
*/
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth);
zfs_panic_recover("unencrypted block in encrypted "
"object set %llu", dmu_objset_id(db->db_objset));
err = SET_ERROR(EIO);
goto early_unlock;
}
@ -1925,7 +1923,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
zio_free(db->db_objset->os_spa, txg, bp);
if (dr->dt.dl.dr_brtwrite) {
ASSERT0P(dr->dt.dl.dr_data);
ASSERT0(dr->dt.dl.dr_data);
dr->dt.dl.dr_data = db->db_buf;
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@ -2736,7 +2734,7 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
}
void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@ -2754,8 +2752,14 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
* Block cloning: We will be completely overwriting a block
* cloned in this transaction group, so let's undirty the
* pending clone and mark the block as uncached. This will be
* as if the clone was never done.
* as if the clone was never done. But if the fill can fail
* we should have a way to return back to the cloned data.
*/
if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
mutex_exit(&db->db_mtx);
dmu_buf_will_dirty(db_fake, tx);
return;
}
VERIFY(!dbuf_undirty(db, tx));
db->db_state = DB_UNCACHED;
}
@ -2816,32 +2820,41 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
dl->dr_overridden_by.blk_birth = dr->dr_txg;
}
void
dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
boolean_t
dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
{
(void) tx;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
dbuf_states_t old_state;
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
old_state = db->db_state;
db->db_state = DB_CACHED;
if (old_state == DB_FILL) {
if (db->db_state == DB_FILL) {
if (db->db_level == 0 && db->db_freed_in_flight) {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
memset(db->db.db_data, 0, db->db.db_size);
db->db_freed_in_flight = FALSE;
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db,
"fill done handling freed in flight");
failed = B_FALSE;
} else if (failed) {
VERIFY(!dbuf_undirty(db, tx));
db->db_buf = NULL;
dbuf_clear_data(db);
DTRACE_SET_STATE(db, "fill failed");
} else {
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "fill done");
}
cv_broadcast(&db->db_changed);
} else {
db->db_state = DB_CACHED;
failed = B_FALSE;
}
mutex_exit(&db->db_mtx);
return (failed);
}
void
@ -2986,7 +2999,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
DTRACE_SET_STATE(db, "filling assigned arcbuf");
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
dmu_buf_fill_done(&db->db, tx);
dmu_buf_fill_done(&db->db, tx, B_FALSE);
}
void

View File

@ -1115,14 +1115,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
dmu_buf_will_fill(db, tx, B_FALSE);
else
dmu_buf_will_dirty(db, tx);
(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
dmu_buf_fill_done(db, tx, B_FALSE);
offset += tocpy;
size -= tocpy;
@ -1330,27 +1330,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
ASSERT(size > 0);
bufoff = zfs_uio_offset(uio) - db->db_offset;
offset_t off = zfs_uio_offset(uio);
bufoff = off - db->db_offset;
tocpy = MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
dmu_buf_will_fill(db, tx, B_TRUE);
else
dmu_buf_will_dirty(db, tx);
/*
* XXX zfs_uiomove could block forever (eg.nfs-backed
* pages). There needs to be a uiolockdown() function
* to lock the pages in memory, so that zfs_uiomove won't
* block.
*/
err = zfs_uio_fault_move((char *)db->db_data + bufoff,
tocpy, UIO_WRITE, uio);
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
/* The fill was reverted. Undo any uio progress. */
zfs_uio_advance(uio, off - zfs_uio_offset(uio));
}
if (err)
break;
@ -1482,9 +1479,9 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, 0, offset);
db = dbuf_hold(dn, blkid, FTAG);
rw_exit(&dn->dn_struct_rwlock);
if (db == NULL)
return (SET_ERROR(EIO));
rw_exit(&dn->dn_struct_rwlock);
/*
* We can only assign if the offset is aligned and the arc buf is the

View File

@ -2532,7 +2532,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
* size of the provided arc_buf_t.
*/
if (db_spill->db_size != drrs->drr_length) {
dmu_buf_will_fill(db_spill, tx);
dmu_buf_will_fill(db_spill, tx, B_FALSE);
VERIFY0(dbuf_spill_set_blksz(db_spill,
drrs->drr_length, tx));
}

View File

@ -1124,8 +1124,6 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (sta->os->os_encrypted &&
!BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
spa_log_error(spa, zb, &bp->blk_birth);
zfs_panic_recover("unencrypted block in encrypted "
"object set %llu", dmu_objset_id(sta->os));
return (SET_ERROR(EIO));
}

View File

@ -266,6 +266,40 @@ spa_crypto_key_compare(const void *a, const void *b)
return (0);
}
/*
* this compares a crypto key based on zk_guid. See comment on
* spa_crypto_key_compare for more information.
*/
boolean_t
dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb)
{
dsl_crypto_key_t *dcka = NULL;
dsl_crypto_key_t *dckb = NULL;
uint64_t obja, objb;
boolean_t equal;
spa_t *spa;
spa = dmu_objset_spa(osa);
if (spa != dmu_objset_spa(osb))
return (B_FALSE);
obja = dmu_objset_ds(osa)->ds_object;
objb = dmu_objset_ds(osb)->ds_object;
if (spa_keystore_lookup_key(spa, obja, FTAG, &dcka) != 0)
return (B_FALSE);
if (spa_keystore_lookup_key(spa, objb, FTAG, &dckb) != 0) {
spa_keystore_dsl_key_rele(spa, dcka, FTAG);
return (B_FALSE);
}
equal = (dcka->dck_key.zk_guid == dckb->dck_key.zk_guid);
spa_keystore_dsl_key_rele(spa, dcka, FTAG);
spa_keystore_dsl_key_rele(spa, dckb, FTAG);
return (equal);
}
static int
spa_key_mapping_compare(const void *a, const void *b)
{

View File

@ -1000,8 +1000,6 @@ livelist_compare(const void *larg, const void *rarg)
/* if vdevs are equal, sort by offsets. */
uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
if (l_dva0_offset == r_dva0_offset)
ASSERT3U(l->blk_birth, ==, r->blk_birth);
return (TREE_CMP(l_dva0_offset, r_dva0_offset));
}
@ -1016,9 +1014,9 @@ struct livelist_iter_arg {
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
* corresponding FREE are stored in the supplied bplist.
*
* Note that multiple FREE and ALLOC entries for the same blkptr may
* be encountered when dedup is involved. For this reason we keep a
* refcount for all the FREE entries of each blkptr and ensure that
* Note that multiple FREE and ALLOC entries for the same blkptr may be
* encountered when dedup or block cloning is involved. For this reason we
* keep a refcount for all the FREE entries of each blkptr and ensure that
* each of those FREE entries has a corresponding ALLOC preceding it.
*/
static int
@ -1037,6 +1035,13 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
livelist_entry_t node;
node.le_bp = *bp;
livelist_entry_t *found = avl_find(avl, &node, NULL);
if (found) {
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
BP_GET_CHECKSUM(&found->le_bp));
ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==,
BP_PHYSICAL_BIRTH(&found->le_bp));
}
if (bp_freed) {
if (found == NULL) {
/* first free entry for this blkptr */
@ -1046,10 +1051,10 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
e->le_refcnt = 1;
avl_add(avl, e);
} else {
/* dedup block free */
ASSERT(BP_GET_DEDUP(bp));
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
BP_GET_CHECKSUM(&found->le_bp));
/*
* Deduped or cloned block free. We could assert D bit
* for dedup, but there is no such one for cloning.
*/
ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
found->le_refcnt++;
}
@ -1065,14 +1070,6 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
/* all tracked free pairs have been matched */
avl_remove(avl, found);
kmem_free(found, sizeof (livelist_entry_t));
} else {
/*
* This is definitely a deduped blkptr so
* let's validate it.
*/
ASSERT(BP_GET_DEDUP(bp));
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
BP_GET_CHECKSUM(&found->le_bp));
}
}
}

View File

@ -151,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
* need to be handled with minimum delay.
*/
static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
@ -1164,6 +1164,275 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
tqs->stqs_taskq = NULL;
}
#ifdef _KERNEL
/*
* The READ and WRITE rows of zio_taskqs are configurable at module load time
* by setting zio_taskq_read or zio_taskq_write.
*
* Example (the defaults for READ and WRITE)
* zio_taskq_read='fixed,1,8 null scale null'
* zio_taskq_write='batch fixed,1,5 scale fixed,1,5'
*
* Each sets the entire row at a time.
*
* 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
* of threads per taskq.
*
* 'null' can only be set on the high-priority queues (queue selection for
* high-priority queues will fall back to the regular queue if the high-pri
* is NULL.
*/
static const char *const modes[ZTI_NMODES] = {
"fixed", "batch", "scale", "null"
};
/* Parse the incoming config string. Modifies cfg */
static int
spa_taskq_param_set(zio_type_t t, char *cfg)
{
int err = 0;
zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
char *next = cfg, *tok, *c;
/*
* Parse out each element from the string and fill `row`. The entire
* row has to be set at once, so any errors are flagged by just
* breaking out of this loop early.
*/
uint_t q;
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
/* `next` is the start of the config */
if (next == NULL)
break;
/* Eat up leading space */
while (isspace(*next))
next++;
if (*next == '\0')
break;
/* Mode ends at space or end of string */
tok = next;
next = strchr(tok, ' ');
if (next != NULL) *next++ = '\0';
/* Parameters start after a comma */
c = strchr(tok, ',');
if (c != NULL) *c++ = '\0';
/* Match mode string */
uint_t mode;
for (mode = 0; mode < ZTI_NMODES; mode++)
if (strcmp(tok, modes[mode]) == 0)
break;
if (mode == ZTI_NMODES)
break;
/* Invalid canary */
row[q].zti_mode = ZTI_NMODES;
/* Per-mode setup */
switch (mode) {
/*
* FIXED is parameterised: number of queues, and number of
* threads per queue.
*/
case ZTI_MODE_FIXED: {
/* No parameters? */
if (c == NULL || *c == '\0')
break;
/* Find next parameter */
tok = c;
c = strchr(tok, ',');
if (c == NULL)
break;
/* Take digits and convert */
unsigned long long nq;
if (!(isdigit(*tok)))
break;
err = ddi_strtoull(tok, &tok, 10, &nq);
/* Must succeed and also end at the next param sep */
if (err != 0 || tok != c)
break;
/* Move past the comma */
tok++;
/* Need another number */
if (!(isdigit(*tok)))
break;
/* Remember start to make sure we moved */
c = tok;
/* Take digits */
unsigned long long ntpq;
err = ddi_strtoull(tok, &tok, 10, &ntpq);
/* Must succeed, and moved forward */
if (err != 0 || tok == c || *tok != '\0')
break;
/*
* sanity; zero queues/threads make no sense, and
* 16K is almost certainly more than anyone will ever
* need and avoids silly numbers like UINT32_MAX
*/
if (nq == 0 || nq >= 16384 ||
ntpq == 0 || ntpq >= 16384)
break;
const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
row[q] = zti;
break;
}
case ZTI_MODE_BATCH: {
const zio_taskq_info_t zti = ZTI_BATCH;
row[q] = zti;
break;
}
case ZTI_MODE_SCALE: {
const zio_taskq_info_t zti = ZTI_SCALE;
row[q] = zti;
break;
}
case ZTI_MODE_NULL: {
/*
* Can only null the high-priority queues; the general-
* purpose ones have to exist.
*/
if (q != ZIO_TASKQ_ISSUE_HIGH &&
q != ZIO_TASKQ_INTERRUPT_HIGH)
break;
const zio_taskq_info_t zti = ZTI_NULL;
row[q] = zti;
break;
}
default:
break;
}
/* Ensure we set a mode */
if (row[q].zti_mode == ZTI_NMODES)
break;
}
/* Didn't get a full row, fail */
if (q < ZIO_TASKQ_TYPES)
return (SET_ERROR(EINVAL));
/* Eat trailing space */
if (next != NULL)
while (isspace(*next))
next++;
/* If there's anything left over then fail */
if (next != NULL && *next != '\0')
return (SET_ERROR(EINVAL));
/* Success! Copy it into the real config */
for (q = 0; q < ZIO_TASKQ_TYPES; q++)
zio_taskqs[t][q] = row[q];
return (0);
}
static int
spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
{
int pos = 0;
/* Build paramater string from live config */
const char *sep = "";
for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
const zio_taskq_info_t *zti = &zio_taskqs[t][q];
if (zti->zti_mode == ZTI_MODE_FIXED)
pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
modes[zti->zti_mode], zti->zti_count,
zti->zti_value);
else
pos += sprintf(&buf[pos], "%s%s", sep,
modes[zti->zti_mode]);
sep = " ";
}
if (add_newline)
buf[pos++] = '\n';
buf[pos] = '\0';
return (pos);
}
#ifdef __linux__
static int
spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
{
char *cfg = kmem_strdup(val);
int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
kmem_free(cfg, strlen(val)+1);
return (-err);
}
static int
spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
{
return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
}
static int
spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
{
char *cfg = kmem_strdup(val);
int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
kmem_free(cfg, strlen(val)+1);
return (-err);
}
static int
spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
{
return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
}
#else
/*
* On FreeBSD load-time parameters can be set up before malloc() is available,
* so we have to do all the parsing work on the stack.
*/
#define SPA_TASKQ_PARAM_MAX (128)
static int
spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
{
char buf[SPA_TASKQ_PARAM_MAX];
int err;
(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
if (err || req->newptr == NULL)
return (err);
return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
}
static int
spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
{
char buf[SPA_TASKQ_PARAM_MAX];
int err;
(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
if (err || req->newptr == NULL)
return (err);
return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
}
#endif
#endif /* _KERNEL */
/*
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
* Note that a type may have multiple discrete taskqs to avoid lock contention
@ -10210,4 +10479,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
ZMOD_RW,
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
"was being condensed");
#ifdef _KERNEL
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
"Configure IO queues for read IO");
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
"Configure IO queues for write IO");
#endif
/* END CSTYLED */

View File

@ -309,6 +309,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
uint64_t dnodesize;
int error;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl));
txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
if (byteswap) {
byteswap_uint64_array(lracl, sizeof (*lracl));
@ -470,6 +472,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
uint64_t dnodesize;
int error;
ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
@ -613,6 +617,8 @@ zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
int error;
int vflg = 0;
ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@ -648,6 +654,8 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
int error;
int vflg = 0;
ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@ -715,12 +723,14 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
{
zfsvfs_t *zfsvfs = arg1;
lr_rename_t *lr = arg2;
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL));
}
@ -730,12 +740,14 @@ zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap)
#ifdef __linux__
zfsvfs_t *zfsvfs = arg1;
lr_rename_t *lr = arg2;
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE,
NULL));
#else
@ -750,14 +762,13 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
zfsvfs_t *zfsvfs = arg1;
lr_rename_whiteout_t *lr = arg2;
int error;
/* sname and tname follow lr_rename_whiteout_t */
char *sname = (char *)(lr + 1);
char *tname = sname + strlen(sname) + 1;
/* For the whiteout file. */
xvattr_t xva;
uint64_t objid;
uint64_t dnodesize;
ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@ -783,6 +794,9 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
if (error)
return (error);
/* sname and tname follow lr_rename_whiteout_t */
char *sname = (char *)(lr + 1);
char *tname = sname + strlen(sname) + 1;
return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname,
RENAME_WHITEOUT, &xva.xva_vattr));
#else
@ -800,6 +814,8 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
int error;
uint64_t eod, offset, length;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@ -863,6 +879,8 @@ zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
int error;
uint64_t end;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@ -910,6 +928,8 @@ zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
flock64_t fl = {0};
int error;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@ -940,6 +960,8 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
int error;
void *start;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
xva_init(&xva);
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
@ -1002,6 +1024,9 @@ zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap)
size_t size;
int error = 0;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size);
ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa,
SPA_FEATURE_ZILSAXATTR));
if (byteswap)
@ -1079,6 +1104,10 @@ zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
znode_t *zp;
int error;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) +
sizeof (ace_t) * lr->lr_aclcnt);
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
zfs_oldace_byteswap(ace, lr->lr_aclcnt);
@ -1124,6 +1153,9 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
znode_t *zp;
int error;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes);
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
@ -1171,6 +1203,10 @@ zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
znode_t *zp;
int error;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
lr_bps[lr->lr_nbps]));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));

View File

@ -47,6 +47,7 @@
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_crypt.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
@ -1103,6 +1104,16 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
return (SET_ERROR(EXDEV));
}
/*
* Cloning across encrypted datasets is possible only if they
* share the same master key.
*/
if (inos != outos && inos->os_encrypted &&
!dmu_objset_crypto_key_equal(inos, outos)) {
zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
return (SET_ERROR(EXDEV));
}
error = zfs_verify_zp(inzp);
if (error == 0)
error = zfs_verify_zp(outzp);
@ -1181,11 +1192,18 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
inblksz = inzp->z_blksz;
/*
* We cannot clone into files with different block size if we can't
* grow it (block size is already bigger or more than one block).
* We cannot clone into a file with different block size if we can't
* grow it (block size is already bigger, has more than one block, or
* not locked for growth). There are other possible reasons for the
* grow to fail, but we cover what we can before opening transaction
* and the rest detect after we try to do it.
*/
if (inblksz < outzp->z_blksz) {
error = SET_ERROR(EINVAL);
goto unlock;
}
if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz ||
outzp->z_size > inblksz)) {
outlr->lr_length != UINT64_MAX)) {
error = SET_ERROR(EINVAL);
goto unlock;
}
@ -1286,20 +1304,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
*/
break;
}
/*
* Encrypted data is fine as long as it comes from the same
* dataset.
* TODO: We want to extend it in the future to allow cloning to
* datasets with the same keys, like clones or to be able to
* clone a file from a snapshot of an encrypted dataset into the
* dataset itself.
*/
if (BP_IS_PROTECTED(&bps[0])) {
if (inzfsvfs != outzfsvfs) {
error = SET_ERROR(EXDEV);
break;
}
}
/*
* Start a transaction.
@ -1318,12 +1322,24 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
}
/*
* Copy source znode's block size. This only happens on the
* first iteration since zfs_rangelock_reduce() will shrink down
* lr_len to the appropriate size.
* Copy source znode's block size. This is done only if the
* whole znode is locked (see zfs_rangelock_cb()) and only
* on the first iteration since zfs_rangelock_reduce() will
* shrink down lr_length to the appropriate size.
*/
if (outlr->lr_length == UINT64_MAX) {
zfs_grow_blocksize(outzp, inblksz, tx);
/*
* Block growth may fail for many reasons we can not
* predict here. If it happen the cloning is doomed.
*/
if (inblksz != outzp->z_blksz) {
error = SET_ERROR(EINVAL);
dmu_tx_abort(tx);
break;
}
/*
* Round range lock up to the block boundary, so we
* prevent appends until we are done.
@ -1339,6 +1355,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
break;
}
if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) {
update_pages(outzp, outoff, size, outos);
}
zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
&clear_setid_bits_txg, tx);

View File

@ -91,15 +91,7 @@
* committed to stable storage. Please refer to the zil_commit_waiter()
* function (and the comments within it) for more details.
*/
static uint_t zfs_commit_timeout_pct = 5;
/*
* Minimal time we care to delay commit waiting for more ZIL records.
* At least FreeBSD kernel can't sleep for less than 2us at its best.
* So requests to sleep for less then 5us is a waste of CPU time with
* a risk of significant log latency increase due to oversleep.
*/
static uint64_t zil_min_commit_timeout = 5000;
static uint_t zfs_commit_timeout_pct = 10;
/*
* See zil.h for more information about these fields.
@ -152,6 +144,7 @@ static kmem_cache_t *zil_zcw_cache;
static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
static itx_t *zil_itx_clone(itx_t *oitx);
static uint64_t zil_max_waste_space(zilog_t *zilog);
static int
zil_bp_compare(const void *x1, const void *x2)
@ -522,6 +515,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
ASSERT3U(reclen, <=, end - lrp);
if (lr->lrc_seq > claim_lr_seq) {
arc_buf_destroy(abuf, &abuf);
goto done;
@ -604,7 +598,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
lr_write_t *lr = (lr_write_t *)lrc;
int error;
ASSERT(lrc->lrc_txtype == TX_WRITE);
ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
/*
* If the block is not readable, don't claim it. This can happen
@ -632,7 +626,9 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
spa_t *spa = zilog->zl_spa;
uint_t ii;
ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
lr_bps[lr->lr_nbps]));
if (tx == NULL) {
return (0);
@ -646,9 +642,9 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
bp = &lr->lr_bps[ii];
/*
* When data are embedded into BP there is no need to create
* BRT entry as there is no data block. Just copy the BP as
* it contains the data.
* When data is embedded into the BP there is no need to create
* BRT entry as there is no data block. Just copy the BP as it
* contains the data.
*/
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
continue;
@ -709,7 +705,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
lr_write_t *lr = (lr_write_t *)lrc;
blkptr_t *bp = &lr->lr_blkptr;
ASSERT(lrc->lrc_txtype == TX_WRITE);
ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
/*
* If we previously claimed it, we need to free it.
@ -730,7 +726,9 @@ zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
spa_t *spa;
uint_t ii;
ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
lr_bps[lr->lr_nbps]));
if (tx == NULL) {
return (0);
@ -1625,7 +1623,7 @@ zil_lwb_write_done(zio_t *zio)
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
if (vd != NULL && !vd->vdev_nowritecache) {
if (vd != NULL) {
/*
* The "ZIO_FLAG_DONT_PROPAGATE" is currently
* always used within "zio_flush". This means,
@ -1713,24 +1711,6 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
mutex_exit(&zilog->zl_lock);
}
/*
* Define a limited set of intent log block sizes.
*
* These must be a multiple of 4KB. Note only the amount used (again
* aligned to 4KB) actually gets written. However, we can't always just
* allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
*/
static const struct {
uint64_t limit;
uint64_t blksz;
} zil_block_buckets[] = {
{ 4096, 4096 }, /* non TX_WRITE */
{ 8192 + 4096, 8192 + 4096 }, /* database */
{ 32768 + 4096, 32768 + 4096 }, /* NFS writes */
{ 65536 + 4096, 65536 + 4096 }, /* 64KB writes */
{ UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */
};
/*
* Maximum block size used by the ZIL. This is picked up when the ZIL is
* initialized. Otherwise this should not be used directly; see
@ -1738,6 +1718,91 @@ static const struct {
*/
static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
/*
* Plan splitting of the provided burst size between several blocks.
*/
static uint_t
zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
{
uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
if (size <= md) {
/*
* Small bursts are written as-is in one block.
*/
*minsize = size;
return (size);
} else if (size > 8 * md) {
/*
* Big bursts use maximum blocks. The first block size
* is hard to predict, but it does not really matter.
*/
*minsize = 0;
return (md);
}
/*
* Medium bursts try to divide evenly to better utilize several SLOG
* VDEVs. The first block size we predict assuming the worst case of
* maxing out others. Fall back to using maximum blocks if due to
* large records or wasted space we can not predict anything better.
*/
uint_t s = size;
uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
uint_t chunk = DIV_ROUND_UP(s, n);
uint_t waste = zil_max_waste_space(zilog);
waste = MAX(waste, zilog->zl_cur_max);
if (chunk <= md - waste) {
*minsize = MAX(s - (md - waste) * (n - 1), waste);
return (chunk);
} else {
*minsize = 0;
return (md);
}
}
/*
* Try to predict next block size based on previous history. Make prediction
* sufficient for 7 of 8 previous bursts. Don't try to save if the saving is
* less then 50%, extra writes may cost more, but we don't want single spike
* to badly affect our predictions.
*/
static uint_t
zil_lwb_predict(zilog_t *zilog)
{
uint_t m, o;
/* If we are in the middle of a burst, take it into account also. */
if (zilog->zl_cur_size > 0) {
o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
} else {
o = UINT_MAX;
m = 0;
}
/* Find minimum optimal size. We don't need to go below that. */
for (int i = 0; i < ZIL_BURSTS; i++)
o = MIN(o, zilog->zl_prev_opt[i]);
/* Find two biggest minimal first block sizes above the optimal. */
uint_t m1 = MAX(m, o), m2 = o;
for (int i = 0; i < ZIL_BURSTS; i++) {
m = zilog->zl_prev_min[i];
if (m >= m1) {
m2 = m1;
m1 = m;
} else if (m > m2) {
m2 = m;
}
}
/*
* If second minimum size gives 50% saving -- use it. It may cost us
* one additional write later, but the space saving is just too big.
*/
return ((m1 < m2 * 2) ? m1 : m2);
}
/*
* Close the log block for being issued and allocate the next one.
* Has to be called under zl_issuer_lock to chain more lwbs.
@ -1745,7 +1810,7 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
static lwb_t *
zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
{
int i;
uint64_t blksz, plan, plan2;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
@ -1760,34 +1825,40 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
return (NULL);
/*
* Log blocks are pre-allocated. Here we select the size of the next
* block, based on size used in the last block.
* - first find the smallest bucket that will fit the block from a
* limited set of block sizes. This is because it's faster to write
* blocks allocated from the same metaslab as they are adjacent or
* close.
* - next find the maximum from the new suggested size and an array of
* previous sizes. This lessens a picket fence effect of wrongly
* guessing the size if we have a stream of say 2k, 64k, 2k, 64k
* requests.
*
* Note we only write what is used, but we can't just allocate
* the maximum block size because we can exhaust the available
* pool log space.
* Log blocks are pre-allocated. Here we select the size of the next
* block, based on what's left of this burst and the previous history.
* While we try to only write used part of the block, we can't just
* always allocate the maximum block size because we can exhaust all
* available pool log space, so we try to be reasonable.
*/
uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
continue;
zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
uint64_t, zil_blksz,
uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
if (zilog->zl_cur_left > 0) {
/*
* We are in the middle of a burst and know how much is left.
* But if workload is multi-threaded there may be more soon.
* Try to predict what can it be and plan for the worst case.
*/
uint_t m;
plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
if (zilog->zl_parallel) {
plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
zil_lwb_predict(zilog), &m);
if (plan < plan2)
plan = plan2;
}
} else {
/*
* The previous burst is done and we can only predict what
* will come next.
*/
plan = zil_lwb_predict(zilog);
}
blksz = plan + sizeof (zil_chain_t);
blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
blksz = MIN(blksz, zilog->zl_max_block_size);
DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
uint64_t, plan);
return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
}
/*
@ -1810,6 +1881,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
itx = list_next(&lwb->lwb_itxs, itx))
zil_lwb_commit(zilog, lwb, itx);
lwb->lwb_nused = lwb->lwb_nfilled;
ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
ZIO_FLAG_CANFAIL);
@ -1837,7 +1909,7 @@ next_lwb:
int wsz = lwb->lwb_sz;
if (lwb->lwb_error == 0) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
prio = ZIO_PRIORITY_SYNC_WRITE;
else
prio = ZIO_PRIORITY_ASYNC_WRITE;
@ -1998,6 +2070,42 @@ zil_max_copied_data(zilog_t *zilog)
return (MIN(max_data, zil_maxcopied));
}
static uint64_t
zil_itx_record_size(itx_t *itx)
{
lr_t *lr = &itx->itx_lr;
if (lr->lrc_txtype == TX_COMMIT)
return (0);
ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
return (lr->lrc_reclen);
}
static uint64_t
zil_itx_data_size(itx_t *itx)
{
lr_t *lr = &itx->itx_lr;
lr_write_t *lrw = (lr_write_t *)lr;
if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t));
return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t),
uint64_t));
}
return (0);
}
static uint64_t
zil_itx_full_size(itx_t *itx)
{
lr_t *lr = &itx->itx_lr;
if (lr->lrc_txtype == TX_COMMIT)
return (0);
ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
return (lr->lrc_reclen + zil_itx_data_size(itx));
}
/*
* Estimate space needed in the lwb for the itx. Allocate more lwbs or
* split the itx as needed, but don't touch the actual transaction data.
@ -2039,14 +2147,10 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
return (lwb);
}
if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t);
} else {
dlen = 0;
}
reclen = lr->lrc_reclen;
zilog->zl_cur_used += (reclen + dlen);
ASSERT3U(reclen, >=, sizeof (lr_t));
ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
dlen = zil_itx_data_size(itx);
cont:
/*
@ -2064,19 +2168,19 @@ cont:
if (lwb == NULL)
return (NULL);
lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
/*
* There must be enough space in the new, empty log block to
* hold reclen. For WR_COPIED, we need to fit the whole
* record in one block, and reclen is the header size + the
* data size. For WR_NEED_COPY, we can create multiple
* records, splitting the data into multiple blocks, so we
* only need to fit one word of data per block; in this case
* reclen is just the header size (no data).
*/
ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
}
/*
* There must be enough space in the log block to hold reclen.
* For WR_COPIED, we need to fit the whole record in one block,
* and reclen is the write record header size + the data size.
* For WR_NEED_COPY, we can create multiple records, splitting
* the data into multiple blocks, so we only need to fit one
* word of data per block; in this case reclen is just the header
* size (no data).
*/
ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
dnow = MIN(dlen, lwb_sp - reclen);
if (dlen > dnow) {
ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
@ -2087,6 +2191,7 @@ cont:
clrw->lr_length = dnow;
lrw->lr_offset += dnow;
lrw->lr_length -= dnow;
zilog->zl_cur_left -= dnow;
} else {
citx = itx;
clr = lr;
@ -2108,10 +2213,8 @@ cont:
list_insert_tail(&lwb->lwb_itxs, citx);
dlen -= dnow;
if (dlen > 0) {
zilog->zl_cur_used += reclen;
if (dlen > 0)
goto cont;
}
if (lr->lrc_txtype == TX_WRITE &&
lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
@ -2138,13 +2241,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
if (lr->lrc_txtype == TX_COMMIT)
return;
if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t);
} else {
dlen = 0;
}
reclen = lr->lrc_reclen;
dlen = zil_itx_data_size(itx);
ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
@ -2252,7 +2350,9 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
size_t itxsize, lrsize;
itx_t *itx;
ASSERT3U(olrsize, >=, sizeof (lr_t));
lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
ASSERT3U(lrsize, >=, olrsize);
itxsize = offsetof(itx_t, itx_lr) + lrsize;
itx = zio_data_buf_alloc(itxsize);
@ -2271,6 +2371,10 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
static itx_t *
zil_itx_clone(itx_t *oitx)
{
ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
ASSERT3U(oitx->itx_size, ==,
offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
memcpy(itx, oitx, oitx->itx_size);
itx->itx_callback = NULL;
@ -2281,6 +2385,9 @@ zil_itx_clone(itx_t *oitx)
void
zil_itx_destroy(itx_t *itx)
{
ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
ASSERT3U(itx->itx_lr.lrc_reclen, ==,
itx->itx_size - offsetof(itx_t, itx_lr));
IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
@ -2364,7 +2471,7 @@ void
zil_remove_async(zilog_t *zilog, uint64_t oid)
{
uint64_t otxg, txg;
itx_async_node_t *ian;
itx_async_node_t *ian, ian_search;
avl_tree_t *t;
avl_index_t where;
list_t clean_list;
@ -2391,7 +2498,8 @@ zil_remove_async(zilog_t *zilog, uint64_t oid)
* Locate the object node and append its list.
*/
t = &itxg->itxg_itxs->i_async_tree;
ian = avl_find(t, &oid, &where);
ian_search.ia_foid = oid;
ian = avl_find(t, &ian_search, &where);
if (ian != NULL)
list_move_tail(&clean_list, &ian->ia_list);
mutex_exit(&itxg->itxg_lock);
@ -2565,6 +2673,7 @@ zil_get_commit_list(zilog_t *zilog)
ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
itx_t *itx = NULL;
if (unlikely(zilog->zl_suspend > 0)) {
/*
* ZIL was just suspended, but we lost the race.
@ -2574,10 +2683,20 @@ zil_get_commit_list(zilog_t *zilog)
if (!list_is_empty(sync_list))
wtxg = MAX(wtxg, txg);
} else {
itx = list_head(sync_list);
list_move_tail(commit_list, sync_list);
}
mutex_exit(&itxg->itxg_lock);
while (itx != NULL) {
uint64_t s = zil_itx_full_size(itx);
zilog->zl_cur_size += s;
zilog->zl_cur_left += s;
s = zil_itx_record_size(itx);
zilog->zl_cur_max = MAX(zilog->zl_cur_max, s);
itx = list_next(commit_list, itx);
}
}
return (wtxg);
}
@ -2589,7 +2708,7 @@ void
zil_async_to_sync(zilog_t *zilog, uint64_t foid)
{
uint64_t otxg, txg;
itx_async_node_t *ian;
itx_async_node_t *ian, ian_search;
avl_tree_t *t;
avl_index_t where;
@ -2619,7 +2738,8 @@ zil_async_to_sync(zilog_t *zilog, uint64_t foid)
*/
t = &itxg->itxg_itxs->i_async_tree;
if (foid != 0) {
ian = avl_find(t, &foid, &where);
ian_search.ia_foid = foid;
ian = avl_find(t, &ian_search, &where);
if (ian != NULL) {
list_move_tail(&itxg->itxg_itxs->i_sync_list,
&ian->ia_list);
@ -2712,6 +2832,26 @@ zil_commit_writer_stall(zilog_t *zilog)
ASSERT(list_is_empty(&zilog->zl_lwb_list));
}
static void
zil_burst_done(zilog_t *zilog)
{
if (!list_is_empty(&zilog->zl_itx_commit_list) ||
zilog->zl_cur_size == 0)
return;
if (zilog->zl_parallel)
zilog->zl_parallel--;
uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1);
zilog->zl_prev_rotor = r;
zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size,
&zilog->zl_prev_min[r]);
zilog->zl_cur_size = 0;
zilog->zl_cur_max = 0;
zilog->zl_cur_left = 0;
}
/*
* This function will traverse the commit list, creating new lwbs as
* needed, and committing the itxs from the commit list to these newly
@ -2726,7 +2866,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
list_t nolwb_waiters;
lwb_t *lwb, *plwb;
itx_t *itx;
boolean_t first = B_TRUE;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@ -2752,9 +2891,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
zil_commit_activate_saxattr_feature(zilog);
ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
lwb->lwb_state == LWB_STATE_OPENED);
first = (lwb->lwb_state == LWB_STATE_NEW) &&
((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
plwb->lwb_state == LWB_STATE_FLUSH_DONE);
/*
* If the lwb is still opened, it means the workload is really
* multi-threaded and we won the chance of write aggregation.
* If it is not opened yet, but previous lwb is still not
* flushed, it still means the workload is multi-threaded, but
* there was too much time between the commits to aggregate, so
* we try aggregation next times, but without too much hopes.
*/
if (lwb->lwb_state == LWB_STATE_OPENED) {
zilog->zl_parallel = ZIL_BURSTS;
} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
!= NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
zilog->zl_parallel = MAX(zilog->zl_parallel,
ZIL_BURSTS / 2);
}
}
while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
@ -2829,7 +2981,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* Our lwb is done, leave the rest of
* itx list to somebody else who care.
*/
first = B_FALSE;
zilog->zl_parallel = ZIL_BURSTS;
zilog->zl_cur_left -=
zil_itx_full_size(itx);
break;
}
} else {
@ -2839,8 +2993,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
}
list_insert_tail(&nolwb_itxs, itx);
}
zilog->zl_cur_left -= zil_itx_full_size(itx);
} else {
ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
zilog->zl_cur_left -= zil_itx_full_size(itx);
zil_itx_destroy(itx);
}
}
@ -2921,28 +3077,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* try and pack as many itxs into as few lwbs as
* possible, without significantly impacting the latency
* of each individual itx.
*
* If we had no already running or open LWBs, it can be
* the workload is single-threaded. And if the ZIL write
* latency is very small or if the LWB is almost full, it
* may be cheaper to bypass the delay.
*/
if (lwb->lwb_state == LWB_STATE_OPENED && first) {
hrtime_t sleep = zilog->zl_last_lwb_latency *
zfs_commit_timeout_pct / 100;
if (sleep < zil_min_commit_timeout ||
lwb->lwb_nmax - lwb->lwb_nused <
lwb->lwb_nmax / 8) {
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb,
LWB_STATE_NEW);
zilog->zl_cur_used = 0;
if (lwb == NULL) {
while ((lwb = list_remove_head(ilwbs))
!= NULL)
zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog);
}
if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
zil_burst_done(zilog);
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
if (lwb == NULL) {
while ((lwb = list_remove_head(ilwbs)) != NULL)
zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog);
}
}
}
@ -3096,24 +3239,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* since we've reached the commit waiter's timeout and it still
* hasn't been issued.
*/
zil_burst_done(zilog);
lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
/*
* Since the lwb's zio hadn't been issued by the time this thread
* reached its timeout, we reset the zilog's "zl_cur_used" field
* to influence the zil block size selection algorithm.
*
* By having to issue the lwb's zio here, it means the size of the
* lwb was too large, given the incoming throughput of itxs. By
* setting "zl_cur_used" to zero, we communicate this fact to the
* block size selection algorithm, so it can take this information
* into account, and potentially select a smaller size for the
* next lwb block that is allocated.
*/
zilog->zl_cur_used = 0;
if (nlwb == NULL) {
/*
* When zil_lwb_write_close() returns NULL, this
@ -3708,7 +3838,9 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
zilog->zl_dirty_max_txg = 0;
zilog->zl_last_lwb_opened = NULL;
zilog->zl_last_lwb_latency = 0;
zilog->zl_max_block_size = zil_maxblocksize;
zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize,
ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ),
spa_maxblocksize(dmu_objset_spa(os)));
mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
@ -3728,6 +3860,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
for (int i = 0; i < ZIL_BURSTS; i++) {
zilog->zl_prev_opt[i] = zilog->zl_max_block_size -
sizeof (zil_chain_t);
}
return (zilog);
}
@ -4230,9 +4367,6 @@ EXPORT_SYMBOL(zil_kstat_values_update);
ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
"ZIL block open timeout percentage");
ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
"Minimum delay we care for ZIL block commit");
ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
"Disable intent logging replay");

View File

@ -306,6 +306,53 @@ zio_fini(void)
* ==========================================================================
*/
#ifdef ZFS_DEBUG
static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
#endif
/*
* Use empty space after the buffer to detect overflows.
*
* Since zio_init() creates kmem caches only for certain set of buffer sizes,
* allocations of different sizes may have some unused space after the data.
* Filling part of that space with a known pattern on allocation and checking
* it on free should allow us to detect some buffer overflows.
*/
static void
zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
{
#ifdef ZFS_DEBUG
size_t off = P2ROUNDUP(size, sizeof (ulong_t));
ulong_t *canary = p + off / sizeof (ulong_t);
size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
cache[c] == cache[c + 1])
asize = (c + 2) << SPA_MINBLOCKSHIFT;
for (; off < asize; canary++, off += sizeof (ulong_t))
*canary = zio_buf_canary;
#endif
}
static void
zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
{
#ifdef ZFS_DEBUG
size_t off = P2ROUNDUP(size, sizeof (ulong_t));
ulong_t *canary = p + off / sizeof (ulong_t);
size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
cache[c] == cache[c + 1])
asize = (c + 2) << SPA_MINBLOCKSHIFT;
for (; off < asize; canary++, off += sizeof (ulong_t)) {
if (unlikely(*canary != zio_buf_canary)) {
PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx",
p, size, (canary - p) * sizeof (ulong_t),
*canary, zio_buf_canary);
}
}
#endif
}
/*
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
@ -322,7 +369,9 @@ zio_buf_alloc(size_t size)
atomic_add_64(&zio_buf_cache_allocs[c], 1);
#endif
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
zio_buf_put_canary(p, size, zio_buf_cache, c);
return (p);
}
/*
@ -338,7 +387,9 @@ zio_data_buf_alloc(size_t size)
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
zio_buf_put_canary(p, size, zio_data_buf_cache, c);
return (p);
}
void
@ -351,6 +402,7 @@ zio_buf_free(void *buf, size_t size)
atomic_add_64(&zio_buf_cache_frees[c], 1);
#endif
zio_buf_check_canary(buf, size, zio_buf_cache, c);
kmem_cache_free(zio_buf_cache[c], buf);
}
@ -361,6 +413,7 @@ zio_data_buf_free(void *buf, size_t size)
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
kmem_cache_free(zio_data_buf_cache[c], buf);
}
@ -1382,23 +1435,10 @@ zio_t *
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_done_func_t *done, void *private, zio_flag_t flags)
{
zio_t *zio;
int c;
if (vd->vdev_children == 0) {
zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
zio->io_cmd = cmd;
} else {
zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
for (c = 0; c < vd->vdev_children; c++)
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
done, private, flags));
}
zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
zio->io_cmd = cmd;
return (zio);
}
@ -1569,11 +1609,18 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
}
void
zio_flush(zio_t *zio, vdev_t *vd)
zio_flush(zio_t *pio, vdev_t *vd)
{
zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
NULL, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
if (vd->vdev_nowritecache)
return;
if (vd->vdev_children == 0) {
zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd,
DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
} else {
for (uint64_t c = 0; c < vd->vdev_children; c++)
zio_flush(pio, vd->vdev_child[c]);
}
}
void

View File

@ -363,11 +363,14 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
zil_chain_t zilc;
abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ,
uint64_t);
uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused,
ZIL_MIN_BLKSZ, uint64_t);
ASSERT3U(size, >=, nused);
size = nused;
eck = zilc.zc_eck;
eck_offset = offsetof(zil_chain_t, zc_eck);
} else {
ASSERT3U(size, >=, sizeof (zio_eck_t));
eck_offset = size - sizeof (zio_eck_t);
abd_copy_to_buf_off(&eck, abd, eck_offset,
sizeof (zio_eck_t));
@ -448,12 +451,13 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
return (SET_ERROR(ECKSUM));
}
if (nused > size) {
nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
if (size < nused)
return (SET_ERROR(ECKSUM));
}
size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
size = nused;
} else {
if (size < sizeof (zio_eck_t))
return (SET_ERROR(ECKSUM));
eck_offset = size - sizeof (zio_eck_t);
abd_copy_to_buf_off(&eck, abd, eck_offset,
sizeof (zio_eck_t));

View File

@ -451,6 +451,8 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
lr_truncate_t *lr = arg2;
uint64_t offset, length;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@ -487,6 +489,8 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_tx_t *tx;
int error;
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@ -516,60 +520,6 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}
/*
* Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
* after a system failure.
*
* TODO: For now we drop block cloning transations for ZVOLs as they are
* unsupported, but we still need to inform BRT about that as we
* claimed them during pool import.
* This situation can occur when we try to import a pool from a ZFS
* version supporting block cloning for ZVOLs into a system that
* has this ZFS version, that doesn't support block cloning for ZVOLs.
*/
static int
zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
{
char name[ZFS_MAX_DATASET_NAME_LEN];
zvol_state_t *zv = arg1;
objset_t *os = zv->zv_objset;
lr_clone_range_t *lr = arg2;
blkptr_t *bp;
dmu_tx_t *tx;
spa_t *spa;
uint_t ii;
int error;
dmu_objset_name(os, name);
cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.",
name);
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
tx = dmu_tx_create(os);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
return (error);
}
spa = os->os_spa;
for (ii = 0; ii < lr->lr_nbps; ii++) {
bp = &lr->lr_bps[ii];
if (!BP_IS_HOLE(bp)) {
zio_free(spa, dmu_tx_get_txg(tx), bp);
}
}
(void) zil_replaying(zv->zv_zilog, tx);
dmu_tx_commit(tx);
return (0);
}
static int
zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
{
@ -604,7 +554,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
zvol_replay_err, /* TX_SETSAXATTR */
zvol_replay_err, /* TX_RENAME_EXCHANGE */
zvol_replay_err, /* TX_RENAME_WHITEOUT */
zvol_replay_clone_range /* TX_CLONE_RANGE */
zvol_replay_err, /* TX_CLONE_RANGE */
};
/*

View File

@ -53,6 +53,12 @@ tags = ['functional', 'arc']
tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on']
tags = ['functional', 'atime']
[tests/functional/block_cloning]
tests = ['block_cloning_clone_mmap_cached',
'block_cloning_copyfilerange',
'block_cloning_copyfilerange_partial']
tags = ['functional', 'block_cloning']
[tests/functional/bootfs]
tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos',
'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos',

View File

@ -42,6 +42,7 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial',
'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone',
'block_cloning_disabled_ficlonerange',
'block_cloning_copyfilerange_cross_dataset',
'block_cloning_cross_enc_dataset',
'block_cloning_copyfilerange_fallback_same_txg']
tags = ['functional', 'block_cloning']

View File

@ -270,6 +270,7 @@ if sys.platform.startswith('freebsd'):
})
elif sys.platform.startswith('linux'):
maybe.update({
'block_cloning/block_cloning_clone_mmap_cached': ['SKIP', cfr_reason],
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
'fault/auto_online_002_pos': ['FAIL', 11889],
@ -305,6 +306,8 @@ elif sys.platform.startswith('linux'):
['SKIP', cfr_cross_reason],
'block_cloning/block_cloning_copyfilerange_fallback_same_txg':
['SKIP', cfr_cross_reason],
'block_cloning/block_cloning_cross_enc_dataset':
['SKIP', cfr_cross_reason],
})

View File

@ -2,6 +2,7 @@
/btree_test
/chg_usr_exec
/clonefile
/clone_mmap_cached
/devname2devid
/dir_rd_update
/draid

View File

@ -2,6 +2,7 @@ scripts_zfs_tests_bindir = $(datadir)/$(PACKAGE)/zfs-tests/bin
scripts_zfs_tests_bin_PROGRAMS = %D%/chg_usr_exec
scripts_zfs_tests_bin_PROGRAMS += %D%/clone_mmap_cached
scripts_zfs_tests_bin_PROGRAMS += %D%/cp_files
scripts_zfs_tests_bin_PROGRAMS += %D%/ctime
scripts_zfs_tests_bin_PROGRAMS += %D%/dir_rd_update

View File

@ -0,0 +1,146 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2024 by Pawel Jakub Dawidek
*/
#include <sys/mman.h>
#include <sys/stat.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#ifdef __FreeBSD__
#define loff_t off_t
#endif
ssize_t
copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int)
__attribute__((weak));
static void *
mmap_file(int fd, size_t size)
{
void *p;
p = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
if (p == MAP_FAILED) {
(void) fprintf(stderr, "mmap failed: %s\n", strerror(errno));
exit(2);
}
return (p);
}
static void
usage(const char *progname)
{
/*
* -i cache input before copy_file_range(2).
* -o cache input before copy_file_range(2).
*/
(void) fprintf(stderr, "usage: %s [-io] <input> <output>\n", progname);
exit(3);
}
int
main(int argc, char *argv[])
{
int dfd, sfd;
size_t dsize, ssize;
void *dmem, *smem, *ptr;
off_t doff, soff;
struct stat sb;
bool cache_input, cache_output;
const char *progname;
int c;
progname = argv[0];
cache_input = cache_output = false;
while ((c = getopt(argc, argv, "io")) != -1) {
switch (c) {
case 'i':
cache_input = true;
break;
case 'o':
cache_output = true;
break;
default:
usage(progname);
}
}
argc -= optind;
argv += optind;
if (argc != 2) {
usage(progname);
}
sfd = open(argv[0], O_RDONLY);
if (fstat(sfd, &sb) == -1) {
(void) fprintf(stderr, "fstat failed: %s\n", strerror(errno));
exit(2);
}
ssize = sb.st_size;
smem = mmap_file(sfd, ssize);
dfd = open(argv[1], O_RDWR);
if (fstat(dfd, &sb) == -1) {
(void) fprintf(stderr, "fstat failed: %s\n", strerror(errno));
exit(2);
}
dsize = sb.st_size;
dmem = mmap_file(dfd, dsize);
/*
* Hopefully it won't be compiled out.
*/
if (cache_input) {
ptr = malloc(ssize);
assert(ptr != NULL);
memcpy(ptr, smem, ssize);
free(ptr);
}
if (cache_output) {
ptr = malloc(ssize);
assert(ptr != NULL);
memcpy(ptr, dmem, dsize);
free(ptr);
}
soff = doff = 0;
if (copy_file_range(sfd, &soff, dfd, &doff, ssize, 0) < 0) {
(void) fprintf(stderr, "copy_file_range failed: %s\n",
strerror(errno));
exit(2);
}
exit(memcmp(smem, dmem, ssize) == 0 ? 0 : 1);
}

View File

@ -362,12 +362,20 @@ main(void)
return (1);
}
if (t1 == t2) {
(void) fprintf(stderr, "%s: t1(%ld) == t2(%ld)\n",
/*
* Ideally, time change would be exactly two seconds, but allow
* a little slack in case of scheduling delays or similar.
*/
long delta = (long)t2 - (long)t1;
if (delta < 2 || delta > 4) {
(void) fprintf(stderr,
"%s: BAD time change: t1(%ld), t2(%ld)\n",
timetest_table[i].name, (long)t1, (long)t2);
return (1);
} else {
(void) fprintf(stderr, "%s: t1(%ld) != t2(%ld)\n",
(void) fprintf(stderr,
"%s: good time change: t1(%ld), t2(%ld)\n",
timetest_table[i].name, (long)t1, (long)t2);
}
}

View File

@ -184,6 +184,7 @@ export ZFSTEST_FILES='badsend
btree_test
chg_usr_exec
clonefile
clone_mmap_cached
devname2devid
dir_rd_update
draid

View File

@ -440,6 +440,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/atime/setup.ksh \
functional/block_cloning/cleanup.ksh \
functional/block_cloning/setup.ksh \
functional/block_cloning/block_cloning_clone_mmap_cached.ksh \
functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
@ -451,6 +452,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/block_cloning/block_cloning_ficlone.ksh \
functional/block_cloning/block_cloning_ficlonerange.ksh \
functional/block_cloning/block_cloning_ficlonerange_partial.ksh \
functional/block_cloning/block_cloning_cross_enc_dataset.ksh \
functional/bootfs/bootfs_001_pos.ksh \
functional/bootfs/bootfs_002_neg.ksh \
functional/bootfs/bootfs_003_pos.ksh \

View File

@ -28,8 +28,8 @@
function have_same_content
{
typeset hash1=$(cat $1 | md5sum)
typeset hash2=$(cat $2 | md5sum)
typeset hash1=$(md5digest $1)
typeset hash2=$(md5digest $2)
log_must [ "$hash1" = "$hash2" ]
}
@ -44,10 +44,14 @@ function have_same_content
#
function get_same_blocks
{
KEY=$5
if [ ${#KEY} -gt 0 ]; then
KEY="--key=$KEY"
fi
typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$
zdb -vvvvv $1 -O $2 | \
zdb $KEY -vvvvv $1 -O $2 | \
awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a
zdb -vvvvv $3 -O $4 | \
zdb $KEY -vvvvv $3 -O $4 | \
awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b
echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ')
}

View File

@ -0,0 +1,86 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
#
# DESCRIPTION:
# When the destination file is mmaped and is already cached we need to
# update mmaped pages after successful clone.
#
# STRATEGY:
# 1. Create a pool.
# 2. Create a two test files with random content.
# 3. mmap the files, read them and clone from one to the other using
# clone_mmap_cached.
# 4. clone_mmap_cached also verifies if the content of the destination
# file was updated while reading it from mmaped memory.
#
verify_runnable "global"
if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
log_unsupported "copy_file_range not available before Linux 4.5"
fi
VDIR=$TEST_BASE_DIR/disk-bclone
VDEV="$VDIR/a"
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
rm -rf $VDIR
}
log_onexit cleanup
log_assert "Test for clone into mmaped and cached file"
log_must rm -rf $VDIR
log_must mkdir -p $VDIR
log_must truncate -s 1G $VDEV
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV
log_must zfs create $TESTPOOL/$TESTFS
for opts in "--" "-i" "-o" "-io"
do
log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/src bs=1M count=1
log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/dst bs=1M count=1
# Clear cache.
log_must zpool export $TESTPOOL
log_must zpool import -d $VDIR $TESTPOOL
log_must clone_mmap_cached $opts /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst
sync_pool $TESTPOOL
log_must sync
log_must have_same_content /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst
blocks=$(get_same_blocks $TESTPOOL/$TESTFS src $TESTPOOL/$TESTFS dst)
# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1).
log_must [ "$blocks" = "$(seq -s " " 0 7 | sed 's/ $//')" ]
done
log_pass "Clone properly updates mmapped and cached pages"

View File

@ -0,0 +1,170 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Kay Pedersen <mail@mkwg.de>
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
verify_runnable "global"
if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
fi
claim="Block cloning across encrypted datasets."
log_assert $claim
DS1="$TESTPOOL/encrypted1"
DS2="$TESTPOOL/encrypted2"
DS1_NC="$TESTPOOL/notcrypted1"
PASSPHRASE="top_secret"
function prepare_enc
{
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
"-o keyformat=passphrase -o keylocation=prompt $DS1"
log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \
"-o keyformat=passphrase -o keylocation=prompt $DS2"
log_must zfs create $DS1/child1
log_must zfs create $DS1/child2
log_must zfs create $DS1_NC
log_note "Create test file"
# we must wait until the src file txg is written to the disk otherwise we
# will fallback to normal copy. See "dmu_read_l0_bps" in
# "zfs/module/zfs/dmu.c" and "zfs_clone_range" in
# "zfs/module/zfs/zfs_vnops.c"
log_must dd if=/dev/urandom of=/$DS1/file bs=128K count=4
log_must dd if=/dev/urandom of=/$DS1/child1/file bs=128K count=4
log_must dd if=/dev/urandom of=/$DS1_NC/file bs=128K count=4
log_must sync_pool $TESTPOOL
}
function cleanup_enc
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
}
function clone_and_check
{
I_FILE="$1"
O_FILE=$2
I_DS=$3
O_DS=$4
SAME_BLOCKS=$5
# the CLONE option provides a choice between copy_file_range
# which should clone and a dd which is a copy no matter what
CLONE=$6
SNAPSHOT=$7
if [ ${#SNAPSHOT} -gt 0 ]; then
I_FILE=".zfs/snapshot/$SNAPSHOT/$1"
fi
if [ $CLONE ]; then
log_must clonefile -f "/$I_DS/$I_FILE" "/$O_DS/$O_FILE" 0 0 524288
else
log_must dd if="/$I_DS/$I_FILE" of="/$O_DS/$O_FILE" bs=128K
fi
log_must sync_pool $TESTPOOL
log_must have_same_content "/$I_DS/$I_FILE" "/$O_DS/$O_FILE"
if [ ${#SNAPSHOT} -gt 0 ]; then
I_DS="$I_DS@$SNAPSHOT"
I_FILE="$1"
fi
typeset blocks=$(get_same_blocks \
$I_DS $I_FILE $O_DS $O_FILE $PASSPHRASE)
log_must [ "$blocks" = "$SAME_BLOCKS" ]
}
log_onexit cleanup_enc
prepare_enc
log_note "Cloning entire file with copy_file_range across different enc" \
"roots, should fallback"
# we are expecting no same block map.
clone_and_check "file" "clone" $DS1 $DS2 "" true
log_note "check if the file is still readable and the same after" \
"unmount and key unload, shouldn't fail"
typeset hash1=$(md5digest "/$DS1/file")
log_must zfs umount $DS1 && zfs unload-key $DS1
typeset hash2=$(md5digest "/$DS2/clone")
log_must [ "$hash1" = "$hash2" ]
cleanup_enc
prepare_enc
log_note "Cloning entire file with copy_file_range across different child datasets"
# clone shouldn't work because of deriving a new master key for the child
# we are expecting no same block map.
clone_and_check "file" "clone" $DS1 "$DS1/child1" "" true
clone_and_check "file" "clone" "$DS1/child1" "$DS1/child2" "" true
cleanup_enc
prepare_enc
log_note "Copying entire file with copy_file_range across same snapshot"
log_must zfs snapshot -r $DS1@s1
log_must sync_pool $TESTPOOL
log_must rm -f "/$DS1/file"
log_must sync_pool $TESTPOOL
clone_and_check "file" "clone" "$DS1" "$DS1" "0 1 2 3" true "s1"
cleanup_enc
prepare_enc
log_note "Copying entire file with copy_file_range across different snapshot"
clone_and_check "file" "file" $DS1 $DS2 "" true
log_must zfs snapshot -r $DS2@s1
log_must sync_pool $TESTPOOL
log_must rm -f "/$DS1/file" "/$DS2/file"
log_must sync_pool $TESTPOOL
clone_and_check "file" "clone" "$DS2" "$DS1" "" true "s1"
typeset hash1=$(md5digest "/$DS1/.zfs/snapshot/s1/file")
log_note "destroy the snapshot and check if the file is still readable and" \
"has the same content"
log_must zfs destroy -r $DS2@s1
log_must sync_pool $TESTPOOL
typeset hash2=$(md5digest "/$DS1/file")
log_must [ "$hash1" = "$hash2" ]
cleanup_enc
prepare_enc
log_note "Copying with copy_file_range from non encrypted to encrypted"
clone_and_check "file" "copy" $DS1_NC $DS1 "" true
cleanup_enc
prepare_enc
log_note "Copying with copy_file_range from encrypted to non encrypted"
clone_and_check "file" "copy" $DS1 $DS1_NC "" true
log_must sync_pool $TESTPOOL
log_pass $claim

View File

@ -30,6 +30,9 @@
if ! command -v clonefile > /dev/null ; then
log_unsupported "clonefile program required to test block cloning"
fi
if ! command -v clone_mmap_cached > /dev/null ; then
log_unsupported "clone_mmap_cached program required to test block cloning"
fi
verify_runnable "global"

View File

@ -31,15 +31,13 @@
# 2. Set l2arc_write_max to a value larger than the cache device.
# 3. Create a file larger than the cache device and random read
# for 10 sec.
# 4. Verify that l2arc_write_max is set back to the default.
# 5. Set l2arc_write_max to a value less than the cache device size but
# 4. Set l2arc_write_max to a value less than the cache device size but
# larger than the default (256MB).
# 6. Record the l2_size.
# 7. Random read for 1 sec.
# 8. Record the l2_size again.
# 9. If (6) <= (8) then we have not looped around yet.
# 10. If (6) > (8) then we looped around. Break out of the loop and test.
# 11. Destroy pool.
# 5. Record the l2_size.
# 6. Random read for 1 sec.
# 7. Record the l2_size again.
# 8. If (5) <= (7) then we have not looped around yet.
# 9. Destroy pool.
#
verify_runnable "global"
@ -93,10 +91,6 @@ log_must zfs set relatime=off $TESTPOOL
log_must fio $FIO_SCRIPTS/mkfiles.fio
log_must fio $FIO_SCRIPTS/random_reads.fio
typeset write_max2=$(get_tunable L2ARC_WRITE_MAX)
log_must test $write_max2 -eq $write_max
log_must set_tunable32 L2ARC_WRITE_MAX $(( 256 * 1024 * 1024 ))
export RUNTIME=1
@ -108,8 +102,6 @@ while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do
do_once=false
done
log_must test $l2_size1 -gt $l2_size2
log_must zpool destroy $TESTPOOL
log_pass "Looping around a cache device succeeds."

View File

@ -44,6 +44,13 @@ if ! $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then
log_unsupported "Requires io_uring support"
fi
if [ -e /etc/os-release ] ; then
source /etc/os-release
if [ -n "$REDHAT_SUPPORT_PRODUCT_VERSION" ] && ((floor($REDHAT_SUPPORT_PRODUCT_VERSION) == 9)) ; then
log_unsupported "Disabled on CentOS 9, fails with 'Operation not permitted'"
fi
fi
fio --ioengine=io_uring --parse-only || log_unsupported "fio io_uring support required"
function cleanup