diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 970c45c9b3..63d95ddedc 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -168,7 +168,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length); - if (txtype == TX_WRITE2 || verbose < 5) + if (txtype == TX_WRITE2 || verbose < 4) return; if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { @@ -178,6 +178,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) "will claim" : "won't claim"); print_log_bp(bp, tab_prefix); + if (verbose < 5) + return; if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); @@ -202,6 +204,9 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) if (error) goto out; } else { + if (verbose < 5) + return; + /* data is stored after the end of the lr_write record */ data = abd_alloc(lr->lr_length, B_FALSE); abd_copy_from_buf(data, lr + 1, lr->lr_length); @@ -217,6 +222,28 @@ out: abd_free(data); } +static void +zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg) +{ + (void) txtype; + const lr_write_t *lr = arg; + const blkptr_t *bp = &lr->lr_blkptr; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + (void) printf("%s(encrypted)\n", tab_prefix); + + if (verbose < 4) + return; + + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + (void) printf("%shas blkptr, %s\n", tab_prefix, + !BP_IS_HOLE(bp) && + bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? + "will claim" : "won't claim"); + print_log_bp(bp, tab_prefix); + } +} + static void zil_prt_rec_truncate(zilog_t *zilog, int txtype, const void *arg) { @@ -312,11 +339,34 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg) { (void) zilog, (void) txtype; const lr_clone_range_t *lr = arg; + int verbose = MAX(dump_opt['d'], dump_opt['i']); (void) printf("%sfoid %llu, offset %llx, length %llx, blksize %llx\n", tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz); + if (verbose < 4) + return; + + for (unsigned int i = 0; i < lr->lr_nbps; i++) { + (void) printf("%s[%u/%llu] ", tab_prefix, i + 1, + (u_longlong_t)lr->lr_nbps); + print_log_bp(&lr->lr_bps[i], ""); + } +} + +static void +zil_prt_rec_clone_range_enc(zilog_t *zilog, int txtype, const void *arg) +{ + (void) zilog, (void) txtype; + const lr_clone_range_t *lr = arg; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + (void) printf("%s(encrypted)\n", tab_prefix); + + if (verbose < 4) + return; + for (unsigned int i = 0; i < lr->lr_nbps; i++) { (void) printf("%s[%u/%llu] ", tab_prefix, i + 1, (u_longlong_t)lr->lr_nbps); @@ -327,6 +377,7 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg) typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *); typedef struct zil_rec_info { zil_prt_rec_func_t zri_print; + zil_prt_rec_func_t zri_print_enc; const char *zri_name; uint64_t zri_count; } zil_rec_info_t; @@ -341,7 +392,9 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "}, {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "}, {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "}, - {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "}, + {.zri_print = zil_prt_rec_write, + .zri_print_enc = zil_prt_rec_write_enc, + .zri_name = "TX_WRITE "}, {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "}, {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "}, {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "}, @@ -358,6 +411,7 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_EXCHANGE "}, {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_WHITEOUT "}, {.zri_print = zil_prt_rec_clone_range, + .zri_print_enc = zil_prt_rec_clone_range_enc, .zri_name = "TX_CLONE_RANGE "}, }; @@ -384,6 +438,8 @@ print_log_record(zilog_t *zilog, const lr_t *lr, void *arg, uint64_t claim_txg) if (txtype && verbose >= 3) { if (!zilog->zl_os->os_encrypted) { zil_rec_info[txtype].zri_print(zilog, txtype, lr); + } else if (zil_rec_info[txtype].zri_print_enc) { + zil_rec_info[txtype].zri_print_enc(zilog, txtype, lr); } else { (void) printf("%s(encrypted)\n", tab_prefix); } diff --git a/config/kernel-current-time.m4 b/config/kernel-current-time.m4 index 3ceb5f63ef..ab7d9c5ced 100644 --- a/config/kernel-current-time.m4 +++ b/config/kernel-current-time.m4 @@ -2,12 +2,15 @@ dnl # dnl # 4.9, current_time() added dnl # 4.18, return type changed from timespec to timespec64 dnl # +dnl # Note that we don't care about the return type in this check. If we have +dnl # to implement a fallback, we'll know we're <4.9, which was timespec. +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_CURRENT_TIME], [ ZFS_LINUX_TEST_SRC([current_time], [ #include ], [ struct inode ip __attribute__ ((unused)); - ip.i_atime = current_time(&ip); + (void) current_time(&ip); ]) ]) diff --git a/config/kernel-flush_dcache_page.m4 b/config/kernel-flush_dcache_page.m4 index 2340c386ef..aa916c87d5 100644 --- a/config/kernel-flush_dcache_page.m4 +++ b/config/kernel-flush_dcache_page.m4 @@ -1,7 +1,8 @@ dnl # dnl # Starting from Linux 5.13, flush_dcache_page() becomes an inline -dnl # function and may indirectly referencing GPL-only cpu_feature_keys on -dnl # powerpc +dnl # function and may indirectly referencing GPL-only symbols: +dnl # on powerpc: cpu_feature_keys +dnl # on riscv: PageHuge (added from 6.2) dnl # dnl # diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 index 0acb3c1ef0..f419f29825 100644 --- a/config/kernel-fpu.m4 +++ b/config/kernel-fpu.m4 @@ -79,6 +79,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [ __kernel_fpu_end(); ], [], []) + ZFS_LINUX_TEST_SRC([kernel_neon], [ + #include + ], [ + kernel_neon_begin(); + kernel_neon_end(); + ], [], [ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_KERNEL_FPU], [ @@ -105,9 +111,20 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) ],[ - AC_MSG_RESULT(internal) - AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1, - [kernel fpu internal]) + dnl # + dnl # ARM neon symbols (only on arm and arm64) + dnl # could be GPL-only on arm64 after Linux 6.2 + dnl # + ZFS_LINUX_TEST_RESULT([kernel_neon_license],[ + AC_MSG_RESULT(kernel_neon_*) + AC_DEFINE(HAVE_KERNEL_NEON, 1, + [kernel has kernel_neon_* functions]) + ],[ + # catch-all + AC_MSG_RESULT(internal) + AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1, + [kernel fpu internal]) + ]) ]) ]) ]) diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4 index aae95abf17..4d861596ed 100644 --- a/config/kernel-inode-times.m4 +++ b/config/kernel-inode-times.m4 @@ -52,6 +52,48 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ memset(&ip, 0, sizeof(ip)); inode_set_ctime_to_ts(&ip, ts); ]) + + dnl # + dnl # 6.7 API change + dnl # i_atime/i_mtime no longer directly accessible, must use + dnl # inode_get_mtime(ip), inode_set_mtime*(ip) to + dnl # read/write. + dnl # + ZFS_LINUX_TEST_SRC([inode_get_atime], [ + #include + ],[ + struct inode ip; + + memset(&ip, 0, sizeof(ip)); + inode_get_atime(&ip); + ]) + ZFS_LINUX_TEST_SRC([inode_get_mtime], [ + #include + ],[ + struct inode ip; + + memset(&ip, 0, sizeof(ip)); + inode_get_mtime(&ip); + ]) + + ZFS_LINUX_TEST_SRC([inode_set_atime_to_ts], [ + #include + ],[ + struct inode ip; + struct timespec64 ts = {0}; + + memset(&ip, 0, sizeof(ip)); + inode_set_atime_to_ts(&ip, ts); + ]) + ZFS_LINUX_TEST_SRC([inode_set_mtime_to_ts], [ + #include + ],[ + struct inode ip; + struct timespec64 ts = {0}; + + memset(&ip, 0, sizeof(ip)); + inode_set_mtime_to_ts(&ip, ts); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ @@ -90,4 +132,40 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ ],[ AC_MSG_RESULT(no) ]) + + AC_MSG_CHECKING([whether inode_get_atime() exists]) + ZFS_LINUX_TEST_RESULT([inode_get_atime], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_GET_ATIME, 1, + [inode_get_atime() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_set_atime_to_ts() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_atime_to_ts], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_ATIME_TO_TS, 1, + [inode_set_atime_to_ts() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_get_mtime() exists]) + ZFS_LINUX_TEST_RESULT([inode_get_mtime], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_GET_MTIME, 1, + [inode_get_mtime() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_set_mtime_to_ts() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_mtime_to_ts], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_MTIME_TO_TS, 1, + [inode_set_mtime_to_ts() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff --git a/config/kernel-shrink.m4 b/config/kernel-shrink.m4 index 0c702153e8..4a529c43b5 100644 --- a/config/kernel-shrink.m4 +++ b/config/kernel-shrink.m4 @@ -19,12 +19,44 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK], [ ],[]) ]) +dnl # +dnl # 6.7 API change +dnl # s_shrink is now a pointer. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR], [ + ZFS_LINUX_TEST_SRC([super_block_s_shrink_ptr], [ + #include + unsigned long shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } + static struct shrinker shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + static const struct super_block + sb __attribute__ ((unused)) = { + .s_shrink = &shrinker, + }; + ],[]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SUPER_BLOCK_S_SHRINK], [ AC_MSG_CHECKING([whether super_block has s_shrink]) ZFS_LINUX_TEST_RESULT([super_block_s_shrink], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK, 1, + [have super_block s_shrink]) ],[ - ZFS_LINUX_TEST_ERROR([sb->s_shrink()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether super_block has s_shrink pointer]) + ZFS_LINUX_TEST_RESULT([super_block_s_shrink_ptr], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK_PTR, 1, + [have super_block s_shrink pointer]) + ],[ + AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([sb->s_shrink()]) + ]) ]) ]) @@ -96,6 +128,25 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK], [ ]) ]) +dnl # +dnl # 6.7 API change +dnl # register_shrinker has been replaced by shrinker_register. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER], [ + ZFS_LINUX_TEST_SRC([shrinker_register], [ + #include + unsigned long shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } + ],[ + struct shrinker cache_shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + shrinker_register(&cache_shrinker); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[ dnl # dnl # 6.0 API change @@ -133,14 +184,36 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[ dnl # cs->shrink() is logically split in to dnl # cs->count_objects() and cs->scan_objects() dnl # - AC_MSG_CHECKING([if cs->count_objects callback exists]) + AC_MSG_CHECKING( + [whether cs->count_objects callback exists]) ZFS_LINUX_TEST_RESULT( - [shrinker_cb_shrink_control_split],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1, - [cs->count_objects exists]) + [shrinker_cb_shrink_control_split],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1, + [cs->count_objects exists]) ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING( + [whether shrinker_register exists]) + ZFS_LINUX_TEST_RESULT([shrinker_register], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SHRINKER_REGISTER, 1, + [shrinker_register exists]) + + dnl # We assume that the split shrinker + dnl # callback exists if + dnl # shrinker_register() exists, + dnl # because the latter is a much more + dnl # recent addition, and the macro + dnl # test for shrinker_register() only + dnl # works if the callback is split + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, + 1, [cs->count_objects exists]) + ],[ + AC_MSG_RESULT(no) ZFS_LINUX_TEST_ERROR([shrinker]) + ]) ]) ]) ]) @@ -174,10 +247,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER], [ ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK + ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_HAS_NID ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT ZFS_AC_KERNEL_SRC_REGISTER_SHRINKER_VARARG + ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER ]) AC_DEFUN([ZFS_AC_KERNEL_SHRINKER], [ diff --git a/config/kernel.m4 b/config/kernel.m4 index 056517a841..d25b65994f 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -168,6 +168,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE ;; + riscv*) + ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE + ;; esac AC_MSG_CHECKING([for available kernel interfaces]) @@ -310,6 +313,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_CPU_HAS_FEATURE ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE ;; + riscv*) + ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE + ;; esac ]) diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 77ce75ca3f..150e50380d 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -91,6 +91,12 @@ #define param_set_max_auto_ashift_args(var) \ CTLTYPE_UINT, NULL, 0, param_set_max_auto_ashift, "IU" +#define spa_taskq_read_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A" + +#define spa_taskq_write_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A" + #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h index b71f2f2e56..b9d41903ea 100644 --- a/include/os/freebsd/spl/sys/uio.h +++ b/include/os/freebsd/spl/sys/uio.h @@ -62,7 +62,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) } static inline void -zfs_uio_advance(zfs_uio_t *uio, size_t size) +zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { zfs_uio_resid(uio) -= size; zfs_uio_offset(uio) += size; diff --git a/include/os/linux/kernel/linux/dcache_compat.h b/include/os/linux/kernel/linux/dcache_compat.h index 1e35204932..ab1711b99f 100644 --- a/include/os/linux/kernel/linux/dcache_compat.h +++ b/include/os/linux/kernel/linux/dcache_compat.h @@ -42,8 +42,8 @@ /* * Starting from Linux 5.13, flush_dcache_page() becomes an inline function * and under some configurations, may indirectly referencing GPL-only - * cpu_feature_keys on powerpc. Override this function when it is detected - * being GPL-only. + * symbols, e.g., cpu_feature_keys on powerpc and PageHuge on riscv. + * Override this function when it is detected being GPL-only. */ #if defined __powerpc__ && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY #include @@ -53,6 +53,17 @@ clear_bit(PG_dcache_clean, &(page)->flags); \ } while (0) #endif +/* + * For riscv implementation, the use of PageHuge can be safely removed. + * Because it handles pages allocated by HugeTLB, while flush_dcache_page + * in zfs module is only called on kernel pages. + */ +#if defined __riscv && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY +#define flush_dcache_page(page) do { \ + if (test_bit(PG_dcache_clean, &(page)->flags)) \ + clear_bit(PG_dcache_clean, &(page)->flags); \ + } while (0) +#endif /* * 2.6.30 API change, diff --git a/include/os/linux/kernel/linux/simd_aarch64.h b/include/os/linux/kernel/linux/simd_aarch64.h index 16276b08c7..123a0c72bc 100644 --- a/include/os/linux/kernel/linux/simd_aarch64.h +++ b/include/os/linux/kernel/linux/simd_aarch64.h @@ -71,9 +71,15 @@ #define ID_AA64PFR0_EL1 sys_reg(3, 0, 0, 1, 0) #define ID_AA64ISAR0_EL1 sys_reg(3, 0, 0, 6, 0) +#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON)) #define kfpu_allowed() 1 #define kfpu_begin() kernel_neon_begin() #define kfpu_end() kernel_neon_end() +#else +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#endif #define kfpu_init() (0) #define kfpu_fini() do {} while (0) diff --git a/include/os/linux/kernel/linux/simd_arm.h b/include/os/linux/kernel/linux/simd_arm.h index c432a6d4ab..bc70eaef30 100644 --- a/include/os/linux/kernel/linux/simd_arm.h +++ b/include/os/linux/kernel/linux/simd_arm.h @@ -53,9 +53,15 @@ #include #include +#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON)) #define kfpu_allowed() 1 #define kfpu_begin() kernel_neon_begin() #define kfpu_end() kernel_neon_end() +#else +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#endif #define kfpu_init() (0) #define kfpu_fini() do {} while (0) diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index 82d50b6034..b159bb52d1 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -70,8 +70,6 @@ typedef enum kmem_cbrc { #define KMC_REAP_CHUNK INT_MAX #define KMC_DEFAULT_SEEKS 1 -#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ - extern struct list_head spl_kmem_cache_list; extern struct rw_semaphore spl_kmem_cache_sem; diff --git a/include/os/linux/spl/sys/shrinker.h b/include/os/linux/spl/sys/shrinker.h index d472754be4..bca4c85069 100644 --- a/include/os/linux/spl/sys/shrinker.h +++ b/include/os/linux/spl/sys/shrinker.h @@ -29,12 +29,13 @@ /* * Due to frequent changes in the shrinker API the following - * compatibility wrappers should be used. They are as follows: + * compatibility wrapper should be used. * - * SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost); + * shrinker = spl_register_shrinker(name, countfunc, scanfunc, seek_cost); + * spl_unregister_shrinker(shrinker); * - * SPL_SHRINKER_DECLARE is used to declare a shrinker with the name varname, - * which is passed to spl_register_shrinker()/spl_unregister_shrinker(). + * spl_register_shrinker is used to create and register a shrinker with the + * given name. * The countfunc returns the number of free-able objects. * The scanfunc returns the number of objects that were freed. * The callbacks can return SHRINK_STOP if further calls can't make any more @@ -57,57 +58,28 @@ * ...scan objects in the cache and reclaim them... * } * - * SPL_SHRINKER_DECLARE(my_shrinker, my_count, my_scan, DEFAULT_SEEKS); + * static struct shrinker *my_shrinker; * * void my_init_func(void) { - * spl_register_shrinker(&my_shrinker); + * my_shrinker = spl_register_shrinker("my-shrinker", + * my_count, my_scan, DEFAULT_SEEKS); + * } + * + * void my_fini_func(void) { + * spl_unregister_shrinker(my_shrinker); * } */ -#ifdef HAVE_REGISTER_SHRINKER_VARARG -#define spl_register_shrinker(x) register_shrinker(x, "zfs-arc-shrinker") -#else -#define spl_register_shrinker(x) register_shrinker(x) -#endif -#define spl_unregister_shrinker(x) unregister_shrinker(x) +typedef unsigned long (*spl_shrinker_cb) + (struct shrinker *, struct shrink_control *); -/* - * Linux 3.0 to 3.11 Shrinker API Compatibility. - */ -#if defined(HAVE_SINGLE_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ -static int \ -__ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\ -{ \ - if (sc->nr_to_scan != 0) { \ - (void) scanfunc(shrink, sc); \ - } \ - return (countfunc(shrink, sc)); \ -} \ - \ -static struct shrinker varname = { \ - .shrink = __ ## varname ## _wrapper, \ - .seeks = seek_cost, \ -} +struct shrinker *spl_register_shrinker(const char *name, + spl_shrinker_cb countfunc, spl_shrinker_cb scanfunc, int seek_cost); +void spl_unregister_shrinker(struct shrinker *); +#ifndef SHRINK_STOP +/* 3.0-3.11 compatibility */ #define SHRINK_STOP (-1) - -/* - * Linux 3.12 and later Shrinker API Compatibility. - */ -#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ -static struct shrinker varname = { \ - .count_objects = countfunc, \ - .scan_objects = scanfunc, \ - .seeks = seek_cost, \ -} - -#else -/* - * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced. - */ -#error "Unknown shrinker callback" #endif #endif /* SPL_SHRINKER_H */ diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index a4b600004c..5e6ea8d3c2 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -95,7 +95,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) } static inline void -zfs_uio_advance(zfs_uio_t *uio, size_t size) +zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { uio->uio_resid -= size; uio->uio_loffset += size; diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 707c6ddb11..cfb2a6ed3e 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -282,5 +282,25 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); #else #define zpl_inode_set_ctime_to_ts(ip, ts) (ip->i_ctime = ts) #endif +#ifdef HAVE_INODE_GET_ATIME +#define zpl_inode_get_atime(ip) inode_get_atime(ip) +#else +#define zpl_inode_get_atime(ip) (ip->i_atime) +#endif +#ifdef HAVE_INODE_SET_ATIME_TO_TS +#define zpl_inode_set_atime_to_ts(ip, ts) inode_set_atime_to_ts(ip, ts) +#else +#define zpl_inode_set_atime_to_ts(ip, ts) (ip->i_atime = ts) +#endif +#ifdef HAVE_INODE_GET_MTIME +#define zpl_inode_get_mtime(ip) inode_get_mtime(ip) +#else +#define zpl_inode_get_mtime(ip) (ip->i_mtime) +#endif +#ifdef HAVE_INODE_SET_MTIME_TO_TS +#define zpl_inode_set_mtime_to_ts(ip, ts) inode_set_mtime_to_ts(ip, ts) +#else +#define zpl_inode_set_mtime_to_ts(ip, ts) (ip->i_mtime = ts) +#endif #endif /* _SYS_ZPL_H */ diff --git a/include/sys/dataset_kstats.h b/include/sys/dataset_kstats.h index 40cf5258a2..c81a07f0c1 100644 --- a/include/sys/dataset_kstats.h +++ b/include/sys/dataset_kstats.h @@ -71,6 +71,7 @@ typedef struct dataset_kstats { int dataset_kstats_create(dataset_kstats_t *, objset_t *); void dataset_kstats_destroy(dataset_kstats_t *); +void dataset_kstats_rename(dataset_kstats_t *dk, const char *); void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t); void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t); diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 1800a7e31d..f2a1535c91 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -379,8 +379,8 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); +boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, diff --git a/include/sys/dsl_crypt.h b/include/sys/dsl_crypt.h index 72716e296c..fbcae37153 100644 --- a/include/sys/dsl_crypt.h +++ b/include/sys/dsl_crypt.h @@ -206,6 +206,7 @@ void dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, dmu_tx_t *tx); int dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, boolean_t *will_encrypt); +boolean_t dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb); void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx); uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index f780ad3d61..9a34bafc1c 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -181,7 +181,7 @@ typedef struct zil_vdev_node { avl_node_t zv_node; /* AVL tree linkage */ } zil_vdev_node_t; -#define ZIL_PREV_BLKS 16 +#define ZIL_BURSTS 8 /* * Stable storage intent log management structure. One per dataset. @@ -216,14 +216,18 @@ struct zilog { uint64_t zl_parse_lr_count; /* number of log records parsed */ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ list_t zl_itx_commit_list; /* itx list to be committed */ - uint64_t zl_cur_used; /* current commit log size used */ + uint64_t zl_cur_size; /* current burst full size */ + uint64_t zl_cur_left; /* current burst remaining size */ + uint64_t zl_cur_max; /* biggest record in current burst */ list_t zl_lwb_list; /* in-flight log write list */ avl_tree_t zl_bp_tree; /* track bps during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ uint64_t zl_replay_blks; /* number of log blocks replayed */ zil_header_t zl_old_header; /* debugging aid */ - uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ + uint_t zl_parallel; /* workload is multi-threaded */ uint_t zl_prev_rotor; /* rotor for zl_prev[] */ + uint_t zl_prev_opt[ZIL_BURSTS]; /* optimal block size */ + uint_t zl_prev_min[ZIL_BURSTS]; /* minimal first block size */ txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index 79e1016eab..8e3f22bf42 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -64,6 +64,9 @@ libspl_assert(const char *buf, const char *file, const char *func, int line) #undef verify #endif +#define PANIC(fmt, a...) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + #define VERIFY(cond) \ (void) ((!(cond)) && \ libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__)) diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index e9e21819d4..665bfc4230 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -90,7 +90,7 @@ zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) } static inline void -zfs_uio_advance(zfs_uio_t *uio, size_t size) +zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { uio->uio_resid -= size; uio->uio_loffset += size; diff --git a/man/man4/spl.4 b/man/man4/spl.4 index 82455fb532..414a923948 100644 --- a/man/man4/spl.4 +++ b/man/man4/spl.4 @@ -31,14 +31,6 @@ for use by the kmem caches. For the majority of systems and workloads only a small number of threads are required. . -.It Sy spl_kmem_cache_reclaim Ns = Ns Sy 0 Pq uint -When this is set it prevents Linux from being able to rapidly reclaim all the -memory held by the kmem caches. -This may be useful in circumstances where it's preferable that Linux -reclaim memory from some other subsystem first. -Setting this will increase the likelihood out of memory events on a memory -constrained system. -. .It Sy spl_kmem_cache_obj_per_slab Ns = Ns Sy 8 Pq uint The preferred number of objects per slab in the cache. In general, a larger value will increase the caches memory footprint diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 4ec52a2fb6..9d51348912 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -798,7 +798,7 @@ Note that this should not be set below the ZED thresholds (currently 10 checksums over 10 seconds) or else the daemon may not trigger any action. . -.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint +.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint This controls the amount of time that a ZIL block (lwb) will remain "open" when it isn't "full", and it has a thread waiting for it to be committed to stable storage. @@ -2160,13 +2160,6 @@ This sets the maximum number of write bytes logged via WR_COPIED. It tunes a tradeoff between additional memory copy and possibly worse log space efficiency vs additional range lock/unlock. . -.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64 -This sets the minimum delay in nanoseconds ZIL care to delay block commit, -waiting for more records. -If ZIL writes are too fast, kernel may not be able sleep for so short interval, -increasing log latency above allowed by -.Sy zfs_commit_timeout_pct . -. .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable the cache flush commands that are normally sent to disk by the ZIL after an LWB write has completed. @@ -2280,6 +2273,16 @@ If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . +.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp +Set the queue and thread configuration for the IO read queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. +.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp +Set the queue and thread configuration for the IO write queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 8ca4bd927b..8456a9aa76 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -364,9 +364,12 @@ When this feature is enabled ZFS will use block cloning for operations like Block cloning allows to create multiple references to a single block. It is much faster than copying the data (as the actual data is neither read nor written) and takes no additional space. -Blocks can be cloned across datasets under some conditions (like disabled -encryption and equal -.Nm recordsize ) . +Blocks can be cloned across datasets under some conditions (like equal +.Nm recordsize , +the same master encryption key, etc.). +ZFS tries its best to clone across datasets including encrypted ones. +This is limited for various (nontrivial) reasons depending on the OS +and/or ZFS internals. .Pp This feature becomes .Sy active diff --git a/module/Kbuild.in b/module/Kbuild.in index a9003fa8a2..004e32b3e9 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -80,6 +80,7 @@ SPL_OBJS := \ spl-kstat.o \ spl-proc.o \ spl-procfs-list.o \ + spl-shrinker.o \ spl-taskq.o \ spl-thread.o \ spl-trace.o \ diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c index 9f5f92e194..43cd4da02e 100644 --- a/module/os/freebsd/spl/spl_kstat.c +++ b/module/os/freebsd/spl/spl_kstat.c @@ -187,19 +187,18 @@ kstat_sysctl_dataset_string(SYSCTL_HANDLER_ARGS) static int kstat_sysctl_io(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; kstat_t *ksp = arg1; kstat_io_t *kip = ksp->ks_data; int rc; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, 0, req); + /* Update the aggsums before reading */ (void) ksp->ks_update(ksp, KSTAT_READ); /* though wlentime & friends are signed, they will never be negative */ - sbuf_printf(sb, + sbuf_printf(&sb, "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", kip->nread, kip->nwritten, @@ -207,25 +206,21 @@ kstat_sysctl_io(SYSCTL_HANDLER_ARGS) kip->wtime, kip->wlentime, kip->wlastupdate, kip->rtime, kip->rlentime, kip->rlastupdate, kip->wcnt, kip->rcnt); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } static int kstat_sysctl_raw(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; void *data; kstat_t *ksp = arg1; void *(*addr_op)(kstat_t *ksp, loff_t index); int n, has_header, rc = 0; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, PAGE_SIZE, req); if (ksp->ks_raw_ops.addr) addr_op = ksp->ks_raw_ops.addr; @@ -258,8 +253,10 @@ restart_headers: if (has_header) { if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart_headers; - if (rc == 0) - sbuf_printf(sb, "\n%s", ksp->ks_raw_buf); + if (rc == 0) { + sbuf_cat(&sb, "\n"); + sbuf_cat(&sb, ksp->ks_raw_buf); + } } while ((data = addr_op(ksp, n)) != NULL) { @@ -270,22 +267,19 @@ restart: if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (rc == 0) - sbuf_printf(sb, "%s", ksp->ks_raw_buf); + sbuf_cat(&sb, ksp->ks_raw_buf); } else { ASSERT3U(ksp->ks_ndata, ==, 1); - sbuf_hexdump(sb, ksp->ks_data, + sbuf_hexdump(&sb, ksp->ks_data, ksp->ks_data_size, NULL, 0); } n++; } free(ksp->ks_raw_buf, M_TEMP); mutex_exit(ksp->ks_lock); - sbuf_trim(sb); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c index a5f486b95d..c33ce01ab3 100644 --- a/module/os/freebsd/zfs/dmu_os.c +++ b/module/os/freebsd/zfs/dmu_os.c @@ -110,7 +110,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_FALSE); else dmu_buf_will_dirty(db, tx); @@ -126,7 +126,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, } if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx, B_FALSE); offset += tocpy; size -= tocpy; diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index 74755eb6d9..2b62abcccb 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1251,7 +1251,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, iovec_t *dst_iovecs; zil_chain_t *zilc; lr_t *lr; - uint64_t txtype, lr_len; + uint64_t txtype, lr_len, nused; uint_t crypt_len, nr_iovecs, vec; uint_t aad_len = 0, total_len = 0; @@ -1268,7 +1268,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + ASSERT3U(nused, >=, sizeof (zil_chain_t)); + ASSERT3U(nused, <=, datalen); + blkend = src + nused; /* * Calculate the number of encrypted iovecs we will need. @@ -1287,6 +1290,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } + ASSERT3U(lr_len, >=, sizeof (lr_t)); + ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 2520507b98..b6edac434d 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -1333,6 +1333,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + dataset_kstats_rename(&zv->zv_kstat, newname); } /* diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 3c30dfc577..a2920c7466 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -76,17 +76,6 @@ module_param(spl_kmem_cache_magazine_size, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_magazine_size, "Default magazine size (2-256), set automatically (0)"); -/* - * The default behavior is to report the number of objects remaining in the - * cache. This allows the Linux VM to repeatedly reclaim objects from the - * cache when memory is low satisfy other memory allocations. Alternately, - * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache - * is reclaimed. This may increase the likelihood of out of memory events. - */ -static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; -module_param(spl_kmem_cache_reclaim, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); - static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_obj_per_slab, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); diff --git a/module/os/linux/spl/spl-shrinker.c b/module/os/linux/spl/spl-shrinker.c new file mode 100644 index 0000000000..d5c8da471c --- /dev/null +++ b/module/os/linux/spl/spl-shrinker.c @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Shrinker Implementation. + */ + +#include +#include + +#ifdef HAVE_SINGLE_SHRINKER_CALLBACK +/* 3.0-3.11: single shrink() callback, which we wrap to carry both functions */ +struct spl_shrinker_wrap { + struct shrinker shrinker; + spl_shrinker_cb countfunc; + spl_shrinker_cb scanfunc; +}; + +static int +spl_shrinker_single_cb(struct shrinker *shrinker, struct shrink_control *sc) +{ + struct spl_shrinker_wrap *sw = (struct spl_shrinker_wrap *)shrinker; + + if (sc->nr_to_scan != 0) + (void) sw->scanfunc(&sw->shrinker, sc); + return (sw->countfunc(&sw->shrinker, sc)); +} +#endif + +struct shrinker * +spl_register_shrinker(const char *name, spl_shrinker_cb countfunc, + spl_shrinker_cb scanfunc, int seek_cost) +{ + struct shrinker *shrinker; + + /* allocate shrinker */ +#if defined(HAVE_SHRINKER_REGISTER) + /* 6.7: kernel will allocate the shrinker for us */ + shrinker = shrinker_alloc(0, name); +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + /* 3.12-6.6: we allocate the shrinker */ + shrinker = kmem_zalloc(sizeof (struct shrinker), KM_SLEEP); +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) + /* 3.0-3.11: allocate a wrapper */ + struct spl_shrinker_wrap *sw = + kmem_zalloc(sizeof (struct spl_shrinker_wrap), KM_SLEEP); + shrinker = &sw->shrinker; +#else + /* 2.x-2.6.22, or a newer shrinker API has been introduced. */ +#error "Unknown shrinker API" +#endif + + if (shrinker == NULL) + return (NULL); + + /* set callbacks */ +#ifdef HAVE_SINGLE_SHRINKER_CALLBACK + sw->countfunc = countfunc; + sw->scanfunc = scanfunc; + shrinker->shrink = spl_shrinker_single_cb; +#else + shrinker->count_objects = countfunc; + shrinker->scan_objects = scanfunc; +#endif + + /* set params */ + shrinker->seeks = seek_cost; + + /* register with kernel */ +#if defined(HAVE_SHRINKER_REGISTER) + shrinker_register(shrinker); +#elif defined(HAVE_REGISTER_SHRINKER_VARARG) + register_shrinker(shrinker, name); +#else + register_shrinker(shrinker); +#endif + + return (shrinker); +} +EXPORT_SYMBOL(spl_register_shrinker); + +void +spl_unregister_shrinker(struct shrinker *shrinker) +{ +#if defined(HAVE_SHRINKER_REGISTER) + shrinker_free(shrinker); +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + unregister_shrinker(shrinker); + kmem_free(shrinker, sizeof (struct shrinker)); +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) + unregister_shrinker(shrinker); + kmem_free(shrinker, sizeof (struct spl_shrinker_wrap)); +#else +#error "Unknown shrinker API" +#endif +} +EXPORT_SYMBOL(spl_unregister_shrinker); diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 43ed087e2d..1fa9f3eb3f 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -247,8 +247,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) return (sc->nr_to_scan); } -SPL_SHRINKER_DECLARE(arc_shrinker, - arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); +static struct shrinker *arc_shrinker = NULL; int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) @@ -351,14 +350,18 @@ arc_lowmem_init(void) * reclaim from the arc. This is done to prevent kswapd from * swapping out pages when it is preferable to shrink the arc. */ - spl_register_shrinker(&arc_shrinker); + arc_shrinker = spl_register_shrinker("zfs-arc-shrinker", + arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); + VERIFY(arc_shrinker); + arc_set_sys_free(allmem); } void arc_lowmem_fini(void) { - spl_unregister_shrinker(&arc_shrinker); + spl_unregister_shrinker(arc_shrinker); + arc_shrinker = NULL; } int diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 48ac55f070..8b5aa94fe4 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -85,7 +85,7 @@ static blk_mode_t #else static fmode_t #endif -vdev_bdev_mode(spa_mode_t spa_mode) +vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) { #ifdef HAVE_BLK_MODE_T blk_mode_t mode = 0; @@ -95,6 +95,9 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= BLK_OPEN_WRITE; + + if (exclusive) + mode |= BLK_OPEN_EXCL; #else fmode_t mode = 0; @@ -103,6 +106,9 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; + + if (exclusive) + mode |= FMODE_EXCL; #endif return (mode); @@ -225,10 +231,10 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, { #ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG return (blkdev_get_by_path(path, - vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops)); + vdev_bdev_mode(mode, B_TRUE), holder, hops)); #else return (blkdev_get_by_path(path, - vdev_bdev_mode(mode) | FMODE_EXCL, holder)); + vdev_bdev_mode(mode, B_TRUE), holder)); #endif } @@ -238,7 +244,7 @@ vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder) #ifdef HAVE_BLKDEV_PUT_HOLDER return (blkdev_put(bdev, holder)); #else - return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL)); + return (blkdev_put(bdev, vdev_bdev_mode(mode, B_TRUE))); #endif } @@ -248,9 +254,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, { struct block_device *bdev; #ifdef HAVE_BLK_MODE_T - blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); #else - fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); #endif hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 94e25fa0ae..54ed70d039 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -520,8 +520,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, ip->i_uid = SUID_TO_KUID(0); ip->i_gid = SGID_TO_KGID(0); ip->i_blkbits = SPA_MINBLOCKSHIFT; - ip->i_atime = now; - ip->i_mtime = now; + zpl_inode_set_atime_to_ts(ip, now); + zpl_inode_set_mtime_to_ts(ip, now); zpl_inode_set_ctime_to_ts(ip, now); ip->i_fop = fops; ip->i_op = ops; diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 7bb70439a1..dcc586362c 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1258,12 +1258,18 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. */ +#if defined(HAVE_SUPER_BLOCK_S_SHRINK) +#define S_SHRINK(sb) (&(sb)->s_shrink) +#elif defined(HAVE_SUPER_BLOCK_S_SHRINK_PTR) +#define S_SHRINK(sb) ((sb)->s_shrink) +#endif + int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfsvfs_t *zfsvfs = sb->s_fs_info; int error = 0; - struct shrinker *shrinker = &sb->s_shrink; + struct shrinker *shrinker = S_SHRINK(sb); struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, @@ -1275,7 +1281,7 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ defined(SHRINK_CONTROL_HAS_NID) && \ defined(SHRINKER_NUMA_AWARE) - if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { + if (shrinker->flags & SHRINKER_NUMA_AWARE) { *objects = 0; for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 9cdce3afdc..881c3d9ba3 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -2464,15 +2464,16 @@ top: if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; - ZFS_TIME_ENCODE(&ip->i_atime, atime); + inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( - vap->va_mtime, ZTOI(zp)); + zpl_inode_set_mtime_to_ts(ZTOI(zp), + zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); @@ -3686,7 +3687,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; @@ -3850,9 +3851,10 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - tmp_ctime = zpl_inode_get_ctime(ip); - ZFS_TIME_ENCODE(&tmp_ctime, ctime); + tmp_ts = zpl_inode_get_mtime(ip); + ZFS_TIME_ENCODE(&tmp_ts, mtime); + tmp_ts = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ts, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; @@ -3902,7 +3904,7 @@ zfs_dirty_inode(struct inode *ip, int flags) zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; @@ -3947,10 +3949,12 @@ zfs_dirty_inode(struct inode *ip, int flags) SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_atime, atime); - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - tmp_ctime = zpl_inode_get_ctime(ip); - ZFS_TIME_ENCODE(&tmp_ctime, ctime); + tmp_ts = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_ts, atime); + tmp_ts = zpl_inode_get_mtime(ip); + ZFS_TIME_ENCODE(&tmp_ts, mtime); + tmp_ts = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ts, ctime); mode = ip->i_mode; zp->z_mode = mode; @@ -3993,7 +3997,9 @@ zfs_inactive(struct inode *ip) if (error) { dmu_tx_abort(tx); } else { - ZFS_TIME_ENCODE(&ip->i_atime, atime); + inode_timespec_t tmp_atime; + tmp_atime = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index f71026da83..b99df188c6 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -542,7 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; @@ -614,10 +614,12 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; - ZFS_TIME_DECODE(&ip->i_atime, atime); - ZFS_TIME_DECODE(&ip->i_mtime, mtime); - ZFS_TIME_DECODE(&tmp_ctime, ctime); - zpl_inode_set_ctime_to_ts(ip, tmp_ctime); + ZFS_TIME_DECODE(&tmp_ts, atime); + zpl_inode_set_atime_to_ts(ip, tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ip, tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; @@ -1197,7 +1199,7 @@ zfs_rezget(znode_t *zp) uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; @@ -1290,10 +1292,12 @@ zfs_rezget(znode_t *zp) zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); - ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); - ZFS_TIME_DECODE(&tmp_ctime, ctime); - zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); + ZFS_TIME_DECODE(&tmp_ts, atime); + zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { @@ -1401,22 +1405,24 @@ zfs_zinactive(znode_t *zp) boolean_t zfs_relatime_need_update(const struct inode *ip) { - inode_timespec_t now, tmp_ctime; + inode_timespec_t now, tmp_atime, tmp_ts; gethrestime(&now); + tmp_atime = zpl_inode_get_atime(ip); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ - if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) + tmp_ts = zpl_inode_get_mtime(ip); + if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); - tmp_ctime = zpl_inode_get_ctime(ip); - if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0) + tmp_ts = zpl_inode_get_ctime(ip); + if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); - if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) + if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); @@ -1439,7 +1445,7 @@ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { - inode_timespec_t now, tmp_ctime; + inode_timespec_t now, tmp_ts; gethrestime(&now); @@ -1447,7 +1453,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); @@ -1456,8 +1463,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); - ZFS_TIME_DECODE(&tmp_ctime, ctime); - zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 55f807ccfc..21f3740f6f 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1405,7 +1405,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, boolean_t *no_crypt) { int ret; - uint64_t txtype, lr_len; + uint64_t txtype, lr_len, nused; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; @@ -1432,7 +1432,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + ASSERT3U(nused, >=, sizeof (zil_chain_t)); + ASSERT3U(nused, <=, datalen); + blkend = src + nused; /* calculate the number of encrypted iovecs we will need */ for (; slrp < blkend; slrp += lr_len) { @@ -1445,6 +1448,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } + ASSERT3U(lr_len, >=, sizeof (lr_t)); + ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index adf987b3c9..b25f3e9224 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -526,7 +526,8 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) vap->va_ctime = ia->ia_ctime; if (vap->va_mask & ATTR_ATIME) - ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); + zpl_inode_set_atime_to_ts(ip, + zpl_inode_timestamp_truncate(ia->ia_atime, ip)); cookie = spl_fstrans_mark(); #ifdef HAVE_USERNS_IOPS_SETATTR diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index a7bcda252f..bde3be2a79 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1528,6 +1528,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); + + dataset_kstats_rename(&zv->zv_kstat, newname); } void diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 745ee8f02e..d982f201c9 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -802,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - boolean_t gang = abd_is_gang(abd); abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { - /* If we are at the end of the gang ABD we are done */ - if (gang && !c_abd) - break; + IMPLY(abd_is_gang(abd), c_abd != NULL); abd_iter_map(&aiter); @@ -930,7 +927,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, { int ret = 0; struct abd_iter daiter, saiter; - boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; if (size == 0) @@ -942,16 +938,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); - dabd_is_gang_abd = abd_is_gang(dabd); - sabd_is_gang_abd = abd_is_gang(sabd); c_dabd = abd_init_abd_iter(dabd, &daiter, doff); c_sabd = abd_init_abd_iter(sabd, &saiter, soff); while (size > 0) { - /* if we are at the end of the gang ABD we are done */ - if ((dabd_is_gang_abd && !c_dabd) || - (sabd_is_gang_abd && !c_sabd)) - break; + IMPLY(abd_is_gang(dabd), c_dabd != NULL); + IMPLY(abd_is_gang(sabd), c_sabd != NULL); abd_iter_map(&daiter); abd_iter_map(&saiter); @@ -1032,66 +1024,40 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, int i; ssize_t len, dlen; struct abd_iter caiters[3]; - struct abd_iter daiter = {0}; + struct abd_iter daiter; void *caddrs[3]; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_dabd = NULL; - boolean_t cabds_is_gang_abd[3]; - boolean_t dabd_is_gang_abd = B_FALSE; ASSERT3U(parity, <=, 3); - for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); + abd_verify(cabds[i]); + ASSERT3U(csize, <=, cabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); } - if (dabd) { - dabd_is_gang_abd = abd_is_gang(dabd); + ASSERT3S(dsize, >=, 0); + if (dsize > 0) { + ASSERT(dabd); + abd_verify(dabd); + ASSERT3U(dsize, <=, dabd->abd_size); c_dabd = abd_init_abd_iter(dabd, &daiter, 0); } - ASSERT3S(dsize, >=, 0); - abd_enter_critical(flags); while (csize > 0) { - /* if we are at the end of the gang ABD we are done */ - if (dabd_is_gang_abd && !c_dabd) - break; - + len = csize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we are - * done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; + len = MIN(caiters[i].iter_mapsize, len); } - len = csize; - - if (dabd && dsize > 0) + if (dsize > 0) { + IMPLY(abd_is_gang(dabd), c_dabd != NULL); abd_iter_map(&daiter); - - switch (parity) { - case 3: - len = MIN(caiters[2].iter_mapsize, len); - zfs_fallthrough; - case 2: - len = MIN(caiters[1].iter_mapsize, len); - zfs_fallthrough; - case 1: - len = MIN(caiters[0].iter_mapsize, len); - } - - /* must be progressive */ - ASSERT3S(len, >, 0); - - if (dabd && dsize > 0) { - /* this needs precise iter.length */ len = MIN(daiter.iter_mapsize, len); dlen = len; } else @@ -1114,7 +1080,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, &caiters[i], len); } - if (dabd && dsize > 0) { + if (dsize > 0) { abd_iter_unmap(&daiter); c_dabd = abd_advance_abd_iter(dabd, c_dabd, &daiter, @@ -1153,16 +1119,16 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; unsigned long flags __maybe_unused = 0; - boolean_t cabds_is_gang_abd[3]; - boolean_t tabds_is_gang_abd[3]; abd_t *c_cabds[3]; abd_t *c_tabds[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); - tabds_is_gang_abd[i] = abd_is_gang(tabds[i]); + abd_verify(cabds[i]); + abd_verify(tabds[i]); + ASSERT3U(tsize, <=, cabds[i]->abd_size); + ASSERT3U(tsize, <=, tabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &citers[i], 0); c_tabds[i] = @@ -1171,36 +1137,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, abd_enter_critical(flags); while (tsize > 0) { - + len = tsize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we - * are done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; - if (tabds_is_gang_abd[i] && !c_tabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); + IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL); abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; xaddrs[i] = xiters[i].iter_mapaddr; + len = MIN(citers[i].iter_mapsize, len); + len = MIN(xiters[i].iter_mapsize, len); } - len = tsize; - switch (parity) { - case 3: - len = MIN(xiters[2].iter_mapsize, len); - len = MIN(citers[2].iter_mapsize, len); - zfs_fallthrough; - case 2: - len = MIN(xiters[1].iter_mapsize, len); - len = MIN(citers[1].iter_mapsize, len); - zfs_fallthrough; - case 1: - len = MIN(xiters[0].iter_mapsize, len); - len = MIN(citers[0].iter_mapsize, len); - } /* must be progressive */ ASSERT3S(len, >, 0); /* diff --git a/module/zfs/arc.c b/module/zfs/arc.c index df84252bd2..f5fd0aa551 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8042,9 +8042,8 @@ l2arc_write_size(l2arc_dev_t *dev) */ size = l2arc_write_max; if (size == 0) { - cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " - "be greater than zero, resetting it to the default (%d)", - L2ARC_WRITE_SIZE); + cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, " + "resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } @@ -8067,30 +8066,9 @@ l2arc_write_size(l2arc_dev_t *dev) * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ - if (size > dev->l2ad_end - dev->l2ad_start) { - cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " - "plus the overhead of log blocks (persistent L2ARC, " - "%llu bytes) exceeds the size of the cache device " - "(guid %llu), resetting them to the default (%d)", - (u_longlong_t)l2arc_log_blk_overhead(size, dev), - (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4); - size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; - - if (l2arc_trim_ahead > 1) { - cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); - l2arc_trim_ahead = 1; - } - - if (arc_warm == B_FALSE) - size += l2arc_write_boost; - - size += l2arc_log_blk_overhead(size, dev); - if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { - size += MAX(64 * 1024 * 1024, - (size * l2arc_trim_ahead) / 100); - } - } + size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift); return (size); diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 759bc8d2e2..225ddaca1e 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -157,10 +157,8 @@ * (copying the file content to the new dataset and removing the source file). * In that case Block Cloning will only be used briefly, because the BRT entries * will be removed when the source is removed. - * Note: currently it is not possible to clone blocks between encrypted - * datasets, even if those datasets use the same encryption key (this includes - * snapshots of encrypted datasets). Cloning blocks between datasets that use - * the same keys should be possible and should be implemented in the future. + * Block Cloning across encrypted datasets is supported as long as both + * datasets share the same master key (e.g. snapshots and clones) * * Block Cloning flow through ZFS layers. * @@ -344,7 +342,7 @@ brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) ASSERT3U(idx, <, brtvd->bv_size); - if (brtvd->bv_need_byteswap) { + if (unlikely(brtvd->bv_need_byteswap)) { return (BSWAP_16(brtvd->bv_entcount[idx])); } else { return (brtvd->bv_entcount[idx]); @@ -357,7 +355,7 @@ brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) ASSERT3U(idx, <, brtvd->bv_size); - if (brtvd->bv_need_byteswap) { + if (unlikely(brtvd->bv_need_byteswap)) { brtvd->bv_entcount[idx] = BSWAP_16(entcnt); } else { brtvd->bv_entcount[idx] = entcnt; @@ -392,55 +390,39 @@ brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) #ifdef ZFS_DEBUG static void -brt_vdev_dump(brt_t *brt) +brt_vdev_dump(brt_vdev_t *brtvd) { - brt_vdev_t *brtvd; - uint64_t vdevid; + uint64_t idx; - if ((zfs_flags & ZFS_DEBUG_BRT) == 0) { - return; - } - - if (brt->brt_nvdevs == 0) { - zfs_dbgmsg("BRT empty"); - return; - } - - zfs_dbgmsg("BRT vdev dump:"); - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - uint64_t idx; - - brtvd = &brt->brt_vdevs[vdevid]; - zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d " - "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", - (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid, - brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, - (u_longlong_t)brtvd->bv_size, - (u_longlong_t)brtvd->bv_totalcount, - (u_longlong_t)brtvd->bv_nblocks, - (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); - if (brtvd->bv_totalcount > 0) { - zfs_dbgmsg(" entcounts:"); - for (idx = 0; idx < brtvd->bv_size; idx++) { - if (brt_vdev_entcount_get(brtvd, idx) > 0) { - zfs_dbgmsg(" [%04llu] %hu", - (u_longlong_t)idx, - brt_vdev_entcount_get(brtvd, idx)); - } + zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " + "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", + (u_longlong_t)brtvd->bv_vdevid, + brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, + (u_longlong_t)brtvd->bv_size, + (u_longlong_t)brtvd->bv_totalcount, + (u_longlong_t)brtvd->bv_nblocks, + (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); + if (brtvd->bv_totalcount > 0) { + zfs_dbgmsg(" entcounts:"); + for (idx = 0; idx < brtvd->bv_size; idx++) { + uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); + if (entcnt > 0) { + zfs_dbgmsg(" [%04llu] %hu", + (u_longlong_t)idx, entcnt); } } - if (brtvd->bv_entcount_dirty) { - char *bitmap; + } + if (brtvd->bv_entcount_dirty) { + char *bitmap; - bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); - for (idx = 0; idx < brtvd->bv_nblocks; idx++) { - bitmap[idx] = - BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; - } - bitmap[idx] = '\0'; - zfs_dbgmsg(" bitmap: %s", bitmap); - kmem_free(bitmap, brtvd->bv_nblocks + 1); + bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); + for (idx = 0; idx < brtvd->bv_nblocks; idx++) { + bitmap[idx] = + BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; } + bitmap[idx] = '\0'; + zfs_dbgmsg(" dirty: %s", bitmap); + kmem_free(bitmap, brtvd->bv_nblocks + 1); } } #endif @@ -769,7 +751,8 @@ brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, BT_SET(brtvd->bv_bitmap, idx); #ifdef ZFS_DEBUG - brt_vdev_dump(brt); + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); #endif } @@ -805,7 +788,8 @@ brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, BT_SET(brtvd->bv_bitmap, idx); #ifdef ZFS_DEBUG - brt_vdev_dump(brt); + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); #endif } diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 767a461e00..2ac058fd2c 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -198,6 +198,18 @@ dataset_kstats_destroy(dataset_kstats_t *dk) zil_sums_fini(&dk->dk_zil_sums); } +void +dataset_kstats_rename(dataset_kstats_t *dk, const char *name) +{ + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + char *ds_name; + + ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name); + ASSERT3S(ds_name, !=, NULL); + (void) strlcpy(ds_name, name, + KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name)); +} + void dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index e77461fb9c..280001bc34 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1619,8 +1619,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth); - zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", dmu_objset_id(db->db_objset)); err = SET_ERROR(EIO); goto early_unlock; } @@ -1925,7 +1923,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) zio_free(db->db_objset->os_spa, txg, bp); if (dr->dt.dl.dr_brtwrite) { - ASSERT0P(dr->dt.dl.dr_data); + ASSERT0(dr->dt.dl.dr_data); dr->dt.dl.dr_data = db->db_buf; } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -2736,7 +2734,7 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) } void -dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; @@ -2754,8 +2752,14 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) * Block cloning: We will be completely overwriting a block * cloned in this transaction group, so let's undirty the * pending clone and mark the block as uncached. This will be - * as if the clone was never done. + * as if the clone was never done. But if the fill can fail + * we should have a way to return back to the cloned data. */ + if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + mutex_exit(&db->db_mtx); + dmu_buf_will_dirty(db_fake, tx); + return; + } VERIFY(!dbuf_undirty(db, tx)); db->db_state = DB_UNCACHED; } @@ -2816,32 +2820,41 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) dl->dr_overridden_by.blk_birth = dr->dr_txg; } -void -dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) +boolean_t +dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) { (void) tx; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - dbuf_states_t old_state; mutex_enter(&db->db_mtx); DBUF_VERIFY(db); - old_state = db->db_state; - db->db_state = DB_CACHED; - if (old_state == DB_FILL) { + if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ memset(db->db.db_data, 0, db->db.db_size); db->db_freed_in_flight = FALSE; + db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "fill done handling freed in flight"); + failed = B_FALSE; + } else if (failed) { + VERIFY(!dbuf_undirty(db, tx)); + db->db_buf = NULL; + dbuf_clear_data(db); + DTRACE_SET_STATE(db, "fill failed"); } else { + db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "fill done"); } cv_broadcast(&db->db_changed); + } else { + db->db_state = DB_CACHED; + failed = B_FALSE; } mutex_exit(&db->db_mtx); + return (failed); } void @@ -2986,7 +2999,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) DTRACE_SET_STATE(db, "filling assigned arcbuf"); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - dmu_buf_fill_done(&db->db, tx); + dmu_buf_fill_done(&db->db, tx, B_FALSE); } void diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 63464d7474..3215ab1c2a 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1115,14 +1115,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_FALSE); else dmu_buf_will_dirty(db, tx); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx, B_FALSE); offset += tocpy; size -= tocpy; @@ -1330,27 +1330,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) ASSERT(size > 0); - bufoff = zfs_uio_offset(uio) - db->db_offset; + offset_t off = zfs_uio_offset(uio); + bufoff = off - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_TRUE); else dmu_buf_will_dirty(db, tx); - /* - * XXX zfs_uiomove could block forever (eg.nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that zfs_uiomove won't - * block. - */ err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) { + /* The fill was reverted. Undo any uio progress. */ + zfs_uio_advance(uio, off - zfs_uio_offset(uio)); + } if (err) break; @@ -1482,9 +1479,9 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); - rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned and the arc buf is the diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 05ca91717c..54aa60259e 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2532,7 +2532,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, * size of the provided arc_buf_t. */ if (db_spill->db_size != drrs->drr_length) { - dmu_buf_will_fill(db_spill, tx); + dmu_buf_will_fill(db_spill, tx, B_FALSE); VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 2d37ed2cdf..37c68528bf 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1124,8 +1124,6 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { spa_log_error(spa, zb, &bp->blk_birth); - zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", dmu_objset_id(sta->os)); return (SET_ERROR(EIO)); } diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index 5e6e4e3d6c..8e1055d9bc 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -266,6 +266,40 @@ spa_crypto_key_compare(const void *a, const void *b) return (0); } +/* + * this compares a crypto key based on zk_guid. See comment on + * spa_crypto_key_compare for more information. + */ +boolean_t +dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb) +{ + dsl_crypto_key_t *dcka = NULL; + dsl_crypto_key_t *dckb = NULL; + uint64_t obja, objb; + boolean_t equal; + spa_t *spa; + + spa = dmu_objset_spa(osa); + if (spa != dmu_objset_spa(osb)) + return (B_FALSE); + obja = dmu_objset_ds(osa)->ds_object; + objb = dmu_objset_ds(osb)->ds_object; + + if (spa_keystore_lookup_key(spa, obja, FTAG, &dcka) != 0) + return (B_FALSE); + if (spa_keystore_lookup_key(spa, objb, FTAG, &dckb) != 0) { + spa_keystore_dsl_key_rele(spa, dcka, FTAG); + return (B_FALSE); + } + + equal = (dcka->dck_key.zk_guid == dckb->dck_key.zk_guid); + + spa_keystore_dsl_key_rele(spa, dcka, FTAG); + spa_keystore_dsl_key_rele(spa, dckb, FTAG); + + return (equal); +} + static int spa_key_mapping_compare(const void *a, const void *b) { diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 47c234f76c..ac30a37081 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -1000,8 +1000,6 @@ livelist_compare(const void *larg, const void *rarg) /* if vdevs are equal, sort by offsets. */ uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); - if (l_dva0_offset == r_dva0_offset) - ASSERT3U(l->blk_birth, ==, r->blk_birth); return (TREE_CMP(l_dva0_offset, r_dva0_offset)); } @@ -1016,9 +1014,9 @@ struct livelist_iter_arg { * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a * corresponding FREE are stored in the supplied bplist. * - * Note that multiple FREE and ALLOC entries for the same blkptr may - * be encountered when dedup is involved. For this reason we keep a - * refcount for all the FREE entries of each blkptr and ensure that + * Note that multiple FREE and ALLOC entries for the same blkptr may be + * encountered when dedup or block cloning is involved. For this reason we + * keep a refcount for all the FREE entries of each blkptr and ensure that * each of those FREE entries has a corresponding ALLOC preceding it. */ static int @@ -1037,6 +1035,13 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, livelist_entry_t node; node.le_bp = *bp; livelist_entry_t *found = avl_find(avl, &node, NULL); + if (found) { + ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp)); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, + BP_GET_CHECKSUM(&found->le_bp)); + ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==, + BP_PHYSICAL_BIRTH(&found->le_bp)); + } if (bp_freed) { if (found == NULL) { /* first free entry for this blkptr */ @@ -1046,10 +1051,10 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, e->le_refcnt = 1; avl_add(avl, e); } else { - /* dedup block free */ - ASSERT(BP_GET_DEDUP(bp)); - ASSERT3U(BP_GET_CHECKSUM(bp), ==, - BP_GET_CHECKSUM(&found->le_bp)); + /* + * Deduped or cloned block free. We could assert D bit + * for dedup, but there is no such one for cloning. + */ ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); found->le_refcnt++; } @@ -1065,14 +1070,6 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, /* all tracked free pairs have been matched */ avl_remove(avl, found); kmem_free(found, sizeof (livelist_entry_t)); - } else { - /* - * This is definitely a deduped blkptr so - * let's validate it. - */ - ASSERT(BP_GET_DEDUP(bp)); - ASSERT3U(BP_GET_CHECKSUM(bp), ==, - BP_GET_CHECKSUM(&found->le_bp)); } } } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1410651c63..d7fe96cde6 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -151,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ -static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { +static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ @@ -1164,6 +1164,275 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) tqs->stqs_taskq = NULL; } +#ifdef _KERNEL +/* + * The READ and WRITE rows of zio_taskqs are configurable at module load time + * by setting zio_taskq_read or zio_taskq_write. + * + * Example (the defaults for READ and WRITE) + * zio_taskq_read='fixed,1,8 null scale null' + * zio_taskq_write='batch fixed,1,5 scale fixed,1,5' + * + * Each sets the entire row at a time. + * + * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number + * of threads per taskq. + * + * 'null' can only be set on the high-priority queues (queue selection for + * high-priority queues will fall back to the regular queue if the high-pri + * is NULL. + */ +static const char *const modes[ZTI_NMODES] = { + "fixed", "batch", "scale", "null" +}; + +/* Parse the incoming config string. Modifies cfg */ +static int +spa_taskq_param_set(zio_type_t t, char *cfg) +{ + int err = 0; + + zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; + + char *next = cfg, *tok, *c; + + /* + * Parse out each element from the string and fill `row`. The entire + * row has to be set at once, so any errors are flagged by just + * breaking out of this loop early. + */ + uint_t q; + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { + /* `next` is the start of the config */ + if (next == NULL) + break; + + /* Eat up leading space */ + while (isspace(*next)) + next++; + if (*next == '\0') + break; + + /* Mode ends at space or end of string */ + tok = next; + next = strchr(tok, ' '); + if (next != NULL) *next++ = '\0'; + + /* Parameters start after a comma */ + c = strchr(tok, ','); + if (c != NULL) *c++ = '\0'; + + /* Match mode string */ + uint_t mode; + for (mode = 0; mode < ZTI_NMODES; mode++) + if (strcmp(tok, modes[mode]) == 0) + break; + if (mode == ZTI_NMODES) + break; + + /* Invalid canary */ + row[q].zti_mode = ZTI_NMODES; + + /* Per-mode setup */ + switch (mode) { + + /* + * FIXED is parameterised: number of queues, and number of + * threads per queue. + */ + case ZTI_MODE_FIXED: { + /* No parameters? */ + if (c == NULL || *c == '\0') + break; + + /* Find next parameter */ + tok = c; + c = strchr(tok, ','); + if (c == NULL) + break; + + /* Take digits and convert */ + unsigned long long nq; + if (!(isdigit(*tok))) + break; + err = ddi_strtoull(tok, &tok, 10, &nq); + /* Must succeed and also end at the next param sep */ + if (err != 0 || tok != c) + break; + + /* Move past the comma */ + tok++; + /* Need another number */ + if (!(isdigit(*tok))) + break; + /* Remember start to make sure we moved */ + c = tok; + + /* Take digits */ + unsigned long long ntpq; + err = ddi_strtoull(tok, &tok, 10, &ntpq); + /* Must succeed, and moved forward */ + if (err != 0 || tok == c || *tok != '\0') + break; + + /* + * sanity; zero queues/threads make no sense, and + * 16K is almost certainly more than anyone will ever + * need and avoids silly numbers like UINT32_MAX + */ + if (nq == 0 || nq >= 16384 || + ntpq == 0 || ntpq >= 16384) + break; + + const zio_taskq_info_t zti = ZTI_P(ntpq, nq); + row[q] = zti; + break; + } + + case ZTI_MODE_BATCH: { + const zio_taskq_info_t zti = ZTI_BATCH; + row[q] = zti; + break; + } + + case ZTI_MODE_SCALE: { + const zio_taskq_info_t zti = ZTI_SCALE; + row[q] = zti; + break; + } + + case ZTI_MODE_NULL: { + /* + * Can only null the high-priority queues; the general- + * purpose ones have to exist. + */ + if (q != ZIO_TASKQ_ISSUE_HIGH && + q != ZIO_TASKQ_INTERRUPT_HIGH) + break; + + const zio_taskq_info_t zti = ZTI_NULL; + row[q] = zti; + break; + } + + default: + break; + } + + /* Ensure we set a mode */ + if (row[q].zti_mode == ZTI_NMODES) + break; + } + + /* Didn't get a full row, fail */ + if (q < ZIO_TASKQ_TYPES) + return (SET_ERROR(EINVAL)); + + /* Eat trailing space */ + if (next != NULL) + while (isspace(*next)) + next++; + + /* If there's anything left over then fail */ + if (next != NULL && *next != '\0') + return (SET_ERROR(EINVAL)); + + /* Success! Copy it into the real config */ + for (q = 0; q < ZIO_TASKQ_TYPES; q++) + zio_taskqs[t][q] = row[q]; + + return (0); +} + +static int +spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) +{ + int pos = 0; + + /* Build paramater string from live config */ + const char *sep = ""; + for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { + const zio_taskq_info_t *zti = &zio_taskqs[t][q]; + if (zti->zti_mode == ZTI_MODE_FIXED) + pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, + modes[zti->zti_mode], zti->zti_count, + zti->zti_value); + else + pos += sprintf(&buf[pos], "%s%s", sep, + modes[zti->zti_mode]); + sep = " "; + } + + if (add_newline) + buf[pos++] = '\n'; + buf[pos] = '\0'; + + return (pos); +} + +#ifdef __linux__ +static int +spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); +} + +static int +spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); +} +#else +/* + * On FreeBSD load-time parameters can be set up before malloc() is available, + * so we have to do all the parsing work on the stack. + */ +#define SPA_TASKQ_PARAM_MAX (128) + +static int +spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err; + + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err || req->newptr == NULL) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err; + + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err || req->newptr == NULL) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); +} +#endif +#endif /* _KERNEL */ + /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention @@ -10210,4 +10479,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); + +#ifdef _KERNEL +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + "Configure IO queues for read IO"); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + "Configure IO queues for write IO"); +#endif /* END CSTYLED */ diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 09c7be853b..2e0af60f6d 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -309,6 +309,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); @@ -470,6 +472,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); @@ -613,6 +617,8 @@ zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -648,6 +654,8 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -715,12 +723,14 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL)); } @@ -730,12 +740,14 @@ zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap) #ifdef __linux__ zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE, NULL)); #else @@ -750,14 +762,13 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) zfsvfs_t *zfsvfs = arg1; lr_rename_whiteout_t *lr = arg2; int error; - /* sname and tname follow lr_rename_whiteout_t */ - char *sname = (char *)(lr + 1); - char *tname = sname + strlen(sname) + 1; /* For the whiteout file. */ xvattr_t xva; uint64_t objid; uint64_t dnodesize; + ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -783,6 +794,9 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) if (error) return (error); + /* sname and tname follow lr_rename_whiteout_t */ + char *sname = (char *)(lr + 1); + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname, RENAME_WHITEOUT, &xva.xva_vattr)); #else @@ -800,6 +814,8 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) int error; uint64_t eod, offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -863,6 +879,8 @@ zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) int error; uint64_t end; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -910,6 +928,8 @@ zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) flock64_t fl = {0}; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -940,6 +960,8 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) int error; void *start; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + xva_init(&xva); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); @@ -1002,6 +1024,9 @@ zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap) size_t size; int error = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size); + ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa, SPA_FEATURE_ZILSAXATTR)); if (byteswap) @@ -1079,6 +1104,10 @@ zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + + sizeof (ace_t) * lr->lr_aclcnt); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_oldace_byteswap(ace, lr->lr_aclcnt); @@ -1124,6 +1153,9 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); @@ -1171,6 +1203,10 @@ zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 3a5fa75df2..aa61575a6a 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1103,6 +1104,16 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, return (SET_ERROR(EXDEV)); } + /* + * Cloning across encrypted datasets is possible only if they + * share the same master key. + */ + if (inos != outos && inos->os_encrypted && + !dmu_objset_crypto_key_equal(inos, outos)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + error = zfs_verify_zp(inzp); if (error == 0) error = zfs_verify_zp(outzp); @@ -1181,11 +1192,18 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, inblksz = inzp->z_blksz; /* - * We cannot clone into files with different block size if we can't - * grow it (block size is already bigger or more than one block). + * We cannot clone into a file with different block size if we can't + * grow it (block size is already bigger, has more than one block, or + * not locked for growth). There are other possible reasons for the + * grow to fail, but we cover what we can before opening transaction + * and the rest detect after we try to do it. */ + if (inblksz < outzp->z_blksz) { + error = SET_ERROR(EINVAL); + goto unlock; + } if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || - outzp->z_size > inblksz)) { + outlr->lr_length != UINT64_MAX)) { error = SET_ERROR(EINVAL); goto unlock; } @@ -1286,20 +1304,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ break; } - /* - * Encrypted data is fine as long as it comes from the same - * dataset. - * TODO: We want to extend it in the future to allow cloning to - * datasets with the same keys, like clones or to be able to - * clone a file from a snapshot of an encrypted dataset into the - * dataset itself. - */ - if (BP_IS_PROTECTED(&bps[0])) { - if (inzfsvfs != outzfsvfs) { - error = SET_ERROR(EXDEV); - break; - } - } /* * Start a transaction. @@ -1318,12 +1322,24 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } /* - * Copy source znode's block size. This only happens on the - * first iteration since zfs_rangelock_reduce() will shrink down - * lr_len to the appropriate size. + * Copy source znode's block size. This is done only if the + * whole znode is locked (see zfs_rangelock_cb()) and only + * on the first iteration since zfs_rangelock_reduce() will + * shrink down lr_length to the appropriate size. */ if (outlr->lr_length == UINT64_MAX) { zfs_grow_blocksize(outzp, inblksz, tx); + + /* + * Block growth may fail for many reasons we can not + * predict here. If it happen the cloning is doomed. + */ + if (inblksz != outzp->z_blksz) { + error = SET_ERROR(EINVAL); + dmu_tx_abort(tx); + break; + } + /* * Round range lock up to the block boundary, so we * prevent appends until we are done. @@ -1339,6 +1355,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, break; } + if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { + update_pages(outzp, outoff, size, outos); + } + zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, &clear_setid_bits_txg, tx); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 4a4c204bcc..9b5d866a8c 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -91,15 +91,7 @@ * committed to stable storage. Please refer to the zil_commit_waiter() * function (and the comments within it) for more details. */ -static uint_t zfs_commit_timeout_pct = 5; - -/* - * Minimal time we care to delay commit waiting for more ZIL records. - * At least FreeBSD kernel can't sleep for less than 2us at its best. - * So requests to sleep for less then 5us is a waste of CPU time with - * a risk of significant log latency increase due to oversleep. - */ -static uint64_t zil_min_commit_timeout = 5000; +static uint_t zfs_commit_timeout_pct = 10; /* * See zil.h for more information about these fields. @@ -152,6 +144,7 @@ static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static itx_t *zil_itx_clone(itx_t *oitx); +static uint64_t zil_max_waste_space(zilog_t *zilog); static int zil_bp_compare(const void *x1, const void *x2) @@ -522,6 +515,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); + ASSERT3U(reclen, <=, end - lrp); if (lr->lrc_seq > claim_lr_seq) { arc_buf_destroy(abuf, &abuf); goto done; @@ -604,7 +598,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) lr_write_t *lr = (lr_write_t *)lrc; int error; - ASSERT(lrc->lrc_txtype == TX_WRITE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); /* * If the block is not readable, don't claim it. This can happen @@ -632,7 +626,9 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, spa_t *spa = zilog->zl_spa; uint_t ii; - ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); if (tx == NULL) { return (0); @@ -646,9 +642,9 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, bp = &lr->lr_bps[ii]; /* - * When data are embedded into BP there is no need to create - * BRT entry as there is no data block. Just copy the BP as - * it contains the data. + * When data is embedded into the BP there is no need to create + * BRT entry as there is no data block. Just copy the BP as it + * contains the data. */ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) continue; @@ -709,7 +705,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; - ASSERT(lrc->lrc_txtype == TX_WRITE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); /* * If we previously claimed it, we need to free it. @@ -730,7 +726,9 @@ zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) spa_t *spa; uint_t ii; - ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); if (tx == NULL) { return (0); @@ -1625,7 +1623,7 @@ zil_lwb_write_done(zio_t *zio) while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL && !vd->vdev_nowritecache) { + if (vd != NULL) { /* * The "ZIO_FLAG_DONT_PROPAGATE" is currently * always used within "zio_flush". This means, @@ -1713,24 +1711,6 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) mutex_exit(&zilog->zl_lock); } -/* - * Define a limited set of intent log block sizes. - * - * These must be a multiple of 4KB. Note only the amount used (again - * aligned to 4KB) actually gets written. However, we can't always just - * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. - */ -static const struct { - uint64_t limit; - uint64_t blksz; -} zil_block_buckets[] = { - { 4096, 4096 }, /* non TX_WRITE */ - { 8192 + 4096, 8192 + 4096 }, /* database */ - { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ - { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ - { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ -}; - /* * Maximum block size used by the ZIL. This is picked up when the ZIL is * initialized. Otherwise this should not be used directly; see @@ -1738,6 +1718,91 @@ static const struct { */ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; +/* + * Plan splitting of the provided burst size between several blocks. + */ +static uint_t +zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) +{ + uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t); + + if (size <= md) { + /* + * Small bursts are written as-is in one block. + */ + *minsize = size; + return (size); + } else if (size > 8 * md) { + /* + * Big bursts use maximum blocks. The first block size + * is hard to predict, but it does not really matter. + */ + *minsize = 0; + return (md); + } + + /* + * Medium bursts try to divide evenly to better utilize several SLOG + * VDEVs. The first block size we predict assuming the worst case of + * maxing out others. Fall back to using maximum blocks if due to + * large records or wasted space we can not predict anything better. + */ + uint_t s = size; + uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t)); + uint_t chunk = DIV_ROUND_UP(s, n); + uint_t waste = zil_max_waste_space(zilog); + waste = MAX(waste, zilog->zl_cur_max); + if (chunk <= md - waste) { + *minsize = MAX(s - (md - waste) * (n - 1), waste); + return (chunk); + } else { + *minsize = 0; + return (md); + } +} + +/* + * Try to predict next block size based on previous history. Make prediction + * sufficient for 7 of 8 previous bursts. Don't try to save if the saving is + * less then 50%, extra writes may cost more, but we don't want single spike + * to badly affect our predictions. + */ +static uint_t +zil_lwb_predict(zilog_t *zilog) +{ + uint_t m, o; + + /* If we are in the middle of a burst, take it into account also. */ + if (zilog->zl_cur_size > 0) { + o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m); + } else { + o = UINT_MAX; + m = 0; + } + + /* Find minimum optimal size. We don't need to go below that. */ + for (int i = 0; i < ZIL_BURSTS; i++) + o = MIN(o, zilog->zl_prev_opt[i]); + + /* Find two biggest minimal first block sizes above the optimal. */ + uint_t m1 = MAX(m, o), m2 = o; + for (int i = 0; i < ZIL_BURSTS; i++) { + m = zilog->zl_prev_min[i]; + if (m >= m1) { + m2 = m1; + m1 = m; + } else if (m > m2) { + m2 = m; + } + } + + /* + * If second minimum size gives 50% saving -- use it. It may cost us + * one additional write later, but the space saving is just too big. + */ + return ((m1 < m2 * 2) ? m1 : m2); +} + /* * Close the log block for being issued and allocate the next one. * Has to be called under zl_issuer_lock to chain more lwbs. @@ -1745,7 +1810,7 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; static lwb_t * zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { - int i; + uint64_t blksz, plan, plan2; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); @@ -1760,34 +1825,40 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) return (NULL); /* - * Log blocks are pre-allocated. Here we select the size of the next - * block, based on size used in the last block. - * - first find the smallest bucket that will fit the block from a - * limited set of block sizes. This is because it's faster to write - * blocks allocated from the same metaslab as they are adjacent or - * close. - * - next find the maximum from the new suggested size and an array of - * previous sizes. This lessens a picket fence effect of wrongly - * guessing the size if we have a stream of say 2k, 64k, 2k, 64k - * requests. - * - * Note we only write what is used, but we can't just allocate - * the maximum block size because we can exhaust the available - * pool log space. + * Log blocks are pre-allocated. Here we select the size of the next + * block, based on what's left of this burst and the previous history. + * While we try to only write used part of the block, we can't just + * always allocate the maximum block size because we can exhaust all + * available pool log space, so we try to be reasonable. */ - uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); - for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) - continue; - zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); - zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; - for (i = 0; i < ZIL_PREV_BLKS; i++) - zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); - DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, - uint64_t, zil_blksz, - uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); - zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); + if (zilog->zl_cur_left > 0) { + /* + * We are in the middle of a burst and know how much is left. + * But if workload is multi-threaded there may be more soon. + * Try to predict what can it be and plan for the worst case. + */ + uint_t m; + plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + if (zilog->zl_parallel) { + plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left + + zil_lwb_predict(zilog), &m); + if (plan < plan2) + plan = plan2; + } + } else { + /* + * The previous burst is done and we can only predict what + * will come next. + */ + plan = zil_lwb_predict(zilog); + } + blksz = plan + sizeof (zil_chain_t); + blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t); + blksz = MIN(blksz, zilog->zl_max_block_size); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz, + uint64_t, plan); - return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); + return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state)); } /* @@ -1810,6 +1881,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) itx = list_next(&lwb->lwb_itxs, itx)) zil_lwb_commit(zilog, lwb, itx); lwb->lwb_nused = lwb->lwb_nfilled; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); @@ -1837,7 +1909,7 @@ next_lwb: int wsz = lwb->lwb_sz; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; @@ -1998,6 +2070,42 @@ zil_max_copied_data(zilog_t *zilog) return (MIN(max_data, zil_maxcopied)); } +static uint64_t +zil_itx_record_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + + if (lr->lrc_txtype == TX_COMMIT) + return (0); + ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t)); + return (lr->lrc_reclen); +} + +static uint64_t +zil_itx_data_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + lr_write_t *lrw = (lr_write_t *)lr; + + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t)); + return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t), + uint64_t)); + } + return (0); +} + +static uint64_t +zil_itx_full_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + + if (lr->lrc_txtype == TX_COMMIT) + return (0); + ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t)); + return (lr->lrc_reclen + zil_itx_data_size(itx)); +} + /* * Estimate space needed in the lwb for the itx. Allocate more lwbs or * split the itx as needed, but don't touch the actual transaction data. @@ -2039,14 +2147,10 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) return (lwb); } - if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { - dlen = P2ROUNDUP_TYPED( - lrw->lr_length, sizeof (uint64_t), uint64_t); - } else { - dlen = 0; - } reclen = lr->lrc_reclen; - zilog->zl_cur_used += (reclen + dlen); + ASSERT3U(reclen, >=, sizeof (lr_t)); + ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0)); + dlen = zil_itx_data_size(itx); cont: /* @@ -2064,19 +2168,19 @@ cont: if (lwb == NULL) return (NULL); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; - - /* - * There must be enough space in the new, empty log block to - * hold reclen. For WR_COPIED, we need to fit the whole - * record in one block, and reclen is the header size + the - * data size. For WR_NEED_COPY, we can create multiple - * records, splitting the data into multiple blocks, so we - * only need to fit one word of data per block; in this case - * reclen is just the header size (no data). - */ - ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } + /* + * There must be enough space in the log block to hold reclen. + * For WR_COPIED, we need to fit the whole record in one block, + * and reclen is the write record header size + the data size. + * For WR_NEED_COPY, we can create multiple records, splitting + * the data into multiple blocks, so we only need to fit one + * word of data per block; in this case reclen is just the header + * size (no data). + */ + ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); + dnow = MIN(dlen, lwb_sp - reclen); if (dlen > dnow) { ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); @@ -2087,6 +2191,7 @@ cont: clrw->lr_length = dnow; lrw->lr_offset += dnow; lrw->lr_length -= dnow; + zilog->zl_cur_left -= dnow; } else { citx = itx; clr = lr; @@ -2108,10 +2213,8 @@ cont: list_insert_tail(&lwb->lwb_itxs, citx); dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; + if (dlen > 0) goto cont; - } if (lr->lrc_txtype == TX_WRITE && lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) @@ -2138,13 +2241,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) if (lr->lrc_txtype == TX_COMMIT) return; - if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { - dlen = P2ROUNDUP_TYPED( - lrw->lr_length, sizeof (uint64_t), uint64_t); - } else { - dlen = 0; - } reclen = lr->lrc_reclen; + dlen = zil_itx_data_size(itx); ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; @@ -2252,7 +2350,9 @@ zil_itx_create(uint64_t txtype, size_t olrsize) size_t itxsize, lrsize; itx_t *itx; + ASSERT3U(olrsize, >=, sizeof (lr_t)); lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t); + ASSERT3U(lrsize, >=, olrsize); itxsize = offsetof(itx_t, itx_lr) + lrsize; itx = zio_data_buf_alloc(itxsize); @@ -2271,6 +2371,10 @@ zil_itx_create(uint64_t txtype, size_t olrsize) static itx_t * zil_itx_clone(itx_t *oitx) { + ASSERT3U(oitx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(oitx->itx_size, ==, + offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen); + itx_t *itx = zio_data_buf_alloc(oitx->itx_size); memcpy(itx, oitx, oitx->itx_size); itx->itx_callback = NULL; @@ -2281,6 +2385,9 @@ zil_itx_clone(itx_t *oitx) void zil_itx_destroy(itx_t *itx) { + ASSERT3U(itx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(itx->itx_lr.lrc_reclen, ==, + itx->itx_size - offsetof(itx_t, itx_lr)); IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL); IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); @@ -2364,7 +2471,7 @@ void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; list_t clean_list; @@ -2391,7 +2498,8 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; - ian = avl_find(t, &oid, &where); + ian_search.ia_foid = oid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); @@ -2565,6 +2673,7 @@ zil_get_commit_list(zilog_t *zilog) ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_t *sync_list = &itxg->itxg_itxs->i_sync_list; + itx_t *itx = NULL; if (unlikely(zilog->zl_suspend > 0)) { /* * ZIL was just suspended, but we lost the race. @@ -2574,10 +2683,20 @@ zil_get_commit_list(zilog_t *zilog) if (!list_is_empty(sync_list)) wtxg = MAX(wtxg, txg); } else { + itx = list_head(sync_list); list_move_tail(commit_list, sync_list); } mutex_exit(&itxg->itxg_lock); + + while (itx != NULL) { + uint64_t s = zil_itx_full_size(itx); + zilog->zl_cur_size += s; + zilog->zl_cur_left += s; + s = zil_itx_record_size(itx); + zilog->zl_cur_max = MAX(zilog->zl_cur_max, s); + itx = list_next(commit_list, itx); + } } return (wtxg); } @@ -2589,7 +2708,7 @@ void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; @@ -2619,7 +2738,8 @@ zil_async_to_sync(zilog_t *zilog, uint64_t foid) */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { - ian = avl_find(t, &foid, &where); + ian_search.ia_foid = foid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); @@ -2712,6 +2832,26 @@ zil_commit_writer_stall(zilog_t *zilog) ASSERT(list_is_empty(&zilog->zl_lwb_list)); } +static void +zil_burst_done(zilog_t *zilog) +{ + if (!list_is_empty(&zilog->zl_itx_commit_list) || + zilog->zl_cur_size == 0) + return; + + if (zilog->zl_parallel) + zilog->zl_parallel--; + + uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1); + zilog->zl_prev_rotor = r; + zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size, + &zilog->zl_prev_min[r]); + + zilog->zl_cur_size = 0; + zilog->zl_cur_max = 0; + zilog->zl_cur_left = 0; +} + /* * This function will traverse the commit list, creating new lwbs as * needed, and committing the itxs from the commit list to these newly @@ -2726,7 +2866,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) list_t nolwb_waiters; lwb_t *lwb, *plwb; itx_t *itx; - boolean_t first = B_TRUE; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); @@ -2752,9 +2891,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) zil_commit_activate_saxattr_feature(zilog); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); - first = (lwb->lwb_state == LWB_STATE_NEW) && - ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || - plwb->lwb_state == LWB_STATE_FLUSH_DONE); + + /* + * If the lwb is still opened, it means the workload is really + * multi-threaded and we won the chance of write aggregation. + * If it is not opened yet, but previous lwb is still not + * flushed, it still means the workload is multi-threaded, but + * there was too much time between the commits to aggregate, so + * we try aggregation next times, but without too much hopes. + */ + if (lwb->lwb_state == LWB_STATE_OPENED) { + zilog->zl_parallel = ZIL_BURSTS; + } else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) + != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) { + zilog->zl_parallel = MAX(zilog->zl_parallel, + ZIL_BURSTS / 2); + } } while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { @@ -2829,7 +2981,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * Our lwb is done, leave the rest of * itx list to somebody else who care. */ - first = B_FALSE; + zilog->zl_parallel = ZIL_BURSTS; + zilog->zl_cur_left -= + zil_itx_full_size(itx); break; } } else { @@ -2839,8 +2993,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } list_insert_tail(&nolwb_itxs, itx); } + zilog->zl_cur_left -= zil_itx_full_size(itx); } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); + zilog->zl_cur_left -= zil_itx_full_size(itx); zil_itx_destroy(itx); } } @@ -2921,28 +3077,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * try and pack as many itxs into as few lwbs as * possible, without significantly impacting the latency * of each individual itx. - * - * If we had no already running or open LWBs, it can be - * the workload is single-threaded. And if the ZIL write - * latency is very small or if the LWB is almost full, it - * may be cheaper to bypass the delay. */ - if (lwb->lwb_state == LWB_STATE_OPENED && first) { - hrtime_t sleep = zilog->zl_last_lwb_latency * - zfs_commit_timeout_pct / 100; - if (sleep < zil_min_commit_timeout || - lwb->lwb_nmax - lwb->lwb_nused < - lwb->lwb_nmax / 8) { - list_insert_tail(ilwbs, lwb); - lwb = zil_lwb_write_close(zilog, lwb, - LWB_STATE_NEW); - zilog->zl_cur_used = 0; - if (lwb == NULL) { - while ((lwb = list_remove_head(ilwbs)) - != NULL) - zil_lwb_write_issue(zilog, lwb); - zil_commit_writer_stall(zilog); - } + if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) { + zil_burst_done(zilog); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + if (lwb == NULL) { + while ((lwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); + zil_commit_writer_stall(zilog); } } } @@ -3096,24 +3239,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ + zil_burst_done(zilog); lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); - /* - * Since the lwb's zio hadn't been issued by the time this thread - * reached its timeout, we reset the zilog's "zl_cur_used" field - * to influence the zil block size selection algorithm. - * - * By having to issue the lwb's zio here, it means the size of the - * lwb was too large, given the incoming throughput of itxs. By - * setting "zl_cur_used" to zero, we communicate this fact to the - * block size selection algorithm, so it can take this information - * into account, and potentially select a smaller size for the - * next lwb block that is allocated. - */ - zilog->zl_cur_used = 0; - if (nlwb == NULL) { /* * When zil_lwb_write_close() returns NULL, this @@ -3708,7 +3838,9 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; - zilog->zl_max_block_size = zil_maxblocksize; + zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize, + ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ), + spa_maxblocksize(dmu_objset_spa(os))); mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); @@ -3728,6 +3860,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); + for (int i = 0; i < ZIL_BURSTS; i++) { + zilog->zl_prev_opt[i] = zilog->zl_max_block_size - + sizeof (zil_chain_t); + } + return (zilog); } @@ -4230,9 +4367,6 @@ EXPORT_SYMBOL(zil_kstat_values_update); ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW, "ZIL block open timeout percentage"); -ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW, - "Minimum delay we care for ZIL block commit"); - ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, "Disable intent logging replay"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3b3b40fa73..d0b4016237 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -306,6 +306,53 @@ zio_fini(void) * ========================================================================== */ +#ifdef ZFS_DEBUG +static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; +#endif + +/* + * Use empty space after the buffer to detect overflows. + * + * Since zio_init() creates kmem caches only for certain set of buffer sizes, + * allocations of different sizes may have some unused space after the data. + * Filling part of that space with a known pattern on allocation and checking + * it on free should allow us to detect some buffer overflows. + */ +static void +zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) +{ +#ifdef ZFS_DEBUG + size_t off = P2ROUNDUP(size, sizeof (ulong_t)); + ulong_t *canary = p + off / sizeof (ulong_t); + size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; + if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && + cache[c] == cache[c + 1]) + asize = (c + 2) << SPA_MINBLOCKSHIFT; + for (; off < asize; canary++, off += sizeof (ulong_t)) + *canary = zio_buf_canary; +#endif +} + +static void +zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) +{ +#ifdef ZFS_DEBUG + size_t off = P2ROUNDUP(size, sizeof (ulong_t)); + ulong_t *canary = p + off / sizeof (ulong_t); + size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; + if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && + cache[c] == cache[c + 1]) + asize = (c + 2) << SPA_MINBLOCKSHIFT; + for (; off < asize; canary++, off += sizeof (ulong_t)) { + if (unlikely(*canary != zio_buf_canary)) { + PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx", + p, size, (canary - p) * sizeof (ulong_t), + *canary, zio_buf_canary); + } + } +#endif +} + /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's @@ -322,7 +369,9 @@ zio_buf_alloc(size_t size) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif - return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); + void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE); + zio_buf_put_canary(p, size, zio_buf_cache, c); + return (p); } /* @@ -338,7 +387,9 @@ zio_data_buf_alloc(size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); + void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE); + zio_buf_put_canary(p, size, zio_data_buf_cache, c); + return (p); } void @@ -351,6 +402,7 @@ zio_buf_free(void *buf, size_t size) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif + zio_buf_check_canary(buf, size, zio_buf_cache, c); kmem_cache_free(zio_buf_cache[c], buf); } @@ -361,6 +413,7 @@ zio_data_buf_free(void *buf, size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + zio_buf_check_canary(buf, size, zio_data_buf_cache, c); kmem_cache_free(zio_data_buf_cache[c], buf); } @@ -1382,23 +1435,10 @@ zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, zio_flag_t flags) { - zio_t *zio; - int c; - - if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, - ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, - ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); - - zio->io_cmd = cmd; - } else { - zio = zio_null(pio, spa, NULL, NULL, NULL, flags); - - for (c = 0; c < vd->vdev_children; c++) - zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - done, private, flags)); - } - + zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, + ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, + ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); + zio->io_cmd = cmd; return (zio); } @@ -1569,11 +1609,18 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, } void -zio_flush(zio_t *zio, vdev_t *vd) +zio_flush(zio_t *pio, vdev_t *vd) { - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); + if (vd->vdev_nowritecache) + return; + if (vd->vdev_children == 0) { + zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd, + DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); + } else { + for (uint64_t c = 0; c < vd->vdev_children; c++) + zio_flush(pio, vd->vdev_child[c]); + } } void diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 9de515e876..e511b31fee 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -363,11 +363,14 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, zil_chain_t zilc; abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); - size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ, - uint64_t); + uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused, + ZIL_MIN_BLKSZ, uint64_t); + ASSERT3U(size, >=, nused); + size = nused; eck = zilc.zc_eck; eck_offset = offsetof(zil_chain_t, zc_eck); } else { + ASSERT3U(size, >=, sizeof (zio_eck_t)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); @@ -448,12 +451,13 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, return (SET_ERROR(ECKSUM)); } - if (nused > size) { + nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + if (size < nused) return (SET_ERROR(ECKSUM)); - } - - size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + size = nused; } else { + if (size < sizeof (zio_eck_t)) + return (SET_ERROR(ECKSUM)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index e661c55d18..8f68b95d7a 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -451,6 +451,8 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) lr_truncate_t *lr = arg2; uint64_t offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -487,6 +489,8 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) dmu_tx_t *tx; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -516,60 +520,6 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) return (error); } -/* - * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed - * after a system failure. - * - * TODO: For now we drop block cloning transations for ZVOLs as they are - * unsupported, but we still need to inform BRT about that as we - * claimed them during pool import. - * This situation can occur when we try to import a pool from a ZFS - * version supporting block cloning for ZVOLs into a system that - * has this ZFS version, that doesn't support block cloning for ZVOLs. - */ -static int -zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) -{ - char name[ZFS_MAX_DATASET_NAME_LEN]; - zvol_state_t *zv = arg1; - objset_t *os = zv->zv_objset; - lr_clone_range_t *lr = arg2; - blkptr_t *bp; - dmu_tx_t *tx; - spa_t *spa; - uint_t ii; - int error; - - dmu_objset_name(os, name); - cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.", - name); - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - tx = dmu_tx_create(os); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - spa = os->os_spa; - - for (ii = 0; ii < lr->lr_nbps; ii++) { - bp = &lr->lr_bps[ii]; - - if (!BP_IS_HOLE(bp)) { - zio_free(spa, dmu_tx_get_txg(tx), bp); - } - } - - (void) zil_replaying(zv->zv_zilog, tx); - dmu_tx_commit(tx); - - return (0); -} - static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { @@ -604,7 +554,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ - zvol_replay_clone_range /* TX_CLONE_RANGE */ + zvol_replay_err, /* TX_CLONE_RANGE */ }; /* diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index ef787c65c0..75b9d752a6 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -53,6 +53,12 @@ tags = ['functional', 'arc'] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tags = ['functional', 'atime'] +[tests/functional/block_cloning] +tests = ['block_cloning_clone_mmap_cached', + 'block_cloning_copyfilerange', + 'block_cloning_copyfilerange_partial'] +tags = ['functional', 'block_cloning'] + [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 8bc55a1b4b..fb78d96fb5 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -42,6 +42,7 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', 'block_cloning_disabled_ficlonerange', 'block_cloning_copyfilerange_cross_dataset', + 'block_cloning_cross_enc_dataset', 'block_cloning_copyfilerange_fallback_same_txg'] tags = ['functional', 'block_cloning'] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 4608e87522..01909c4168 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -270,6 +270,7 @@ if sys.platform.startswith('freebsd'): }) elif sys.platform.startswith('linux'): maybe.update({ + 'block_cloning/block_cloning_clone_mmap_cached': ['SKIP', cfr_reason], 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], 'fault/auto_online_002_pos': ['FAIL', 11889], @@ -305,6 +306,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_cross_reason], 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_cross_enc_dataset': + ['SKIP', cfr_cross_reason], }) diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 5f53b68719..9b77197dce 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -2,6 +2,7 @@ /btree_test /chg_usr_exec /clonefile +/clone_mmap_cached /devname2devid /dir_rd_update /draid diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 9bdb3c2097..101e26a12f 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -2,6 +2,7 @@ scripts_zfs_tests_bindir = $(datadir)/$(PACKAGE)/zfs-tests/bin scripts_zfs_tests_bin_PROGRAMS = %D%/chg_usr_exec +scripts_zfs_tests_bin_PROGRAMS += %D%/clone_mmap_cached scripts_zfs_tests_bin_PROGRAMS += %D%/cp_files scripts_zfs_tests_bin_PROGRAMS += %D%/ctime scripts_zfs_tests_bin_PROGRAMS += %D%/dir_rd_update diff --git a/tests/zfs-tests/cmd/clone_mmap_cached.c b/tests/zfs-tests/cmd/clone_mmap_cached.c new file mode 100644 index 0000000000..c1cdf796cf --- /dev/null +++ b/tests/zfs-tests/cmd/clone_mmap_cached.c @@ -0,0 +1,146 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2024 by Pawel Jakub Dawidek + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#define loff_t off_t +#endif + +ssize_t +copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int) + __attribute__((weak)); + +static void * +mmap_file(int fd, size_t size) +{ + void *p; + + p = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + (void) fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + exit(2); + } + + return (p); +} + +static void +usage(const char *progname) +{ + + /* + * -i cache input before copy_file_range(2). + * -o cache input before copy_file_range(2). + */ + (void) fprintf(stderr, "usage: %s [-io] \n", progname); + exit(3); +} + +int +main(int argc, char *argv[]) +{ + int dfd, sfd; + size_t dsize, ssize; + void *dmem, *smem, *ptr; + off_t doff, soff; + struct stat sb; + bool cache_input, cache_output; + const char *progname; + int c; + + progname = argv[0]; + cache_input = cache_output = false; + + while ((c = getopt(argc, argv, "io")) != -1) { + switch (c) { + case 'i': + cache_input = true; + break; + case 'o': + cache_output = true; + break; + default: + usage(progname); + } + } + argc -= optind; + argv += optind; + + if (argc != 2) { + usage(progname); + } + + sfd = open(argv[0], O_RDONLY); + if (fstat(sfd, &sb) == -1) { + (void) fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + exit(2); + } + ssize = sb.st_size; + smem = mmap_file(sfd, ssize); + + dfd = open(argv[1], O_RDWR); + if (fstat(dfd, &sb) == -1) { + (void) fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + exit(2); + } + dsize = sb.st_size; + dmem = mmap_file(dfd, dsize); + + /* + * Hopefully it won't be compiled out. + */ + if (cache_input) { + ptr = malloc(ssize); + assert(ptr != NULL); + memcpy(ptr, smem, ssize); + free(ptr); + } + if (cache_output) { + ptr = malloc(ssize); + assert(ptr != NULL); + memcpy(ptr, dmem, dsize); + free(ptr); + } + + soff = doff = 0; + if (copy_file_range(sfd, &soff, dfd, &doff, ssize, 0) < 0) { + (void) fprintf(stderr, "copy_file_range failed: %s\n", + strerror(errno)); + exit(2); + } + + exit(memcmp(smem, dmem, ssize) == 0 ? 0 : 1); +} diff --git a/tests/zfs-tests/cmd/ctime.c b/tests/zfs-tests/cmd/ctime.c index 0f5d81aea6..5ff1cea8a8 100644 --- a/tests/zfs-tests/cmd/ctime.c +++ b/tests/zfs-tests/cmd/ctime.c @@ -362,12 +362,20 @@ main(void) return (1); } - if (t1 == t2) { - (void) fprintf(stderr, "%s: t1(%ld) == t2(%ld)\n", + + /* + * Ideally, time change would be exactly two seconds, but allow + * a little slack in case of scheduling delays or similar. + */ + long delta = (long)t2 - (long)t1; + if (delta < 2 || delta > 4) { + (void) fprintf(stderr, + "%s: BAD time change: t1(%ld), t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); return (1); } else { - (void) fprintf(stderr, "%s: t1(%ld) != t2(%ld)\n", + (void) fprintf(stderr, + "%s: good time change: t1(%ld), t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); } } diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 648f2203df..afa3f7b18a 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -184,6 +184,7 @@ export ZFSTEST_FILES='badsend btree_test chg_usr_exec clonefile + clone_mmap_cached devname2devid dir_rd_update draid diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 87b50f59ca..a28bdd187f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -440,6 +440,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/atime/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ + functional/block_cloning/block_cloning_clone_mmap_cached.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ @@ -451,6 +452,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/block_cloning_ficlone.ksh \ functional/block_cloning/block_cloning_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlonerange_partial.ksh \ + functional/block_cloning/block_cloning_cross_enc_dataset.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib index 8e16366b4c..526bd54a2b 100644 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib @@ -28,8 +28,8 @@ function have_same_content { - typeset hash1=$(cat $1 | md5sum) - typeset hash2=$(cat $2 | md5sum) + typeset hash1=$(md5digest $1) + typeset hash2=$(md5digest $2) log_must [ "$hash1" = "$hash2" ] } @@ -44,10 +44,14 @@ function have_same_content # function get_same_blocks { + KEY=$5 + if [ ${#KEY} -gt 0 ]; then + KEY="--key=$KEY" + fi typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$ - zdb -vvvvv $1 -O $2 | \ + zdb $KEY -vvvvv $1 -O $2 | \ awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a - zdb -vvvvv $3 -O $4 | \ + zdb $KEY -vvvvv $3 -O $4 | \ awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ') } diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh new file mode 100755 index 0000000000..b0ef8ec995 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# When the destination file is mmaped and is already cached we need to +# update mmaped pages after successful clone. +# +# STRATEGY: +# 1. Create a pool. +# 2. Create a two test files with random content. +# 3. mmap the files, read them and clone from one to the other using +# clone_mmap_cached. +# 4. clone_mmap_cached also verifies if the content of the destination +# file was updated while reading it from mmaped memory. +# + +verify_runnable "global" + +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +VDIR=$TEST_BASE_DIR/disk-bclone +VDEV="$VDIR/a" + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $VDIR +} + +log_onexit cleanup + +log_assert "Test for clone into mmaped and cached file" + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must truncate -s 1G $VDEV + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV +log_must zfs create $TESTPOOL/$TESTFS + +for opts in "--" "-i" "-o" "-io" +do + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/src bs=1M count=1 + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/dst bs=1M count=1 + + # Clear cache. + log_must zpool export $TESTPOOL + log_must zpool import -d $VDIR $TESTPOOL + + log_must clone_mmap_cached $opts /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst + + sync_pool $TESTPOOL + log_must sync + + log_must have_same_content /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst + blocks=$(get_same_blocks $TESTPOOL/$TESTFS src $TESTPOOL/$TESTFS dst) + # FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). + log_must [ "$blocks" = "$(seq -s " " 0 7 | sed 's/ $//')" ] +done + +log_pass "Clone properly updates mmapped and cached pages" diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh new file mode 100755 index 0000000000..fe8f0867b9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh @@ -0,0 +1,170 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Kay Pedersen +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then + log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" +fi + +claim="Block cloning across encrypted datasets." + +log_assert $claim + +DS1="$TESTPOOL/encrypted1" +DS2="$TESTPOOL/encrypted2" +DS1_NC="$TESTPOOL/notcrypted1" +PASSPHRASE="top_secret" + +function prepare_enc +{ + log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt $DS1" + log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt $DS2" + log_must zfs create $DS1/child1 + log_must zfs create $DS1/child2 + log_must zfs create $DS1_NC + + log_note "Create test file" + # we must wait until the src file txg is written to the disk otherwise we + # will fallback to normal copy. See "dmu_read_l0_bps" in + # "zfs/module/zfs/dmu.c" and "zfs_clone_range" in + # "zfs/module/zfs/zfs_vnops.c" + log_must dd if=/dev/urandom of=/$DS1/file bs=128K count=4 + log_must dd if=/dev/urandom of=/$DS1/child1/file bs=128K count=4 + log_must dd if=/dev/urandom of=/$DS1_NC/file bs=128K count=4 + log_must sync_pool $TESTPOOL +} + +function cleanup_enc +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +function clone_and_check +{ + I_FILE="$1" + O_FILE=$2 + I_DS=$3 + O_DS=$4 + SAME_BLOCKS=$5 + # the CLONE option provides a choice between copy_file_range + # which should clone and a dd which is a copy no matter what + CLONE=$6 + SNAPSHOT=$7 + if [ ${#SNAPSHOT} -gt 0 ]; then + I_FILE=".zfs/snapshot/$SNAPSHOT/$1" + fi + if [ $CLONE ]; then + log_must clonefile -f "/$I_DS/$I_FILE" "/$O_DS/$O_FILE" 0 0 524288 + else + log_must dd if="/$I_DS/$I_FILE" of="/$O_DS/$O_FILE" bs=128K + fi + log_must sync_pool $TESTPOOL + + log_must have_same_content "/$I_DS/$I_FILE" "/$O_DS/$O_FILE" + + if [ ${#SNAPSHOT} -gt 0 ]; then + I_DS="$I_DS@$SNAPSHOT" + I_FILE="$1" + fi + typeset blocks=$(get_same_blocks \ + $I_DS $I_FILE $O_DS $O_FILE $PASSPHRASE) + log_must [ "$blocks" = "$SAME_BLOCKS" ] +} + +log_onexit cleanup_enc + +prepare_enc + +log_note "Cloning entire file with copy_file_range across different enc" \ + "roots, should fallback" +# we are expecting no same block map. +clone_and_check "file" "clone" $DS1 $DS2 "" true +log_note "check if the file is still readable and the same after" \ + "unmount and key unload, shouldn't fail" +typeset hash1=$(md5digest "/$DS1/file") +log_must zfs umount $DS1 && zfs unload-key $DS1 +typeset hash2=$(md5digest "/$DS2/clone") +log_must [ "$hash1" = "$hash2" ] + +cleanup_enc +prepare_enc + +log_note "Cloning entire file with copy_file_range across different child datasets" +# clone shouldn't work because of deriving a new master key for the child +# we are expecting no same block map. +clone_and_check "file" "clone" $DS1 "$DS1/child1" "" true +clone_and_check "file" "clone" "$DS1/child1" "$DS1/child2" "" true + +cleanup_enc +prepare_enc + +log_note "Copying entire file with copy_file_range across same snapshot" +log_must zfs snapshot -r $DS1@s1 +log_must sync_pool $TESTPOOL +log_must rm -f "/$DS1/file" +log_must sync_pool $TESTPOOL +clone_and_check "file" "clone" "$DS1" "$DS1" "0 1 2 3" true "s1" + +cleanup_enc +prepare_enc + +log_note "Copying entire file with copy_file_range across different snapshot" +clone_and_check "file" "file" $DS1 $DS2 "" true +log_must zfs snapshot -r $DS2@s1 +log_must sync_pool $TESTPOOL +log_must rm -f "/$DS1/file" "/$DS2/file" +log_must sync_pool $TESTPOOL +clone_and_check "file" "clone" "$DS2" "$DS1" "" true "s1" +typeset hash1=$(md5digest "/$DS1/.zfs/snapshot/s1/file") +log_note "destroy the snapshot and check if the file is still readable and" \ + "has the same content" +log_must zfs destroy -r $DS2@s1 +log_must sync_pool $TESTPOOL +typeset hash2=$(md5digest "/$DS1/file") +log_must [ "$hash1" = "$hash2" ] + +cleanup_enc +prepare_enc + +log_note "Copying with copy_file_range from non encrypted to encrypted" +clone_and_check "file" "copy" $DS1_NC $DS1 "" true + +cleanup_enc +prepare_enc + +log_note "Copying with copy_file_range from encrypted to non encrypted" +clone_and_check "file" "copy" $DS1 $DS1_NC "" true + +log_must sync_pool $TESTPOOL + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh index 58441bf8f3..a9b13f062a 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh @@ -30,6 +30,9 @@ if ! command -v clonefile > /dev/null ; then log_unsupported "clonefile program required to test block cloning" fi +if ! command -v clone_mmap_cached > /dev/null ; then + log_unsupported "clone_mmap_cached program required to test block cloning" +fi verify_runnable "global" diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh index 74caa12a9c..945db71bf1 100755 --- a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh @@ -31,15 +31,13 @@ # 2. Set l2arc_write_max to a value larger than the cache device. # 3. Create a file larger than the cache device and random read # for 10 sec. -# 4. Verify that l2arc_write_max is set back to the default. -# 5. Set l2arc_write_max to a value less than the cache device size but +# 4. Set l2arc_write_max to a value less than the cache device size but # larger than the default (256MB). -# 6. Record the l2_size. -# 7. Random read for 1 sec. -# 8. Record the l2_size again. -# 9. If (6) <= (8) then we have not looped around yet. -# 10. If (6) > (8) then we looped around. Break out of the loop and test. -# 11. Destroy pool. +# 5. Record the l2_size. +# 6. Random read for 1 sec. +# 7. Record the l2_size again. +# 8. If (5) <= (7) then we have not looped around yet. +# 9. Destroy pool. # verify_runnable "global" @@ -93,10 +91,6 @@ log_must zfs set relatime=off $TESTPOOL log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio -typeset write_max2=$(get_tunable L2ARC_WRITE_MAX) - -log_must test $write_max2 -eq $write_max - log_must set_tunable32 L2ARC_WRITE_MAX $(( 256 * 1024 * 1024 )) export RUNTIME=1 @@ -108,8 +102,6 @@ while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do do_once=false done -log_must test $l2_size1 -gt $l2_size2 - log_must zpool destroy $TESTPOOL log_pass "Looping around a cache device succeeds." diff --git a/tests/zfs-tests/tests/functional/io/io_uring.ksh b/tests/zfs-tests/tests/functional/io/io_uring.ksh index 47e439d0f4..2fa1465563 100755 --- a/tests/zfs-tests/tests/functional/io/io_uring.ksh +++ b/tests/zfs-tests/tests/functional/io/io_uring.ksh @@ -44,6 +44,13 @@ if ! $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then log_unsupported "Requires io_uring support" fi +if [ -e /etc/os-release ] ; then + source /etc/os-release + if [ -n "$REDHAT_SUPPORT_PRODUCT_VERSION" ] && ((floor($REDHAT_SUPPORT_PRODUCT_VERSION) == 9)) ; then + log_unsupported "Disabled on CentOS 9, fails with 'Operation not permitted'" + fi +fi + fio --ioengine=io_uring --parse-only || log_unsupported "fio io_uring support required" function cleanup