From c418edf1d399ee66c63017fa6407e63e78f41de1 Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski <0mp@FreeBSD.org> Date: Mon, 7 Aug 2023 22:53:59 +0200 Subject: [PATCH 01/20] Fix some typos Reviewed-by: Brian Behlendorf Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org> Closes #15141 --- include/sys/metaslab_impl.h | 6 +++--- module/zfs/metaslab.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 5beb1b7377..d328068890 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -313,7 +313,7 @@ struct metaslab_group { * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are - * allocated, the allocated segment are removed from the ms_allocatable and + * allocated, the allocated segments are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context @@ -366,9 +366,9 @@ struct metaslab_group { struct metaslab { /* * This is the main lock of the metaslab and its purpose is to - * coordinate our allocations and frees [e.g metaslab_block_alloc(), + * coordinate our allocations and frees [e.g., metaslab_block_alloc(), * metaslab_free_concrete(), ..etc] with our various syncing - * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. + * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc]. * * The lock is also used during some miscellaneous operations like * using the metaslab's histogram for the metaslab group's histogram diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 8393e8dd91..20dc934593 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1292,7 +1292,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, /* * If this metaslab group is below its qmax or it's - * the only allocatable metasable group, then attempt + * the only allocatable metaslab group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) From 1e488eec606ca0ab75543e0ed2712f0fdff1ea00 Mon Sep 17 00:00:00 2001 From: Ryan Lahfa Date: Mon, 7 Aug 2023 22:55:59 +0200 Subject: [PATCH 02/20] linux/spl/kmem_cache: undefine `kmem_cache_alloc` before defining it When compiling a kernel with bcachefs and zfs, the two macros will collide, making it impossible to have both filesystems. It is sufficient to just undefine the macro before calling it. On why this should be in ZFS rather than bcachefs, currently, bcachefs is not a in-tree filesystem, but, it has a reasonably high chance of getting included soon. This avoids the breakage in ZFS early, this patch may be distributed downstream in NixOS and is already used there. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Ryan Lahfa Closes #15144 --- include/os/linux/spl/sys/kmem_cache.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index cc9cafa84f..20eeadc46e 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -198,6 +198,14 @@ extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) #define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +/* + * This is necessary to be compatible with other kernel modules + * or in-tree filesystem that may define kmem_cache_alloc, + * like bcachefs does it now. + */ +#ifdef kmem_cache_alloc +#undef kmem_cache_alloc +#endif #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) #define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) From 6bdc7259d1a3d8b1a48dfbd26e5d98f172e09bd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Tue, 8 Aug 2023 18:35:35 +0200 Subject: [PATCH 03/20] libzfs: sendrecv: send_progress_thread: handle SIGINFO/SIGUSR1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit POSIX timers target the process, not the thread (as does SIGINFO), so we need to block it in the main thread which will die if interrupted. Ref: https://101010.pl/@ed1conf@bsd.network/110731819189629373 Reviewed-by: Brian Behlendorf Reviewed-by: Jorgen Lundman Signed-off-by: Ahelenia Ziemiańska Closes #15113 --- lib/libzfs/Makefile.am | 2 +- lib/libzfs/libzfs_sendrecv.c | 95 +++++++++++++++++++++++++++++------- man/man8/zfs-send.8 | 18 ++++++- 3 files changed, 96 insertions(+), 19 deletions(-) diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index cffe341220..5e74d908de 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -57,7 +57,7 @@ libzfs_la_LIBADD = \ libzutil.la \ libuutil.la -libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) +libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) libzfs_la_LDFLAGS = -pthread diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 87a30f54fe..e9bc78aa8d 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -928,6 +928,39 @@ zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, return (0); } +static volatile boolean_t send_progress_thread_signal_duetotimer; +static void +send_progress_thread_act(int sig, siginfo_t *info, void *ucontext) +{ + (void) sig, (void) ucontext; + send_progress_thread_signal_duetotimer = info->si_code == SI_TIMER; +} + +struct timer_desirability { + timer_t timer; + boolean_t desired; +}; +static void +timer_delete_cleanup(void *timer) +{ + struct timer_desirability *td = timer; + if (td->desired) + timer_delete(td->timer); +} + +#ifdef SIGINFO +#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO sigaddset(&new, SIGINFO) +#else +#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO +#endif +#define SEND_PROGRESS_THREAD_PARENT_BLOCK(old) { \ + sigset_t new; \ + sigemptyset(&new); \ + sigaddset(&new, SIGUSR1); \ + SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO; \ + pthread_sigmask(SIG_BLOCK, &new, old); \ +} + static void * send_progress_thread(void *arg) { @@ -941,6 +974,26 @@ send_progress_thread(void *arg) struct tm tm; int err; + const struct sigaction signal_action = + {.sa_sigaction = send_progress_thread_act, .sa_flags = SA_SIGINFO}; + struct sigevent timer_cfg = + {.sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGUSR1}; + const struct itimerspec timer_time = + {.it_value = {.tv_sec = 1}, .it_interval = {.tv_sec = 1}}; + struct timer_desirability timer = {}; + + sigaction(SIGUSR1, &signal_action, NULL); +#ifdef SIGINFO + sigaction(SIGINFO, &signal_action, NULL); +#endif + + if ((timer.desired = pa->pa_progress || pa->pa_astitle)) { + if (timer_create(CLOCK_MONOTONIC, &timer_cfg, &timer.timer)) + return ((void *)(uintptr_t)errno); + (void) timer_settime(timer.timer, 0, &timer_time, NULL); + } + pthread_cleanup_push(timer_delete_cleanup, &timer); + if (!pa->pa_parsable && pa->pa_progress) { (void) fprintf(stderr, "TIME %s %sSNAPSHOT %s\n", @@ -953,12 +1006,12 @@ send_progress_thread(void *arg) * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { - (void) sleep(1); + pause(); if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, &blocks)) != 0) { if (err == EINTR || err == ENOENT) - return ((void *)0); - return ((void *)(uintptr_t)err); + err = 0; + pthread_exit(((void *)(uintptr_t)err)); } (void) time(&t); @@ -991,21 +1044,25 @@ send_progress_thread(void *arg) (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, zhp->zfs_name); - } else if (pa->pa_progress) { + } else if (pa->pa_progress || + !send_progress_thread_signal_duetotimer) { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, zhp->zfs_name); } } + pthread_cleanup_pop(B_TRUE); } static boolean_t -send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid) +send_progress_thread_exit( + libzfs_handle_t *hdl, pthread_t ptid, sigset_t *oldmask) { void *status = NULL; (void) pthread_cancel(ptid); (void) pthread_join(ptid, &status); + pthread_sigmask(SIG_SETMASK, oldmask, NULL); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) return (zfs_standard_error(hdl, error, @@ -1199,7 +1256,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (sdd->progress || sdd->progressastitle) { + sigset_t oldmask; + { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; @@ -1214,13 +1272,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) zfs_close(zhp); return (err); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); - if ((sdd->progress || sdd->progressastitle) && - send_progress_thread_exit(zhp->zfs_hdl, tid)) + if (send_progress_thread_exit(zhp->zfs_hdl, tid, &oldmask)) return (-1); } @@ -1562,8 +1620,9 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, progress_arg_t pa = { 0 }; int err = 0; pthread_t ptid; + sigset_t oldmask; - if (flags->progress || flags->progressastitle) { + { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; @@ -1577,6 +1636,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_space_resume_redacted(zhp->zfs_name, from, @@ -1584,8 +1644,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, redactbook, fd, &size); *sizep = size; - if ((flags->progress || flags->progressastitle) && - send_progress_thread_exit(zhp->zfs_hdl, ptid)) + if (send_progress_thread_exit(zhp->zfs_hdl, ptid, &oldmask)) return (-1); if (!flags->progress && !flags->parsable) @@ -1876,11 +1935,12 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; + sigset_t oldmask; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (flags->progress || flags->progressastitle) { + { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; @@ -1898,6 +1958,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, zfs_close(zhp); return (error); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, @@ -1905,8 +1966,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, if (redact_book != NULL) free(redact_book); - if ((flags->progressastitle || flags->progress) && - send_progress_thread_exit(hdl, tid)) { + if (send_progress_thread_exit(hdl, tid, &oldmask)) { zfs_close(zhp); return (-1); } @@ -2691,7 +2751,8 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (flags->progress || flags->progressastitle) { + sigset_t oldmask; + { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; @@ -2708,13 +2769,13 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_redacted(name, from, fd, lzc_flags_from_sendflags(flags), redactbook); - if ((flags->progress || flags->progressastitle) && - send_progress_thread_exit(hdl, ptid)) + if (send_progress_thread_exit(hdl, ptid, &oldmask)) return (-1); if (err == 0 && (flags->props || flags->holds || flags->backup)) { diff --git a/man/man8/zfs-send.8 b/man/man8/zfs-send.8 index 8cc6ae6ad5..ba604bf778 100644 --- a/man/man8/zfs-send.8 +++ b/man/man8/zfs-send.8 @@ -29,7 +29,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd January 12, 2023 +.Dd July 27, 2023 .Dt ZFS-SEND 8 .Os . @@ -297,6 +297,12 @@ This flag can only be used in conjunction with .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. +The same report can be requested by sending +.Dv SIGINFO +or +.Dv SIGUSR1 , +regardless of +.Fl v . .Pp The format of the stream is committed. You will be able to receive your streams on future versions of ZFS. @@ -433,6 +439,12 @@ and the verbose output goes to standard error .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. +The same report can be requested by sending +.Dv SIGINFO +or +.Dv SIGUSR1 , +regardless of +.Fl v . .El .It Xo .Nm zfs @@ -669,6 +681,10 @@ ones on the source, and are ready to be used, while the parent snapshot on the target contains none of the username and password data present on the source, because it was removed by the redacted send operation. . +.Sh SIGNALS +See +.Fl v . +. .Sh EXAMPLES .\" These are, respectively, examples 12, 13 from zfs.8 .\" Make sure to update them bidirectionally From 895cb689d310e49cc21040f565d63dd34cc60ab3 Mon Sep 17 00:00:00 2001 From: oromenahar Date: Tue, 8 Aug 2023 18:37:06 +0200 Subject: [PATCH 04/20] zfs_clone_range should return a descriptive error codes Return the more descriptive error codes instead of `EXDEV` when the parameters don't match the requirements of the clone function. Updated the comments in `brt.c` accordingly. The first three errors are just invalid parameters, which zfs can not handle. The fourth error indicates that the block which should be cloned is created and cloned or modified in the same transaction group (`txg`). Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Kay Pedersen Closes #15148 --- module/os/freebsd/zfs/zfs_vnops_os.c | 2 +- module/os/linux/zfs/zpl_file_range.c | 4 ++-- module/zfs/brt.c | 6 +++--- module/zfs/zfs_vnops.c | 13 +++++++------ 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 45cf6fdfc4..513accf18a 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6290,7 +6290,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), ap->a_outoffp, &len, ap->a_outcred); - if (error == EXDEV || error == EOPNOTSUPP) + if (error == EXDEV || error == EINVAL || error == EOPNOTSUPP) goto bad_locked_fallback; *ap->a_lenp = (size_t)len; out_locked: diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 72384b638b..43ba9a4982 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -103,7 +103,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * Since Linux 5.3 the filesystem driver is responsible for executing * an appropriate fallback, and a generic fallback function is provided. */ - if (ret == -EOPNOTSUPP || ret == -EXDEV) + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); #else @@ -111,7 +111,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal * to the kernel that it should fallback to a content copy. */ - if (ret == -EXDEV) + if (ret == -EINVAL || ret == -EXDEV) ret = -EOPNOTSUPP; #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ diff --git a/module/zfs/brt.c b/module/zfs/brt.c index e8218fb268..ddd8eefe60 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -174,7 +174,7 @@ * size_t len, unsigned int flags); * * Even though offsets and length represent bytes, they have to be - * block-aligned or we will return the EXDEV error so the upper layer can + * block-aligned or we will return an error so the upper layer can * fallback to the generic mechanism that will just copy the data. * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. * This function was implemented based on zfs_write(), but instead of writing @@ -192,9 +192,9 @@ * Some special cases to consider and how we address them: * - The block we want to clone may have been created within the same * transaction group that we are trying to clone. Such block has no BP - * allocated yet, so cannot be immediately cloned. We return EXDEV. + * allocated yet, so cannot be immediately cloned. We return EAGAIN. * - The block we want to clone may have been modified within the same - * transaction group. We return EXDEV. + * transaction group. We return EAGAIN. * - A block may be cloned multiple times during one transaction group (that's * why pending list is actually a tree and not an append-only list - this * way we can figure out faster if this block is cloned for the first time diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 8ffcf37134..a3d54c6e67 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1028,6 +1028,10 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) * * On success, the function return the number of bytes copied in *lenp. * Note, it doesn't return how much bytes are left to be copied. + * On errors which are caused by any file system limitations or + * brt limitations `EINVAL` is returned. In the most cases a user + * requested bad parameters, it could be possible to clone the file but + * some parameters don't match the requirements. */ int zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, @@ -1171,7 +1175,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * We cannot clone into files with different block size. */ if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } @@ -1179,7 +1183,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * Offsets and len must be at block boundries. */ if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } /* @@ -1187,7 +1191,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ if ((len % inblksz) != 0 && (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } @@ -1246,9 +1250,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * in the current transaction group. Return an error, * so the caller can fallback to just copying the data. */ - if (error == EAGAIN) { - error = SET_ERROR(EXDEV); - } break; } /* From 95649854ba7c2273d2dd71ac3a9c50ffb94908f7 Mon Sep 17 00:00:00 2001 From: Rafael Kitover Date: Tue, 8 Aug 2023 16:38:34 +0000 Subject: [PATCH 05/20] dracut: support mountpoint=legacy for root dataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support mountpoint=legacy for the root dataset in the dracut zfs support scripts. mountpoint=/ or mountpoint=/sysroot also works. Change zfs-env-bootfs.service to add zfsutil to BOOTFSFLAGS only for root datasets with mountpoint != legacy. Reviewed-by: Brian Behlendorf Reviewed-by: Ahelenia Ziemiańska Signed-off-by: Rafael Kitover Closes #15149 --- contrib/dracut/90zfs/zfs-env-bootfs.service.in | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/contrib/dracut/90zfs/zfs-env-bootfs.service.in b/contrib/dracut/90zfs/zfs-env-bootfs.service.in index 7ebab4c1a5..fe362b930b 100644 --- a/contrib/dracut/90zfs/zfs-env-bootfs.service.in +++ b/contrib/dracut/90zfs/zfs-env-bootfs.service.in @@ -12,11 +12,12 @@ ExecStart=/bin/sh -c ' decode_root_args || exit 0; \ [ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \ rootflags="$(getarg rootflags=)"; \ - case ",$rootflags," in \ - *,zfsutil,*) ;; \ - ,,) rootflags=zfsutil ;; \ - *) rootflags="zfsutil,$rootflags" ;; \ - esac; \ + [ "$(@sbindir@/zfs get -H -o value mountpoint "$root")" = legacy ] || \ + case ",$rootflags," in \ + *,zfsutil,*) ;; \ + ,,) rootflags=zfsutil ;; \ + *) rootflags="zfsutil,$rootflags" ;; \ + esac; \ exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"' [Install] From 645a7e4d958df96f98f88a93cd38183369bf36a3 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 8 Aug 2023 21:40:36 +0500 Subject: [PATCH 06/20] Move zinject from openzfs-zfs-test to openzfs-zfsutils For Native Debian packaging, zinject binary and man page is packaged in ZFS test package. zinject is not not directly related to ZTS and should be packaged with other utilities, like it is present in zfs_.rpm/deb packages. This commit moves zinject binary and man page from openzfs-zfs-test to openzfs-zfsutils package. Reviewed-by: Brian Behlendorf Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15160 --- contrib/debian/openzfs-zfs-test.install | 2 -- contrib/debian/openzfs-zfsutils.install | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/debian/openzfs-zfs-test.install b/contrib/debian/openzfs-zfs-test.install index cafcfdc0e1..b3afef50db 100644 --- a/contrib/debian/openzfs-zfs-test.install +++ b/contrib/debian/openzfs-zfs-test.install @@ -1,10 +1,8 @@ -sbin/zinject sbin/ztest usr/bin/raidz_test usr/share/man/man1/raidz_test.1 usr/share/man/man1/test-runner.1 usr/share/man/man1/ztest.1 -usr/share/man/man8/zinject.8 usr/share/zfs/common.sh usr/share/zfs/runfiles/ usr/share/zfs/test-runner diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 49f0ec0d5d..0f58508f00 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -27,6 +27,7 @@ sbin/zfs sbin/zfs_ids_to_path sbin/zgenhostid sbin/zhack +sbin/zinject sbin/zpool sbin/zstream sbin/zstreamdump @@ -92,6 +93,7 @@ usr/share/man/man8/zfs_ids_to_path.8 usr/share/man/man7/zfsconcepts.7 usr/share/man/man7/zfsprops.7 usr/share/man/man8/zgenhostid.8 +usr/share/man/man8/zinject.8 usr/share/man/man8/zpool-add.8 usr/share/man/man8/zpool-attach.8 usr/share/man/man8/zpool-checkpoint.8 From 92f095a903f8599e76473cdc019e6da64d0b67c4 Mon Sep 17 00:00:00 2001 From: Rob N Date: Tue, 15 Aug 2023 10:34:14 +1000 Subject: [PATCH 07/20] copy_file_range: fix fallback when source create on same txg In 019dea0a5 we removed the conversion from EAGAIN->EXDEV inside zfs_clone_range(), but forgot to add a test for EAGAIN to the copy_file_range() entry points to trigger fallback to a content copy. This commit fixes that. Reviewed-by: Brian Behlendorf Reviewed-by: Kay Pedersen Signed-off-by: Rob Norris Closes #15170 Closes #15172 --- module/os/freebsd/zfs/zfs_vnops_os.c | 3 +- module/os/linux/zfs/zpl_file_range.c | 5 +- module/zfs/zfs_vnops.c | 7 +- tests/runfiles/linux.run | 3 +- tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/tests/Makefile.am | 3 +- ...loning_copyfilerange_fallback_same_txg.ksh | 66 +++++++++++++++++++ 7 files changed, 81 insertions(+), 8 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 513accf18a..c498a13282 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6290,7 +6290,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), ap->a_outoffp, &len, ap->a_outcred); - if (error == EXDEV || error == EINVAL || error == EOPNOTSUPP) + if (error == EXDEV || error == EAGAIN || error == EINVAL || + error == EOPNOTSUPP) goto bad_locked_fallback; *ap->a_lenp = (size_t)len; out_locked: diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 43ba9a4982..2abbf44df5 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -103,7 +103,8 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * Since Linux 5.3 the filesystem driver is responsible for executing * an appropriate fallback, and a generic fallback function is provided. */ - if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV) + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV || + ret == -EAGAIN) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); #else @@ -111,7 +112,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal * to the kernel that it should fallback to a content copy. */ - if (ret == -EINVAL || ret == -EXDEV) + if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN) ret = -EOPNOTSUPP; #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index a3d54c6e67..f8d13075d5 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1246,9 +1246,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, &nbps); if (error != 0) { /* - * If we are tyring to clone a block that was created - * in the current transaction group. Return an error, - * so the caller can fallback to just copying the data. + * If we are trying to clone a block that was created + * in the current transaction group, error will be + * EAGAIN here, which we can just return to the caller + * so it can fallback if it likes. */ break; } diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4747b98373..2c8d5cb0ec 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -41,7 +41,8 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_ficlonerange_partial', 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', 'block_cloning_disabled_ficlonerange', - 'block_cloning_copyfilerange_cross_dataset'] + 'block_cloning_copyfilerange_cross_dataset', + 'block_cloning_copyfilerange_fallback_same_txg'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 5c4b3a7bcd..e1bbe063ab 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -304,6 +304,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': + ['SKIP', cfr_cross_reason], }) diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 3b6b2ef734..66aff5026f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -441,9 +441,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ functional/block_cloning/block_cloning_copyfilerange.ksh \ functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ - functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ functional/block_cloning/block_cloning_disabled_ficlone.ksh \ functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh new file mode 100755 index 0000000000..3451f887af --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# Copyright (c) 2023, Rob Norris +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="copy_file_range will fall back to copy when cloning on same txg" + +log_assert $claim + +typeset timeout=$(get_tunable TXG_TIMEOUT) + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 TXG_TIMEOUT $timeout +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must set_tunable64 TXG_TIMEOUT 5000 + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 + +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "" ] + +log_pass $claim + From d19304ffeec50ebc02cf4496c14e8945c74fb76a Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 24 Aug 2023 11:59:03 -0700 Subject: [PATCH 08/20] zed: Add zedlet to power off slot when drive is faulted If ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is enabled in zed.rc, then power off the drive's slot in the enclosure if it becomes FAULTED. This can help silence misbehaving drives. This assumes your drive enclosure fully supports slot power control via sysfs. Reviewed-by: @AllKind Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15200 --- cmd/zed/zed.d/statechange-slot_off.sh | 61 +++++++++++++++++++++++++++ cmd/zed/zed.d/zed.rc | 5 +++ 2 files changed, 66 insertions(+) create mode 100755 cmd/zed/zed.d/statechange-slot_off.sh diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh new file mode 100755 index 0000000000..d6f3c94a41 --- /dev/null +++ b/cmd/zed/zed.d/statechange-slot_off.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# +# Turn off disk's enclosure slot if it becomes FAULTED. +# +# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos +# as they flip between FAULTED and ONLINE. If +# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets +# FAULTED, then power down the slot via sysfs: +# +# /sys/class/enclosure///power_status +# +# We assume the user will be responsible for turning the slot back on again. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI Enclosure services (SES) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: slot successfully powered off +# 1: enclosure not available +# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled +# 3: vdev was not FAULTED +# 4: The enclosure sysfs path passed from ZFS does not exist +# 5: Enclosure slot didn't actually turn off after we told it to + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + # No JBOD enclosure or NVMe slots + exit 1 +fi + +if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then + exit 2 +fi + +if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then + exit 3 +fi + +if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then + exit 4 +fi + +echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" + +# Wait for sysfs for report that the slot is off. It can take ~400ms on some +# enclosures. +for i in $(seq 1 20) ; do + if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then + break + fi + sleep 0.1 +done + +if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then + exit 5 +fi + +zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH" diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index c55a70c79f..78dc1afc7b 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -142,3 +142,8 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" # Disabled by default, 1 to enable and 0 to disable. #ZED_SYSLOG_DISPLAY_GUIDS=1 +## +# Power off the drive's slot in the enclosure if it becomes FAULTED. This can +# help silence misbehaving drives. This assumes your drive enclosure fully +# supports slot power control via sysfs. +#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1 From ab999406fedf81439a6e307cbb538d77fd3a5dd1 Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Fri, 25 Aug 2023 10:28:36 -0700 Subject: [PATCH 09/20] Update outdated assertion from zio_write_compress As part of some internal gang block testing within Delphix we hit the assertion removed by this patch. The assertion was triggered by a ZIO that had two copies and was a gang block making the following expression equal to 3: ``` MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) ``` and failing when we expected the above to be equal to `BP_GET_NDVAS(bp)`. The assertion is no longer valid since the following commit: ``` commit 14872aaa4f909d72c6b5e4105dadcfa13c7d9d66 Author: Matthew Ahrens Date: Mon Feb 6 09:37:06 2023 -0800 EIO caused by encryption + recursive gang ``` The above commit changed gang block headers so they can't have more than 2 copies but the assertion in question from this PR was never updated. Reviewed-by: George Wilson Reviewed-by: Matthew Ahrens Signed-off-by: Serapheim Dimitropoulos Closes #15180 --- module/zfs/zio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 7458b416c2..3b3b40fa73 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1775,8 +1775,9 @@ zio_write_compress(zio_t *zio) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ - ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), - spa_max_replication(spa)) == BP_GET_NDVAS(bp)); + ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) || + MIN(zp->zp_copies, spa_max_replication(spa)) + == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ From 084ff4abd27c98a63db0c73d8018a08d7cdd2984 Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 26 Aug 2023 03:31:29 +1000 Subject: [PATCH 10/20] tests/block_cloning: rename and document get_same_blocks helper `get_same_blocks` is a helper to compare two files and return a list of the blocks that are clones of each other. Its very necessary for block cloning tests. Previously it was incorrectly called `unique_blocks`, which is the _inverse_ of what it does (an early version did list unique blocks; it was changed but the name was not). So if nothing else, it should be called `duplicate_blocks`. But, keeping the details of a clone operation in your head is actually quite difficult, without the additional overhead of wondering how the tools work. So I've renamed it to better describe what it does, added a usage note, and changed it to return block indexes from 0 instead of 1, to match how L0 blocks are normally counted. Reviewed-by: Umer Saleem Reviewed-by: Kay Pedersen Signed-off-by: Rob Norris Closes #15181 --- .../functional/block_cloning/block_cloning.kshlib | 14 +++++++++++--- .../block_cloning/block_cloning_copyfilerange.ksh | 4 ++-- .../block_cloning_copyfilerange_cross_dataset.ksh | 4 ++-- .../block_cloning_copyfilerange_fallback.ksh | 12 ++++++------ ...ock_cloning_copyfilerange_fallback_same_txg.ksh | 2 +- .../block_cloning_copyfilerange_partial.ksh | 6 +++--- .../block_cloning_disabled_copyfilerange.ksh | 2 +- .../block_cloning/block_cloning_ficlone.ksh | 4 ++-- .../block_cloning/block_cloning_ficlonerange.ksh | 4 ++-- .../block_cloning_ficlonerange_partial.ksh | 6 +++--- 10 files changed, 33 insertions(+), 25 deletions(-) diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib index 9998e5a87b..8e16366b4c 100644 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib @@ -34,13 +34,21 @@ function have_same_content log_must [ "$hash1" = "$hash2" ] } -function unique_blocks +# +# get_same_blocks dataset1 path/to/file1 dataset2 path/to/file2 +# +# Returns a space-separated list of the indexes (starting at 0) of the L0 +# blocks that are shared between both files (by first DVA and checksum). +# Assumes that the two files have the same content, use have_same_content to +# confirm that. +# +function get_same_blocks { typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$ zdb -vvvvv $1 -O $2 | \ - awk '/ L0 / { print ++l " " $3 " " $7 }' > $zdbout.a + awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a zdb -vvvvv $3 -O $4 | \ - awk '/ L0 / { print ++l " " $3 " " $7 }' > $zdbout.b + awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ') } diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh index 9adcbfcd88..43ea47b0ef 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh @@ -54,7 +54,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 -typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2) -log_must [ "$blocks" = "1 2 3 4" ] +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "0 1 2 3" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh index 07e089e89c..74e6b04903 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh @@ -58,8 +58,8 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/$TESTFS1/file1 /$TESTPOOL/$TESTFS2/file2 -typeset blocks=$(unique_blocks \ +typeset blocks=$(get_same_blocks \ $TESTPOOL/$TESTFS1 file1 $TESTPOOL/$TESTFS2 file2) -log_must [ "$blocks" = "1 2 3 4" ] +log_must [ "$blocks" = "0 1 2 3" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh index 87f99eb5c0..9a96eacd60 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh @@ -58,8 +58,8 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone -typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) -log_must [ "$blocks" = "1 2 3 4" ] +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "0 1 2 3" ] log_note "Copying within a block with copy_file_range" @@ -69,8 +69,8 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone -typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) -log_must [ "$blocks" = "2 3 4" ] +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "1 2 3" ] log_note "Copying across a block with copy_file_range" @@ -80,7 +80,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone -typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) -log_must [ "$blocks" = "2" ] +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "1" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index 3451f887af..a10545bc07 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -59,7 +59,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone -typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) log_must [ "$blocks" = "" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh index ecac62b203..a5da0a0bd3 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh @@ -54,7 +54,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 -typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2) +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) log_must [ "$blocks" = "" ] log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 131072 131072 262144 @@ -62,7 +62,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 -typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2) -log_must [ "$blocks" = "2 3" ] +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "1 2" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh index 30b155a140..d21b625113 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh @@ -54,7 +54,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 -typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2) +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) log_must [ "$blocks" = "" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh index d13a392298..3f227fb68e 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlone.ksh @@ -50,7 +50,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 -typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2) -log_must [ "$blocks" = "1 2 3 4" ] +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "0 1 2 3" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh index 6556050c43..cefc4336ae 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange.ksh @@ -50,7 +50,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 -typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2) -log_must [ "$blocks" = "1 2 3 4" ] +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "0 1 2 3" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh index 37a3511a26..067f55aaa6 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_ficlonerange_partial.ksh @@ -50,7 +50,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 -typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2) +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) log_must [ "$blocks" = "" ] log_must clonefile -r /$TESTPOOL/file1 /$TESTPOOL/file2 131072 131072 262144 @@ -58,7 +58,7 @@ log_must sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 -typeset blocks=$(unique_blocks $TESTPOOL file1 $TESTPOOL file2) -log_must [ "$blocks" = "2 3" ] +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) +log_must [ "$blocks" = "1 2" ] log_pass $claim From 1b696429c118eacd78c255c12dcffddfad439a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Sat, 26 Aug 2023 01:13:43 +0200 Subject: [PATCH 11/20] Make zoned/jailed zfsprops(7) make more sense. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Distribute zfs-[un]jail.8 on FreeBSD and zfs-[un]zone.8 on Linux - zfsprops.7: mirror zoned/jailed, only available on respective platforms Reviewed-by: Brian Behlendorf Signed-off-by: Ahelenia Ziemiańska Closes #15161 --- contrib/debian/openzfs-zfsutils.install | 2 -- man/Makefile.am | 16 ++++++++++++---- man/man7/zfsprops.7 | 16 +++++++++------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 0f58508f00..301d8f67b3 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -60,7 +60,6 @@ usr/share/man/man8/zfs-get.8 usr/share/man/man8/zfs-groupspace.8 usr/share/man/man8/zfs-hold.8 usr/share/man/man8/zfs-inherit.8 -usr/share/man/man8/zfs-jail.8 usr/share/man/man8/zfs-list.8 usr/share/man/man8/zfs-load-key.8 usr/share/man/man8/zfs-mount-generator.8 @@ -80,7 +79,6 @@ usr/share/man/man8/zfs-set.8 usr/share/man/man8/zfs-share.8 usr/share/man/man8/zfs-snapshot.8 usr/share/man/man8/zfs-unallow.8 -usr/share/man/man8/zfs-unjail.8 usr/share/man/man8/zfs-unload-key.8 usr/share/man/man8/zfs-unmount.8 usr/share/man/man8/zfs-unzone.8 diff --git a/man/Makefile.am b/man/Makefile.am index 2973520324..36c1aede10 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -38,7 +38,6 @@ dist_man_MANS = \ %D%/man8/zfs-groupspace.8 \ %D%/man8/zfs-hold.8 \ %D%/man8/zfs-inherit.8 \ - %D%/man8/zfs-jail.8 \ %D%/man8/zfs-list.8 \ %D%/man8/zfs-load-key.8 \ %D%/man8/zfs-mount.8 \ @@ -57,14 +56,11 @@ dist_man_MANS = \ %D%/man8/zfs-share.8 \ %D%/man8/zfs-snapshot.8 \ %D%/man8/zfs-unallow.8 \ - %D%/man8/zfs-unjail.8 \ %D%/man8/zfs-unload-key.8 \ %D%/man8/zfs-unmount.8 \ - %D%/man8/zfs-unzone.8 \ %D%/man8/zfs-upgrade.8 \ %D%/man8/zfs-userspace.8 \ %D%/man8/zfs-wait.8 \ - %D%/man8/zfs-zone.8 \ %D%/man8/zfs_ids_to_path.8 \ %D%/man8/zgenhostid.8 \ %D%/man8/zinject.8 \ @@ -104,6 +100,18 @@ dist_man_MANS = \ %D%/man8/zstreamdump.8 \ %D%/man8/zpool_influxdb.8 +if BUILD_FREEBSD +dist_man_MANS += \ + %D%/man8/zfs-jail.8 \ + %D%/man8/zfs-unjail.8 +endif + +if BUILD_LINUX +dist_man_MANS += \ + %D%/man8/zfs-unzone.8 \ + %D%/man8/zfs-zone.8 +endif + nodist_man_MANS = \ %D%/man8/zed.8 \ %D%/man8/zfs-mount-generator.8 diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index 8f6b919cfc..51ddd85eb7 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -38,7 +38,7 @@ .\" Copyright (c) 2019, Kjeld Schouten-Lebbing .\" Copyright (c) 2022 Hewlett Packard Enterprise Development LP. .\" -.Dd April 18, 2023 +.Dd August 8, 2023 .Dt ZFSPROPS 7 .Os . @@ -1916,13 +1916,15 @@ See for more information. Jails are a .Fx -feature and are not relevant on other platforms. -The default value is -.Sy off . -.It Sy zoned Ns = Ns Sy on Ns | Ns Sy off +feature and this property is not available on other platforms. +.It Sy zoned Ns = Ns Sy off Ns | Ns Sy on Controls whether the dataset is managed from a non-global zone or namespace. -The default value is -.Sy off . +See +.Xr zfs-zone 8 +for more information. +Zoning is a +Linux +feature and this property is not available on other platforms. .El .Pp The following three properties cannot be changed after the file system is From e99e684b337b44d5507ab51b02cf50a11b064162 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sat, 26 Aug 2023 11:22:28 -0700 Subject: [PATCH 12/20] zed: update zed.d/statechange-slot_off.sh The statechange-slot_off.sh zedlet which was added in #15200 needed to be installed so it's included by the packages. Additional testing has also shown that multiple retries are often needed for the script to operate reliably. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #15210 --- cmd/zed/zed.d/Makefile.am | 2 ++ cmd/zed/zed.d/statechange-slot_off.sh | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am index c65b43fb02..812558cf6d 100644 --- a/cmd/zed/zed.d/Makefile.am +++ b/cmd/zed/zed.d/Makefile.am @@ -16,6 +16,7 @@ dist_zedexec_SCRIPTS = \ %D%/scrub_finish-notify.sh \ %D%/statechange-led.sh \ %D%/statechange-notify.sh \ + %D%/statechange-slot_off.sh \ %D%/trim_finish-notify.sh \ %D%/vdev_attach-led.sh \ %D%/vdev_clear-led.sh @@ -35,6 +36,7 @@ zedconfdefaults = \ scrub_finish-notify.sh \ statechange-led.sh \ statechange-notify.sh \ + statechange-slot_off.sh \ vdev_attach-led.sh \ vdev_clear-led.sh diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh index d6f3c94a41..9d218ddaa6 100755 --- a/cmd/zed/zed.d/statechange-slot_off.sh +++ b/cmd/zed/zed.d/statechange-slot_off.sh @@ -43,15 +43,17 @@ if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then exit 4 fi -echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" - -# Wait for sysfs for report that the slot is off. It can take ~400ms on some -# enclosures. +# Turn off the slot and wait for sysfs to report that the slot is off. +# It can take ~400ms on some enclosures and multiple retries may be needed. for i in $(seq 1 20) ; do - if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then - break - fi - sleep 0.1 + echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" + + for j in $(seq 1 5) ; do + if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then + break 2 + fi + sleep 0.1 + done done if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then From c65aaa83876f9764f9017db278cefa453e1a3f5b Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Sat, 26 Aug 2023 14:25:46 -0400 Subject: [PATCH 13/20] Avoid save/restoring AMX registers to avoid a SPR erratum Intel SPR erratum SPR4 says that if you trip into a vmexit while doing FPU save/restore, your AMX register state might misbehave... and by misbehave, I mean save all zeroes incorrectly, leading to explosions if you restore it. Since we're not using AMX for anything, the simple way to avoid this is to just not save/restore those when we do anything, since we're killing preemption of any sort across our save/restores. If we ever decide to use AMX, it's not clear that we have any way to mitigate this, on Linux...but I am not an expert. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14989 Closes #15168 --- include/os/linux/kernel/linux/simd_x86.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/include/os/linux/kernel/linux/simd_x86.h b/include/os/linux/kernel/linux/simd_x86.h index 1d77f0487a..699b8a5718 100644 --- a/include/os/linux/kernel/linux/simd_x86.h +++ b/include/os/linux/kernel/linux/simd_x86.h @@ -147,6 +147,15 @@ #error "Toolchain needs to support the XSAVE assembler instruction" #endif +#ifndef XFEATURE_MASK_XTILE +/* + * For kernels where this doesn't exist yet, we still don't want to break + * by save/restoring this broken nonsense. + * See issue #14989 or Intel errata SPR4 for why + */ +#define XFEATURE_MASK_XTILE 0x60000 +#endif + #include #include @@ -315,18 +324,18 @@ kfpu_begin(void) uint8_t *state = zfs_kfpu_fpregs[smp_processor_id()]; #if defined(HAVE_XSAVES) if (static_cpu_has(X86_FEATURE_XSAVES)) { - kfpu_do_xsave("xsaves", state, ~0); + kfpu_do_xsave("xsaves", state, ~XFEATURE_MASK_XTILE); return; } #endif #if defined(HAVE_XSAVEOPT) if (static_cpu_has(X86_FEATURE_XSAVEOPT)) { - kfpu_do_xsave("xsaveopt", state, ~0); + kfpu_do_xsave("xsaveopt", state, ~XFEATURE_MASK_XTILE); return; } #endif if (static_cpu_has(X86_FEATURE_XSAVE)) { - kfpu_do_xsave("xsave", state, ~0); + kfpu_do_xsave("xsave", state, ~XFEATURE_MASK_XTILE); } else if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_save_fxsr(state); } else { @@ -376,12 +385,12 @@ kfpu_end(void) uint8_t *state = zfs_kfpu_fpregs[smp_processor_id()]; #if defined(HAVE_XSAVES) if (static_cpu_has(X86_FEATURE_XSAVES)) { - kfpu_do_xrstor("xrstors", state, ~0); + kfpu_do_xrstor("xrstors", state, ~XFEATURE_MASK_XTILE); goto out; } #endif if (static_cpu_has(X86_FEATURE_XSAVE)) { - kfpu_do_xrstor("xrstor", state, ~0); + kfpu_do_xrstor("xrstor", state, ~XFEATURE_MASK_XTILE); } else if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_restore_fxsr(state); } else { From 7eabb0af374ed8650e707278e7e1708b2d911da2 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Sat, 26 Aug 2023 11:30:19 -0700 Subject: [PATCH 14/20] Try to clarify wording to reduce zpool add incidents Try to clarify wording to reduce zpool add incidents. Add an attach example. Reviewed-by: Rich Ercolani Reviewed-by: George Melikov Reviewed-by: Alexander Motin Signed-off-by: Paul Dagnelie Closes #15179 --- man/man8/zpool.8 | 58 +++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index e8eadffa6f..4e45890f1e 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -110,9 +110,10 @@ Removes ZFS label information from the specified .It Xo .Xr zpool-attach 8 Ns / Ns Xr zpool-detach 8 .Xc -Increases or decreases redundancy by -.Cm attach Ns ing or -.Cm detach Ns ing a device on an existing vdev (virtual device). +Converts a non-redundant disk into a mirror, or increases the redundancy level of an existing mirror +.Ns ( +.Cm attach Ns ), or performs the inverse operation ( +.Cm detach Ns ). .It Xo .Xr zpool-add 8 Ns / Ns Xr zpool-remove 8 .Xc @@ -233,16 +234,16 @@ Invalid command line options were specified. .El . .Sh EXAMPLES -.\" Examples 1, 2, 3, 4, 11, 12 are shared with zpool-create.8. -.\" Examples 5, 13 are shared with zpool-add.8. -.\" Examples 6, 15 are shared with zpool-list.8. -.\" Examples 7 are shared with zpool-destroy.8. -.\" Examples 8 are shared with zpool-export.8. -.\" Examples 9 are shared with zpool-import.8. -.\" Examples 10 are shared with zpool-upgrade.8. -.\" Examples 14 are shared with zpool-remove.8. -.\" Examples 16 are shared with zpool-status.8. -.\" Examples 13, 16 are also shared with zpool-iostat.8. +.\" Examples 1, 2, 3, 4, 12, 13 are shared with zpool-create.8. +.\" Examples 6, 14 are shared with zpool-add.8. +.\" Examples 7, 16 are shared with zpool-list.8. +.\" Examples 8 are shared with zpool-destroy.8. +.\" Examples 9 are shared with zpool-export.8. +.\" Examples 10 are shared with zpool-import.8. +.\" Examples 11 are shared with zpool-upgrade.8. +.\" Examples 15 are shared with zpool-remove.8. +.\" Examples 17 are shared with zpool-status.8. +.\" Examples 14, 17 are also shared with zpool-iostat.8. .\" Make sure to update them omnidirectionally .Ss Example 1 : No Creating a RAID-Z Storage Pool The following command creates a pool with a single raidz root vdev that @@ -264,14 +265,21 @@ While not recommended, a pool based on files can be useful for experimental purposes. .Dl # Nm zpool Cm create Ar tank Pa /path/to/file/a /path/to/file/b . -.Ss Example 5 : No Adding a Mirror to a ZFS Storage Pool +.Ss Example 5 : No Making a non-mirrored ZFS Storage Pool mirrored. +The following command converts an existing single device +.Ar sda +into a mirror by attaching a second device to it, +.Ar sdb . +.Dl # Nm zpool Cm attach Ar tank Pa sda sdb +. +.Ss Example 6 : No Adding a Mirror to a ZFS Storage Pool The following command adds two mirrored disks to the pool .Ar tank , assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool. .Dl # Nm zpool Cm add Ar tank Sy mirror Pa sda sdb . -.Ss Example 6 : No Listing Available ZFS Storage Pools +.Ss Example 7 : No Listing Available ZFS Storage Pools The following command lists all available pools on the system. In this case, the pool .Ar zion @@ -285,19 +293,19 @@ tank 61.5G 20.0G 41.5G - 48% 32% 1.00x ONLINE - zion - - - - - - - FAULTED - .Ed . -.Ss Example 7 : No Destroying a ZFS Storage Pool +.Ss Example 8 : No Destroying a ZFS Storage Pool The following command destroys the pool .Ar tank and any datasets contained within: .Dl # Nm zpool Cm destroy Fl f Ar tank . -.Ss Example 8 : No Exporting a ZFS Storage Pool +.Ss Example 9 : No Exporting a ZFS Storage Pool The following command exports the devices in pool .Ar tank so that they can be relocated or later imported: .Dl # Nm zpool Cm export Ar tank . -.Ss Example 9 : No Importing a ZFS Storage Pool +.Ss Example 10 : No Importing a ZFS Storage Pool The following command displays available pools, and then imports the pool .Ar tank for use on the system. @@ -318,7 +326,7 @@ config: .No # Nm zpool Cm import Ar tank .Ed . -.Ss Example 10 : No Upgrading All ZFS Storage Pools to the Current Version +.Ss Example 11 : No Upgrading All ZFS Storage Pools to the Current Version The following command upgrades all ZFS Storage pools to the current version of the software: .Bd -literal -compact -offset Ds @@ -326,7 +334,7 @@ the software: This system is currently running ZFS version 2. .Ed . -.Ss Example 11 : No Managing Hot Spares +.Ss Example 12 : No Managing Hot Spares The following command creates a new pool with an available hot spare: .Dl # Nm zpool Cm create Ar tank Sy mirror Pa sda sdb Sy spare Pa sdc .Pp @@ -341,12 +349,12 @@ The hot spare can be permanently removed from the pool using the following command: .Dl # Nm zpool Cm remove Ar tank Pa sdc . -.Ss Example 12 : No Creating a ZFS Pool with Mirrored Separate Intent Logs +.Ss Example 13 : No Creating a ZFS Pool with Mirrored Separate Intent Logs The following command creates a ZFS storage pool consisting of two, two-way mirrors and mirrored log devices: .Dl # Nm zpool Cm create Ar pool Sy mirror Pa sda sdb Sy mirror Pa sdc sdd Sy log mirror Pa sde sdf . -.Ss Example 13 : No Adding Cache Devices to a ZFS Pool +.Ss Example 14 : No Adding Cache Devices to a ZFS Pool The following command adds two disks for use as cache devices to a ZFS storage pool: .Dl # Nm zpool Cm add Ar pool Sy cache Pa sdc sdd @@ -359,7 +367,7 @@ Capacity and reads can be monitored using the subcommand as follows: .Dl # Nm zpool Cm iostat Fl v Ar pool 5 . -.Ss Example 14 : No Removing a Mirrored top-level (Log or Data) Device +.Ss Example 15 : No Removing a Mirrored top-level (Log or Data) Device The following commands remove the mirrored log device .Sy mirror-2 and mirrored top-level data device @@ -394,7 +402,7 @@ The command to remove the mirrored data .Ar mirror-1 No is : .Dl # Nm zpool Cm remove Ar tank mirror-1 . -.Ss Example 15 : No Displaying expanded space on a device +.Ss Example 16 : No Displaying expanded space on a device The following command displays the detailed information for the pool .Ar data . This pool is comprised of a single raidz vdev where one of its devices @@ -411,7 +419,7 @@ data 23.9G 14.6G 9.30G - 48% 61% 1.00x ONLINE - sdc - - - - - .Ed . -.Ss Example 16 : No Adding output columns +.Ss Example 17 : No Adding output columns Additional columns can be added to the .Nm zpool Cm status No and Nm zpool Cm iostat No output with Fl c . .Bd -literal -compact -offset Ds From 63159e5bda1c6baac14c7f61da49ebc52907cc4e Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Tue, 29 Aug 2023 09:12:40 -0700 Subject: [PATCH 15/20] checkstyle: fix action failures Reviewed-by: Don Brady Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Serapheim Dimitropoulos Closes #15220 --- cmd/zed/zed.d/statechange-slot_off.sh | 1 + man/man8/zpool.8 | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh index 9d218ddaa6..150012abe7 100755 --- a/cmd/zed/zed.d/statechange-slot_off.sh +++ b/cmd/zed/zed.d/statechange-slot_off.sh @@ -1,4 +1,5 @@ #!/bin/sh +# shellcheck disable=SC3014,SC2154,SC2086,SC2034 # # Turn off disk's enclosure slot if it becomes FAULTED. # diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 4e45890f1e..4c4020bdd8 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -110,9 +110,9 @@ Removes ZFS label information from the specified .It Xo .Xr zpool-attach 8 Ns / Ns Xr zpool-detach 8 .Xc -Converts a non-redundant disk into a mirror, or increases the redundancy level of an existing mirror -.Ns ( -.Cm attach Ns ), or performs the inverse operation ( +Converts a non-redundant disk into a mirror, or increases +the redundancy level of an existing mirror +.Cm ( attach Ns ), or performs the inverse operation ( .Cm detach Ns ). .It Xo .Xr zpool-add 8 Ns / Ns Xr zpool-remove 8 @@ -265,7 +265,7 @@ While not recommended, a pool based on files can be useful for experimental purposes. .Dl # Nm zpool Cm create Ar tank Pa /path/to/file/a /path/to/file/b . -.Ss Example 5 : No Making a non-mirrored ZFS Storage Pool mirrored. +.Ss Example 5 : No Making a non-mirrored ZFS Storage Pool mirrored The following command converts an existing single device .Ar sda into a mirror by attaching a second device to it, From 400f56e3f86311f622bea4872cf3ccf19ce421b8 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Fri, 1 Sep 2023 03:17:12 +0200 Subject: [PATCH 16/20] dmu_buf_will_clone: change assertion to fix 32-bit compiler warning Building module/zfs/dbuf.c for 32-bit targets can result in a warning: In file included from /usr/src/sys/contrib/openzfs/include/sys/zfs_context.h:97, from /usr/src/sys/contrib/openzfs/module/zfs/dbuf.c:32: /usr/src/sys/contrib/openzfs/module/zfs/dbuf.c: In function 'dmu_buf_will_clone': /usr/src/sys/contrib/openzfs/lib/libspl/include/assert.h:116:33: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 116 | const uint64_t __left = (uint64_t)(LEFT); \ | ^ /usr/src/sys/contrib/openzfs/lib/libspl/include/assert.h:148:25: note: in expansion of macro 'VERIFY0' 148 | #define ASSERT0 VERIFY0 | ^~~~~~~ /usr/src/sys/contrib/openzfs/module/zfs/dbuf.c:2704:9: note: in expansion of macro 'ASSERT0' 2704 | ASSERT0(dbuf_find_dirty_eq(db, tx->tx_txg)); | ^~~~~~~ This is because dbuf_find_dirty_eq() returns a pointer, which if pointers are 32-bit results in a warning about the cast to uint64_t. Instead, use the ASSERT3P() macro, with == and NULL as second and third arguments, which should work regardless of the target's bitness. Reviewed-by: Kay Pedersen Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Dimitry Andric Closes #15224 --- module/zfs/dbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index b7453578a7..f2831a0e8a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2701,7 +2701,7 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) */ mutex_enter(&db->db_mtx); VERIFY(!dbuf_undirty(db, tx)); - ASSERT0(dbuf_find_dirty_eq(db, tx->tx_txg)); + ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; From 5a7cb0b06578688bb80ad739f3b3ac4f1492b1fa Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 1 Sep 2023 20:13:22 -0400 Subject: [PATCH 17/20] ZIL: Tune some assertions. In zil_free_lwb() we should first assert lwb_state or the rest of assertions can be misleading if it is false. Add lwb_state assertions in zil_lwb_add_block() to make sure we are not trying to add elements to lwb_vdev_tree after it was processed. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15227 --- module/zfs/zil.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index f2d279e36a..be3311b031 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -814,17 +814,17 @@ static void zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - VERIFY(list_is_empty(&lwb->lwb_waiters)); - VERIFY(list_is_empty(&lwb->lwb_itxs)); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_FLUSH_DONE); ASSERT3P(lwb->lwb_child_zio, ==, NULL); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); - ASSERT(lwb->lwb_state == LWB_STATE_NEW || - lwb->lwb_state == LWB_STATE_FLUSH_DONE); + VERIFY(list_is_empty(&lwb->lwb_itxs)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); + ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); /* * Clear the zilog's field to indicate this lwb is no longer @@ -1329,6 +1329,9 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) int ndvas = BP_GET_NDVAS(bp); int i; + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + if (zil_nocacheflush) return; From 7dc2baaa1f18953c7ecb4daef98d14a3201f95d6 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 1 Sep 2023 20:13:52 -0400 Subject: [PATCH 18/20] ZIL: Revert zl_lock scope reduction. While I have no reports of it, I suspect possible use-after-free scenario when zil_commit_waiter() tries to dereference zcw_lwb for lwb already freed by zil_sync(), while zcw_done is not set. Extension of zl_lock scope as it was originally should block zil_sync() from freeing the lwb, closing this race. This reverts #14959 and couple chunks of #14841. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15228 --- module/zfs/zil.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index be3311b031..297c6b65d4 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1411,15 +1411,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog_t *zilog = lwb->lwb_zilog; zil_commit_waiter_t *zcw; itx_t *itx; - uint64_t txg; - list_t itxs, waiters; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); - list_create(&itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); - list_create(&waiters, sizeof (zil_commit_waiter_t), - offsetof(zil_commit_waiter_t, zcw_node)); - hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); @@ -1428,6 +1422,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio) lwb->lwb_root_zio = NULL; + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); + lwb->lwb_state = LWB_STATE_FLUSH_DONE; + if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number @@ -1438,22 +1435,13 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } - list_move_tail(&itxs, &lwb->lwb_itxs); - list_move_tail(&waiters, &lwb->lwb_waiters); - txg = lwb->lwb_issued_txg; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - lwb->lwb_state = LWB_STATE_FLUSH_DONE; - - mutex_exit(&zilog->zl_lock); - - while ((itx = list_remove_head(&itxs)) != NULL) + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); - list_destroy(&itxs); - while ((zcw = list_remove_head(&waiters)) != NULL) { + while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); + ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been @@ -1478,7 +1466,11 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zcw->zcw_lock); } - list_destroy(&waiters); + + uint64_t txg = lwb->lwb_issued_txg; + + /* Once we drop the lock, lwb may be freed by zil_sync(). */ + mutex_exit(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); From 79ac1b29d5b586fcc63032bfaa385eef4e489a52 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 1 Sep 2023 20:14:50 -0400 Subject: [PATCH 19/20] ZIL: Change ZIOs issue order. In zil_lwb_write_issue(), after issuing lwb_root_zio/lwb_write_zio, we have no right to access lwb->lwb_child_zio. If it was not there, the first two ZIOs may have already completed and freed the lwb. ZIOs issue in opposite order from children to parent should keep the lwb valid till the end, since the lwb can be freed only after lwb_root_zio completion callback. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15233 --- module/zfs/zil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 297c6b65d4..b30676b42d 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1924,10 +1924,10 @@ next_lwb: BP_GET_LSIZE(&lwb->lwb_blk)); } lwb->lwb_issued_timestamp = gethrtime(); - zio_nowait(lwb->lwb_root_zio); - zio_nowait(lwb->lwb_write_zio); if (lwb->lwb_child_zio) zio_nowait(lwb->lwb_child_zio); + zio_nowait(lwb->lwb_write_zio); + zio_nowait(lwb->lwb_root_zio); /* * If nlwb was ready when we gave it the block pointer, From 32949f2560bf35ec86dfa5d984514908e0eb3ecc Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Sat, 2 Sep 2023 05:25:11 +0500 Subject: [PATCH 20/20] Relax error reporting in zpool import and zpool split For zpool import and zpool split, zpool_enable_datasets is called to mount and share all datasets in a pool. If there is an error while mounting or sharing any dataset in the pool, the status of import or split is reported as failure. However, the changes do show up in zpool list. This commit updates the error reporting in zpool import and zpool split path. More descriptive messages are shown to user in case there is an error during mount or share. Errors in mount or share do not effect the overall status of zpool import and zpool split. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #15216 --- cmd/zpool/zpool_main.c | 34 +++++++++++++++++++++++----------- include/libzfs.h | 1 + lib/libzfs/libzfs_mount.c | 4 ++-- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 10a3b5b14f..6d0dae8d8b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3143,6 +3143,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { int ret = 0; + int ms_status = 0; zpool_handle_t *zhp; const char *name; uint64_t version; @@ -3232,10 +3233,15 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, ret = 1; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && - !(flags & ZFS_IMPORT_ONLY) && - zpool_enable_datasets(zhp, mntopts, 0) != 0) { - zpool_close(zhp); - return (1); + !(flags & ZFS_IMPORT_ONLY)) { + ms_status = zpool_enable_datasets(zhp, mntopts, 0); + if (ms_status == EZFS_SHAREFAILED) { + (void) fprintf(stderr, gettext("Import was " + "successful, but unable to share some datasets")); + } else if (ms_status == EZFS_MOUNTFAILED) { + (void) fprintf(stderr, gettext("Import was " + "successful, but unable to mount some datasets")); + } } zpool_close(zhp); @@ -6755,6 +6761,7 @@ zpool_do_split(int argc, char **argv) char *mntopts = NULL; splitflags_t flags; int c, ret = 0; + int ms_status = 0; boolean_t loadkeys = B_FALSE; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; @@ -6891,13 +6898,18 @@ zpool_do_split(int argc, char **argv) ret = 1; } - if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && - zpool_enable_datasets(zhp, mntopts, 0) != 0) { - ret = 1; - (void) fprintf(stderr, gettext("Split was successful, but " - "the datasets could not all be mounted\n")); - (void) fprintf(stderr, gettext("Try doing '%s' with a " - "different altroot\n"), "zpool import"); + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { + ms_status = zpool_enable_datasets(zhp, mntopts, 0); + if (ms_status == EZFS_SHAREFAILED) { + (void) fprintf(stderr, gettext("Split was successful, " + "datasets are mounted but sharing of some datasets " + "has failed\n")); + } else if (ms_status == EZFS_MOUNTFAILED) { + (void) fprintf(stderr, gettext("Split was successful" + ", but some datasets could not be mounted\n")); + (void) fprintf(stderr, gettext("Try doing '%s' with a " + "different altroot\n"), "zpool import"); + } } zpool_close(zhp); nvlist_free(config); diff --git a/include/libzfs.h b/include/libzfs.h index a7037e3e62..fa05b7921b 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -156,6 +156,7 @@ typedef enum zfs_error { EZFS_NOT_USER_NAMESPACE, /* a file is not a user namespace */ EZFS_CKSUM, /* insufficient replicas */ EZFS_RESUME_EXISTS, /* Resume on existing dataset without force */ + EZFS_SHAREFAILED, /* filesystem share failed */ EZFS_UNKNOWN } zfs_error_t; diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 5d1fe651c9..b38ad88096 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -1300,7 +1300,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, zfs_mount_one, &ms, B_TRUE); if (ms.ms_mntstatus != 0) - ret = ms.ms_mntstatus; + ret = EZFS_MOUNTFAILED; /* * Share all filesystems that need to be shared. This needs to be @@ -1311,7 +1311,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, zfs_share_one, &ms, B_FALSE); if (ms.ms_mntstatus != 0) - ret = ms.ms_mntstatus; + ret = EZFS_SHAREFAILED; else zfs_commit_shares(NULL);