From 5fe9a4eee850ecbc624c8fbefdc663da6ac3b1b6 Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski <0mp@FreeBSD.org> Date: Mon, 7 Aug 2023 22:53:59 +0200 Subject: [PATCH 1/8] Fix some typos Reviewed-by: Brian Behlendorf Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org> Closes #15141 --- include/sys/metaslab_impl.h | 6 +++--- module/zfs/metaslab.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 5beb1b7377..d328068890 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -313,7 +313,7 @@ struct metaslab_group { * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are - * allocated, the allocated segment are removed from the ms_allocatable and + * allocated, the allocated segments are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context @@ -366,9 +366,9 @@ struct metaslab_group { struct metaslab { /* * This is the main lock of the metaslab and its purpose is to - * coordinate our allocations and frees [e.g metaslab_block_alloc(), + * coordinate our allocations and frees [e.g., metaslab_block_alloc(), * metaslab_free_concrete(), ..etc] with our various syncing - * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. + * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc]. * * The lock is also used during some miscellaneous operations like * using the metaslab's histogram for the metaslab group's histogram diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 9991e1a22c..966787b25b 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1292,7 +1292,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, /* * If this metaslab group is below its qmax or it's - * the only allocatable metasable group, then attempt + * the only allocatable metaslab group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) From 39bf4437e14fa965f7c05472999d7c1ef0d2c17c Mon Sep 17 00:00:00 2001 From: Ryan Lahfa Date: Mon, 7 Aug 2023 22:55:59 +0200 Subject: [PATCH 2/8] linux/spl/kmem_cache: undefine `kmem_cache_alloc` before defining it When compiling a kernel with bcachefs and zfs, the two macros will collide, making it impossible to have both filesystems. It is sufficient to just undefine the macro before calling it. On why this should be in ZFS rather than bcachefs, currently, bcachefs is not a in-tree filesystem, but, it has a reasonably high chance of getting included soon. This avoids the breakage in ZFS early, this patch may be distributed downstream in NixOS and is already used there. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Ryan Lahfa Closes #15144 --- include/os/linux/spl/sys/kmem_cache.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index cc9cafa84f..20eeadc46e 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -198,6 +198,14 @@ extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) #define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +/* + * This is necessary to be compatible with other kernel modules + * or in-tree filesystem that may define kmem_cache_alloc, + * like bcachefs does it now. + */ +#ifdef kmem_cache_alloc +#undef kmem_cache_alloc +#endif #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) #define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) From c2995a8d90a06f2ed1b8258214c43f9dda7236d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Tue, 8 Aug 2023 18:35:35 +0200 Subject: [PATCH 3/8] libzfs: sendrecv: send_progress_thread: handle SIGINFO/SIGUSR1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit POSIX timers target the process, not the thread (as does SIGINFO), so we need to block it in the main thread which will die if interrupted. Ref: https://101010.pl/@ed1conf@bsd.network/110731819189629373 Reviewed-by: Brian Behlendorf Reviewed-by: Jorgen Lundman Signed-off-by: Ahelenia Ziemiańska Closes #15113 --- lib/libzfs/Makefile.am | 2 +- lib/libzfs/libzfs_sendrecv.c | 95 +++++++++++++++++++++++++++++------- man/man8/zfs-send.8 | 18 ++++++- 3 files changed, 96 insertions(+), 19 deletions(-) diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index cffe341220..5e74d908de 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -57,7 +57,7 @@ libzfs_la_LIBADD = \ libzutil.la \ libuutil.la -libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) +libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) libzfs_la_LDFLAGS = -pthread diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 87a30f54fe..e9bc78aa8d 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -928,6 +928,39 @@ zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, return (0); } +static volatile boolean_t send_progress_thread_signal_duetotimer; +static void +send_progress_thread_act(int sig, siginfo_t *info, void *ucontext) +{ + (void) sig, (void) ucontext; + send_progress_thread_signal_duetotimer = info->si_code == SI_TIMER; +} + +struct timer_desirability { + timer_t timer; + boolean_t desired; +}; +static void +timer_delete_cleanup(void *timer) +{ + struct timer_desirability *td = timer; + if (td->desired) + timer_delete(td->timer); +} + +#ifdef SIGINFO +#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO sigaddset(&new, SIGINFO) +#else +#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO +#endif +#define SEND_PROGRESS_THREAD_PARENT_BLOCK(old) { \ + sigset_t new; \ + sigemptyset(&new); \ + sigaddset(&new, SIGUSR1); \ + SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO; \ + pthread_sigmask(SIG_BLOCK, &new, old); \ +} + static void * send_progress_thread(void *arg) { @@ -941,6 +974,26 @@ send_progress_thread(void *arg) struct tm tm; int err; + const struct sigaction signal_action = + {.sa_sigaction = send_progress_thread_act, .sa_flags = SA_SIGINFO}; + struct sigevent timer_cfg = + {.sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGUSR1}; + const struct itimerspec timer_time = + {.it_value = {.tv_sec = 1}, .it_interval = {.tv_sec = 1}}; + struct timer_desirability timer = {}; + + sigaction(SIGUSR1, &signal_action, NULL); +#ifdef SIGINFO + sigaction(SIGINFO, &signal_action, NULL); +#endif + + if ((timer.desired = pa->pa_progress || pa->pa_astitle)) { + if (timer_create(CLOCK_MONOTONIC, &timer_cfg, &timer.timer)) + return ((void *)(uintptr_t)errno); + (void) timer_settime(timer.timer, 0, &timer_time, NULL); + } + pthread_cleanup_push(timer_delete_cleanup, &timer); + if (!pa->pa_parsable && pa->pa_progress) { (void) fprintf(stderr, "TIME %s %sSNAPSHOT %s\n", @@ -953,12 +1006,12 @@ send_progress_thread(void *arg) * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { - (void) sleep(1); + pause(); if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, &blocks)) != 0) { if (err == EINTR || err == ENOENT) - return ((void *)0); - return ((void *)(uintptr_t)err); + err = 0; + pthread_exit(((void *)(uintptr_t)err)); } (void) time(&t); @@ -991,21 +1044,25 @@ send_progress_thread(void *arg) (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, zhp->zfs_name); - } else if (pa->pa_progress) { + } else if (pa->pa_progress || + !send_progress_thread_signal_duetotimer) { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, zhp->zfs_name); } } + pthread_cleanup_pop(B_TRUE); } static boolean_t -send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid) +send_progress_thread_exit( + libzfs_handle_t *hdl, pthread_t ptid, sigset_t *oldmask) { void *status = NULL; (void) pthread_cancel(ptid); (void) pthread_join(ptid, &status); + pthread_sigmask(SIG_SETMASK, oldmask, NULL); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) return (zfs_standard_error(hdl, error, @@ -1199,7 +1256,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (sdd->progress || sdd->progressastitle) { + sigset_t oldmask; + { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; @@ -1214,13 +1272,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) zfs_close(zhp); return (err); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); - if ((sdd->progress || sdd->progressastitle) && - send_progress_thread_exit(zhp->zfs_hdl, tid)) + if (send_progress_thread_exit(zhp->zfs_hdl, tid, &oldmask)) return (-1); } @@ -1562,8 +1620,9 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, progress_arg_t pa = { 0 }; int err = 0; pthread_t ptid; + sigset_t oldmask; - if (flags->progress || flags->progressastitle) { + { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; @@ -1577,6 +1636,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_space_resume_redacted(zhp->zfs_name, from, @@ -1584,8 +1644,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, redactbook, fd, &size); *sizep = size; - if ((flags->progress || flags->progressastitle) && - send_progress_thread_exit(zhp->zfs_hdl, ptid)) + if (send_progress_thread_exit(zhp->zfs_hdl, ptid, &oldmask)) return (-1); if (!flags->progress && !flags->parsable) @@ -1876,11 +1935,12 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; + sigset_t oldmask; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (flags->progress || flags->progressastitle) { + { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; @@ -1898,6 +1958,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, zfs_close(zhp); return (error); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, @@ -1905,8 +1966,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, if (redact_book != NULL) free(redact_book); - if ((flags->progressastitle || flags->progress) && - send_progress_thread_exit(hdl, tid)) { + if (send_progress_thread_exit(hdl, tid, &oldmask)) { zfs_close(zhp); return (-1); } @@ -2691,7 +2751,8 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (flags->progress || flags->progressastitle) { + sigset_t oldmask; + { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; @@ -2708,13 +2769,13 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_redacted(name, from, fd, lzc_flags_from_sendflags(flags), redactbook); - if ((flags->progress || flags->progressastitle) && - send_progress_thread_exit(hdl, ptid)) + if (send_progress_thread_exit(hdl, ptid, &oldmask)) return (-1); if (err == 0 && (flags->props || flags->holds || flags->backup)) { diff --git a/man/man8/zfs-send.8 b/man/man8/zfs-send.8 index 8cc6ae6ad5..ba604bf778 100644 --- a/man/man8/zfs-send.8 +++ b/man/man8/zfs-send.8 @@ -29,7 +29,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd January 12, 2023 +.Dd July 27, 2023 .Dt ZFS-SEND 8 .Os . @@ -297,6 +297,12 @@ This flag can only be used in conjunction with .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. +The same report can be requested by sending +.Dv SIGINFO +or +.Dv SIGUSR1 , +regardless of +.Fl v . .Pp The format of the stream is committed. You will be able to receive your streams on future versions of ZFS. @@ -433,6 +439,12 @@ and the verbose output goes to standard error .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. +The same report can be requested by sending +.Dv SIGINFO +or +.Dv SIGUSR1 , +regardless of +.Fl v . .El .It Xo .Nm zfs @@ -669,6 +681,10 @@ ones on the source, and are ready to be used, while the parent snapshot on the target contains none of the username and password data present on the source, because it was removed by the redacted send operation. . +.Sh SIGNALS +See +.Fl v . +. .Sh EXAMPLES .\" These are, respectively, examples 12, 13 from zfs.8 .\" Make sure to update them bidirectionally From dd577a7e64e90f60c66cb2aa19401b301bdb538f Mon Sep 17 00:00:00 2001 From: oromenahar Date: Tue, 8 Aug 2023 18:37:06 +0200 Subject: [PATCH 4/8] zfs_clone_range should return a descriptive error codes Return the more descriptive error codes instead of `EXDEV` when the parameters don't match the requirements of the clone function. Updated the comments in `brt.c` accordingly. The first three errors are just invalid parameters, which zfs can not handle. The fourth error indicates that the block which should be cloned is created and cloned or modified in the same transaction group (`txg`). Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Kay Pedersen Closes #15148 --- module/os/freebsd/zfs/zfs_vnops_os.c | 2 +- module/os/linux/zfs/zpl_file_range.c | 4 ++-- module/zfs/brt.c | 6 +++--- module/zfs/zfs_vnops.c | 13 +++++++------ 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 45cf6fdfc4..513accf18a 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6290,7 +6290,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), ap->a_outoffp, &len, ap->a_outcred); - if (error == EXDEV || error == EOPNOTSUPP) + if (error == EXDEV || error == EINVAL || error == EOPNOTSUPP) goto bad_locked_fallback; *ap->a_lenp = (size_t)len; out_locked: diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 72384b638b..43ba9a4982 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -103,7 +103,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * Since Linux 5.3 the filesystem driver is responsible for executing * an appropriate fallback, and a generic fallback function is provided. */ - if (ret == -EOPNOTSUPP || ret == -EXDEV) + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); #else @@ -111,7 +111,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal * to the kernel that it should fallback to a content copy. */ - if (ret == -EXDEV) + if (ret == -EINVAL || ret == -EXDEV) ret = -EOPNOTSUPP; #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ diff --git a/module/zfs/brt.c b/module/zfs/brt.c index e8218fb268..ddd8eefe60 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -174,7 +174,7 @@ * size_t len, unsigned int flags); * * Even though offsets and length represent bytes, they have to be - * block-aligned or we will return the EXDEV error so the upper layer can + * block-aligned or we will return an error so the upper layer can * fallback to the generic mechanism that will just copy the data. * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. * This function was implemented based on zfs_write(), but instead of writing @@ -192,9 +192,9 @@ * Some special cases to consider and how we address them: * - The block we want to clone may have been created within the same * transaction group that we are trying to clone. Such block has no BP - * allocated yet, so cannot be immediately cloned. We return EXDEV. + * allocated yet, so cannot be immediately cloned. We return EAGAIN. * - The block we want to clone may have been modified within the same - * transaction group. We return EXDEV. + * transaction group. We return EAGAIN. * - A block may be cloned multiple times during one transaction group (that's * why pending list is actually a tree and not an append-only list - this * way we can figure out faster if this block is cloned for the first time diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 54ea43363b..c6831ff6cd 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1028,6 +1028,10 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) * * On success, the function return the number of bytes copied in *lenp. * Note, it doesn't return how much bytes are left to be copied. + * On errors which are caused by any file system limitations or + * brt limitations `EINVAL` is returned. In the most cases a user + * requested bad parameters, it could be possible to clone the file but + * some parameters don't match the requirements. */ int zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, @@ -1171,7 +1175,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * We cannot clone into files with different block size. */ if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } @@ -1179,7 +1183,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * Offsets and len must be at block boundries. */ if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } /* @@ -1187,7 +1191,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ if ((len % inblksz) != 0 && (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } @@ -1246,9 +1250,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * in the current transaction group. Return an error, * so the caller can fallback to just copying the data. */ - if (error == EAGAIN) { - error = SET_ERROR(EXDEV); - } break; } /* From 72305b2cdbb80c3794394b5d5f8057221c6d1c6f Mon Sep 17 00:00:00 2001 From: Rafael Kitover Date: Tue, 8 Aug 2023 16:38:34 +0000 Subject: [PATCH 5/8] dracut: support mountpoint=legacy for root dataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support mountpoint=legacy for the root dataset in the dracut zfs support scripts. mountpoint=/ or mountpoint=/sysroot also works. Change zfs-env-bootfs.service to add zfsutil to BOOTFSFLAGS only for root datasets with mountpoint != legacy. Reviewed-by: Brian Behlendorf Reviewed-by: Ahelenia Ziemiańska Signed-off-by: Rafael Kitover Closes #15149 --- contrib/dracut/90zfs/zfs-env-bootfs.service.in | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/contrib/dracut/90zfs/zfs-env-bootfs.service.in b/contrib/dracut/90zfs/zfs-env-bootfs.service.in index 7ebab4c1a5..fe362b930b 100644 --- a/contrib/dracut/90zfs/zfs-env-bootfs.service.in +++ b/contrib/dracut/90zfs/zfs-env-bootfs.service.in @@ -12,11 +12,12 @@ ExecStart=/bin/sh -c ' decode_root_args || exit 0; \ [ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \ rootflags="$(getarg rootflags=)"; \ - case ",$rootflags," in \ - *,zfsutil,*) ;; \ - ,,) rootflags=zfsutil ;; \ - *) rootflags="zfsutil,$rootflags" ;; \ - esac; \ + [ "$(@sbindir@/zfs get -H -o value mountpoint "$root")" = legacy ] || \ + case ",$rootflags," in \ + *,zfsutil,*) ;; \ + ,,) rootflags=zfsutil ;; \ + *) rootflags="zfsutil,$rootflags" ;; \ + esac; \ exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"' [Install] From b8a85c8161bb2aaf40b676261dbea6b4ee1e9a3e Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 8 Aug 2023 21:40:36 +0500 Subject: [PATCH 6/8] Move zinject from openzfs-zfs-test to openzfs-zfsutils For Native Debian packaging, zinject binary and man page is packaged in ZFS test package. zinject is not not directly related to ZTS and should be packaged with other utilities, like it is present in zfs_.rpm/deb packages. This commit moves zinject binary and man page from openzfs-zfs-test to openzfs-zfsutils package. Reviewed-by: Brian Behlendorf Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15160 --- contrib/debian/openzfs-zfs-test.install | 2 -- contrib/debian/openzfs-zfsutils.install | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/debian/openzfs-zfs-test.install b/contrib/debian/openzfs-zfs-test.install index cafcfdc0e1..b3afef50db 100644 --- a/contrib/debian/openzfs-zfs-test.install +++ b/contrib/debian/openzfs-zfs-test.install @@ -1,10 +1,8 @@ -sbin/zinject sbin/ztest usr/bin/raidz_test usr/share/man/man1/raidz_test.1 usr/share/man/man1/test-runner.1 usr/share/man/man1/ztest.1 -usr/share/man/man8/zinject.8 usr/share/zfs/common.sh usr/share/zfs/runfiles/ usr/share/zfs/test-runner diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 49f0ec0d5d..0f58508f00 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -27,6 +27,7 @@ sbin/zfs sbin/zfs_ids_to_path sbin/zgenhostid sbin/zhack +sbin/zinject sbin/zpool sbin/zstream sbin/zstreamdump @@ -92,6 +93,7 @@ usr/share/man/man8/zfs_ids_to_path.8 usr/share/man/man7/zfsconcepts.7 usr/share/man/man7/zfsprops.7 usr/share/man/man8/zgenhostid.8 +usr/share/man/man8/zinject.8 usr/share/man/man8/zpool-add.8 usr/share/man/man8/zpool-attach.8 usr/share/man/man8/zpool-checkpoint.8 From 46e77ff51707226476918573105a330b3005ca4b Mon Sep 17 00:00:00 2001 From: Rob N Date: Tue, 15 Aug 2023 10:34:14 +1000 Subject: [PATCH 7/8] copy_file_range: fix fallback when source create on same txg In 019dea0a5 we removed the conversion from EAGAIN->EXDEV inside zfs_clone_range(), but forgot to add a test for EAGAIN to the copy_file_range() entry points to trigger fallback to a content copy. This commit fixes that. Reviewed-by: Brian Behlendorf Reviewed-by: Kay Pedersen Signed-off-by: Rob Norris Closes #15170 Closes #15172 --- module/os/freebsd/zfs/zfs_vnops_os.c | 3 +- module/os/linux/zfs/zpl_file_range.c | 5 +- module/zfs/zfs_vnops.c | 7 +- tests/runfiles/linux.run | 3 +- tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/tests/Makefile.am | 3 +- ...loning_copyfilerange_fallback_same_txg.ksh | 66 +++++++++++++++++++ 7 files changed, 81 insertions(+), 8 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 513accf18a..c498a13282 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6290,7 +6290,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), ap->a_outoffp, &len, ap->a_outcred); - if (error == EXDEV || error == EINVAL || error == EOPNOTSUPP) + if (error == EXDEV || error == EAGAIN || error == EINVAL || + error == EOPNOTSUPP) goto bad_locked_fallback; *ap->a_lenp = (size_t)len; out_locked: diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 43ba9a4982..2abbf44df5 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -103,7 +103,8 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * Since Linux 5.3 the filesystem driver is responsible for executing * an appropriate fallback, and a generic fallback function is provided. */ - if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV) + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV || + ret == -EAGAIN) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); #else @@ -111,7 +112,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal * to the kernel that it should fallback to a content copy. */ - if (ret == -EINVAL || ret == -EXDEV) + if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN) ret = -EOPNOTSUPP; #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index c6831ff6cd..c1657ba802 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1246,9 +1246,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, &nbps); if (error != 0) { /* - * If we are tyring to clone a block that was created - * in the current transaction group. Return an error, - * so the caller can fallback to just copying the data. + * If we are trying to clone a block that was created + * in the current transaction group, error will be + * EAGAIN here, which we can just return to the caller + * so it can fallback if it likes. */ break; } diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4747b98373..2c8d5cb0ec 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -41,7 +41,8 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_ficlonerange_partial', 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', 'block_cloning_disabled_ficlonerange', - 'block_cloning_copyfilerange_cross_dataset'] + 'block_cloning_copyfilerange_cross_dataset', + 'block_cloning_copyfilerange_fallback_same_txg'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 5c4b3a7bcd..e1bbe063ab 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -304,6 +304,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': + ['SKIP', cfr_cross_reason], }) diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 3b6b2ef734..66aff5026f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -441,9 +441,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ functional/block_cloning/block_cloning_copyfilerange.ksh \ functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ - functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ functional/block_cloning/block_cloning_disabled_ficlone.ksh \ functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh new file mode 100755 index 0000000000..3451f887af --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# Copyright (c) 2023, Rob Norris +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="copy_file_range will fall back to copy when cloning on same txg" + +log_assert $claim + +typeset timeout=$(get_tunable TXG_TIMEOUT) + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 TXG_TIMEOUT $timeout +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must set_tunable64 TXG_TIMEOUT 5000 + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 + +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "" ] + +log_pass $claim + From 06924e6a7b34b444c64845f4569f344b50849953 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 24 Aug 2023 11:59:03 -0700 Subject: [PATCH 8/8] zed: Add zedlet to power off slot when drive is faulted If ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is enabled in zed.rc, then power off the drive's slot in the enclosure if it becomes FAULTED. This can help silence misbehaving drives. This assumes your drive enclosure fully supports slot power control via sysfs. Reviewed-by: @AllKind Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15200 --- cmd/zed/zed.d/statechange-slot_off.sh | 61 +++++++++++++++++++++++++++ cmd/zed/zed.d/zed.rc | 5 +++ 2 files changed, 66 insertions(+) create mode 100755 cmd/zed/zed.d/statechange-slot_off.sh diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh new file mode 100755 index 0000000000..d6f3c94a41 --- /dev/null +++ b/cmd/zed/zed.d/statechange-slot_off.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# +# Turn off disk's enclosure slot if it becomes FAULTED. +# +# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos +# as they flip between FAULTED and ONLINE. If +# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets +# FAULTED, then power down the slot via sysfs: +# +# /sys/class/enclosure///power_status +# +# We assume the user will be responsible for turning the slot back on again. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI Enclosure services (SES) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: slot successfully powered off +# 1: enclosure not available +# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled +# 3: vdev was not FAULTED +# 4: The enclosure sysfs path passed from ZFS does not exist +# 5: Enclosure slot didn't actually turn off after we told it to + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + # No JBOD enclosure or NVMe slots + exit 1 +fi + +if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then + exit 2 +fi + +if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then + exit 3 +fi + +if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then + exit 4 +fi + +echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" + +# Wait for sysfs for report that the slot is off. It can take ~400ms on some +# enclosures. +for i in $(seq 1 20) ; do + if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then + break + fi + sleep 0.1 +done + +if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then + exit 5 +fi + +zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH" diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index c55a70c79f..78dc1afc7b 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -142,3 +142,8 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" # Disabled by default, 1 to enable and 0 to disable. #ZED_SYSLOG_DISPLAY_GUIDS=1 +## +# Power off the drive's slot in the enclosure if it becomes FAULTED. This can +# help silence misbehaving drives. This assumes your drive enclosure fully +# supports slot power control via sysfs. +#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1