Merge pull request #158 from truenas/zfs-2.2-release-cobia-rc1

Sync with upstream zfs-2.2-release branch
This commit is contained in:
Alexander Motin 2023-08-25 16:27:57 -04:00 committed by GitHub
commit f8c61e8326
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
36 changed files with 893 additions and 566 deletions

View File

@ -79,6 +79,7 @@
#include <sys/dsl_crypt.h>
#include <sys/dsl_scan.h>
#include <sys/btree.h>
#include <sys/brt.h>
#include <zfs_comutil.h>
#include <sys/zstd/zstd.h>
@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = {
#define ZB_TOTAL DN_MAX_LEVELS
#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)
typedef struct zdb_brt_entry {
dva_t zbre_dva;
uint64_t zbre_refcount;
avl_node_t zbre_node;
} zdb_brt_entry_t;
typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
uint64_t zcb_removing_size;
uint64_t zcb_checkpoint_size;
uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks;
uint64_t zcb_clone_asize;
uint64_t zcb_clone_blocks;
uint64_t zcb_psize_count[SPA_MAX_FOR_16M];
uint64_t zcb_lsize_count[SPA_MAX_FOR_16M];
uint64_t zcb_asize_count[SPA_MAX_FOR_16M];
@ -5368,6 +5377,8 @@ typedef struct zdb_cb {
int zcb_haderrors;
spa_t *zcb_spa;
uint32_t **zcb_vd_obsolete_counts;
avl_tree_t zcb_brt;
boolean_t zcb_brt_is_active;
} zdb_cb_t;
/* test if two DVA offsets from same vdev are within the same metaslab */
@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
zcb->zcb_asize_total += BP_GET_ASIZE(bp);
if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
/*
* Cloned blocks are special. We need to count them, so we can
* later uncount them when reporting leaked space, and we must
* only claim them them once.
*
* To do this, we keep our own in-memory BRT. For each block
* we haven't seen before, we look it up in the real BRT and
* if its there, we note it and its refcount then proceed as
* normal. If we see the block again, we count it as a clone
* and then give it no further consideration.
*/
zdb_brt_entry_t zbre_search, *zbre;
avl_index_t where;
zbre_search.zbre_dva = bp->blk_dva[0];
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
if (zbre != NULL) {
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
zcb->zcb_clone_blocks++;
zbre->zbre_refcount--;
if (zbre->zbre_refcount == 0) {
avl_remove(&zcb->zcb_brt, zbre);
umem_free(zbre, sizeof (zdb_brt_entry_t));
}
return;
}
uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
if (crefcnt > 0) {
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
UMEM_NOFAIL);
zbre->zbre_dva = bp->blk_dva[0];
zbre->zbre_refcount = crefcnt;
avl_insert(&zcb->zcb_brt, zbre, where);
}
}
if (dump_opt['L'])
return;
@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa)
iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
}
static int
zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
{
const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
int cmp;
cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
if (cmp == 0)
cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
return (cmp);
}
static int
dump_block_stats(spa_t *spa)
{
@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa)
zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
sizeof (zdb_brt_entry_t),
offsetof(zdb_brt_entry_t, zbre_node));
zcb->zcb_brt_is_active = B_TRUE;
}
(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
(dump_opt['c'] == 1) ? "metadata " : "",
@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa)
metaslab_class_get_alloc(spa_special_class(spa)) +
metaslab_class_get_alloc(spa_dedup_class(spa)) +
get_unflushed_alloc_space(spa);
total_found = tzb->zb_asize - zcb->zcb_dedup_asize +
total_found =
tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
if (total_found == total_alloc && !dump_opt['L']) {
@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa)
"bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
(u_longlong_t)zcb->zcb_dedup_blocks,
(double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
(void) printf("\t%-16s %14llu count: %6llu\n",
"bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
(u_longlong_t)zcb->zcb_clone_blocks);
(void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);

View File

@ -0,0 +1,61 @@
#!/bin/sh
#
# Turn off disk's enclosure slot if it becomes FAULTED.
#
# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
# as they flip between FAULTED and ONLINE. If
# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
# FAULTED, then power down the slot via sysfs:
#
# /sys/class/enclosure/<enclosure>/<slot>/power_status
#
# We assume the user will be responsible for turning the slot back on again.
#
# Note that this script requires that your enclosure be supported by the
# Linux SCSI Enclosure services (SES) driver. The script will do nothing
# if you have no enclosure, or if your enclosure isn't supported.
#
# Exit codes:
# 0: slot successfully powered off
# 1: enclosure not available
# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled
# 3: vdev was not FAULTED
# 4: The enclosure sysfs path passed from ZFS does not exist
# 5: Enclosure slot didn't actually turn off after we told it to
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"
if [ ! -d /sys/class/enclosure ] ; then
# No JBOD enclosure or NVMe slots
exit 1
fi
if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then
exit 2
fi
if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then
exit 3
fi
if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then
exit 4
fi
echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status"
# Wait for sysfs for report that the slot is off. It can take ~400ms on some
# enclosures.
for i in $(seq 1 20) ; do
if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then
break
fi
sleep 0.1
done
if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then
exit 5
fi
zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH"

View File

@ -142,3 +142,8 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
# Disabled by default, 1 to enable and 0 to disable.
#ZED_SYSLOG_DISPLAY_GUIDS=1
##
# Power off the drive's slot in the enclosure if it becomes FAULTED. This can
# help silence misbehaving drives. This assumes your drive enclosure fully
# supports slot power control via sysfs.
#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1

View File

@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
int error;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
ztest_object_lock(zd, object, RL_READER);
@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
DMU_READ_NO_PREFETCH);
ASSERT0(error);
} else {
ASSERT3P(zio, !=, NULL);
size = doi.doi_data_block_size;
if (ISP2(size)) {
offset = P2ALIGN(offset, size);

View File

@ -12,11 +12,12 @@ ExecStart=/bin/sh -c '
decode_root_args || exit 0; \
[ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \
rootflags="$(getarg rootflags=)"; \
case ",$rootflags," in \
*,zfsutil,*) ;; \
,,) rootflags=zfsutil ;; \
*) rootflags="zfsutil,$rootflags" ;; \
esac; \
[ "$(@sbindir@/zfs get -H -o value mountpoint "$root")" = legacy ] || \
case ",$rootflags," in \
*,zfsutil,*) ;; \
,,) rootflags=zfsutil ;; \
*) rootflags="zfsutil,$rootflags" ;; \
esac; \
exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"'
[Install]

View File

@ -198,6 +198,14 @@ extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache);
spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move)
#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc)
/*
* This is necessary to be compatible with other kernel modules
* or in-tree filesystem that may define kmem_cache_alloc,
* like bcachefs does it now.
*/
#ifdef kmem_cache_alloc
#undef kmem_cache_alloc
#endif
#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags)
#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj)
#define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc)

View File

@ -36,6 +36,7 @@ extern "C" {
#endif
extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp);
extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp);
extern uint64_t brt_get_dspace(spa_t *spa);
extern uint64_t brt_get_used(spa_t *spa);

View File

@ -572,11 +572,15 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp);
int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags);
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, boolean_t read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp, uint32_t flags);
int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
dmu_buf_t **dbp);
/*
* Add a reference to a dmu buffer that has already been held via
* dmu_buf_hold() in the current context.

View File

@ -247,8 +247,6 @@ typedef struct dmu_sendstatus {
void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
const void *, dmu_buf_t **);
#ifdef __cplusplus
}

View File

@ -80,7 +80,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10
#define METASLAB_MUST_RESERVE 0x20
#define METASLAB_FASTWRITE 0x40
#define METASLAB_ZIL 0x80
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
@ -96,8 +95,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
void metaslab_stat_init(void);
void metaslab_stat_fini(void);

View File

@ -313,7 +313,7 @@ struct metaslab_group {
* Each metaslab maintains a set of in-core trees to track metaslab
* operations. The in-core free tree (ms_allocatable) contains the list of
* free segments which are eligible for allocation. As blocks are
* allocated, the allocated segment are removed from the ms_allocatable and
* allocated, the allocated segments are removed from the ms_allocatable and
* added to a per txg allocation tree (ms_allocating). As blocks are
* freed, they are added to the free tree (ms_freeing). These trees
* allow us to process all allocations and frees in syncing context
@ -366,9 +366,9 @@ struct metaslab_group {
struct metaslab {
/*
* This is the main lock of the metaslab and its purpose is to
* coordinate our allocations and frees [e.g metaslab_block_alloc(),
* coordinate our allocations and frees [e.g., metaslab_block_alloc(),
* metaslab_free_concrete(), ..etc] with our various syncing
* procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
* procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc].
*
* The lock is also used during some miscellaneous operations like
* using the metaslab's histogram for the metaslab group's histogram

View File

@ -266,7 +266,6 @@ struct vdev {
metaslab_group_t *vdev_mg; /* metaslab group */
metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */
metaslab_t **vdev_ms; /* metaslab array */
uint64_t vdev_pending_fastwrite; /* allocated fastwrites */
txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */

View File

@ -38,14 +38,22 @@ extern "C" {
/*
* Possible states for a given lwb structure.
*
* An lwb will start out in the "closed" state, and then transition to
* the "opened" state via a call to zil_lwb_write_open(). When
* transitioning from "closed" to "opened" the zilog's "zl_issuer_lock"
* must be held.
* An lwb will start out in the "new" state, and transition to the "opened"
* state via a call to zil_lwb_write_open() on first itx assignment. When
* transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be
* held.
*
* After the lwb is "opened", it can transition into the "issued" state
* via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must
* be held when making this transition.
* After the lwb is "opened", it can be assigned number of itxs and transition
* into the "closed" state via zil_lwb_write_close() when full or on timeout.
* When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock"
* must be held. New lwb allocation also takes "zl_lock" to protect the list.
*
* After the lwb is "closed", it can transition into the "ready" state via
* zil_lwb_write_issue(). "zl_lock" must be held when making this transition.
* Since it is done by the same thread, "zl_issuer_lock" is not needed.
*
* When lwb in "ready" state receives its block pointer, it can transition to
* "issued". "zl_lock" must be held when making this transition.
*
* After the lwb's write zio completes, it transitions into the "write
* done" state via zil_lwb_write_done(); and then into the "flush done"
@ -62,17 +70,20 @@ extern "C" {
*
* Additionally, correctness when reading an lwb's state is often
* achieved by exploiting the fact that these state transitions occur in
* this specific order; i.e. "closed" to "opened" to "issued" to "done".
* this specific order; i.e. "new" to "opened" to "closed" to "ready" to
* "issued" to "write_done" and finally "flush_done".
*
* Thus, if an lwb is in the "closed" or "opened" state, holding the
* Thus, if an lwb is in the "new" or "opened" state, holding the
* "zl_issuer_lock" will prevent a concurrent thread from transitioning
* that lwb to the "issued" state. Likewise, if an lwb is already in the
* "issued" state, holding the "zl_lock" will prevent a concurrent
* thread from transitioning that lwb to the "write done" state.
* that lwb to the "closed" state. Likewise, if an lwb is already in the
* "ready" state, holding the "zl_lock" will prevent a concurrent thread
* from transitioning that lwb to the "issued" state.
*/
typedef enum {
LWB_STATE_CLOSED,
LWB_STATE_NEW,
LWB_STATE_OPENED,
LWB_STATE_CLOSED,
LWB_STATE_READY,
LWB_STATE_ISSUED,
LWB_STATE_WRITE_DONE,
LWB_STATE_FLUSH_DONE,
@ -91,18 +102,21 @@ typedef enum {
typedef struct lwb {
zilog_t *lwb_zilog; /* back pointer to log struct */
blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */
boolean_t lwb_slim; /* log block has slim format */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */
boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */
int lwb_error; /* log block allocation error */
int lwb_nmax; /* max bytes in the buffer */
int lwb_nused; /* # used bytes in buffer */
int lwb_nfilled; /* # filled bytes in buffer */
int lwb_sz; /* size of block and buffer */
lwb_state_t lwb_state; /* the state of this lwb */
char *lwb_buf; /* log write buffer */
zio_t *lwb_child_zio; /* parent zio for children */
zio_t *lwb_write_zio; /* zio for the lwb buffer */
zio_t *lwb_root_zio; /* root zio for lwb write and flushes */
hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
uint64_t lwb_issued_txg; /* the txg when the write is issued */
uint64_t lwb_alloc_txg; /* the txg when lwb_blk is allocated */
uint64_t lwb_max_txg; /* highest txg in this lwb */
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */

View File

@ -222,7 +222,6 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_NOPWRITE (1ULL << 28)
#define ZIO_FLAG_REEXECUTED (1ULL << 29)
#define ZIO_FLAG_DELEGATED (1ULL << 30)
#define ZIO_FLAG_FASTWRITE (1ULL << 31)
#define ZIO_FLAG_MUSTSUCCEED 0
#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)

View File

@ -57,7 +57,7 @@ libzfs_la_LIBADD = \
libzutil.la \
libuutil.la
libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)
libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)
libzfs_la_LDFLAGS = -pthread

View File

@ -3928,6 +3928,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
switch (errno) {
case EALREADY:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"removal for this vdev is already in progress."));
(void) zfs_error(hdl, EZFS_BUSY, errbuf);
break;
case EINVAL:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid config; all top-level vdevs must "

View File

@ -928,6 +928,39 @@ zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written,
return (0);
}
static volatile boolean_t send_progress_thread_signal_duetotimer;
static void
send_progress_thread_act(int sig, siginfo_t *info, void *ucontext)
{
(void) sig, (void) ucontext;
send_progress_thread_signal_duetotimer = info->si_code == SI_TIMER;
}
struct timer_desirability {
timer_t timer;
boolean_t desired;
};
static void
timer_delete_cleanup(void *timer)
{
struct timer_desirability *td = timer;
if (td->desired)
timer_delete(td->timer);
}
#ifdef SIGINFO
#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO sigaddset(&new, SIGINFO)
#else
#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO
#endif
#define SEND_PROGRESS_THREAD_PARENT_BLOCK(old) { \
sigset_t new; \
sigemptyset(&new); \
sigaddset(&new, SIGUSR1); \
SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO; \
pthread_sigmask(SIG_BLOCK, &new, old); \
}
static void *
send_progress_thread(void *arg)
{
@ -941,6 +974,26 @@ send_progress_thread(void *arg)
struct tm tm;
int err;
const struct sigaction signal_action =
{.sa_sigaction = send_progress_thread_act, .sa_flags = SA_SIGINFO};
struct sigevent timer_cfg =
{.sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGUSR1};
const struct itimerspec timer_time =
{.it_value = {.tv_sec = 1}, .it_interval = {.tv_sec = 1}};
struct timer_desirability timer = {};
sigaction(SIGUSR1, &signal_action, NULL);
#ifdef SIGINFO
sigaction(SIGINFO, &signal_action, NULL);
#endif
if ((timer.desired = pa->pa_progress || pa->pa_astitle)) {
if (timer_create(CLOCK_MONOTONIC, &timer_cfg, &timer.timer))
return ((void *)(uintptr_t)errno);
(void) timer_settime(timer.timer, 0, &timer_time, NULL);
}
pthread_cleanup_push(timer_delete_cleanup, &timer);
if (!pa->pa_parsable && pa->pa_progress) {
(void) fprintf(stderr,
"TIME %s %sSNAPSHOT %s\n",
@ -953,12 +1006,12 @@ send_progress_thread(void *arg)
* Print the progress from ZFS_IOC_SEND_PROGRESS every second.
*/
for (;;) {
(void) sleep(1);
pause();
if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes,
&blocks)) != 0) {
if (err == EINTR || err == ENOENT)
return ((void *)0);
return ((void *)(uintptr_t)err);
err = 0;
pthread_exit(((void *)(uintptr_t)err));
}
(void) time(&t);
@ -991,21 +1044,25 @@ send_progress_thread(void *arg)
(void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
tm.tm_hour, tm.tm_min, tm.tm_sec,
(u_longlong_t)bytes, zhp->zfs_name);
} else if (pa->pa_progress) {
} else if (pa->pa_progress ||
!send_progress_thread_signal_duetotimer) {
zfs_nicebytes(bytes, buf, sizeof (buf));
(void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n",
tm.tm_hour, tm.tm_min, tm.tm_sec,
buf, zhp->zfs_name);
}
}
pthread_cleanup_pop(B_TRUE);
}
static boolean_t
send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid)
send_progress_thread_exit(
libzfs_handle_t *hdl, pthread_t ptid, sigset_t *oldmask)
{
void *status = NULL;
(void) pthread_cancel(ptid);
(void) pthread_join(ptid, &status);
pthread_sigmask(SIG_SETMASK, oldmask, NULL);
int error = (int)(uintptr_t)status;
if (error != 0 && status != PTHREAD_CANCELED)
return (zfs_standard_error(hdl, error,
@ -1199,7 +1256,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
* If progress reporting is requested, spawn a new thread to
* poll ZFS_IOC_SEND_PROGRESS at a regular interval.
*/
if (sdd->progress || sdd->progressastitle) {
sigset_t oldmask;
{
pa.pa_zhp = zhp;
pa.pa_fd = sdd->outfd;
pa.pa_parsable = sdd->parsable;
@ -1214,13 +1272,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
zfs_close(zhp);
return (err);
}
SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
}
err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
fromorigin, sdd->outfd, flags, sdd->debugnv);
if ((sdd->progress || sdd->progressastitle) &&
send_progress_thread_exit(zhp->zfs_hdl, tid))
if (send_progress_thread_exit(zhp->zfs_hdl, tid, &oldmask))
return (-1);
}
@ -1562,8 +1620,9 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
progress_arg_t pa = { 0 };
int err = 0;
pthread_t ptid;
sigset_t oldmask;
if (flags->progress || flags->progressastitle) {
{
pa.pa_zhp = zhp;
pa.pa_fd = fd;
pa.pa_parsable = flags->parsable;
@ -1577,6 +1636,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
return (zfs_error(zhp->zfs_hdl,
EZFS_THREADCREATEFAILED, errbuf));
}
SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
}
err = lzc_send_space_resume_redacted(zhp->zfs_name, from,
@ -1584,8 +1644,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
redactbook, fd, &size);
*sizep = size;
if ((flags->progress || flags->progressastitle) &&
send_progress_thread_exit(zhp->zfs_hdl, ptid))
if (send_progress_thread_exit(zhp->zfs_hdl, ptid, &oldmask))
return (-1);
if (!flags->progress && !flags->parsable)
@ -1876,11 +1935,12 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
if (!flags->dryrun) {
progress_arg_t pa = { 0 };
pthread_t tid;
sigset_t oldmask;
/*
* If progress reporting is requested, spawn a new thread to
* poll ZFS_IOC_SEND_PROGRESS at a regular interval.
*/
if (flags->progress || flags->progressastitle) {
{
pa.pa_zhp = zhp;
pa.pa_fd = outfd;
pa.pa_parsable = flags->parsable;
@ -1898,6 +1958,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
zfs_close(zhp);
return (error);
}
SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
}
error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd,
@ -1905,8 +1966,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
if (redact_book != NULL)
free(redact_book);
if ((flags->progressastitle || flags->progress) &&
send_progress_thread_exit(hdl, tid)) {
if (send_progress_thread_exit(hdl, tid, &oldmask)) {
zfs_close(zhp);
return (-1);
}
@ -2691,7 +2751,8 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
* If progress reporting is requested, spawn a new thread to poll
* ZFS_IOC_SEND_PROGRESS at a regular interval.
*/
if (flags->progress || flags->progressastitle) {
sigset_t oldmask;
{
pa.pa_zhp = zhp;
pa.pa_fd = fd;
pa.pa_parsable = flags->parsable;
@ -2708,13 +2769,13 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
return (zfs_error(zhp->zfs_hdl,
EZFS_THREADCREATEFAILED, errbuf));
}
SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
}
err = lzc_send_redacted(name, from, fd,
lzc_flags_from_sendflags(flags), redactbook);
if ((flags->progress || flags->progressastitle) &&
send_progress_thread_exit(hdl, ptid))
if (send_progress_thread_exit(hdl, ptid, &oldmask))
return (-1);
if (err == 0 && (flags->props || flags->holds || flags->backup)) {

View File

@ -29,7 +29,7 @@
.\" Copyright 2018 Nexenta Systems, Inc.
.\" Copyright 2019 Joyent, Inc.
.\"
.Dd January 12, 2023
.Dd July 27, 2023
.Dt ZFS-SEND 8
.Os
.
@ -297,6 +297,12 @@ This flag can only be used in conjunction with
.It Fl v , -verbose
Print verbose information about the stream package generated.
This information includes a per-second report of how much data has been sent.
The same report can be requested by sending
.Dv SIGINFO
or
.Dv SIGUSR1 ,
regardless of
.Fl v .
.Pp
The format of the stream is committed.
You will be able to receive your streams on future versions of ZFS.
@ -433,6 +439,12 @@ and the verbose output goes to standard error
.It Fl v , -verbose
Print verbose information about the stream package generated.
This information includes a per-second report of how much data has been sent.
The same report can be requested by sending
.Dv SIGINFO
or
.Dv SIGUSR1 ,
regardless of
.Fl v .
.El
.It Xo
.Nm zfs
@ -669,6 +681,10 @@ ones on the source, and are ready to be used, while the parent snapshot on the
target contains none of the username and password data present on the source,
because it was removed by the redacted send operation.
.
.Sh SIGNALS
See
.Fl v .
.
.Sh EXAMPLES
.\" These are, respectively, examples 12, 13 from zfs.8
.\" Make sure to update them bidirectionally

View File

@ -6290,7 +6290,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
ap->a_outoffp, &len, ap->a_outcred);
if (error == EXDEV || error == EOPNOTSUPP)
if (error == EXDEV || error == EAGAIN || error == EINVAL ||
error == EOPNOTSUPP)
goto bad_locked_fallback;
*ap->a_lenp = (size_t)len;
out_locked:

View File

@ -478,17 +478,19 @@ zfsctl_is_snapdir(struct inode *ip)
*/
static struct inode *
zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
const struct file_operations *fops, const struct inode_operations *ops)
const struct file_operations *fops, const struct inode_operations *ops,
uint64_t creation)
{
inode_timespec_t now;
struct inode *ip;
znode_t *zp;
inode_timespec_t now = {.tv_sec = creation};
ip = new_inode(zfsvfs->z_sb);
if (ip == NULL)
return (NULL);
now = current_time(ip);
if (!creation)
now = current_time(ip);
zp = ITOZ(ip);
ASSERT3P(zp->z_dirlocks, ==, NULL);
ASSERT3P(zp->z_acl_cached, ==, NULL);
@ -552,14 +554,28 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
const struct file_operations *fops, const struct inode_operations *ops)
{
struct inode *ip = NULL;
uint64_t creation = 0;
dsl_dataset_t *snap_ds;
dsl_pool_t *pool;
while (ip == NULL) {
ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
if (ip)
break;
if (id <= ZFSCTL_INO_SNAPDIRS && !creation) {
pool = dmu_objset_pool(zfsvfs->z_os);
dsl_pool_config_enter(pool, FTAG);
if (!dsl_dataset_hold_obj(pool,
ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) {
creation = dsl_get_creation(snap_ds);
dsl_dataset_rele(snap_ds, FTAG);
}
dsl_pool_config_exit(pool, FTAG);
}
/* May fail due to concurrent zfsctl_inode_alloc() */
ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation);
}
return (ip);
@ -581,7 +597,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
ASSERT(zfsvfs->z_ctldir == NULL);
zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
&zpl_fops_root, &zpl_ops_root);
&zpl_fops_root, &zpl_ops_root, 0);
if (zfsvfs->z_ctldir == NULL)
return (SET_ERROR(ENOENT));

View File

@ -103,9 +103,17 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
* Since Linux 5.3 the filesystem driver is responsible for executing
* an appropriate fallback, and a generic fallback function is provided.
*/
if (ret == -EOPNOTSUPP || ret == -EXDEV)
if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
ret == -EAGAIN)
ret = generic_copy_file_range(src_file, src_off, dst_file,
dst_off, len, flags);
#else
/*
* Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
* to the kernel that it should fallback to a content copy.
*/
if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
ret = -EOPNOTSUPP;
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
return (ret);

View File

@ -174,7 +174,7 @@
* size_t len, unsigned int flags);
*
* Even though offsets and length represent bytes, they have to be
* block-aligned or we will return the EXDEV error so the upper layer can
* block-aligned or we will return an error so the upper layer can
* fallback to the generic mechanism that will just copy the data.
* Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
* This function was implemented based on zfs_write(), but instead of writing
@ -192,9 +192,9 @@
* Some special cases to consider and how we address them:
* - The block we want to clone may have been created within the same
* transaction group that we are trying to clone. Such block has no BP
* allocated yet, so cannot be immediately cloned. We return EXDEV.
* allocated yet, so cannot be immediately cloned. We return EAGAIN.
* - The block we want to clone may have been modified within the same
* transaction group. We return EXDEV.
* transaction group. We return EAGAIN.
* - A block may be cloned multiple times during one transaction group (that's
* why pending list is actually a tree and not an append-only list - this
* way we can figure out faster if this block is cloned for the first time
@ -1544,6 +1544,37 @@ out:
return (B_FALSE);
}
uint64_t
brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
{
brt_t *brt = spa->spa_brt;
brt_vdev_t *brtvd;
brt_entry_t bre_search, *bre;
uint64_t vdevid, refcnt;
int error;
brt_entry_fill(bp, &bre_search, &vdevid);
brt_rlock(brt);
brtvd = brt_vdev(brt, vdevid);
ASSERT(brtvd != NULL);
bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
if (bre == NULL) {
error = brt_entry_lookup(brt, brtvd, &bre_search);
ASSERT(error == 0 || error == ENOENT);
if (error == ENOENT)
refcnt = 0;
else
refcnt = bre_search.bre_refcount;
} else
refcnt = bre->bre_refcount;
brt_unlock(brt);
return (refcnt);
}
static void
brt_prefetch(brt_t *brt, const blkptr_t *bp)
{

View File

@ -165,7 +165,7 @@ dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
{ zfs_acl_byteswap, "acl" }
};
static int
int
dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp)
{
@ -185,6 +185,7 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
*dbp = &db->db;
return (0);
}
int
dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp)
@ -1653,10 +1654,22 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
{
dmu_sync_arg_t *dsa;
dmu_tx_t *tx;
int error;
error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
DB_RF_CANFAIL | DB_RF_NOPREFETCH);
if (error != 0)
return (error);
tx = dmu_tx_create(os);
dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
/*
* This transaction does not produce any dirty data or log blocks, so
* it should not be throttled. All other cases wait for TXG sync, by
* which time the log block we are writing will be obsolete, so we can
* skip waiting and just return error here instead.
*/
if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) {
dmu_tx_abort(tx);
/* Make zl_get_data do txg_waited_synced() */
return (SET_ERROR(EIO));

View File

@ -1292,7 +1292,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
/*
* If this metaslab group is below its qmax or it's
* the only allocatable metasable group, then attempt
* the only allocatable metaslab group, then attempt
* to allocate from it.
*/
if (qdepth < qmax || mc->mc_alloc_groups == 1)
@ -5101,7 +5101,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
zio_alloc_list_t *zal, int allocator)
{
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
metaslab_group_t *mg, *fast_mg, *rotor;
metaslab_group_t *mg, *rotor;
vdev_t *vd;
boolean_t try_hard = B_FALSE;
@ -5164,15 +5164,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
} else if (flags & METASLAB_FASTWRITE) {
mg = fast_mg = mca->mca_rotor;
do {
if (fast_mg->mg_vd->vdev_pending_fastwrite <
mg->mg_vd->vdev_pending_fastwrite)
mg = fast_mg;
} while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
} else {
ASSERT(mca->mca_rotor != NULL);
mg = mca->mca_rotor;
@ -5297,7 +5288,7 @@ top:
mg->mg_bias = 0;
}
if ((flags & METASLAB_FASTWRITE) ||
if ((flags & METASLAB_ZIL) ||
atomic_add_64_nv(&mca->mca_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) {
mca->mca_rotor = mg->mg_next;
@ -5310,11 +5301,6 @@ top:
((flags & METASLAB_GANG_HEADER) ? 1 : 0));
DVA_SET_ASIZE(&dva[d], asize);
if (flags & METASLAB_FASTWRITE) {
atomic_add_64(&vd->vdev_pending_fastwrite,
psize);
}
return (0);
}
next:
@ -5950,55 +5936,6 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
return (error);
}
void
metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
uint64_t psize = BP_GET_PSIZE(bp);
int d;
vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (d = 0; d < ndvas; d++) {
if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
continue;
atomic_add_64(&vd->vdev_pending_fastwrite, psize);
}
spa_config_exit(spa, SCL_VDEV, FTAG);
}
void
metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
uint64_t psize = BP_GET_PSIZE(bp);
int d;
vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (d = 0; d < ndvas; d++) {
if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
continue;
ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
}
spa_config_exit(spa, SCL_VDEV, FTAG);
}
static void
metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)

View File

@ -1192,7 +1192,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
ASSERT(tvd == tvd->vdev_top);
tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
tvd->vdev_ms_array = svd->vdev_ms_array;
tvd->vdev_ms_shift = svd->vdev_ms_shift;
tvd->vdev_ms_count = svd->vdev_ms_count;
@ -1655,7 +1654,6 @@ vdev_metaslab_fini(vdev_t *vd)
}
}
ASSERT0(vd->vdev_ms_count);
ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
}
typedef struct vdev_probe_stats {

View File

@ -839,7 +839,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
uint64_t zp_gen;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
/*
@ -889,6 +888,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
ASSERT3P(zio, !=, NULL);
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
@ -917,8 +917,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
}
#endif
if (error == 0)
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
error = dmu_buf_hold_noread(os, object, offset, zgd,
&db);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
@ -1028,6 +1028,10 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
*
* On success, the function return the number of bytes copied in *lenp.
* Note, it doesn't return how much bytes are left to be copied.
* On errors which are caused by any file system limitations or
* brt limitations `EINVAL` is returned. In the most cases a user
* requested bad parameters, it could be possible to clone the file but
* some parameters don't match the requirements.
*/
int
zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
@ -1171,7 +1175,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
* We cannot clone into files with different block size.
*/
if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) {
error = SET_ERROR(EXDEV);
error = SET_ERROR(EINVAL);
goto unlock;
}
@ -1179,7 +1183,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
* Offsets and len must be at block boundries.
*/
if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
error = SET_ERROR(EXDEV);
error = SET_ERROR(EINVAL);
goto unlock;
}
/*
@ -1187,7 +1191,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
*/
if ((len % inblksz) != 0 &&
(len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
error = SET_ERROR(EXDEV);
error = SET_ERROR(EINVAL);
goto unlock;
}
@ -1242,13 +1246,11 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
&nbps);
if (error != 0) {
/*
* If we are tyring to clone a block that was created
* in the current transaction group. Return an error,
* so the caller can fallback to just copying the data.
* If we are trying to clone a block that was created
* in the current transaction group, error will be
* EAGAIN here, which we can just return to the caller
* so it can fallback if it likes.
*/
if (error == EAGAIN) {
error = SET_ERROR(EXDEV);
}
break;
}
/*

File diff suppressed because it is too large Load Diff

View File

@ -3024,11 +3024,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
*/
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
/*
* We didn't allocate this bp, so make sure it doesn't get unmarked.
*/
pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
zio_nowait(zio);
return (pio);
@ -3616,7 +3611,6 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
if (zio->io_flags & ZIO_FLAG_NODATA)
flags |= METASLAB_DONT_THROTTLE;
if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
@ -3776,7 +3770,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* of, so we just hash the objset ID to pick the allocator to get
* some parallelism.
*/
int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
int flags = METASLAB_ZIL;
int allocator = (uint_t)cityhash4(0, 0, 0,
os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
@ -4472,8 +4466,8 @@ zio_ready(zio_t *zio)
zio_t *pio, *pio_next;
zio_link_t *zl = NULL;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
ZIO_WAIT_READY)) {
if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
return (NULL);
}
@ -4931,12 +4925,6 @@ zio_done(zio_t *zio)
zfs_ereport_free_checksum(zcr);
}
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
!BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
!(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
}
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as

View File

@ -698,7 +698,6 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
int error;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
@ -717,6 +716,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH);
} else { /* indirect write */
ASSERT3P(zio, !=, NULL);
/*
* Have to lock the whole block to ensure when it's written out
* and its checksum is being calculated that no one can change
@ -727,8 +727,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
offset = P2ALIGN_TYPED(offset, size, uint64_t);
zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
size, RL_READER);
error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
&db);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;

View File

@ -36,11 +36,13 @@ tags = ['functional', 'atime']
[tests/functional/block_cloning:Linux]
tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial',
'block_cloning_copyfilerange_fallback',
'block_cloning_ficlone', 'block_cloning_ficlonerange',
'block_cloning_ficlonerange_partial',
'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone',
'block_cloning_disabled_ficlonerange',
'block_cloning_copyfilerange_cross_dataset']
'block_cloning_copyfilerange_cross_dataset',
'block_cloning_copyfilerange_fallback_same_txg']
tags = ['functional', 'block_cloning']
[tests/functional/chattr:Linux]

View File

@ -300,8 +300,12 @@ elif sys.platform.startswith('linux'):
['SKIP', cfr_reason],
'block_cloning/block_cloning_copyfilerange_partial':
['SKIP', cfr_reason],
'block_cloning/block_cloning_copyfilerange_fallback':
['SKIP', cfr_reason],
'block_cloning/block_cloning_copyfilerange_cross_dataset':
['SKIP', cfr_cross_reason],
'block_cloning/block_cloning_copyfilerange_fallback_same_txg':
['SKIP', cfr_cross_reason],
})

View File

@ -212,7 +212,7 @@ main(int argc, char **argv)
int dfd = open(argv[optind+1], O_WRONLY|O_CREAT,
S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if (sfd < 0) {
if (dfd < 0) {
fprintf(stderr, "open: %s: %s\n",
argv[optind+1], strerror(errno));
close(sfd);

View File

@ -44,6 +44,7 @@
#include <fcntl.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <time.h>
int

View File

@ -441,6 +441,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/block_cloning/cleanup.ksh \
functional/block_cloning/setup.ksh \
functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
functional/block_cloning/block_cloning_copyfilerange.ksh \
functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \

View File

@ -0,0 +1,86 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
verify_runnable "global"
if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
log_unsupported "copy_file_range not available before Linux 4.5"
fi
claim="copy_file_range will fall back to copy when cloning not possible."
log_assert $claim
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
}
log_onexit cleanup
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
log_must sync_pool $TESTPOOL
log_note "Copying entire file with copy_file_range"
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "1 2 3 4" ]
log_note "Copying within a block with copy_file_range"
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 32768 32768 65536
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "2 3 4" ]
log_note "Copying across a block with copy_file_range"
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 327680 327680 131072
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "2" ]
log_pass $claim

View File

@ -0,0 +1,66 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
verify_runnable "global"
if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
log_unsupported "copy_file_range not available before Linux 4.5"
fi
claim="copy_file_range will fall back to copy when cloning on same txg"
log_assert $claim
typeset timeout=$(get_tunable TXG_TIMEOUT)
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
set_tunable64 TXG_TIMEOUT $timeout
}
log_onexit cleanup
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must set_tunable64 TXG_TIMEOUT 5000
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "" ]
log_pass $claim