diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 9568d2bbfe..4b9921d47b 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = { #define ZB_TOTAL DN_MAX_LEVELS #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) +typedef struct zdb_brt_entry { + dva_t zbre_dva; + uint64_t zbre_refcount; + avl_node_t zbre_node; +} zdb_brt_entry_t; + typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; + uint64_t zcb_clone_asize; + uint64_t zcb_clone_blocks; uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; @@ -5368,6 +5377,8 @@ typedef struct zdb_cb { int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; + avl_tree_t zcb_brt; + boolean_t zcb_brt_is_active; } zdb_cb_t; /* test if two DVA offsets from same vdev are within the same metaslab */ @@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); + if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { + /* + * Cloned blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them them once. + * + * To do this, we keep our own in-memory BRT. For each block + * we haven't seen before, we look it up in the real BRT and + * if its there, we note it and its refcount then proceed as + * normal. If we see the block again, we count it as a clone + * and then give it no further consideration. + */ + zdb_brt_entry_t zbre_search, *zbre; + avl_index_t where; + + zbre_search.zbre_dva = bp->blk_dva[0]; + zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); + if (zbre != NULL) { + zcb->zcb_clone_asize += BP_GET_ASIZE(bp); + zcb->zcb_clone_blocks++; + + zbre->zbre_refcount--; + if (zbre->zbre_refcount == 0) { + avl_remove(&zcb->zcb_brt, zbre); + umem_free(zbre, sizeof (zdb_brt_entry_t)); + } + return; + } + + uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); + if (crefcnt > 0) { + zbre = umem_zalloc(sizeof (zdb_brt_entry_t), + UMEM_NOFAIL); + zbre->zbre_dva = bp->blk_dva[0]; + zbre->zbre_refcount = crefcnt; + avl_insert(&zcb->zcb_brt, zbre, where); + } + } + if (dump_opt['L']) return; @@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa) iterate_deleted_livelists(spa, dump_livelist_cb, NULL); } +static int +zdb_brt_entry_compare(const void *zcn1, const void *zcn2) +{ + const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva; + const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva; + int cmp; + + cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + if (cmp == 0) + cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)); + + return (cmp); +} + static int dump_block_stats(spa_t *spa) { @@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa) zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); + if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { + avl_create(&zcb->zcb_brt, zdb_brt_entry_compare, + sizeof (zdb_brt_entry_t), + offsetof(zdb_brt_entry_t, zbre_node)); + zcb->zcb_brt_is_active = B_TRUE; + } + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", @@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa) metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); - total_found = tzb->zb_asize - zcb->zcb_dedup_asize + + total_found = + tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize + zcb->zcb_removing_size + zcb->zcb_checkpoint_size; if (total_found == total_alloc && !dump_opt['L']) { @@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa) "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, (u_longlong_t)zcb->zcb_dedup_blocks, (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); + (void) printf("\t%-16s %14llu count: %6llu\n", + "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize, + (u_longlong_t)zcb->zcb_clone_blocks); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh new file mode 100755 index 0000000000..d6f3c94a41 --- /dev/null +++ b/cmd/zed/zed.d/statechange-slot_off.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# +# Turn off disk's enclosure slot if it becomes FAULTED. +# +# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos +# as they flip between FAULTED and ONLINE. If +# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets +# FAULTED, then power down the slot via sysfs: +# +# /sys/class/enclosure///power_status +# +# We assume the user will be responsible for turning the slot back on again. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI Enclosure services (SES) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: slot successfully powered off +# 1: enclosure not available +# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled +# 3: vdev was not FAULTED +# 4: The enclosure sysfs path passed from ZFS does not exist +# 5: Enclosure slot didn't actually turn off after we told it to + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + # No JBOD enclosure or NVMe slots + exit 1 +fi + +if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then + exit 2 +fi + +if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then + exit 3 +fi + +if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then + exit 4 +fi + +echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" + +# Wait for sysfs for report that the slot is off. It can take ~400ms on some +# enclosures. +for i in $(seq 1 20) ; do + if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then + break + fi + sleep 0.1 +done + +if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then + exit 5 +fi + +zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH" diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index c55a70c79f..78dc1afc7b 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -142,3 +142,8 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" # Disabled by default, 1 to enable and 0 to disable. #ZED_SYSLOG_DISPLAY_GUIDS=1 +## +# Power off the drive's slot in the enclosure if it becomes FAULTED. This can +# help silence misbehaving drives. This assumes your drive enclosure fully +# supports slot power control via sysfs. +#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1 diff --git a/cmd/ztest.c b/cmd/ztest.c index b6b99bfff6..398c519cfc 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, int error; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); ztest_object_lock(zd, object, RL_READER); @@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, DMU_READ_NO_PREFETCH); ASSERT0(error); } else { + ASSERT3P(zio, !=, NULL); size = doi.doi_data_block_size; if (ISP2(size)) { offset = P2ALIGN(offset, size); diff --git a/contrib/dracut/90zfs/zfs-env-bootfs.service.in b/contrib/dracut/90zfs/zfs-env-bootfs.service.in index 7ebab4c1a5..fe362b930b 100644 --- a/contrib/dracut/90zfs/zfs-env-bootfs.service.in +++ b/contrib/dracut/90zfs/zfs-env-bootfs.service.in @@ -12,11 +12,12 @@ ExecStart=/bin/sh -c ' decode_root_args || exit 0; \ [ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \ rootflags="$(getarg rootflags=)"; \ - case ",$rootflags," in \ - *,zfsutil,*) ;; \ - ,,) rootflags=zfsutil ;; \ - *) rootflags="zfsutil,$rootflags" ;; \ - esac; \ + [ "$(@sbindir@/zfs get -H -o value mountpoint "$root")" = legacy ] || \ + case ",$rootflags," in \ + *,zfsutil,*) ;; \ + ,,) rootflags=zfsutil ;; \ + *) rootflags="zfsutil,$rootflags" ;; \ + esac; \ exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"' [Install] diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index cc9cafa84f..20eeadc46e 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -198,6 +198,14 @@ extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) #define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +/* + * This is necessary to be compatible with other kernel modules + * or in-tree filesystem that may define kmem_cache_alloc, + * like bcachefs does it now. + */ +#ifdef kmem_cache_alloc +#undef kmem_cache_alloc +#endif #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) #define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) diff --git a/include/sys/brt.h b/include/sys/brt.h index 0761159e3f..f73df95058 100644 --- a/include/sys/brt.h +++ b/include/sys/brt.h @@ -36,6 +36,7 @@ extern "C" { #endif extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp); +extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp); extern uint64_t brt_get_dspace(spa_t *spa); extern uint64_t brt_get_used(spa_t *spa); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 7e57d133c2..615ba8fe74 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -572,11 +572,15 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp); +int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + const void *tag, dmu_buf_t **dbp); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags); +int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, + dmu_buf_t **dbp); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index ce6ae3c665..83ae2b76ba 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -247,8 +247,6 @@ typedef struct dmu_sendstatus { void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); -int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t, - const void *, dmu_buf_t **); #ifdef __cplusplus } diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index fec080139a..0df6e5f81f 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -80,7 +80,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); #define METASLAB_ASYNC_ALLOC 0x8 #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 -#define METASLAB_FASTWRITE 0x40 #define METASLAB_ZIL 0x80 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, @@ -96,8 +95,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); -void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); -void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); void metaslab_stat_init(void); void metaslab_stat_fini(void); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 5beb1b7377..d328068890 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -313,7 +313,7 @@ struct metaslab_group { * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are - * allocated, the allocated segment are removed from the ms_allocatable and + * allocated, the allocated segments are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context @@ -366,9 +366,9 @@ struct metaslab_group { struct metaslab { /* * This is the main lock of the metaslab and its purpose is to - * coordinate our allocations and frees [e.g metaslab_block_alloc(), + * coordinate our allocations and frees [e.g., metaslab_block_alloc(), * metaslab_free_concrete(), ..etc] with our various syncing - * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. + * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc]. * * The lock is also used during some miscellaneous operations like * using the metaslab's histogram for the metaslab group's histogram diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 5f4e82ad86..ad9dc3aefd 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -266,7 +266,6 @@ struct vdev { metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ - uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 03a409c525..f780ad3d61 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -38,14 +38,22 @@ extern "C" { /* * Possible states for a given lwb structure. * - * An lwb will start out in the "closed" state, and then transition to - * the "opened" state via a call to zil_lwb_write_open(). When - * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock" - * must be held. + * An lwb will start out in the "new" state, and transition to the "opened" + * state via a call to zil_lwb_write_open() on first itx assignment. When + * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be + * held. * - * After the lwb is "opened", it can transition into the "issued" state - * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must - * be held when making this transition. + * After the lwb is "opened", it can be assigned number of itxs and transition + * into the "closed" state via zil_lwb_write_close() when full or on timeout. + * When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock" + * must be held. New lwb allocation also takes "zl_lock" to protect the list. + * + * After the lwb is "closed", it can transition into the "ready" state via + * zil_lwb_write_issue(). "zl_lock" must be held when making this transition. + * Since it is done by the same thread, "zl_issuer_lock" is not needed. + * + * When lwb in "ready" state receives its block pointer, it can transition to + * "issued". "zl_lock" must be held when making this transition. * * After the lwb's write zio completes, it transitions into the "write * done" state via zil_lwb_write_done(); and then into the "flush done" @@ -62,17 +70,20 @@ extern "C" { * * Additionally, correctness when reading an lwb's state is often * achieved by exploiting the fact that these state transitions occur in - * this specific order; i.e. "closed" to "opened" to "issued" to "done". + * this specific order; i.e. "new" to "opened" to "closed" to "ready" to + * "issued" to "write_done" and finally "flush_done". * - * Thus, if an lwb is in the "closed" or "opened" state, holding the + * Thus, if an lwb is in the "new" or "opened" state, holding the * "zl_issuer_lock" will prevent a concurrent thread from transitioning - * that lwb to the "issued" state. Likewise, if an lwb is already in the - * "issued" state, holding the "zl_lock" will prevent a concurrent - * thread from transitioning that lwb to the "write done" state. + * that lwb to the "closed" state. Likewise, if an lwb is already in the + * "ready" state, holding the "zl_lock" will prevent a concurrent thread + * from transitioning that lwb to the "issued" state. */ typedef enum { - LWB_STATE_CLOSED, + LWB_STATE_NEW, LWB_STATE_OPENED, + LWB_STATE_CLOSED, + LWB_STATE_READY, LWB_STATE_ISSUED, LWB_STATE_WRITE_DONE, LWB_STATE_FLUSH_DONE, @@ -91,18 +102,21 @@ typedef enum { typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ - boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ + boolean_t lwb_slim; /* log block has slim format */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */ - boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ + int lwb_error; /* log block allocation error */ + int lwb_nmax; /* max bytes in the buffer */ int lwb_nused; /* # used bytes in buffer */ int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ + zio_t *lwb_child_zio; /* parent zio for children */ zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ uint64_t lwb_issued_txg; /* the txg when the write is issued */ + uint64_t lwb_alloc_txg; /* the txg when lwb_blk is allocated */ uint64_t lwb_max_txg; /* highest txg in this lwb */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ diff --git a/include/sys/zio.h b/include/sys/zio.h index f4da80783e..e1f4d5c044 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -222,7 +222,6 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NOPWRITE (1ULL << 28) #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) -#define ZIO_FLAG_FASTWRITE (1ULL << 31) #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index cffe341220..5e74d908de 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -57,7 +57,7 @@ libzfs_la_LIBADD = \ libzutil.la \ libuutil.la -libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) +libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) libzfs_la_LDFLAGS = -pthread diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 0e21029c5a..a28ec7745c 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3928,6 +3928,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) switch (errno) { + case EALREADY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "removal for this vdev is already in progress.")); + (void) zfs_error(hdl, EZFS_BUSY, errbuf); + break; + case EINVAL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; all top-level vdevs must " diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 87a30f54fe..e9bc78aa8d 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -928,6 +928,39 @@ zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, return (0); } +static volatile boolean_t send_progress_thread_signal_duetotimer; +static void +send_progress_thread_act(int sig, siginfo_t *info, void *ucontext) +{ + (void) sig, (void) ucontext; + send_progress_thread_signal_duetotimer = info->si_code == SI_TIMER; +} + +struct timer_desirability { + timer_t timer; + boolean_t desired; +}; +static void +timer_delete_cleanup(void *timer) +{ + struct timer_desirability *td = timer; + if (td->desired) + timer_delete(td->timer); +} + +#ifdef SIGINFO +#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO sigaddset(&new, SIGINFO) +#else +#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO +#endif +#define SEND_PROGRESS_THREAD_PARENT_BLOCK(old) { \ + sigset_t new; \ + sigemptyset(&new); \ + sigaddset(&new, SIGUSR1); \ + SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO; \ + pthread_sigmask(SIG_BLOCK, &new, old); \ +} + static void * send_progress_thread(void *arg) { @@ -941,6 +974,26 @@ send_progress_thread(void *arg) struct tm tm; int err; + const struct sigaction signal_action = + {.sa_sigaction = send_progress_thread_act, .sa_flags = SA_SIGINFO}; + struct sigevent timer_cfg = + {.sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGUSR1}; + const struct itimerspec timer_time = + {.it_value = {.tv_sec = 1}, .it_interval = {.tv_sec = 1}}; + struct timer_desirability timer = {}; + + sigaction(SIGUSR1, &signal_action, NULL); +#ifdef SIGINFO + sigaction(SIGINFO, &signal_action, NULL); +#endif + + if ((timer.desired = pa->pa_progress || pa->pa_astitle)) { + if (timer_create(CLOCK_MONOTONIC, &timer_cfg, &timer.timer)) + return ((void *)(uintptr_t)errno); + (void) timer_settime(timer.timer, 0, &timer_time, NULL); + } + pthread_cleanup_push(timer_delete_cleanup, &timer); + if (!pa->pa_parsable && pa->pa_progress) { (void) fprintf(stderr, "TIME %s %sSNAPSHOT %s\n", @@ -953,12 +1006,12 @@ send_progress_thread(void *arg) * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { - (void) sleep(1); + pause(); if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, &blocks)) != 0) { if (err == EINTR || err == ENOENT) - return ((void *)0); - return ((void *)(uintptr_t)err); + err = 0; + pthread_exit(((void *)(uintptr_t)err)); } (void) time(&t); @@ -991,21 +1044,25 @@ send_progress_thread(void *arg) (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, zhp->zfs_name); - } else if (pa->pa_progress) { + } else if (pa->pa_progress || + !send_progress_thread_signal_duetotimer) { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, zhp->zfs_name); } } + pthread_cleanup_pop(B_TRUE); } static boolean_t -send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid) +send_progress_thread_exit( + libzfs_handle_t *hdl, pthread_t ptid, sigset_t *oldmask) { void *status = NULL; (void) pthread_cancel(ptid); (void) pthread_join(ptid, &status); + pthread_sigmask(SIG_SETMASK, oldmask, NULL); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) return (zfs_standard_error(hdl, error, @@ -1199,7 +1256,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (sdd->progress || sdd->progressastitle) { + sigset_t oldmask; + { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; @@ -1214,13 +1272,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) zfs_close(zhp); return (err); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); - if ((sdd->progress || sdd->progressastitle) && - send_progress_thread_exit(zhp->zfs_hdl, tid)) + if (send_progress_thread_exit(zhp->zfs_hdl, tid, &oldmask)) return (-1); } @@ -1562,8 +1620,9 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, progress_arg_t pa = { 0 }; int err = 0; pthread_t ptid; + sigset_t oldmask; - if (flags->progress || flags->progressastitle) { + { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; @@ -1577,6 +1636,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_space_resume_redacted(zhp->zfs_name, from, @@ -1584,8 +1644,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, redactbook, fd, &size); *sizep = size; - if ((flags->progress || flags->progressastitle) && - send_progress_thread_exit(zhp->zfs_hdl, ptid)) + if (send_progress_thread_exit(zhp->zfs_hdl, ptid, &oldmask)) return (-1); if (!flags->progress && !flags->parsable) @@ -1876,11 +1935,12 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; + sigset_t oldmask; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (flags->progress || flags->progressastitle) { + { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; @@ -1898,6 +1958,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, zfs_close(zhp); return (error); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, @@ -1905,8 +1966,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, if (redact_book != NULL) free(redact_book); - if ((flags->progressastitle || flags->progress) && - send_progress_thread_exit(hdl, tid)) { + if (send_progress_thread_exit(hdl, tid, &oldmask)) { zfs_close(zhp); return (-1); } @@ -2691,7 +2751,8 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. */ - if (flags->progress || flags->progressastitle) { + sigset_t oldmask; + { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; @@ -2708,13 +2769,13 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } + SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_redacted(name, from, fd, lzc_flags_from_sendflags(flags), redactbook); - if ((flags->progress || flags->progressastitle) && - send_progress_thread_exit(hdl, ptid)) + if (send_progress_thread_exit(hdl, ptid, &oldmask)) return (-1); if (err == 0 && (flags->props || flags->holds || flags->backup)) { diff --git a/man/man8/zfs-send.8 b/man/man8/zfs-send.8 index 8cc6ae6ad5..ba604bf778 100644 --- a/man/man8/zfs-send.8 +++ b/man/man8/zfs-send.8 @@ -29,7 +29,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd January 12, 2023 +.Dd July 27, 2023 .Dt ZFS-SEND 8 .Os . @@ -297,6 +297,12 @@ This flag can only be used in conjunction with .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. +The same report can be requested by sending +.Dv SIGINFO +or +.Dv SIGUSR1 , +regardless of +.Fl v . .Pp The format of the stream is committed. You will be able to receive your streams on future versions of ZFS. @@ -433,6 +439,12 @@ and the verbose output goes to standard error .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. +The same report can be requested by sending +.Dv SIGINFO +or +.Dv SIGUSR1 , +regardless of +.Fl v . .El .It Xo .Nm zfs @@ -669,6 +681,10 @@ ones on the source, and are ready to be used, while the parent snapshot on the target contains none of the username and password data present on the source, because it was removed by the redacted send operation. . +.Sh SIGNALS +See +.Fl v . +. .Sh EXAMPLES .\" These are, respectively, examples 12, 13 from zfs.8 .\" Make sure to update them bidirectionally diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 45cf6fdfc4..c498a13282 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6290,7 +6290,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), ap->a_outoffp, &len, ap->a_outcred); - if (error == EXDEV || error == EOPNOTSUPP) + if (error == EXDEV || error == EAGAIN || error == EINVAL || + error == EOPNOTSUPP) goto bad_locked_fallback; *ap->a_lenp = (size_t)len; out_locked: diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index dca48e1e40..c45a3eb5a4 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -478,17 +478,19 @@ zfsctl_is_snapdir(struct inode *ip) */ static struct inode * zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, - const struct file_operations *fops, const struct inode_operations *ops) + const struct file_operations *fops, const struct inode_operations *ops, + uint64_t creation) { - inode_timespec_t now; struct inode *ip; znode_t *zp; + inode_timespec_t now = {.tv_sec = creation}; ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); - now = current_time(ip); + if (!creation) + now = current_time(ip); zp = ITOZ(ip); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); @@ -552,14 +554,28 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops) { struct inode *ip = NULL; + uint64_t creation = 0; + dsl_dataset_t *snap_ds; + dsl_pool_t *pool; while (ip == NULL) { ip = ilookup(zfsvfs->z_sb, (unsigned long)id); if (ip) break; + if (id <= ZFSCTL_INO_SNAPDIRS && !creation) { + pool = dmu_objset_pool(zfsvfs->z_os); + dsl_pool_config_enter(pool, FTAG); + if (!dsl_dataset_hold_obj(pool, + ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) { + creation = dsl_get_creation(snap_ds); + dsl_dataset_rele(snap_ds, FTAG); + } + dsl_pool_config_exit(pool, FTAG); + } + /* May fail due to concurrent zfsctl_inode_alloc() */ - ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops); + ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation); } return (ip); @@ -581,7 +597,7 @@ zfsctl_create(zfsvfs_t *zfsvfs) ASSERT(zfsvfs->z_ctldir == NULL); zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, - &zpl_fops_root, &zpl_ops_root); + &zpl_fops_root, &zpl_ops_root, 0); if (zfsvfs->z_ctldir == NULL) return (SET_ERROR(ENOENT)); diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 18efebfc1d..2abbf44df5 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -103,9 +103,17 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * Since Linux 5.3 the filesystem driver is responsible for executing * an appropriate fallback, and a generic fallback function is provided. */ - if (ret == -EOPNOTSUPP || ret == -EXDEV) + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV || + ret == -EAGAIN) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); +#else + /* + * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal + * to the kernel that it should fallback to a content copy. + */ + if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN) + ret = -EOPNOTSUPP; #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ return (ret); diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 877b503a1b..ddd8eefe60 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -174,7 +174,7 @@ * size_t len, unsigned int flags); * * Even though offsets and length represent bytes, they have to be - * block-aligned or we will return the EXDEV error so the upper layer can + * block-aligned or we will return an error so the upper layer can * fallback to the generic mechanism that will just copy the data. * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. * This function was implemented based on zfs_write(), but instead of writing @@ -192,9 +192,9 @@ * Some special cases to consider and how we address them: * - The block we want to clone may have been created within the same * transaction group that we are trying to clone. Such block has no BP - * allocated yet, so cannot be immediately cloned. We return EXDEV. + * allocated yet, so cannot be immediately cloned. We return EAGAIN. * - The block we want to clone may have been modified within the same - * transaction group. We return EXDEV. + * transaction group. We return EAGAIN. * - A block may be cloned multiple times during one transaction group (that's * why pending list is actually a tree and not an append-only list - this * way we can figure out faster if this block is cloned for the first time @@ -1544,6 +1544,37 @@ out: return (B_FALSE); } +uint64_t +brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) +{ + brt_t *brt = spa->spa_brt; + brt_vdev_t *brtvd; + brt_entry_t bre_search, *bre; + uint64_t vdevid, refcnt; + int error; + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_rlock(brt); + + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); + if (bre == NULL) { + error = brt_entry_lookup(brt, brtvd, &bre_search); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) + refcnt = 0; + else + refcnt = bre_search.bre_refcount; + } else + refcnt = bre->bre_refcount; + + brt_unlock(brt); + return (refcnt); +} + static void brt_prefetch(brt_t *brt, const blkptr_t *bp) { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3a4560cec2..ddb29020b0 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -165,7 +165,7 @@ dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { zfs_acl_byteswap, "acl" } }; -static int +int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp) { @@ -185,6 +185,7 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, *dbp = &db->db; return (0); } + int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp) @@ -1653,10 +1654,22 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, { dmu_sync_arg_t *dsa; dmu_tx_t *tx; + int error; + + error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL, + DB_RF_CANFAIL | DB_RF_NOPREFETCH); + if (error != 0) + return (error); tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); - if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + /* + * This transaction does not produce any dirty data or log blocks, so + * it should not be throttled. All other cases wait for TXG sync, by + * which time the log block we are writing will be obsolete, so we can + * skip waiting and just return error here instead. + */ + if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 9991e1a22c..20dc934593 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1292,7 +1292,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, /* * If this metaslab group is below its qmax or it's - * the only allocatable metasable group, then attempt + * the only allocatable metaslab group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) @@ -5101,7 +5101,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, zio_alloc_list_t *zal, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; - metaslab_group_t *mg, *fast_mg, *rotor; + metaslab_group_t *mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; @@ -5164,15 +5164,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; - } else if (flags & METASLAB_FASTWRITE) { - mg = fast_mg = mca->mca_rotor; - - do { - if (fast_mg->mg_vd->vdev_pending_fastwrite < - mg->mg_vd->vdev_pending_fastwrite) - mg = fast_mg; - } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor); - } else { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; @@ -5297,7 +5288,7 @@ top: mg->mg_bias = 0; } - if ((flags & METASLAB_FASTWRITE) || + if ((flags & METASLAB_ZIL) || atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mca->mca_rotor = mg->mg_next; @@ -5310,11 +5301,6 @@ top: ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); - if (flags & METASLAB_FASTWRITE) { - atomic_add_64(&vd->vdev_pending_fastwrite, - psize); - } - return (0); } next: @@ -5950,55 +5936,6 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } -void -metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - uint64_t psize = BP_GET_PSIZE(bp); - int d; - vdev_t *vd; - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(psize > 0); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - for (d = 0; d < ndvas; d++) { - if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) - continue; - atomic_add_64(&vd->vdev_pending_fastwrite, psize); - } - - spa_config_exit(spa, SCL_VDEV, FTAG); -} - -void -metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - uint64_t psize = BP_GET_PSIZE(bp); - int d; - vdev_t *vd; - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(psize > 0); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - for (d = 0; d < ndvas; d++) { - if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) - continue; - ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); - atomic_sub_64(&vd->vdev_pending_fastwrite, psize); - } - - spa_config_exit(spa, SCL_VDEV, FTAG); -} - static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index f3812b843e..87c1455932 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1192,7 +1192,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT(tvd == tvd->vdev_top); - tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; @@ -1655,7 +1654,6 @@ vdev_metaslab_fini(vdev_t *vd) } } ASSERT0(vd->vdev_ms_count); - ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 54ea43363b..f8d13075d5 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -839,7 +839,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, uint64_t zp_gen; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* @@ -889,6 +888,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ + ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated @@ -917,8 +917,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, } #endif if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); + error = dmu_buf_hold_noread(os, object, offset, zgd, + &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; @@ -1028,6 +1028,10 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) * * On success, the function return the number of bytes copied in *lenp. * Note, it doesn't return how much bytes are left to be copied. + * On errors which are caused by any file system limitations or + * brt limitations `EINVAL` is returned. In the most cases a user + * requested bad parameters, it could be possible to clone the file but + * some parameters don't match the requirements. */ int zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, @@ -1171,7 +1175,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * We cannot clone into files with different block size. */ if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } @@ -1179,7 +1183,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, * Offsets and len must be at block boundries. */ if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } /* @@ -1187,7 +1191,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ if ((len % inblksz) != 0 && (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { - error = SET_ERROR(EXDEV); + error = SET_ERROR(EINVAL); goto unlock; } @@ -1242,13 +1246,11 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, &nbps); if (error != 0) { /* - * If we are tyring to clone a block that was created - * in the current transaction group. Return an error, - * so the caller can fallback to just copying the data. + * If we are trying to clone a block that was created + * in the current transaction group, error will be + * EAGAIN here, which we can just return to the caller + * so it can fallback if it likes. */ - if (error == EAGAIN) { - error = SET_ERROR(EXDEV); - } break; } /* diff --git a/module/zfs/zil.c b/module/zfs/zil.c index be5b9edf6e..f2d279e36a 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -151,7 +151,6 @@ static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); -static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb); static itx_t *zil_itx_clone(itx_t *oitx); static int @@ -290,7 +289,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, char *lr = (char *)(zilc + 1); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + sizeof (cksum)) || zilc->zc_nused < sizeof (*zilc) || zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); @@ -304,7 +303,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + sizeof (cksum)) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { @@ -760,35 +759,52 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) return (TREE_CMP(v1, v2)); } +/* + * Allocate a new lwb. We may already have a block pointer for it, in which + * case we get size and version from there. Or we may not yet, in which case + * we choose them here and later make the block allocation match. + */ static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, - boolean_t fastwrite) +zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, + uint64_t txg, lwb_state_t state) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; - lwb->lwb_blk = *bp; - lwb->lwb_fastwrite = fastwrite; - lwb->lwb_slog = slog; - lwb->lwb_indirect = B_FALSE; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); + if (bp) { + lwb->lwb_blk = *bp; + lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); + sz = BP_GET_LSIZE(bp); } else { - lwb->lwb_nused = lwb->lwb_nfilled = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + BP_ZERO(&lwb->lwb_blk); + lwb->lwb_slim = (spa_version(zilog->zl_spa) >= + SPA_VERSION_SLIM_ZIL); } - lwb->lwb_state = LWB_STATE_CLOSED; - lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); + lwb->lwb_slog = slog; + lwb->lwb_error = 0; + if (lwb->lwb_slim) { + lwb->lwb_nmax = sz; + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + } else { + lwb->lwb_nmax = sz - sizeof (zil_chain_t); + lwb->lwb_nused = lwb->lwb_nfilled = 0; + } + lwb->lwb_sz = sz; + lwb->lwb_state = state; + lwb->lwb_buf = zio_buf_alloc(sz); + lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; - lwb->lwb_max_txg = txg; + lwb->lwb_alloc_txg = txg; + lwb->lwb_max_txg = 0; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); + if (state != LWB_STATE_NEW) + zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); @@ -802,10 +818,12 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) VERIFY(list_is_empty(&lwb->lwb_waiters)); VERIFY(list_is_empty(&lwb->lwb_itxs)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT3P(lwb->lwb_child_zio, ==, NULL); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); - ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || + ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_FLUSH_DONE); /* @@ -916,7 +934,6 @@ zil_create(zilog_t *zilog) dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; - boolean_t fastwrite = FALSE; boolean_t slog = FALSE; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); @@ -949,8 +966,6 @@ zil_create(zilog_t *zilog) error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, ZIL_MIN_BLKSZ, &slog); - fastwrite = TRUE; - if (error == 0) zil_init_log_chain(zilog, &blk); } @@ -959,7 +974,7 @@ zil_create(zilog_t *zilog) * Allocate a log write block (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite); + lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); /* * If we just allocated the first log block, commit our transaction @@ -1044,12 +1059,10 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { - if (lwb->lwb_fastwrite) - metaslab_fastwrite_unmark(zilog->zl_spa, - &lwb->lwb_blk); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); } } else if (!keep_first) { @@ -1277,21 +1290,21 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { /* * The lwb_waiters field of the lwb is protected by the zilog's - * zl_lock, thus it must be held when calling this function. + * zl_issuer_lock while the lwb is open and zl_lock otherwise. + * zl_issuer_lock also protects leaving the open state. + * zcw_lwb setting is protected by zl_issuer_lock and state != + * flush_done, which transition is protected by zl_lock. */ - ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); + IMPLY(lwb->lwb_state != LWB_STATE_OPENED, + MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); - ASSERT3P(lwb, !=, NULL); - ASSERT(lwb->lwb_state == LWB_STATE_OPENED || - lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE); - list_insert_tail(&lwb->lwb_waiters, zcw); + ASSERT3P(zcw->zcw_lwb, ==, NULL); zcw->zcw_lwb = lwb; - mutex_exit(&zcw->zcw_lock); } /* @@ -1302,11 +1315,9 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) static void zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { - mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); list_insert_tail(nolwb, zcw); - mutex_exit(&zcw->zcw_lock); + ASSERT3P(zcw->zcw_lwb, ==, NULL); } void @@ -1492,7 +1503,7 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); lwb_t *lwb = list_head(&zilog->zl_lwb_list); - while (lwb != NULL && lwb->lwb_max_txg <= txg) { + while (lwb != NULL) { if (lwb->lwb_issued_txg <= txg) { ASSERT(lwb->lwb_state != LWB_STATE_ISSUED); ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE); @@ -1535,14 +1546,6 @@ zil_lwb_write_done(zio_t *zio) ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); - ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); - ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); - ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); - ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); - ASSERT(!BP_IS_GANG(zio->io_bp)); - ASSERT(!BP_IS_HOLE(zio->io_bp)); - ASSERT(BP_GET_FILL(zio->io_bp) == 0); - abd_free(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); lwb->lwb_buf = NULL; @@ -1550,8 +1553,8 @@ zil_lwb_write_done(zio_t *zio) mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; + lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; - lwb->lwb_fastwrite = FALSE; nlwb = list_next(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); @@ -1615,124 +1618,75 @@ zil_lwb_write_done(zio_t *zio) } } +/* + * Build the zio dependency chain, which is used to preserve the ordering of + * lwb completions that is required by the semantics of the ZIL. Each new lwb + * zio becomes a parent of the previous lwb zio, such that the new lwb's zio + * cannot complete until the previous lwb's zio completes. + * + * This is required by the semantics of zil_commit(): the commit waiters + * attached to the lwbs will be woken in the lwb zio's completion callback, + * so this zio dependency graph ensures the waiters are woken in the correct + * order (the same order the lwbs were created). + */ static void zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) { - lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zilog->zl_lock)); + lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb); + if (prev_lwb == NULL || + prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE) + return; + /* - * The zilog's "zl_last_lwb_opened" field is used to build the - * lwb/zio dependency chain, which is used to preserve the - * ordering of lwb completions that is required by the semantics - * of the ZIL. Each new lwb zio becomes a parent of the - * "previous" lwb zio, such that the new lwb's zio cannot - * complete until the "previous" lwb's zio completes. + * If the previous lwb's write hasn't already completed, we also want + * to order the completion of the lwb write zios (above, we only order + * the completion of the lwb root zios). This is required because of + * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. * - * This is required by the semantics of zil_commit(); the commit - * waiters attached to the lwbs will be woken in the lwb zio's - * completion callback, so this zio dependency graph ensures the - * waiters are woken in the correct order (the same order the - * lwbs were created). + * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous + * lwb will rely on this lwb to flush the vdevs written to by that + * previous lwb. Thus, we need to ensure this lwb doesn't issue the + * flush until after the previous lwb's write completes. We ensure + * this ordering by setting the zio parent/child relationship here. + * + * Without this relationship on the lwb's write zio, it's possible + * for this lwb's write to complete prior to the previous lwb's write + * completing; and thus, the vdevs for the previous lwb would be + * flushed prior to that lwb's data being written to those vdevs (the + * vdevs are flushed in the lwb write zio's completion handler, + * zil_lwb_write_done()). */ - if (last_lwb_opened != NULL && - last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED || - last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE); - - ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); - zio_add_child(lwb->lwb_root_zio, - last_lwb_opened->lwb_root_zio); - - /* - * If the previous lwb's write hasn't already completed, - * we also want to order the completion of the lwb write - * zios (above, we only order the completion of the lwb - * root zios). This is required because of how we can - * defer the DKIOCFLUSHWRITECACHE commands for each lwb. - * - * When the DKIOCFLUSHWRITECACHE commands are deferred, - * the previous lwb will rely on this lwb to flush the - * vdevs written to by that previous lwb. Thus, we need - * to ensure this lwb doesn't issue the flush until - * after the previous lwb's write completes. We ensure - * this ordering by setting the zio parent/child - * relationship here. - * - * Without this relationship on the lwb's write zio, - * it's possible for this lwb's write to complete prior - * to the previous lwb's write completing; and thus, the - * vdevs for the previous lwb would be flushed prior to - * that lwb's data being written to those vdevs (the - * vdevs are flushed in the lwb write zio's completion - * handler, zil_lwb_write_done()). - */ - if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED); - - ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL); - zio_add_child(lwb->lwb_write_zio, - last_lwb_opened->lwb_write_zio); - } + if (prev_lwb->lwb_state == LWB_STATE_ISSUED) { + ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL); + zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio); + } else { + ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); } + + ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL); + zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio); } /* * This function's purpose is to "open" an lwb such that it is ready to - * accept new itxs being committed to it. To do this, the lwb's zio - * structures are created, and linked to the lwb. This function is - * idempotent; if the passed in lwb has already been opened, this - * function is essentially a no-op. + * accept new itxs being committed to it. This function is idempotent; if + * the passed in lwb has already been opened, it is essentially a no-op. */ static void zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) { - zbookmark_phys_t zb; - zio_priority_t prio; - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb, !=, NULL); - EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); - EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); - if (lwb->lwb_root_zio != NULL) + if (lwb->lwb_state != LWB_STATE_NEW) { + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); return; - - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); - - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; - - SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, - lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); - - /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ - mutex_enter(&zilog->zl_lock); - if (!lwb->lwb_fastwrite) { - metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 1; } - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, - &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), - zil_lwb_write_done, lwb, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); - + mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; - - zil_lwb_set_zio_dependency(zilog, lwb); zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); } @@ -1769,57 +1723,21 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { - lwb_t *nlwb = NULL; - zil_chain_t *zilc; - spa_t *spa = zilog->zl_spa; - blkptr_t *bp; - dmu_tx_t *tx; - uint64_t txg; - uint64_t zil_blksz; - int i, error; - boolean_t slog; + int i; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + lwb->lwb_state = LWB_STATE_CLOSED; /* - * If this lwb includes indirect writes, we have to commit before - * creating the transaction, otherwise we may end up in dead lock. + * If there was an allocation failure then returned NULL will trigger + * zil_commit_writer_stall() at the caller. This is inherently racy, + * since allocation may not have happened yet. */ - if (lwb->lwb_indirect) { - for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); - lwb->lwb_nused = lwb->lwb_nfilled; - } - - /* - * Allocate the next block and save its address in this block - * before writing it in order to establish the log chain. - */ - - tx = dmu_tx_create(zilog->zl_os); - - /* - * Since we are not going to create any new dirty data, and we - * can even help with clearing the existing dirty data, we - * should not be subject to the dirty data based delays. We - * use TXG_NOTHROTTLE to bypass the delay mechanism. - */ - VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); - - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - - mutex_enter(&zilog->zl_lwb_io_lock); - lwb->lwb_issued_txg = txg; - zilog->zl_lwb_inflight[txg & TXG_MASK]++; - zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); - mutex_exit(&zilog->zl_lwb_io_lock); + if (lwb->lwb_error != 0) + return (NULL); /* * Log blocks are pre-allocated. Here we select the size of the next @@ -1837,7 +1755,7 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) * the maximum block size because we can exhaust the available * pool log space. */ - zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); + uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) continue; zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); @@ -1849,94 +1767,149 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) - zilc = (zil_chain_t *)lwb->lwb_buf; - else - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; - BP_ZERO(bp); - error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); - if (error == 0) { - ASSERT3U(bp->blk_birth, ==, txg); - bp->blk_cksum = lwb->lwb_blk.blk_cksum; - bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; - - /* - * Allocate a new log write block (lwb). - */ - nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); - } - - lwb->lwb_state = LWB_STATE_ISSUED; - - dmu_tx_commit(tx); - - /* - * We need to acquire the config lock for the lwb to issue it later. - * However, if we already have a queue of closed parent lwbs already - * holding the config lock (but not yet issued), we can't block here - * waiting on the lock or we will deadlock. In that case we must - * first issue to parent IOs before waiting on the lock. - */ - if (ilwbs && !list_is_empty(ilwbs)) { - if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) { - lwb_t *tlwb; - while ((tlwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, tlwb); - spa_config_enter(spa, SCL_STATE, lwb, RW_READER); - } - } else { - spa_config_enter(spa, SCL_STATE, lwb, RW_READER); - } - - if (ilwbs) - list_insert_tail(ilwbs, lwb); - - /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). - */ - return (nlwb); + return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); } /* * Finalize previously closed block and issue the write zio. - * Does not require locking. */ static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { + spa_t *spa = zilog->zl_spa; zil_chain_t *zilc; - int wsz; + boolean_t slog; + zbookmark_phys_t zb; + zio_priority_t prio; + int error; - /* Actually fill the lwb with the data if not yet. */ - if (!lwb->lwb_indirect) { - for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); - lwb->lwb_nused = lwb->lwb_nfilled; - } + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); - ASSERT3S(wsz, <=, lwb->lwb_sz); - zio_shrink(lwb->lwb_write_zio, wsz); - wsz = lwb->lwb_write_zio->io_size; + /* Actually fill the lwb with the data. */ + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; - zilc = (zil_chain_t *)lwb->lwb_buf; - } else { - wsz = lwb->lwb_sz; - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - } - zilc->zc_pad = 0; - zilc->zc_nused = lwb->lwb_nused; - zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, + ZIO_FLAG_CANFAIL); /* - * clear unused data for security + * The lwb is now ready to be issued, but it can be only if it already + * got its block pointer allocated or the allocation has failed. + * Otherwise leave it as-is, relying on some other thread to issue it + * after allocating its block pointer via calling zil_lwb_write_issue() + * for the previous lwb(s) in the chain. */ - memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + mutex_enter(&zilog->zl_lock); + lwb->lwb_state = LWB_STATE_READY; + if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { + mutex_exit(&zilog->zl_lock); + return; + } + mutex_exit(&zilog->zl_lock); + +next_lwb: + if (lwb->lwb_slim) + zilc = (zil_chain_t *)lwb->lwb_buf; + else + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); + int wsz = lwb->lwb_sz; + if (lwb->lwb_error == 0) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, + &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, + lwb, prio, ZIO_FLAG_CANFAIL, &zb); + zil_lwb_add_block(lwb, &lwb->lwb_blk); + + if (lwb->lwb_slim) { + /* For Slim ZIL only write what is used. */ + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, + int); + ASSERT3S(wsz, <=, lwb->lwb_sz); + zio_shrink(lwb->lwb_write_zio, wsz); + wsz = lwb->lwb_write_zio->io_size; + } + memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + zilc->zc_pad = 0; + zilc->zc_nused = lwb->lwb_nused; + zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + } else { + /* + * We can't write the lwb if there was an allocation failure, + * so create a null zio instead just to maintain dependencies. + */ + lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL, + zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL); + lwb->lwb_write_zio->io_error = lwb->lwb_error; + } + if (lwb->lwb_child_zio) + zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio); + + /* + * Open transaction to allocate the next block pointer. + */ + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Allocate next the block pointer unless we are already in error. + */ + lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb); + blkptr_t *bp = &zilc->zc_next_blk; + BP_ZERO(bp); + error = lwb->lwb_error; + if (error == 0) { + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, + &slog); + } + if (error == 0) { + ASSERT3U(bp->blk_birth, ==, txg); + BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : + ZIO_CHECKSUM_ZILOG); + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; + } + + /* + * Reduce TXG open time by incrementing inflight counter and committing + * the transaciton. zil_sync() will wait for it to return to zero. + */ + mutex_enter(&zilog->zl_lwb_io_lock); + lwb->lwb_issued_txg = txg; + zilog->zl_lwb_inflight[txg & TXG_MASK]++; + zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); + mutex_exit(&zilog->zl_lwb_io_lock); + dmu_tx_commit(tx); + + spa_config_enter(spa, SCL_STATE, lwb, RW_READER); + + /* + * We've completed all potentially blocking operations. Update the + * nlwb and allow it proceed without possible lock order reversals. + */ + mutex_enter(&zilog->zl_lock); + zil_lwb_set_zio_dependency(zilog, lwb); + lwb->lwb_state = LWB_STATE_ISSUED; + + if (nlwb) { + nlwb->lwb_blk = *bp; + nlwb->lwb_error = error; + nlwb->lwb_slog = slog; + nlwb->lwb_alloc_txg = txg; + if (nlwb->lwb_state != LWB_STATE_READY) + nlwb = NULL; + } + mutex_exit(&zilog->zl_lock); if (lwb->lwb_slog) { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); @@ -1955,11 +1928,19 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } - ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER)); - zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_write_zio); + if (lwb->lwb_child_zio) + zio_nowait(lwb->lwb_child_zio); + + /* + * If nlwb was ready when we gave it the block pointer, + * it is on us to issue it and possibly following ones. + */ + lwb = nlwb; + if (lwb) + goto next_lwb; } /* @@ -2031,10 +2012,7 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) * For more details, see the comment above zil_commit(). */ if (lr->lrc_txtype == TX_COMMIT) { - mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); - itx->itx_private = NULL; - mutex_exit(&zilog->zl_lock); list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } @@ -2053,17 +2031,17 @@ cont: * If this record won't fit in the current log block, start a new one. * For WR_NEED_COPY optimize layout for minimal number of chunks. */ - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t)); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_close(zilog, lwb, ilwbs); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); if (lwb == NULL) return (NULL); - zil_lwb_write_open(zilog, lwb); - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; /* * There must be enough space in the new, empty log block to @@ -2101,7 +2079,7 @@ cont: clr->lrc_seq = ++zilog->zl_lr_seq; lwb->lwb_nused += reclen + dnow; - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); zil_lwb_add_txg(lwb, lr->lrc_txg); @@ -2113,22 +2091,9 @@ cont: goto cont; } - /* - * We have to really issue all queued LWBs before we may have to - * wait for a txg sync. Otherwise we may end up in a dead lock. - */ - if (lr->lrc_txtype == TX_WRITE) { - boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa); - if (frozen || itx->itx_wr_state == WR_INDIRECT) { - lwb_t *tlwb; - while ((tlwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, tlwb); - } - if (itx->itx_wr_state == WR_INDIRECT) - lwb->lwb_indirect = B_TRUE; - if (frozen) - txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); - } + if (lr->lrc_txtype == TX_WRITE && + lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); return (lwb); } @@ -2191,26 +2156,24 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) ZIL_STAT_BUMP(zilog, zil_itx_indirect_count); ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); + if (lwb->lwb_child_zio == NULL) { + lwb->lwb_child_zio = zio_root( + zilog->zl_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } } /* - * We pass in the "lwb_write_zio" rather than - * "lwb_root_zio" so that the "lwb_write_zio" - * becomes the parent of any zio's created by - * the "zl_get_data" callback. The vdevs are - * flushed after the "lwb_write_zio" completes, - * so we want to make sure that completion - * callback waits for these additional zio's, - * such that the vdevs used by those zio's will - * be included in the lwb's vdev tree, and those - * vdevs will be properly flushed. If we passed - * in "lwb_root_zio" here, then these additional - * vdevs may not be flushed; e.g. if these zio's - * completed after "lwb_write_zio" completed. + * The "lwb_child_zio" we pass in will become a child of + * "lwb_write_zio", when one is created, so one will be + * a parent of any zio's created by the "zl_get_data". + * This way "lwb_write_zio" will first wait for children + * block pointers before own writing, and then for their + * writing completion before the vdev cache flushing. */ error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, - lwb->lwb_write_zio); + lwb->lwb_child_zio); if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ memset((char *)dbuf + lrwb->lr_length, 0, @@ -2243,12 +2206,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) error); zfs_fallthrough; case EIO: - if (lwb->lwb_indirect) { - txg_wait_synced(zilog->zl_dmu_pool, - lr->lrc_txg); - } else { - lwb->lwb_write_zio->io_error = error; - } + txg_wait_synced(zilog->zl_dmu_pool, + lr->lrc_txg); zfs_fallthrough; case ENOENT: zfs_fallthrough; @@ -2692,7 +2651,6 @@ zil_prune_commit_list(zilog_t *zilog) zil_commit_waiter_skip(itx->itx_private); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); - itx->itx_private = NULL; } mutex_exit(&zilog->zl_lock); @@ -2770,10 +2728,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * have already been created (zl_lwb_list not empty). */ zil_commit_activate_saxattr_feature(zilog); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - first = (lwb->lwb_state != LWB_STATE_OPENED) && + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_OPENED); + first = (lwb->lwb_state == LWB_STATE_NEW) && ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || plwb->lwb_state == LWB_STATE_FLUSH_DONE); } @@ -2897,37 +2854,32 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_OPENED); /* * At this point, the ZIL block pointed at by the "lwb" - * variable is in one of the following states: "closed" - * or "open". + * variable is in "new" or "opened" state. * - * If it's "closed", then no itxs have been committed to - * it, so there's no point in issuing its zio (i.e. it's - * "empty"). + * If it's "new", then no itxs have been committed to it, so + * there's no point in issuing its zio (i.e. it's "empty"). * - * If it's "open", then it contains one or more itxs that + * If it's "opened", then it contains one or more itxs that * eventually need to be committed to stable storage. In * this case we intentionally do not issue the lwb's zio * to disk yet, and instead rely on one of the following * two mechanisms for issuing the zio: * - * 1. Ideally, there will be more ZIL activity occurring - * on the system, such that this function will be - * immediately called again (not necessarily by the same - * thread) and this lwb's zio will be issued via - * zil_lwb_assign(). This way, the lwb is guaranteed to - * be "full" when it is issued to disk, and we'll make - * use of the lwb's size the best we can. + * 1. Ideally, there will be more ZIL activity occurring on + * the system, such that this function will be immediately + * called again by different thread and this lwb will be + * closed by zil_lwb_assign(). This way, the lwb will be + * "full" when it is issued to disk, and we'll make use of + * the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on - * the system, such that this lwb's zio isn't issued via - * zil_lwb_assign(), zil_commit_waiter() will issue the - * lwb's zio. If this occurs, the lwb is not guaranteed + * the system, zil_commit_waiter() will close it and issue + * the zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount * of ZIL activity occurring on the system at that time. @@ -2957,8 +2909,11 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) hrtime_t sleep = zilog->zl_last_lwb_latency * zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || - lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { - lwb = zil_lwb_write_close(zilog, lwb, ilwbs); + lwb->lwb_nmax - lwb->lwb_nused < + lwb->lwb_nmax / 8) { + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, + LWB_STATE_NEW); zilog->zl_cur_used = 0; if (lwb == NULL) { while ((lwb = list_remove_head(ilwbs)) @@ -3041,7 +2996,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) lwb_t *lwb = zcw->zcw_lwb; ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); /* * If the lwb has already been issued by another thread, we can @@ -3050,9 +3005,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * do this prior to acquiring the zl_issuer_lock, to avoid * acquiring it when it's not necessary to do so. */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) + if (lwb->lwb_state != LWB_STATE_OPENED) return; /* @@ -3075,8 +3028,8 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * wind up with a use-after-free error below. */ if (zcw->zcw_done) { - lwb = NULL; - goto out; + mutex_exit(&zilog->zl_issuer_lock); + return; } ASSERT3P(lwb, ==, zcw->zcw_lwb); @@ -3087,28 +3040,33 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * second time while holding the lock. * * We don't need to hold the zl_lock since the lwb cannot transition - * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb - * _can_ transition from ISSUED to DONE, but it's OK to race with + * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb + * _can_ transition from CLOSED to DONE, but it's OK to race with * that transition since we treat the lwb the same, whether it's in - * the ISSUED or DONE states. + * the CLOSED, ISSUED or DONE states. * * The important thing, is we treat the lwb differently depending on - * if it's ISSUED or OPENED, and block any other threads that might - * attempt to issue this lwb. For that reason we hold the + * if it's OPENED or CLOSED, and block any other threads that might + * attempt to close/issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_close() if the lwb had already been issued. + * zil_lwb_write_close() if the lwb had already been closed/issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) { - lwb = NULL; - goto out; + if (lwb->lwb_state != LWB_STATE_OPENED) { + mutex_exit(&zilog->zl_issuer_lock); + return; } - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + /* + * We do not need zcw_lock once we hold zl_issuer_lock and know lwb + * is still open. But we have to drop it to avoid a deadlock in case + * callback of zio issued by zil_lwb_write_issue() try to get it, + * while zil_lwb_write_issue() is blocked on attempt to issue next + * lwb it found in LWB_STATE_READY state. + */ + mutex_exit(&zcw->zcw_lock); /* * As described in the comments above zil_commit_waiter() and @@ -3116,9 +3074,9 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ - lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* * Since the lwb's zio hadn't been issued by the time this thread @@ -3141,34 +3099,15 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the * zil_commit_writer_stall() function for more details. - * - * We must drop the commit waiter's lock prior to - * calling zil_commit_writer_stall() or else we can wind - * up with the following deadlock: - * - * - This thread is waiting for the txg to sync while - * holding the waiter's lock; txg_wait_synced() is - * used within txg_commit_writer_stall(). - * - * - The txg can't sync because it is waiting for this - * lwb's zio callback to call dmu_tx_commit(). - * - * - The lwb's zio callback can't call dmu_tx_commit() - * because it's blocked trying to acquire the waiter's - * lock, which occurs prior to calling dmu_tx_commit() */ - mutex_exit(&zcw->zcw_lock); zil_lwb_write_issue(zilog, lwb); - lwb = NULL; zil_commit_writer_stall(zilog); - mutex_enter(&zcw->zcw_lock); - } - -out: - mutex_exit(&zilog->zl_issuer_lock); - if (lwb) + mutex_exit(&zilog->zl_issuer_lock); + } else { + mutex_exit(&zilog->zl_issuer_lock); zil_lwb_write_issue(zilog, lwb); - ASSERT(MUTEX_HELD(&zcw->zcw_lock)); + } + mutex_enter(&zcw->zcw_lock); } /* @@ -3233,7 +3172,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) * where it's "zcw_lwb" field is NULL, and it hasn't yet * been skipped, so it's "zcw_done" field is still B_FALSE. */ - IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); + IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW); if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { ASSERT3B(timedout, ==, B_FALSE); @@ -3281,6 +3220,8 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) */ IMPLY(lwb != NULL, + lwb->lwb_state == LWB_STATE_CLOSED || + lwb->lwb_state == LWB_STATE_READY || lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE); @@ -3635,10 +3576,11 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_state != LWB_STATE_FLUSH_DONE || - lwb->lwb_max_txg > txg) + lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); - zio_free(spa, txg, &lwb->lwb_blk); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + zio_free(spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); /* @@ -3651,18 +3593,6 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) BP_ZERO(&zh->zh_log); } - /* - * Remove fastwrite on any blocks that have been pre-allocated for - * the next commit. This prevents fastwrite counter pollution by - * unused, long-lived LWBs. - */ - for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) { - if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) { - metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 0; - } - } - mutex_exit(&zilog->zl_lock); } @@ -3854,17 +3784,18 @@ zil_close(zilog_t *zilog) } mutex_enter(&zilog->zl_lock); + txg = zilog->zl_dirty_max_txg; lwb = list_tail(&zilog->zl_lwb_list); - if (lwb == NULL) - txg = zilog->zl_dirty_max_txg; - else - txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); + if (lwb != NULL) { + txg = MAX(txg, lwb->lwb_alloc_txg); + txg = MAX(txg, lwb->lwb_max_txg); + } mutex_exit(&zilog->zl_lock); /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in - * zil_lwb_write_close(). + * zil_lwb_write_issue(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); @@ -3893,11 +3824,7 @@ zil_close(zilog_t *zilog) lwb = list_remove_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - - if (lwb->lwb_fastwrite) - metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } @@ -4018,7 +3945,7 @@ zil_suspend(const char *osname, void **cookiep) /* * We need to use zil_commit_impl to ensure we wait for all - * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed + * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed * to disk before proceeding. If we used zil_commit instead, it * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3f5e6a08d8..7458b416c2 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3024,11 +3024,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - /* - * We didn't allocate this bp, so make sure it doesn't get unmarked. - */ - pio->io_flags &= ~ZIO_FLAG_FASTWRITE; - zio_nowait(zio); return (pio); @@ -3616,7 +3611,6 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) @@ -3776,7 +3770,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ - int flags = METASLAB_FASTWRITE | METASLAB_ZIL; + int flags = METASLAB_ZIL; int allocator = (uint_t)cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, @@ -4472,8 +4466,8 @@ zio_ready(zio_t *zio) zio_t *pio, *pio_next; zio_link_t *zl = NULL; - if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, - ZIO_WAIT_READY)) { + if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | + ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } @@ -4931,12 +4925,6 @@ zio_done(zio_t *zio) zfs_ereport_free_checksum(zcr); } - if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && - !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && - !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { - metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); - } - /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index cd4e6f0c75..53dcb4dee4 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -698,7 +698,6 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, int error; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); @@ -717,6 +716,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ + ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change @@ -727,8 +727,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); - error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, - DMU_READ_NO_PREFETCH); + error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd, + &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index b68202d849..2c8d5cb0ec 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -36,11 +36,13 @@ tags = ['functional', 'atime'] [tests/functional/block_cloning:Linux] tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', + 'block_cloning_copyfilerange_fallback', 'block_cloning_ficlone', 'block_cloning_ficlonerange', 'block_cloning_ficlonerange_partial', 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', 'block_cloning_disabled_ficlonerange', - 'block_cloning_copyfilerange_cross_dataset'] + 'block_cloning_copyfilerange_cross_dataset', + 'block_cloning_copyfilerange_fallback_same_txg'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index c9a2b4179a..e1bbe063ab 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -300,8 +300,12 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_partial': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange_fallback': + ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': + ['SKIP', cfr_cross_reason], }) diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c index a7e7277ae4..696dc471d8 100644 --- a/tests/zfs-tests/cmd/clonefile.c +++ b/tests/zfs-tests/cmd/clonefile.c @@ -212,7 +212,7 @@ main(int argc, char **argv) int dfd = open(argv[optind+1], O_WRONLY|O_CREAT, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); - if (sfd < 0) { + if (dfd < 0) { fprintf(stderr, "open: %s: %s\n", argv[optind+1], strerror(errno)); close(sfd); diff --git a/tests/zfs-tests/cmd/readmmap.c b/tests/zfs-tests/cmd/readmmap.c index 704ffd55c8..a5c8079d0e 100644 --- a/tests/zfs-tests/cmd/readmmap.c +++ b/tests/zfs-tests/cmd/readmmap.c @@ -44,6 +44,7 @@ #include #include #include +#include #include int diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 0819cb6b57..66aff5026f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -441,6 +441,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ functional/block_cloning/block_cloning_copyfilerange.ksh \ functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh new file mode 100755 index 0000000000..87f99eb5c0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# Copyright (c) 2023, Rob Norris +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="copy_file_range will fall back to copy when cloning not possible." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must sync_pool $TESTPOOL + + +log_note "Copying entire file with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "1 2 3 4" ] + + +log_note "Copying within a block with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 32768 32768 65536 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "2 3 4" ] + + +log_note "Copying across a block with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 327680 327680 131072 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "2" ] + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh new file mode 100755 index 0000000000..3451f887af --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# Copyright (c) 2023, Rob Norris +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="copy_file_range will fall back to copy when cloning on same txg" + +log_assert $claim + +typeset timeout=$(get_tunable TXG_TIMEOUT) + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 TXG_TIMEOUT $timeout +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must set_tunable64 TXG_TIMEOUT 5000 + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 + +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "" ] + +log_pass $claim +