From 9e63631dea553fb81fe10710e626fae26ff5c14f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 9 Apr 2024 19:14:04 -0400 Subject: [PATCH 001/113] Small fix to prefetch ranges aggregation When after #16022 adding new range we aggregate more than two existing ranges, that should be very rare, only if several streams overlap, we may need to zero not the last range, but some earlier. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16072 --- module/zfs/dmu_zfetch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 915d99916d..ed50f1889b 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -418,8 +418,8 @@ dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks) zs->zs_ranges[f].start = zs->zs_ranges[l].start; zs->zs_ranges[f].end = zs->zs_ranges[l].end; } - zs->zs_ranges[ZFETCH_RANGES - 1].start = 0; - zs->zs_ranges[ZFETCH_RANGES - 1].end = 0; + zs->zs_ranges[f].start = 0; + zs->zs_ranges[f].end = 0; } } else if (i < ZFETCH_RANGES) { /* Got no intersecting ranges, insert new one. */ From 997f85b4d3123286a584bbd3aaac3077a8067abb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 9 Apr 2024 19:23:19 -0400 Subject: [PATCH 002/113] L2ARC: Relax locking during write Previous code held ARC state sublist lock throughout all L2ARC write process, which included number of allocations and even ZIO issues. Being blocked in any of those places the code could also block ARC eviction, that could cause OOM activation or even dead- lock if system is low on memory or one is too fragmented. Fix it by dropping the lock as soon as we see a block eligible for L2ARC writing and pick it up later using earlier inserted marker. While there, also reduce scope of hash lock, moving ZIO allocation and other operations not requiring header access out of it. All operations requiring header access move under hash lock, since L2_WRITING flag does not prevent header eviction only transition to arc_l2c_only state with L1 header. To be able to manipulate sublist lock and marker as needed add few more multilist functions and modify one. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16040 --- include/sys/multilist.h | 5 +- module/zfs/arc.c | 189 +++++++++++++++++++++------------------- module/zfs/dbuf.c | 2 +- module/zfs/dmu_objset.c | 10 +-- module/zfs/metaslab.c | 8 +- module/zfs/multilist.c | 26 +++++- 6 files changed, 136 insertions(+), 104 deletions(-) diff --git a/include/sys/multilist.h b/include/sys/multilist.h index 26f37c37ab..e7de86f237 100644 --- a/include/sys/multilist.h +++ b/include/sys/multilist.h @@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *); unsigned int multilist_get_num_sublists(multilist_t *); unsigned int multilist_get_random_index(multilist_t *); -multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +void multilist_sublist_lock(multilist_sublist_t *); +multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int); multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *); void multilist_sublist_unlock(multilist_sublist_t *); void multilist_sublist_insert_head(multilist_sublist_t *, void *); void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *); +void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *); void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); void multilist_sublist_remove(multilist_sublist_t *, void *); int multilist_sublist_is_empty(multilist_sublist_t *); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index b1bcac6c44..16c95db10f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -3872,7 +3872,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, ASSERT3P(marker, !=, NULL); - mls = multilist_sublist_lock(ml, idx); + mls = multilist_sublist_lock_idx(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { @@ -3984,6 +3984,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, return (bytes_evicted); } +static arc_buf_hdr_t * +arc_state_alloc_marker(void) +{ + arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_evict_state_impl(). + */ + marker->b_spa = 0; + + return (marker); +} + +static void +arc_state_free_marker(arc_buf_hdr_t *marker) +{ + kmem_cache_free(hdr_full_cache, marker); +} + /* * Allocate an array of buffer headers used as placeholders during arc state * eviction. @@ -3994,16 +4014,8 @@ arc_state_alloc_markers(int count) arc_buf_hdr_t **markers; markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); - for (int i = 0; i < count; i++) { - markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); - - /* - * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_evict_state_impl(). - */ - markers[i]->b_spa = 0; - - } + for (int i = 0; i < count; i++) + markers[i] = arc_state_alloc_marker(); return (markers); } @@ -4011,7 +4023,7 @@ static void arc_state_free_markers(arc_buf_hdr_t **markers, int count) { for (int i = 0; i < count; i++) - kmem_cache_free(hdr_full_cache, markers[i]); + arc_state_free_marker(markers[i]); kmem_free(markers, sizeof (*markers) * count); } @@ -4055,7 +4067,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; - mls = multilist_sublist_lock(ml, i); + mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } @@ -4120,7 +4132,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, } for (int i = 0; i < num_sublists; i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); } @@ -8633,7 +8645,7 @@ l2arc_sublist_lock(int list_num) * sublists being selected. */ idx = multilist_get_random_index(ml); - return (multilist_sublist_lock(ml, idx)); + return (multilist_sublist_lock_idx(ml, idx)); } /* @@ -9046,9 +9058,9 @@ l2arc_blk_fetch_done(zio_t *zio) static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; + arc_buf_hdr_t *hdr, *head, *marker; + uint64_t write_asize, write_psize, headroom; + boolean_t full, from_head = !arc_warm; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); @@ -9057,10 +9069,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_lsize = write_asize = write_psize = 0; + write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); + marker = arc_state_alloc_marker(); /* * Copy buffers for L2ARC writing. @@ -9075,40 +9088,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) continue; } - multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; - - VERIFY3P(mls, !=, NULL); - - /* - * L2ARC fast warmup. - * - * Until the ARC is warm and starts to evict, read from the - * head of the ARC lists rather than the tail. - */ - if (arc_warm == B_FALSE) - hdr = multilist_sublist_head(mls); - else - hdr = multilist_sublist_tail(mls); - headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; - for (; hdr; hdr = hdr_prev) { + /* + * Until the ARC is warm and starts to evict, read from the + * head of the ARC lists rather than the tail. + */ + multilist_sublist_t *mls = l2arc_sublist_lock(pass); + ASSERT3P(mls, !=, NULL); + if (from_head) + hdr = multilist_sublist_head(mls); + else + hdr = multilist_sublist_tail(mls); + + while (hdr != NULL) { kmutex_t *hash_lock; abd_t *to_write = NULL; - if (arc_warm == B_FALSE) - hdr_prev = multilist_sublist_next(mls, hdr); - else - hdr_prev = multilist_sublist_prev(mls, hdr); - hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { - /* - * Skip this buffer rather than waiting. - */ +skip: + /* Skip this buffer rather than waiting. */ + if (from_head) + hdr = multilist_sublist_next(mls, hdr); + else + hdr = multilist_sublist_prev(mls, hdr); continue; } @@ -9123,11 +9130,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); - continue; + goto skip; } ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || @@ -9149,12 +9155,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. + * We should not sleep with sublist lock held or it + * may block ARC eviction. Insert a marker to save + * the position and drop the lock. */ - arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); + if (from_head) { + multilist_sublist_insert_after(mls, hdr, + marker); + } else { + multilist_sublist_insert_before(mls, hdr, + marker); + } + multilist_sublist_unlock(mls); /* * If this header has b_rabd, we can use this since it @@ -9185,32 +9197,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, - ARC_FLAG_L2_WRITING); + ARC_FLAG_L2CACHE); mutex_exit(hash_lock); - continue; + goto next; } l2arc_free_abd_on_write(to_write, asize, type); } + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_hits = 0; + hdr->b_l2hdr.b_arcs_state = + hdr->b_l1hdr.b_state->arcs_state; + mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ - mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); - mutex_exit(&dev->l2ad_mtx); + } + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | + ARC_FLAG_L2_WRITING); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + l2arc_hdr_arcstats_increment(hdr); + + boolean_t commit = l2arc_log_blk_insert(dev, hdr); + mutex_exit(hash_lock); + + if (pio == NULL) { cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; - /* - * Create a list to save allocated abd buffers - * for l2arc_log_blk_commit(). - */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); @@ -9218,54 +9243,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ZIO_FLAG_CANFAIL); } - hdr->b_l2hdr.b_dev = dev; - hdr->b_l2hdr.b_hits = 0; - - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - hdr->b_l2hdr.b_arcs_state = - hdr->b_l1hdr.b_state->arcs_state; - arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); - - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, hdr); - mutex_exit(&dev->l2ad_mtx); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - wzio = zio_write_phys(pio, dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, asize, to_write, + dev->l2ad_hand, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); - write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + zio_nowait(wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; - l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); - mutex_exit(hash_lock); - - /* - * Append buf info to current log and commit if full. - * arcstat_l2_{size,asize} kstats are updated - * internally. - */ - if (l2arc_log_blk_insert(dev, hdr)) { - /* - * l2ad_hand will be adjusted in - * l2arc_log_blk_commit(). - */ + if (commit) { + /* l2ad_hand will be adjusted inside. */ write_asize += l2arc_log_blk_commit(dev, pio, cb); } - zio_nowait(wzio); +next: + multilist_sublist_lock(mls); + if (from_head) + hdr = multilist_sublist_next(mls, marker); + else + hdr = multilist_sublist_prev(mls, marker); + multilist_sublist_remove(mls, marker); } multilist_sublist_unlock(mls); @@ -9274,9 +9279,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } + arc_state_free_marker(marker); + /* No buffers selected for writing? */ if (pio == NULL) { - ASSERT0(write_lsize); + ASSERT0(write_psize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); @@ -10604,7 +10611,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); - L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); + L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d9fc6cf6af..5f3643f573 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -769,7 +769,7 @@ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); - multilist_sublist_t *mls = multilist_sublist_lock( + multilist_sublist_t *mls = multilist_sublist_lock_idx( &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f098e1daa4..2ba26f68e3 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1665,7 +1665,7 @@ sync_dnodes_task(void *arg) objset_t *os = soa->soa_os; multilist_sublist_t *ms = - multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); + multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx); dmu_objset_sync_dnodes(ms, soa->soa_tx); @@ -2076,8 +2076,8 @@ userquota_updates_task(void *arg) dnode_t *dn; userquota_cache_t cache = { { 0 } }; - multilist_sublist_t *list = - multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_t *list = multilist_sublist_lock_idx( + &os->os_synced_dnodes, uua->uua_sublist_idx); ASSERT(multilist_sublist_head(list) == NULL || dmu_objset_userused_enabled(os)); @@ -2159,8 +2159,8 @@ dnode_rele_task(void *arg) userquota_updates_arg_t *uua = arg; objset_t *os = uua->uua_os; - multilist_sublist_t *list = - multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_t *list = multilist_sublist_lock_idx( + &os->os_synced_dnodes, uua->uua_sublist_idx); dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index c4aa98ced4..9e762357b7 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -639,7 +639,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; for (int i = 0; i < multilist_get_num_sublists(ml); i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL) { @@ -656,7 +656,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) i--; break; } - mls = multilist_sublist_lock(ml, i); + mls = multilist_sublist_lock_idx(ml, i); metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); if (txg > @@ -2232,12 +2232,12 @@ metaslab_potentially_evict(metaslab_class_t *mc) unsigned int idx = multilist_get_random_index( &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = - multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx); + multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { - VERIFY3P(mls, ==, multilist_sublist_lock( + VERIFY3P(mls, ==, multilist_sublist_lock_idx( &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c index b1cdf1c5c5..3d3ef86e68 100644 --- a/module/zfs/multilist.c +++ b/module/zfs/multilist.c @@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml) return (random_in_range(ml->ml_num_sublists)); } +void +multilist_sublist_lock(multilist_sublist_t *mls) +{ + mutex_enter(&mls->mls_lock); +} + /* Lock and return the sublist specified at the given index */ multilist_sublist_t * -multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx) { multilist_sublist_t *mls; @@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) multilist_sublist_t * multilist_sublist_lock_obj(multilist_t *ml, void *obj) { - return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj))); + return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj))); } void @@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) list_insert_tail(&mls->mls_list, obj); } +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_after(&mls->mls_list, prev, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_before(&mls->mls_list, next, obj); +} + /* * Move the object one element forward in the list. * From d98973dbdd5a85b6c8a8556d5bd5c9903e2d2ee6 Mon Sep 17 00:00:00 2001 From: Benda Xu Date: Wed, 10 Apr 2024 07:34:58 +0800 Subject: [PATCH 003/113] config/Substfiles.am: restrict to the dedicated list. We recover the scope of $(SUBSTFILES) to explicitly control what files are being generated from the corresponding .in. Reviewed-by: Brian Behlendorf Signed-off-by: Benda Xu Closes #15980 --- config/Substfiles.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/Substfiles.am b/config/Substfiles.am index 18422bf643..2459637abe 100644 --- a/config/Substfiles.am +++ b/config/Substfiles.am @@ -44,4 +44,4 @@ SUBSTFILES = CLEANFILES += $(SUBSTFILES) dist_noinst_DATA += $(SUBSTFILES:=.in) -$(call SUBST,%,) +$(SUBSTFILES): $(call SUBST,%,) From e5e2a5a3b872e618af585f1a8cec4782c6f2cfe1 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Wed, 10 Apr 2024 16:30:25 -0400 Subject: [PATCH 004/113] Add custom debug printing for your asserts Being able to print custom debug information on assert trip seems useful. Reviewed-by: Brian Behlendorf Reviewed-by: Paul Dagnelie Signed-off-by: Rich Ercolani Closes #15792 --- include/os/freebsd/spl/sys/debug.h | 149 +++++++++++++++++++++++++--- include/os/linux/spl/sys/debug.h | 152 +++++++++++++++++++++++++---- lib/libspl/include/assert.h | 97 ++++++++++++++++++ module/zfs/arc.c | 5 +- 4 files changed, 372 insertions(+), 31 deletions(-) diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h index 785fcf62dd..f041dde34f 100644 --- a/include/os/freebsd/spl/sys/debug.h +++ b/include/os/freebsd/spl/sys/debug.h @@ -56,11 +56,33 @@ /* * Common DEBUG functionality. */ +#ifdef __FreeBSD__ +#include +#endif + +#ifndef __printflike +#define __printflike(a, b) __printf(a, b) +#endif + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((unused)) +#endif + +/* + * Without this, we see warnings from objtool during normal Linux builds when + * the kernel is built with CONFIG_STACK_VALIDATION=y: + * + * warning: objtool: tsd_create() falls through to next function __list_add() + * warning: objtool: .text: unexpected end of section + * + * Until the toolchain stops doing this, we must only define this attribute on + * spl_panic() when doing static analysis. + */ #if defined(__COVERITY__) || defined(__clang_analyzer__) __attribute__((__noreturn__)) #endif extern void spl_panic(const char *file, const char *func, int line, - const char *fmt, ...) __attribute__((__noreturn__)); + const char *fmt, ...); extern void spl_dumpstack(void); static inline int @@ -73,8 +95,10 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #ifndef expect #define expect(expr, value) (__builtin_expect((expr), (value))) #endif +#ifndef __linux__ #define likely(expr) expect((expr) != 0, 1) #define unlikely(expr) expect((expr) != 0, 0) +#endif #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) @@ -84,6 +108,12 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_assert("VERIFY(" #cond ") failed\n", \ __FILE__, __FUNCTION__, __LINE__)) +#define VERIFYF(cond, str, ...) do { \ + if (unlikely(!cond)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\ + } while (0) + #define VERIFY3B(LEFT, OP, RIGHT) do { \ const boolean_t _verify3_left = (boolean_t)(LEFT); \ const boolean_t _verify3_right = (boolean_t)(RIGHT); \ @@ -123,7 +153,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%p " #OP " %p)\n", \ + "failed (%px " #OP " %px)\n", \ (void *)_verify3_left, \ (void *)_verify3_right); \ } while (0) @@ -142,10 +172,98 @@ spl_assert(const char *buf, const char *file, const char *func, int line) if (unlikely(!(0 == _verify0_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY0P(" #RIGHT ") " \ - "failed (NULL == %p)\n", \ + "failed (NULL == %px)\n", \ (void *)_verify0_right); \ } while (0) +/* + * Note that you should not put any operations you want to always happen + * in the print section for ASSERTs unless you only want them to run on + * debug builds! + * e.g. ASSERT3UF(2, <, 3, "%s", foo(x)), foo(x) won't run on non-debug + * builds. + */ + +#define VERIFY3BF(LEFT, OP, RIGHT, STR, ...) do { \ + const boolean_t _verify3_left = (boolean_t)(LEFT); \ + const boolean_t _verify3_right = (boolean_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d) " STR "\n", \ + (boolean_t)(_verify3_left), \ + (boolean_t)(_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY3SF(LEFT, OP, RIGHT, STR, ...) do { \ + const int64_t _verify3_left = (int64_t)(LEFT); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld) " STR "\n", \ + (long long)(_verify3_left), \ + (long long)(_verify3_right), \ + __VA_ARGS); \ + } while (0) + +#define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \ + const uint64_t _verify3_left = (uint64_t)(LEFT); \ + const uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu) " STR "\n", \ + (unsigned long long)(_verify3_left), \ + (unsigned long long)(_verify3_right), \ + __VA_ARGS); \ + } while (0) + +#define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \ + const uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px) " STR "\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY0PF(RIGHT, STR, ...) do { \ + const uintptr_t _verify3_left = (uintptr_t)(0); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0(0 == " #RIGHT ") " \ + "failed (0 == %px) " STR "\n", \ + (long long) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY0F(RIGHT, STR, ...) do { \ + const int64_t _verify3_left = (int64_t)(0); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0(0 == " #RIGHT ") " \ + "failed (0 == %lld) " STR "\n", \ + (long long) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY_IMPLY(A, B) \ + ((void)(likely((!(A)) || (B)) || \ + spl_assert("(" #A ") implies (" #B ")", \ + __FILE__, __FUNCTION__, __LINE__))) + +#define VERIFY_EQUIV(A, B) \ + ((void)(likely(!!(A) == !!(B)) || \ + spl_assert("(" #A ") is equivalent to (" #B ")", \ + __FILE__, __FUNCTION__, __LINE__))) + /* * Debugging disabled (--disable-debug) */ @@ -162,6 +280,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) ((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z))) #define ASSERT0(x) ((void) sizeof ((uintptr_t)(x))) #define ASSERT0P(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERT3BF(x, y, z, str, ...) ASSERT3B(x, y, z) +#define ASSERT3SF(x, y, z, str, ...) ASSERT3S(x, y, z) +#define ASSERT3UF(x, y, z, str, ...) ASSERT3U(x, y, z) +#define ASSERT3PF(x, y, z, str, ...) ASSERT3P(x, y, z) +#define ASSERT0PF(x, str, ...) ASSERT0P(x) +#define ASSERT0F(x, str, ...) ASSERT0(x) +#define ASSERTF(x, str, ...) ASSERT(x) #define IMPLY(A, B) \ ((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B))) #define EQUIV(A, B) \ @@ -178,16 +303,16 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT0P VERIFY0P +#define ASSERT3BF VERIFY3BF +#define ASSERT3SF VERIFY3SF +#define ASSERT3UF VERIFY3UF +#define ASSERT3PF VERIFY3PF +#define ASSERT0PF VERIFY0PF +#define ASSERT0F VERIFY0F +#define ASSERTF VERIFYF #define ASSERT VERIFY -#define IMPLY(A, B) \ - ((void)(likely((!(A)) || (B)) || \ - spl_assert("(" #A ") implies (" #B ")", \ - __FILE__, __FUNCTION__, __LINE__))) -#define EQUIV(A, B) \ - ((void)(likely(!!(A) == !!(B)) || \ - spl_assert("(" #A ") is equivalent to (" #B ")", \ - __FILE__, __FUNCTION__, __LINE__))) - +#define IMPLY VERIFY_IMPLY +#define EQUIV VERIFY_EQUIV #endif /* NDEBUG */ diff --git a/include/os/linux/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h index 288193ad21..f041dde34f 100644 --- a/include/os/linux/spl/sys/debug.h +++ b/include/os/linux/spl/sys/debug.h @@ -1,24 +1,29 @@ /* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. * - * This file is part of the SPL, Solaris Porting Layer. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . + * $FreeBSD$ */ /* @@ -47,10 +52,17 @@ #ifndef _SPL_DEBUG_H #define _SPL_DEBUG_H + /* * Common DEBUG functionality. */ +#ifdef __FreeBSD__ +#include +#endif + +#ifndef __printflike #define __printflike(a, b) __printf(a, b) +#endif #ifndef __maybe_unused #define __maybe_unused __attribute__((unused)) @@ -80,6 +92,14 @@ spl_assert(const char *buf, const char *file, const char *func, int line) return (0); } +#ifndef expect +#define expect(expr, value) (__builtin_expect((expr), (value))) +#endif +#ifndef __linux__ +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) +#endif + #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) @@ -88,6 +108,12 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_assert("VERIFY(" #cond ") failed\n", \ __FILE__, __FUNCTION__, __LINE__)) +#define VERIFYF(cond, str, ...) do { \ + if (unlikely(!cond)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\ + } while (0) + #define VERIFY3B(LEFT, OP, RIGHT) do { \ const boolean_t _verify3_left = (boolean_t)(LEFT); \ const boolean_t _verify3_right = (boolean_t)(RIGHT); \ @@ -150,6 +176,84 @@ spl_assert(const char *buf, const char *file, const char *func, int line) (void *)_verify0_right); \ } while (0) +/* + * Note that you should not put any operations you want to always happen + * in the print section for ASSERTs unless you only want them to run on + * debug builds! + * e.g. ASSERT3UF(2, <, 3, "%s", foo(x)), foo(x) won't run on non-debug + * builds. + */ + +#define VERIFY3BF(LEFT, OP, RIGHT, STR, ...) do { \ + const boolean_t _verify3_left = (boolean_t)(LEFT); \ + const boolean_t _verify3_right = (boolean_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d) " STR "\n", \ + (boolean_t)(_verify3_left), \ + (boolean_t)(_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY3SF(LEFT, OP, RIGHT, STR, ...) do { \ + const int64_t _verify3_left = (int64_t)(LEFT); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld) " STR "\n", \ + (long long)(_verify3_left), \ + (long long)(_verify3_right), \ + __VA_ARGS); \ + } while (0) + +#define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \ + const uint64_t _verify3_left = (uint64_t)(LEFT); \ + const uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu) " STR "\n", \ + (unsigned long long)(_verify3_left), \ + (unsigned long long)(_verify3_right), \ + __VA_ARGS); \ + } while (0) + +#define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \ + const uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px) " STR "\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY0PF(RIGHT, STR, ...) do { \ + const uintptr_t _verify3_left = (uintptr_t)(0); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0(0 == " #RIGHT ") " \ + "failed (0 == %px) " STR "\n", \ + (long long) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY0F(RIGHT, STR, ...) do { \ + const int64_t _verify3_left = (int64_t)(0); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0(0 == " #RIGHT ") " \ + "failed (0 == %lld) " STR "\n", \ + (long long) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + #define VERIFY_IMPLY(A, B) \ ((void)(likely((!(A)) || (B)) || \ spl_assert("(" #A ") implies (" #B ")", \ @@ -176,6 +280,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) ((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z))) #define ASSERT0(x) ((void) sizeof ((uintptr_t)(x))) #define ASSERT0P(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERT3BF(x, y, z, str, ...) ASSERT3B(x, y, z) +#define ASSERT3SF(x, y, z, str, ...) ASSERT3S(x, y, z) +#define ASSERT3UF(x, y, z, str, ...) ASSERT3U(x, y, z) +#define ASSERT3PF(x, y, z, str, ...) ASSERT3P(x, y, z) +#define ASSERT0PF(x, str, ...) ASSERT0P(x) +#define ASSERT0F(x, str, ...) ASSERT0(x) +#define ASSERTF(x, str, ...) ASSERT(x) #define IMPLY(A, B) \ ((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B))) #define EQUIV(A, B) \ @@ -192,6 +303,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT0P VERIFY0P +#define ASSERT3BF VERIFY3BF +#define ASSERT3SF VERIFY3SF +#define ASSERT3UF VERIFY3UF +#define ASSERT3PF VERIFY3PF +#define ASSERT0PF VERIFY0PF +#define ASSERT0F VERIFY0F +#define ASSERTF VERIFYF #define ASSERT VERIFY #define IMPLY VERIFY_IMPLY #define EQUIV VERIFY_EQUIV diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index 57f5719c1a..155bbab302 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -70,6 +70,15 @@ libspl_assert(const char *buf, const char *file, const char *func, int line) #define VERIFY(cond) \ (void) ((!(cond)) && \ libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__)) + +#define VERIFYF(cond, STR, ...) \ +do { \ + if (!(cond)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s " STR, #cond, \ + __VA_ARGS__); \ +} while (0) + #define verify(cond) \ (void) ((!(cond)) && \ libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__)) @@ -132,6 +141,79 @@ do { \ (void *)__left); \ } while (0) +/* + * This is just here because cstyle gets upset about #LEFT + * on a newline. + */ + +/* BEGIN CSTYLED */ +#define VERIFY3BF(LEFT, OP, RIGHT, STR, ...) \ +do { \ + const boolean_t __left = (boolean_t)(LEFT); \ + const boolean_t __right = (boolean_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx) " STR, \ + #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right, \ + __VA_ARGS__); \ +} while (0) + +#define VERIFY3SF(LEFT, OP, RIGHT, STR, ...) \ +do { \ + const int64_t __left = (int64_t)(LEFT); \ + const int64_t __right = (int64_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx) " STR, \ + #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right, \ + __VA_ARGS__); \ +} while (0) + +#define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) \ +do { \ + const uint64_t __left = (uint64_t)(LEFT); \ + const uint64_t __right = (uint64_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx) " STR, \ + #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right, \ + __VA_ARGS__); \ +} while (0) + +#define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) \ +do { \ + const uintptr_t __left = (uintptr_t)(LEFT); \ + const uintptr_t __right = (uintptr_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx) " STR, \ + #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right, \ + __VA_ARGS__); \ +} while (0) +/* END CSTYLED */ + +#define VERIFY0F(LEFT, STR, ...) \ +do { \ + const uint64_t __left = (uint64_t)(LEFT); \ + if (!(__left == 0)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s == 0 (0x%llx == 0) " STR, #LEFT, \ + (u_longlong_t)__left, __VA_ARGS__); \ +} while (0) + +#define VERIFY0PF(LEFT, STR, ...) \ +do { \ + const uintptr_t __left = (uintptr_t)(LEFT); \ + if (!(__left == 0)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s == 0 (%p == 0) " STR, #LEFT, \ + (u_longlong_t)__left, __VA_ARGS__); \ +} while (0) + #ifdef assert #undef assert #endif @@ -147,7 +229,15 @@ do { \ ((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z))) #define ASSERT0(x) ((void) sizeof ((uintptr_t)(x))) #define ASSERT0P(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERT3BF(x, y, z, str, ...) ASSERT3B(x, y, z) +#define ASSERT3SF(x, y, z, str, ...) ASSERT3S(x, y, z) +#define ASSERT3UF(x, y, z, str, ...) ASSERT3U(x, y, z) +#define ASSERT3PF(x, y, z, str, ...) ASSERT3P(x, y, z) +#define ASSERT0P(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERT0PF(x, str, ...) ASSERT0P(x) +#define ASSERT0F(x, str, ...) ASSERT0(x) #define ASSERT(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERTF(x, str, ...) ASSERT(x) #define assert(x) ((void) sizeof ((uintptr_t)(x))) #define IMPLY(A, B) \ ((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B))) @@ -160,7 +250,14 @@ do { \ #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT0P VERIFY0P +#define ASSERT3BF VERIFY3BF +#define ASSERT3SF VERIFY3SF +#define ASSERT3UF VERIFY3UF +#define ASSERT3PF VERIFY3PF +#define ASSERT0PF VERIFY0PF +#define ASSERT0F VERIFY0F #define ASSERT VERIFY +#define ASSERTF VERIFYF #define assert VERIFY #define IMPLY(A, B) \ ((void)(((!(A)) || (B)) || \ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 16c95db10f..6954051b1d 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1960,7 +1960,7 @@ arc_buf_untransform_in_place(arc_buf_t *buf) ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); @@ -2083,7 +2083,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, * allocate a new data buffer for the buf. */ if (ARC_BUF_SHARED(buf)) { - ASSERT(ARC_BUF_COMPRESSED(buf)); + ASSERTF(ARC_BUF_COMPRESSED(buf), + "buf %p was uncompressed", buf); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; From e5ddecd1a7e33bc341e7b5e8dd25d2fe478de8f2 Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Wed, 10 Apr 2024 16:01:39 -0600 Subject: [PATCH 005/113] return NULL at end of send_progress_thread Reviewed-by: Rob Norris Reviewed-by: Brian Behlendorf Signed-off-by: Jason Lee Closes #16074 --- lib/libzfs/libzfs_sendrecv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index d7b90ccb1c..526f57ea40 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -1053,6 +1053,7 @@ send_progress_thread(void *arg) } } pthread_cleanup_pop(B_TRUE); + return (NULL); } static boolean_t From 44f337be30e1502b32c8d381344044f15dd34674 Mon Sep 17 00:00:00 2001 From: Andy Fiddaman Date: Thu, 11 Apr 2024 22:38:22 +0100 Subject: [PATCH 006/113] Illumos#16463 zfs_ioc_recv leaks nvlist In https://www.illumos.org/issues/16463 it was observed that an nvlist was being leaked in zfs_ioc_recv() due a missing call to nvlist_free for "hidden_args". For OpenZFS the same issue exists in zfs_ioc_recv_new() and is addressed by this PR. This change also properly frees nvlists in the unlikely event that a call to get_nvlist() fails. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Igor Kozhukhov Signed-off-by: Andy Fiddaman Closes #16077 --- module/zfs/zfs_ioctl.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index dca15f4b82..2ac1e34dcc 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -40,6 +40,7 @@ * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, 2021, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright 2024 Oxide Computer Company */ /* @@ -5345,8 +5346,9 @@ zfs_ioc_recv(zfs_cmd_t *zc) if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || - strchr(zc->zc_value, '%')) + strchr(zc->zc_value, '%') != NULL) { return (SET_ERROR(EINVAL)); + } (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); @@ -5354,13 +5356,15 @@ zfs_ioc_recv(zfs_cmd_t *zc) if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &recvdprops)) != 0) - return (error); + zc->zc_iflags, &recvdprops)) != 0) { + goto out; + } if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - zc->zc_iflags, &localprops)) != 0) - return (error); + zc->zc_iflags, &localprops)) != 0) { + goto out; + } if (zc->zc_string[0]) origin = zc->zc_string; @@ -5372,8 +5376,6 @@ zfs_ioc_recv(zfs_cmd_t *zc) error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); - nvlist_free(recvdprops); - nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop @@ -5389,7 +5391,10 @@ zfs_ioc_recv(zfs_cmd_t *zc) error = SET_ERROR(EINVAL); } +out: nvlist_free(errors); + nvlist_free(recvdprops); + nvlist_free(localprops); return (error); } @@ -5456,8 +5461,9 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || - strchr(snapname, '%')) + strchr(snapname, '%') != NULL) { return (SET_ERROR(EINVAL)); + } (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); @@ -5481,15 +5487,15 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) - return (error); + goto out; error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) - return (error); + goto out; error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) - return (error); + goto out; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, heal, resumable, input_fd, begin_record, @@ -5499,9 +5505,11 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); +out: nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); + nvlist_free(hidden_args); return (error); } From bc27c494049e5282f90b103ee45d0fe12310aac4 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 10 Apr 2024 11:19:50 +1000 Subject: [PATCH 007/113] tests: add test for vdev_disk page alignment check This provides a test driver and a set of test vectors for the page alignment check callback function vdev_disk_check_pages_cb(). Because there's no good facility for exposing this function to a userspace test right now, for now I'm just duplicating the function and adding commentary to remind people to keep them in sync. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16076 --- module/os/linux/zfs/vdev_disk.c | 6 + tests/runfiles/common.run | 6 + tests/runfiles/sanity.run | 6 + tests/zfs-tests/Makefile.am | 3 + .../tests/functional/vdev_disk/.gitignore | 1 + .../functional/vdev_disk/page_alignment.c | 413 ++++++++++++++++++ 6 files changed, 435 insertions(+) create mode 100644 tests/zfs-tests/tests/functional/vdev_disk/.gitignore create mode 100644 tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index a560bca918..77773c4f2b 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -853,6 +853,11 @@ BIO_END_IO_PROTO(vbio_completion, bio, error) * pages) but we still have to ensure the data portion is correctly sized and * aligned to the logical block size, to ensure that if the kernel wants to * split the BIO, the two halves will still be properly aligned. + * + * NOTE: if you change this function, change the copy in + * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test + * data there to validate the change you're making. + * */ typedef struct { uint_t bmask; @@ -863,6 +868,7 @@ typedef struct { static int vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) { + (void) page; vdev_disk_check_pages_t *s = priv; /* diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 912344b4ed..4295ca1b6f 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -971,6 +971,12 @@ tests = [ 'userspace_send_encrypted', 'userspace_encrypted_13709'] tags = ['functional', 'userquota'] +[tests/functional/vdev_disk:Linux] +pre = +post = +tests = ['page_alignment'] +tags = ['functional', 'vdev_disk'] + [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index ab41c05b84..598123bcd2 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -599,6 +599,12 @@ tags = ['functional', 'truncate'] tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] tags = ['functional', 'upgrade'] +[tests/functional/vdev_disk:Linux] +pre = +post = +tests = ['page_alignment'] +tags = ['functional', 'vdev_disk'] + [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos'] diff --git a/tests/zfs-tests/Makefile.am b/tests/zfs-tests/Makefile.am index 3dd1a64527..40a361d582 100644 --- a/tests/zfs-tests/Makefile.am +++ b/tests/zfs-tests/Makefile.am @@ -13,6 +13,9 @@ scripts_zfs_tests_functional_hkdf_PROGRAMS = %D%/tests/functional/hkdf/hkdf_test %C%_tests_functional_hkdf_hkdf_test_LDADD = \ libzpool.la +scripts_zfs_tests_functional_vdev_diskdir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/vdev_disk +scripts_zfs_tests_functional_vdev_disk_PROGRAMS = %D%/tests/functional/vdev_disk/page_alignment + scripts_zfs_tests_functional_cp_filesdir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/cp_files scripts_zfs_tests_functional_cp_files_PROGRAMS = %D%/tests/functional/cp_files/seekflood diff --git a/tests/zfs-tests/tests/functional/vdev_disk/.gitignore b/tests/zfs-tests/tests/functional/vdev_disk/.gitignore new file mode 100644 index 0000000000..27653e5924 --- /dev/null +++ b/tests/zfs-tests/tests/functional/vdev_disk/.gitignore @@ -0,0 +1 @@ +page_alignment diff --git a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c new file mode 100644 index 0000000000..98d19a1280 --- /dev/null +++ b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c @@ -0,0 +1,413 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2023, 2024, Klara Inc. + */ + +#include +#include +#include +#include +#include + +/* + * This tests the vdev_disk page alignment check callback + * vdev_disk_check_pages_cb(). For now, this test includes a copy of that + * function from module/os/linux/zfs/vdev_disk.c. If you change it here, + * remember to change it there too, and add tests data here to validate the + * change you're making. + */ + +struct page; + +typedef struct { + uint32_t bmask; + uint32_t npages; + uint32_t end; +} vdev_disk_check_pages_t; + +static int +vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +{ + (void) page; + vdev_disk_check_pages_t *s = priv; + + /* + * If we didn't finish on a block size boundary last time, then there + * would be a gap if we tried to use this ABD as-is, so abort. + */ + if (s->end != 0) + return (1); + + /* + * Note if we're taking less than a full block, so we can check it + * above on the next call. + */ + s->end = len & s->bmask; + + /* All blocks after the first must start on a block size boundary. */ + if (s->npages != 0 && (off & s->bmask) != 0) + return (1); + + s->npages++; + return (0); +} + +typedef struct { + /* test name */ + const char *name; + + /* blocks size mask */ + uint32_t mask; + + /* amount of data to take */ + size_t size; + + /* [start offset in page, len to end of page or size] */ + size_t pages[16][2]; +} page_test_t; + +static const page_test_t valid_tests[] = { + /* 512B block tests */ + { + "512B blocks, 4K single page", + 0x1ff, 0x1000, { + { 0x0, 0x1000 }, + }, + }, { + "512B blocks, 1K at start of page", + 0x1ff, 0x400, { + { 0x0, 0x1000 }, + }, + }, { + "512B blocks, 1K at end of page", + 0x1ff, 0x400, { + { 0x0c00, 0x0400 }, + }, + }, { + "512B blocks, 1K within page, 512B start offset", + 0x1ff, 0x400, { + { 0x0200, 0x0e00 }, + }, + }, { + "512B blocks, 8K across 2x4K pages", + 0x1ff, 0x2000, { + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + }, + }, { + "512B blocks, 4K across two pages, 2K start offset", + 0x1ff, 0x1000, { + { 0x0800, 0x0800 }, + { 0x0, 0x0800 }, + }, + }, { + "512B blocks, 16K across 5x4K pages, 512B start offset", + 0x1ff, 0x4000, { + { 0x0200, 0x0e00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0200 }, + }, + }, { + "512B blocks, 64K data, 8x8K compound pages", + 0x1ff, 0x10000, { + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + }, + }, { + "512B blocks, 64K data, 9x8K compound pages, 512B start offset", + 0x1ff, 0x10000, { + { 0x0200, 0x1e00 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x0200 }, + }, + }, { + "512B blocks, 64K data, 2x16K compound pages, 8x4K pages", + 0x1ff, 0x10000, { + { 0x0, 0x8000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + }, + }, { + "512B blocks, 64K data, mixed 4K/8K/16K pages", + 0x1ff, 0x10000, { + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + }, + }, { + "512B blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", + 0x1ff, 0x10000, { + { 0x0400, 0x0c00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0400 }, + }, + }, + + /* 4K block tests */ + { + "4K blocks, 4K single page", + 0xfff, 0x1000, { + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 1K at start of page", + 0xfff, 0x400, { + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 1K at end of page", + 0xfff, 0x400, { + { 0x0c00, 0x0400 }, + }, + }, { + "4K blocks, 1K within page, 512B start offset", + 0xfff, 0x400, { + { 0x0200, 0x0e00 }, + }, + }, { + "4K blocks, 8K across 2x4K pages", + 0xfff, 0x2000, { + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 4K across two pages, 2K start offset", + 0xfff, 0x1000, { + { 0x0800, 0x0800 }, + { 0x0, 0x0800 }, + }, + }, { + "4K blocks, 16K across 5x4K pages, 512B start offset", + 0xfff, 0x4000, { + { 0x0200, 0x0e00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0200 }, + }, + }, { + "4K blocks, 64K data, 8x8K compound pages", + 0xfff, 0x10000, { + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + }, + }, { + "4K blocks, 64K data, 9x8K compound pages, 512B start offset", + 0xfff, 0x10000, { + { 0x0200, 0x1e00 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x0200 }, + }, + }, { + "4K blocks, 64K data, 2x16K compound pages, 8x4K pages", + 0xfff, 0x10000, { + { 0x0, 0x8000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 64K data, mixed 4K/8K/16K pages", + 0xfff, 0x10000, { + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + }, + }, { + "4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", + 0xfff, 0x10000, { + { 0x0400, 0x0c00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0400 }, + }, + }, + + { 0 }, +}; + +static const page_test_t invalid_tests[] = { + { + "512B blocks, 16K data, 512 leader (gang block simulation)", + 0x1ff, 0x8000, { + { 0x0, 0x0200 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0c00 }, + }, + }, { + "4K blocks, 32K data, 2 incompatible spans " + "(gang abd simulation)", + 0xfff, 0x8000, { + { 0x0800, 0x0800 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0800 }, + { 0x0800, 0x0800 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0800 }, + }, + }, + { 0 }, +}; + +static bool +run_test(const page_test_t *test, bool verbose) +{ + size_t rem = test->size; + + vdev_disk_check_pages_t s = { + .bmask = 0xfff, + .npages = 0, + .end = 0, + }; + + for (int i = 0; test->pages[i][1] > 0; i++) { + size_t off = test->pages[i][0]; + size_t len = test->pages[i][1]; + + size_t take = MIN(rem, len); + + if (verbose) + printf(" page %d [off %lx len %lx], " + "rem %lx, take %lx\n", + i, off, len, rem, take); + + if (vdev_disk_check_pages_cb(NULL, off, take, &s)) { + if (verbose) + printf(" ABORT: misalignment detected, " + "rem %lx\n", rem); + return (false); + } + + rem -= take; + if (rem == 0) + break; + } + + if (rem > 0) { + if (verbose) + printf(" ABORT: ran out of pages, rem %lx\n", rem); + return (false); + } + + return (true); +} + +static void +run_test_set(const page_test_t *tests, bool want, int *ntests, int *npassed) +{ + for (const page_test_t *test = &tests[0]; test->name; test++) { + bool pass = (run_test(test, false) == want); + if (pass) { + printf("%s: PASS\n", test->name); + (*npassed)++; + } else { + printf("%s: FAIL [expected %s, got %s]\n", test->name, + want ? "VALID" : "INVALID", + want ? "INVALID" : "VALID"); + run_test(test, true); + } + (*ntests)++; + } +} + +int main(void) { + int ntests = 0, npassed = 0; + + run_test_set(valid_tests, true, &ntests, &npassed); + run_test_set(invalid_tests, false, &ntests, &npassed); + + printf("\n%d/%d tests passed\n", npassed, ntests); + + return (ntests == npassed ? 0 : 1); +} From 1bf649cb0a1cc6e48dce848611ba327eb283000e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 10 Apr 2024 13:14:13 +1000 Subject: [PATCH 008/113] vdev_disk: fix alignment check when buffer has non-zero starting offset If a linear buffer spans multiple pages, and the first page has a non-zero starting offset, the checker would not include the offset, and so would think there was an alignment gap at the end of the first page, rather than at the start. That is, for a 16K buffer spread across five pages with an initial 512B offset: [.XXXXXXX][XXXXXXXX][XXXXXXXX][XXXXXXXX][XXXXXXX.] It would be interpreted as: [XXXXXXX.][XXXXXXXX]... And be rejected as misaligned. Since it's already a linear ABD, the "linearising" copy would just reuse the buffer as-is, and the second check would failing, tripping the VERIFY in vdev_disk_io_rw(). This commit fixes all this by including the offset in the check for end-of-page alignment. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16076 --- module/os/linux/zfs/vdev_disk.c | 2 +- tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 77773c4f2b..f3f0c08752 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -882,7 +882,7 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) * Note if we're taking less than a full block, so we can check it * above on the next call. */ - s->end = len & s->bmask; + s->end = (off+len) & s->bmask; /* All blocks after the first must start on a block size boundary. */ if (s->npages != 0 && (off & s->bmask) != 0) diff --git a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c index 98d19a1280..5c6d28eb2c 100644 --- a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c +++ b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c @@ -61,7 +61,7 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) * Note if we're taking less than a full block, so we can check it * above on the next call. */ - s->end = len & s->bmask; + s->end = (off+len) & s->bmask; /* All blocks after the first must start on a block size boundary. */ if (s->npages != 0 && (off & s->bmask) != 0) From e2035cdbf70e2d4e6f819ce6d5f6a286a152d264 Mon Sep 17 00:00:00 2001 From: Rob N Date: Fri, 12 Apr 2024 07:49:57 +1000 Subject: [PATCH 009/113] AUTHORS: refresh with recent new contributors Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16079 --- .mailmap | 18 ++++++++++++++++++ AUTHORS | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/.mailmap b/.mailmap index 46ef016b93..32bdb52096 100644 --- a/.mailmap +++ b/.mailmap @@ -30,6 +30,7 @@ Andreas Dilger Andrew Walker Benedikt Neuffer Chengfei Zhu +ChenHao Lu <18302010006@fudan.edu.cn> Chris Lindee Colm Buckley Crag Wang @@ -43,6 +44,7 @@ Glenn Washburn Gordan Bobic Gregory Bartholomew hedong zhang +Ilkka Sovanto InsanePrawn Jason Cohen Jason Harmening @@ -57,6 +59,7 @@ KernelOfTruth Liu Hua Liu Qing loli10K +Mart Frauenlob Matthias Blankertz Michael Gmelin Olivier Mazouffre @@ -73,6 +76,9 @@ WHR Yanping Gao Youzhong Yang +# Signed-off-by: overriding Author: +Yuxin Wang + # Commits from strange places, long ago Brian Behlendorf Brian Behlendorf @@ -102,12 +108,15 @@ Brandon Thetford buzzingwires <131118055+buzzingwires@users.noreply.github.com> Cedric Maunoury <38213715+cedricmaunoury@users.noreply.github.com> Charles Suh +Chris Peredun <126915832+chrisperedun@users.noreply.github.com> Dacian Reece-Stremtan <35844628+dacianstremtan@users.noreply.github.com> Damian Szuberski <30863496+szubersk@users.noreply.github.com> Daniel Hiepler <32984777+heeplr@users.noreply.github.com> Daniel Kobras Daniel Reichelt David Quigley +Dennis R. Friedrichsen <31087738+dennisfriedrichsen@users.noreply.github.com> +Dex Wood DHE Dmitri John Ledkov <19779+xnox@users.noreply.github.com> Dries Michiels <32487486+driesmp@users.noreply.github.com> @@ -128,6 +137,7 @@ Harry Mallon <1816667+hjmallon@users.noreply.github.com> Hiếu Lê Jake Howard James Cowgill +Jaron Kent-Dobias Jason King Jeff Dike <52420226+jdike@users.noreply.github.com> Jitendra Patidar <53164267+jsai20@users.noreply.github.com> @@ -137,7 +147,9 @@ John L. Hammond <35266395+jhammond-intel@users.noreply. John-Mark Gurney John Ramsden Jonathon Fernyhough <559369+jonathonf@users.noreply.github.com> +Jose Luis Duran Justin Hibbits +Kevin Greene <104801862+kxgreene@users.noreply.github.com> Kevin Jin <33590050+jxdking@users.noreply.github.com> Kevin P. Fleming Krzysztof Piecuch <3964215+pikrzysztof@users.noreply.github.com> @@ -148,9 +160,11 @@ Lorenz Hüdepohl Luís Henriques <73643340+lumigch@users.noreply.github.com> Marcin Skarbek Matt Fiddaman <81489167+matt-fidd@users.noreply.github.com> +Maxim Filimonov Max Zettlmeißl <6818198+maxz@users.noreply.github.com> Michael Niewöhner Michael Zhivich <33133421+mzhivich@users.noreply.github.com> +MigeljanImeri <78048439+MigeljanImeri@users.noreply.github.com> Mo Zhou <5723047+cdluminate@users.noreply.github.com> Nick Mattis omni <79493359+omnivagant@users.noreply.github.com> @@ -164,6 +178,7 @@ Ping Huang <101400146+hpingfs@users.noreply.github.com> Piotr P. Stefaniak Richard Allen <33836503+belperite@users.noreply.github.com> Rich Ercolani <214141+rincebrain@users.noreply.github.com> +Rick Macklem <64620010+rmacklem@users.noreply.github.com> Rob Wing <98866084+rob-wing@users.noreply.github.com> Roman Strashkin Ryan Hirasaki <4690732+RyanHir@users.noreply.github.com> @@ -174,6 +189,8 @@ Scott Colby Sean Eric Fagan Spencer Kinny <30333052+Spencer-Kinny@users.noreply.github.com> Srikanth N S <75025422+nssrikanth@users.noreply.github.com> +Stefan Lendl <1321542+stfl@users.noreply.github.com> +Thomas Bertschinger <101425190+bertschinger@users.noreply.github.com> Thomas Geppert Tim Crawford Tom Matthews @@ -181,6 +198,7 @@ Tony Perkins <62951051+tony-zfs@users.noreply.github.com> Torsten Wörtwein Tulsi Jain Václav Skála <33496485+vaclavskala@users.noreply.github.com> +Vaibhav Bhanawat <88050553+vaibhav-delphix@users.noreply.github.com> Violet Purcell <66446404+vimproved@users.noreply.github.com> Vipin Kumar Verma <75025470+vermavipinkumar@users.noreply.github.com> Wolfgang Bumiller diff --git a/AUTHORS b/AUTHORS index be1efb87b3..d7d55f42d2 100644 --- a/AUTHORS +++ b/AUTHORS @@ -88,9 +88,11 @@ CONTRIBUTORS: Bassu Ben Allen Ben Cordero + Benda Xu Benedikt Neuffer Benjamin Albrecht Benjamin Gentil + Benjamin Sherman Ben McGough Ben Rubson Ben Wolsieffer @@ -111,6 +113,7 @@ CONTRIBUTORS: bzzz77 cable2999 Caleb James DeLisle + Cameron Harr Cao Xuewen Carlo Landmeter Carlos Alberto Lopez Perez @@ -120,12 +123,15 @@ CONTRIBUTORS: Chen Can Chengfei Zhu Chen Haiquan + ChenHao Lu <18302010006@fudan.edu.cn> Chip Parker Chris Burroughs + Chris Davidson Chris Dunlap Chris Dunlop Chris Lindee Chris McDonough + Chris Peredun Chris Siden Chris Siebenmann Christer Ekholm @@ -144,6 +150,7 @@ CONTRIBUTORS: Clint Armstrong Coleman Kane Colin Ian King + Colin Percival Colm Buckley Crag Wang Craig Loomis @@ -156,6 +163,7 @@ CONTRIBUTORS: Damiano Albani Damian Szuberski Damian Wojsław + Daniel Berlin Daniel Hiepler Daniel Hoffman Daniel Kobras @@ -176,8 +184,10 @@ CONTRIBUTORS: David Quigley Debabrata Banerjee D. Ebdrup + Dennis R. Friedrichsen Denys Rtveliashvili Derek Dai + Dex Wood DHE Didier Roche Dimitri John Ledkov @@ -235,9 +245,11 @@ CONTRIBUTORS: Gionatan Danti Giuseppe Di Natale Glenn Washburn + gofaster Gordan Bobic Gordon Bergling Gordon Ross + Gordon Tetlow Graham Christensen Graham Perrin Gregor Kopka @@ -265,6 +277,7 @@ CONTRIBUTORS: Igor Kozhukhov Igor Lvovsky ilbsmart + Ilkka Sovanto illiliti ilovezfs InsanePrawn @@ -280,9 +293,11 @@ CONTRIBUTORS: Jan Engelhardt Jan Kryl Jan Sanislo + Jaron Kent-Dobias Jason Cohen Jason Harmening Jason King + Jason Lee Jason Zaman Javen Wu Jean-Baptiste Lallement @@ -313,6 +328,7 @@ CONTRIBUTORS: Jonathon Fernyhough Jorgen Lundman Josef 'Jeff' Sipek + Jose Luis Duran Josh Soref Joshua M. Clulow José Luis Salvador Rufo @@ -336,8 +352,10 @@ CONTRIBUTORS: Kash Pande Kay Pedersen Keith M Wesolowski + Kent Ross KernelOfTruth Kevin Bowling + Kevin Greene Kevin Jin Kevin P. Fleming Kevin Tanguy @@ -389,6 +407,7 @@ CONTRIBUTORS: Mark Shellenbaum marku89 Mark Wright + Mart Frauenlob Martin Matuska Martin Rüegg Massimo Maggi @@ -405,6 +424,7 @@ CONTRIBUTORS: Matus Kral Mauricio Faria de Oliveira Max Grossman + Maxim Filimonov Maximilian Mehnert Max Zettlmeißl Md Islam @@ -417,6 +437,7 @@ CONTRIBUTORS: Michael Niewöhner Michael Zhivich Michal Vasilek + MigeljanImeri Mike Gerdts Mike Harsch Mike Leddy @@ -448,6 +469,7 @@ CONTRIBUTORS: Olaf Faaland Oleg Drokin Oleg Stepura + Olivier Certner Olivier Mazouffre omni Orivej Desh @@ -479,6 +501,7 @@ CONTRIBUTORS: Prasad Joshi privb0x23 P.SCH + Quartz Quentin Zdanis Rafael Kitover RageLtMan @@ -491,11 +514,15 @@ CONTRIBUTORS: Riccardo Schirone Richard Allen Richard Elling + Richard Kojedzinszky Richard Laager Richard Lowe Richard Sharpe Richard Yao Rich Ercolani + Rick Macklem + rilysh + Robert Evans Robert Novak Roberto Ricci Rob Norris @@ -509,7 +536,9 @@ CONTRIBUTORS: Ryan Lahfa Ryan Libby Ryan Moeller + Sam Atkinson Sam Hathaway + Sam James Sam Lunt Samuel VERSCHELDE Samuel Wycliffe @@ -530,6 +559,8 @@ CONTRIBUTORS: Shaan Nobee Shampavman Shaun Tancheff + Shawn Bayern + Shengqi Chen Shen Yan Simon Guest Simon Klinkert @@ -537,6 +568,7 @@ CONTRIBUTORS: Spencer Kinny Srikanth N S Stanislav Seletskiy + Stefan Lendl Steffen Müthing Stephen Blinick sterlingjensen @@ -557,6 +589,7 @@ CONTRIBUTORS: Teodor Spæren TerraTech Thijs Cramer + Thomas Bertschinger Thomas Geppert Thomas Lamprecht Till Maas @@ -586,6 +619,7 @@ CONTRIBUTORS: Turbo Fredriksson Tyler J. Stachecki Umer Saleem + Vaibhav Bhanawat Valmiky Arquissandas Val Packett Vince van Oosten @@ -614,6 +648,7 @@ CONTRIBUTORS: yuina822 YunQiang Su Yuri Pankov + Yuxin Wang Yuxuan Shui Zachary Bedell Zach Dykstra From a100a195fa490e4a816492be2efa216a6880909f Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Fri, 12 Apr 2024 03:10:24 +0500 Subject: [PATCH 010/113] Add support for zfs mount -R This commit adds support for mounting a dataset along with all of it's children with '-R' flag for zfs mount. There can be scenarios where we want to mount all datasets under one hierarchy instead of mounting all datasets present on system with '-a' flag. '-R' flag should work on all root and non-root datasets. Usage information and man page has been updated for zfs mount. A test for verifying the behavior for '-R' flag is also added. Reviewed-by: Ameer Hamza Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Umer Saleem Closes #16015 --- cmd/zfs/zfs_main.c | 75 +++++++-- man/man8/zfs-mount.8 | 6 +- tests/runfiles/common.run | 2 +- tests/runfiles/sanity.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../cli_root/zfs_mount/zfs_mount.cfg | 1 + .../zfs_mount/zfs_mount_recursive.ksh | 146 ++++++++++++++++++ 7 files changed, 216 insertions(+), 18 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index c2147c8f4a..ec52c563b4 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -309,7 +309,8 @@ get_usage(zfs_help_t idx) "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" - "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); + "\tmount [-flvO] [-o opts] <-a|-R filesystem|" + "filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: @@ -6754,6 +6755,8 @@ zfs_do_holds(int argc, char **argv) #define MOUNT_TIME 1 /* seconds */ typedef struct get_all_state { + char **ga_datasets; + int ga_count; boolean_t ga_verbose; get_all_cb_t *ga_cbp; } get_all_state_t; @@ -6800,19 +6803,35 @@ get_one_dataset(zfs_handle_t *zhp, void *data) return (0); } -static void -get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) +static int +get_recursive_datasets(zfs_handle_t *zhp, void *data) { - get_all_state_t state = { - .ga_verbose = verbose, - .ga_cbp = cbp - }; + get_all_state_t *state = data; + int len = strlen(zfs_get_name(zhp)); + for (int i = 0; i < state->ga_count; ++i) { + if (strcmp(state->ga_datasets[i], zfs_get_name(zhp)) == 0) + return (get_one_dataset(zhp, data)); + else if ((strncmp(state->ga_datasets[i], zfs_get_name(zhp), + len) == 0) && state->ga_datasets[i][len] == '/') { + (void) zfs_iter_filesystems_v2(zhp, 0, + get_recursive_datasets, data); + } + } + zfs_close(zhp); + return (0); +} - if (verbose) +static void +get_all_datasets(get_all_state_t *state) +{ + if (state->ga_verbose) set_progress_header(gettext("Reading ZFS config")); - (void) zfs_iter_root(g_zfs, get_one_dataset, &state); + if (state->ga_datasets == NULL) + (void) zfs_iter_root(g_zfs, get_one_dataset, state); + else + (void) zfs_iter_root(g_zfs, get_recursive_datasets, state); - if (verbose) + if (state->ga_verbose) finish_progress(gettext("done.")); } @@ -7158,18 +7177,22 @@ static int share_mount(int op, int argc, char **argv) { int do_all = 0; + int recursive = 0; boolean_t verbose = B_FALSE; int c, ret = 0; char *options = NULL; int flags = 0; /* check options */ - while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) + while ((c = getopt(argc, argv, op == OP_MOUNT ? ":aRlvo:Of" : "al")) != -1) { switch (c) { case 'a': do_all = 1; break; + case 'R': + recursive = 1; + break; case 'v': verbose = B_TRUE; break; @@ -7211,7 +7234,7 @@ share_mount(int op, int argc, char **argv) argv += optind; /* check number of arguments */ - if (do_all) { + if (do_all || recursive) { enum sa_protocol protocol = SA_NO_PROTOCOL; if (op == OP_SHARE && argc > 0) { @@ -7220,14 +7243,38 @@ share_mount(int op, int argc, char **argv) argv++; } - if (argc != 0) { + if (argc != 0 && do_all) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } + if (argc == 0 && recursive) { + (void) fprintf(stderr, + gettext("no dataset provided\n")); + usage(B_FALSE); + } + start_progress_timer(); get_all_cb_t cb = { 0 }; - get_all_datasets(&cb, verbose); + get_all_state_t state = { 0 }; + if (argc == 0) { + state.ga_datasets = NULL; + state.ga_count = -1; + } else { + zfs_handle_t *zhp; + for (int i = 0; i < argc; i++) { + zhp = zfs_open(g_zfs, argv[i], + ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + usage(B_FALSE); + zfs_close(zhp); + } + state.ga_datasets = argv; + state.ga_count = argc; + } + state.ga_verbose = verbose; + state.ga_cbp = &cb; + get_all_datasets(&state); if (cb.cb_used == 0) { free(options); diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8 index 35aa187cf0..20dbe4d0e6 100644 --- a/man/man8/zfs-mount.8 +++ b/man/man8/zfs-mount.8 @@ -43,7 +43,7 @@ .Cm mount .Op Fl Oflv .Op Fl o Ar options -.Fl a Ns | Ns Ar filesystem +.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem .Nm zfs .Cm unmount .Op Fl fu @@ -61,7 +61,7 @@ Displays all ZFS file systems currently mounted. .Cm mount .Op Fl Oflv .Op Fl o Ar options -.Fl a Ns | Ns Ar filesystem +.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem .Xc Mount ZFS filesystem on a path described by its .Sy mountpoint @@ -83,6 +83,8 @@ for more information. .It Fl a Mount all available ZFS file systems. Invoked automatically as part of the boot process if configured. +.It Fl R +Mount the specified filesystems along with all their children. .It Ar filesystem Mount the specified filesystem. .It Fl o Ar options diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 4295ca1b6f..558cd425af 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -252,7 +252,7 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', - 'zfs_mount_test_race'] + 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index 598123bcd2..d6a791e337 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -155,7 +155,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', - 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race'] + 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', + 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index db6b4c0146..f182a2825c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -770,6 +770,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \ functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \ + functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \ functional/cli_root/zfs_mount/zfs_mount_remount.ksh \ functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \ functional/cli_root/zfs_mount/zfs_multi_mount.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg index 06d25faf03..739baf1608 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg @@ -31,6 +31,7 @@ export mountcmd=mount export mountforce="$mountcmd -f" export mountall="$mountcmd -a" +export mountrecursive="$mountcmd -R" export unmountcmd=unmount export unmountforce="$unmountcmd -f" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh new file mode 100755 index 0000000000..0e5cc5d695 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh @@ -0,0 +1,146 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2024, iXsystems Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib + +# +# DESCRIPTION: +# Verify zfs mount -R functionality. +# +# STRATEGY: +# 1. Create nested datasets +# 2. Unmount all datasets +# 3. Recusrively mount root datasets, this should mount all datasets +# present in a pool +# 4. Unmount all datasets +# 5. Recusrsively mount child datasets with children. This should mount +# child datasets, but not the root dataset or parent datasets +# 6. Unmount all datasets +# 7. Mount root dataset recursively again and confirm all child +# datasets are mounted. +# + +verify_runnable "both" + +function cleanup +{ + log_must datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -R + log_must datasetexists $TESTPOOL/$TESTFS2 && \ + destroy_dataset $TESTPOOL/$TESTFS2 -R + log_must datasetexists $TESTPOOL/$TESTFS3 && \ + destroy_dataset $TESTPOOL/$TESTFS3 -R +} + +function setup_all +{ + log_must datasetexists $TESTPOOL/$TESTFS || zfs create $TESTPOOL/$TESTFS + log_must zfs create $TESTPOOL/$TESTFS1 + log_must zfs create $TESTPOOL/$TESTFS2 + log_must zfs create $TESTPOOL/$TESTFS3 + log_must zfs create $TESTPOOL/$TESTFS2/child1 + log_must zfs create $TESTPOOL/$TESTFS2/child2 + log_must zfs create $TESTPOOL/$TESTFS2/child3 + log_must zfs create $TESTPOOL/$TESTFS2/child2/subchild + log_must zfs create $TESTPOOL/$TESTFS3/child +} + +log_assert "Verify that 'zfs $mountrecursive' successfully, " \ + "mounts the dataset along with all its children." + +log_onexit cleanup + +log_must setup_all + +log_must zfs $unmountall + +log_must zfs $mountrecursive $TESTPOOL + +log_must mounted $TESTPOOL +log_must mounted $TESTPOOL/$TESTFS +log_must mounted $TESTPOOL/$TESTFS1 +log_must mounted $TESTPOOL/$TESTFS2 +log_must mounted $TESTPOOL/$TESTFS3 +log_must mounted $TESTPOOL/$TESTFS2/child1 +log_must mounted $TESTPOOL/$TESTFS2/child2 +log_must mounted $TESTPOOL/$TESTFS2/child3 +log_must mounted $TESTPOOL/$TESTFS2/child2/subchild +log_must mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $unmountall + +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_mustnot mounted $TESTPOOL/$TESTFS2 +log_mustnot mounted $TESTPOOL/$TESTFS3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child1 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2 +log_mustnot mounted $TESTPOOL/$TESTFS2/child3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild +log_mustnot mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $mountrecursive $TESTPOOL/$TESTFS2 $TESTPOOL/$TESTFS3 + +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_must mounted $TESTPOOL/$TESTFS2 +log_must mounted $TESTPOOL/$TESTFS3 +log_must mounted $TESTPOOL/$TESTFS2/child1 +log_must mounted $TESTPOOL/$TESTFS2/child2 +log_must mounted $TESTPOOL/$TESTFS2/child3 +log_must mounted $TESTPOOL/$TESTFS2/child2/subchild +log_must mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $unmountall + +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_mustnot mounted $TESTPOOL/$TESTFS2 +log_mustnot mounted $TESTPOOL/$TESTFS3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child1 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2 +log_mustnot mounted $TESTPOOL/$TESTFS2/child3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild +log_mustnot mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $mountrecursive $TESTPOOL/$TESTFS2/child2 + +log_must mounted $TESTPOOL/$TESTFS2/child2 +log_must mounted $TESTPOOL/$TESTFS2/child2/subchild +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_mustnot mounted $TESTPOOL/$TESTFS2 +log_mustnot mounted $TESTPOOL/$TESTFS3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child1 +log_mustnot mounted $TESTPOOL/$TESTFS2/child3 +log_mustnot mounted $TESTPOOL/$TESTFS3/child + +log_pass "'zfs $mountrecursive' behaves as expected." From cac416f1062fdbd2ff84ff2b40835d4853cbf190 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 4 Apr 2024 22:34:42 +1100 Subject: [PATCH 011/113] zio: remove zio_ioctl() It only had one user, zio_flush(), and there are no other vdev ioctls anyway. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16064 --- include/sys/zio.h | 5 +---- module/zfs/zio.c | 31 ++++++++++++++++--------------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/include/sys/zio.h b/include/sys/zio.h index 25a4b221f0..5dcd7fe073 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -27,7 +27,7 @@ * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome * Copyright (c) 2019, Allan Jude - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, 2024, Klara Inc. * Copyright (c) 2019-2020, Michael Niewöhner */ @@ -579,9 +579,6 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *priv, zio_flag_t flags); -extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *priv, zio_flag_t flags); - extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 08d56eef83..4aa08f3b30 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -23,7 +23,7 @@ * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. */ @@ -1449,17 +1449,6 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, return (zio); } -zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, zio_flag_t flags) -{ - zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, - ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, - ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); - zio->io_cmd = cmd; - return (zio); -} - zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, @@ -1626,15 +1615,27 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, return (zio); } + +/* + * Send a flush command to the given vdev. Unlike most zio creation functions, + * the flush zios are issued immediately. You can wait on pio to pause until + * the flushes complete. + */ void zio_flush(zio_t *pio, vdev_t *vd) { + const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY; + if (vd->vdev_nowritecache) return; + if (vd->vdev_children == 0) { - zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd, - DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); + zio_t *zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, + NULL, NULL, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, + NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); + zio->io_cmd = DKIOCFLUSHWRITECACHE; + zio_nowait(zio); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) zio_flush(pio, vd->vdev_child[c]); From c9c838aa1fca9aef84d74db1d99872c5efa9a25d Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 4 Apr 2024 22:34:54 +1100 Subject: [PATCH 012/113] zio: remove io_cmd and DKIOCFLUSHWRITECACHE There's no other options, so we can just always assume its a flush. Includes some light refactoring where a switch statement was doing control flow that no longer works. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16064 --- include/os/linux/zfs/sys/trace_common.h | 6 +- include/sys/zio.h | 1 - module/os/freebsd/zfs/vdev_file.c | 9 +-- module/os/freebsd/zfs/vdev_geom.c | 43 +++++------- module/os/linux/zfs/vdev_disk.c | 39 ++++------- module/os/linux/zfs/vdev_file.c | 44 +++++------- module/zfs/vdev_draid.c | 14 ++-- module/zfs/zfs_fm.c | 5 +- module/zfs/zil.c | 93 ++++++++++++------------- module/zfs/zio.c | 9 +-- 10 files changed, 106 insertions(+), 157 deletions(-) diff --git a/include/os/linux/zfs/sys/trace_common.h b/include/os/linux/zfs/sys/trace_common.h index 3d4b1920d5..6ffa57c864 100644 --- a/include/os/linux/zfs/sys/trace_common.h +++ b/include/os/linux/zfs/sys/trace_common.h @@ -31,7 +31,6 @@ /* ZIO macros */ #define ZIO_TP_STRUCT_ENTRY \ __field(zio_type_t, zio_type) \ - __field(int, zio_cmd) \ __field(zio_priority_t, zio_priority) \ __field(uint64_t, zio_size) \ __field(uint64_t, zio_orig_size) \ @@ -61,7 +60,6 @@ #define ZIO_TP_FAST_ASSIGN \ __entry->zio_type = zio->io_type; \ - __entry->zio_cmd = zio->io_cmd; \ __entry->zio_priority = zio->io_priority; \ __entry->zio_size = zio->io_size; \ __entry->zio_orig_size = zio->io_orig_size; \ @@ -90,7 +88,7 @@ __entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify; #define ZIO_TP_PRINTK_FMT \ - "zio { type %u cmd %i prio %u size %llu orig_size %llu " \ + "zio { type %u prio %u size %llu orig_size %llu " \ "offset %llu timestamp %llu delta %llu delay %llu " \ "flags 0x%llx stage 0x%x pipeline 0x%x orig_flags 0x%llx " \ "orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \ @@ -98,7 +96,7 @@ "type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }" #define ZIO_TP_PRINTK_ARGS \ - __entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \ + __entry->zio_type, __entry->zio_priority, \ __entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \ __entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \ __entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \ diff --git a/include/sys/zio.h b/include/sys/zio.h index 5dcd7fe073..545b9cf0c3 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -451,7 +451,6 @@ struct zio { zio_type_t io_type; enum zio_child io_child_type; enum trim_flag io_trim_flags; - int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index a65dfec86c..888c8e7f88 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -255,14 +255,7 @@ vdev_file_io_start(zio_t *zio) return; } - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - zio->io_error = zfs_file_fsync(vf->vf_file, - O_SYNC|O_DSYNC); - break; - default: - zio->io_error = SET_ERROR(ENOTSUP); - } + zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC); zio_execute(zio); return; diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 196d67b4b5..264dfa5c92 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1153,42 +1153,31 @@ vdev_geom_io_start(zio_t *zio) vd = zio->io_vd; - switch (zio->io_type) { - case ZIO_TYPE_IOCTL: + if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; - } else { - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - if (zfs_nocacheflush || - vdev_geom_bio_flush_disable) - break; - if (vd->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - goto sendreq; - default: - zio->io_error = SET_ERROR(ENOTSUP); - } } - zio_execute(zio); - return; - case ZIO_TYPE_TRIM: - if (!vdev_geom_bio_delete_disable) { - goto sendreq; + if (zfs_nocacheflush || vdev_geom_bio_flush_disable) { + zio_execute(zio); + return; + } + + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } + } else if (zio->io_type == ZIO_TYPE_TRIM) { + if (vdev_geom_bio_delete_disable) { + zio_execute(zio); + return; } - zio_execute(zio); - return; - default: - ; - /* PASSTHROUGH --- placate compiler */ } -sendreq: + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM || diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index f3f0c08752..554ed22b9d 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1403,38 +1403,29 @@ vdev_disk_io_start(zio_t *zio) case ZIO_TYPE_IOCTL: if (!vdev_readable(v)) { - rw_exit(&vd->vd_lock); - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - if (v->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - + /* Drive not there, can't flush */ + error = SET_ERROR(ENXIO); + } else if (zfs_nocacheflush) { + /* Flushing disabled by operator, declare success */ + error = 0; + } else if (v->vdev_nowritecache) { + /* This vdev not capable of flushing */ + error = SET_ERROR(ENOTSUP); + } else { + /* + * Issue the flush. If successful, the response will + * be handled in the completion callback, so we're done. + */ error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); if (error == 0) { rw_exit(&vd->vd_lock); return; } - - zio->io_error = error; - - break; - - default: - zio->io_error = SET_ERROR(ENOTSUP); } + /* Couldn't issue the flush, so set the error and return it */ rw_exit(&vd->vd_lock); + zio->io_error = error; zio_execute(zio); return; diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index 5abc0426d1..2b483c9a9f 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -250,33 +250,27 @@ vdev_file_io_start(zio_t *zio) return; } - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - /* - * We cannot safely call vfs_fsync() when PF_FSTRANS - * is set in the current context. Filesystems like - * XFS include sanity checks to verify it is not - * already set, see xfs_vm_writepage(). Therefore - * the sync must be dispatched to a different context. - */ - if (__spl_pf_fstrans_check()) { - VERIFY3U(taskq_dispatch(vdev_file_taskq, - vdev_file_io_fsync, zio, TQ_SLEEP), !=, - TASKQID_INVALID); - return; - } - - zio->io_error = zfs_file_fsync(vf->vf_file, - O_SYNC | O_DSYNC); - break; - default: - zio->io_error = SET_ERROR(ENOTSUP); + if (zfs_nocacheflush) { + zio_execute(zio); + return; } + /* + * We cannot safely call vfs_fsync() when PF_FSTRANS + * is set in the current context. Filesystems like + * XFS include sanity checks to verify it is not + * already set, see xfs_vm_writepage(). Therefore + * the sync must be dispatched to a different context. + */ + if (__spl_pf_fstrans_check()) { + VERIFY3U(taskq_dispatch(vdev_file_taskq, + vdev_file_io_fsync, zio, TQ_SLEEP), !=, + TASKQID_INVALID); + return; + } + + zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); + zio_execute(zio); return; } else if (zio->io_type == ZIO_TYPE_TRIM) { diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index ec961255fd..7769ed6a37 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -2557,15 +2557,11 @@ vdev_draid_spare_ioctl(zio_t *zio) vdev_t *vd = zio->io_vd; int error = 0; - if (zio->io_cmd == DKIOCFLUSHWRITECACHE) { - for (int c = 0; c < vd->vdev_children; c++) { - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[c], zio->io_offset, zio->io_abd, - zio->io_size, zio->io_type, zio->io_priority, 0, - vdev_draid_spare_child_done, zio)); - } - } else { - error = SET_ERROR(ENOTSUP); + for (int c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[c], zio->io_offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); } return (error); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 481af2ba82..2f43c4aa41 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1096,10 +1096,7 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) return (B_FALSE); if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ + /* If this is not a read or write zio, ignore the error */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) return (B_FALSE); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 1af357c580..34be54b337 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -125,10 +125,9 @@ static kstat_t *zil_kstats_global; int zil_replay_disable = 0; /* - * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to - * the disk(s) by the ZIL after an LWB write has completed. Setting this - * will cause ZIL corruption on power loss if a volatile out-of-order - * write cache is enabled. + * Disable the flush commands that are normally sent to the disk(s) by the ZIL + * after an LWB write has completed. Setting this will cause ZIL corruption on + * power loss if a volatile out-of-order write cache is enabled. */ static int zil_nocacheflush = 0; @@ -1406,19 +1405,17 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) } /* - * This function is a called after all vdevs associated with a given lwb - * write have completed their DKIOCFLUSHWRITECACHE command; or as soon - * as the lwb write completes, if "zil_nocacheflush" is set. Further, - * all "previous" lwb's will have completed before this function is - * called; i.e. this function is called for all previous lwbs before - * it's called for "this" lwb (enforced via zio the dependencies - * configured in zil_lwb_set_zio_dependency()). + * This function is a called after all vdevs associated with a given lwb write + * have completed their flush command; or as soon as the lwb write completes, + * if "zil_nocacheflush" is set. Further, all "previous" lwb's will have + * completed before this function is called; i.e. this function is called for + * all previous lwbs before it's called for "this" lwb (enforced via zio the + * dependencies configured in zil_lwb_set_zio_dependency()). * - * The intention is for this function to be called as soon as the - * contents of an lwb are considered "stable" on disk, and will survive - * any sudden loss of power. At this point, any threads waiting for the - * lwb to reach this state are signalled, and the "waiter" structures - * are marked "done". + * The intention is for this function to be called as soon as the contents of + * an lwb are considered "stable" on disk, and will survive any sudden loss of + * power. At this point, any threads waiting for the lwb to reach this state + * are signalled, and the "waiter" structures are marked "done". */ static void zil_lwb_flush_vdevs_done(zio_t *zio) @@ -1532,17 +1529,16 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) } /* - * This is called when an lwb's write zio completes. The callback's - * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs - * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved - * in writing out this specific lwb's data, and in the case that cache - * flushes have been deferred, vdevs involved in writing the data for - * previous lwbs. The writes corresponding to all the vdevs in the - * lwb_vdev_tree will have completed by the time this is called, due to - * the zio dependencies configured in zil_lwb_set_zio_dependency(), - * which takes deferred flushes into account. The lwb will be "done" - * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio - * completion callback for the lwb's root zio. + * This is called when an lwb's write zio completes. The callback's purpose is + * to issue the flush commands for the vdevs in the lwb's lwb_vdev_tree. The + * tree will contain the vdevs involved in writing out this specific lwb's + * data, and in the case that cache flushes have been deferred, vdevs involved + * in writing the data for previous lwbs. The writes corresponding to all the + * vdevs in the lwb_vdev_tree will have completed by the time this is called, + * due to the zio dependencies configured in zil_lwb_set_zio_dependency(), + * which takes deferred flushes into account. The lwb will be "done" once + * zil_lwb_flush_vdevs_done() is called, which occurs in the zio completion + * callback for the lwb's root zio. */ static void zil_lwb_write_done(zio_t *zio) @@ -1601,19 +1597,18 @@ zil_lwb_write_done(zio_t *zio) } /* - * If this lwb does not have any threads waiting for it to - * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE - * command to the vdevs written to by "this" lwb, and instead - * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE - * command for those vdevs. Thus, we merge the vdev tree of - * "this" lwb with the vdev tree of the "next" lwb in the list, - * and assume the "next" lwb will handle flushing the vdevs (or - * deferring the flush(s) again). + * If this lwb does not have any threads waiting for it to complete, we + * want to defer issuing the flush command to the vdevs written to by + * "this" lwb, and instead rely on the "next" lwb to handle the flush + * command for those vdevs. Thus, we merge the vdev tree of "this" lwb + * with the vdev tree of the "next" lwb in the list, and assume the + * "next" lwb will handle flushing the vdevs (or deferring the flush(s) + * again). * - * This is a useful performance optimization, especially for - * workloads with lots of async write activity and few sync - * write and/or fsync activity, as it has the potential to - * coalesce multiple flush commands to a vdev into one. + * This is a useful performance optimization, especially for workloads + * with lots of async write activity and few sync write and/or fsync + * activity, as it has the potential to coalesce multiple flush + * commands to a vdev into one. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); @@ -1663,16 +1658,16 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) * If the previous lwb's write hasn't already completed, we also want * to order the completion of the lwb write zios (above, we only order * the completion of the lwb root zios). This is required because of - * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. + * how we can defer the flush commands for each lwb. * - * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous - * lwb will rely on this lwb to flush the vdevs written to by that - * previous lwb. Thus, we need to ensure this lwb doesn't issue the - * flush until after the previous lwb's write completes. We ensure - * this ordering by setting the zio parent/child relationship here. + * When the flush commands are deferred, the previous lwb will rely on + * this lwb to flush the vdevs written to by that previous lwb. Thus, + * we need to ensure this lwb doesn't issue the flush until after the + * previous lwb's write completes. We ensure this ordering by setting + * the zio parent/child relationship here. * - * Without this relationship on the lwb's write zio, it's possible - * for this lwb's write to complete prior to the previous lwb's write + * Without this relationship on the lwb's write zio, it's possible for + * this lwb's write to complete prior to the previous lwb's write * completing; and thus, the vdevs for the previous lwb would be * flushed prior to that lwb's data being written to those vdevs (the * vdevs are flushed in the lwb write zio's completion handler, @@ -3499,8 +3494,8 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) * callback of the lwb's zio[*]. * * * Actually, the waiters are signaled in the zio completion - * callback of the root zio for the DKIOCFLUSHWRITECACHE commands - * that are sent to the vdevs upon completion of the lwb zio. + * callback of the root zio for the flush commands that are sent to + * the vdevs upon completion of the lwb zio. * * 2. When the itxs are inserted into the ZIL's queue of uncommitted * itxs, the order in which they are inserted is preserved[*]; as diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 4aa08f3b30..031fc3d513 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1631,11 +1631,9 @@ zio_flush(zio_t *pio, vdev_t *vd) return; if (vd->vdev_children == 0) { - zio_t *zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, + zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, NULL, NULL, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, - NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); - zio->io_cmd = DKIOCFLUSHWRITECACHE; - zio_nowait(zio); + NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE)); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) zio_flush(pio, vd->vdev_child[c]); @@ -4241,8 +4239,7 @@ zio_vdev_io_assess(zio_t *zio) * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_IOCTL && - zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) + zio->io_type == ZIO_TYPE_IOCTL && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) From b613709c46bcc0d190c0d67c739ef3f8722d76b2 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 10 Apr 2024 16:07:24 +1000 Subject: [PATCH 013/113] dkio: remove kernel dkio.h compatibility header Without DKIOCFLUSHWRITECACHE, we no longer need the compat header. Note that we're keeping the userspace SPL compat header, which is used by libefi. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16064 --- include/os/freebsd/Makefile.am | 1 - include/os/freebsd/spl/sys/dkio.h | 34 --------------------------- include/os/linux/Makefile.am | 1 - include/os/linux/spl/sys/dkio.h | 39 ------------------------------- include/sys/vdev_impl.h | 1 - 5 files changed, 76 deletions(-) delete mode 100644 include/os/freebsd/spl/sys/dkio.h delete mode 100644 include/os/linux/spl/sys/dkio.h diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am index d4103c2f06..292f79b8ce 100644 --- a/include/os/freebsd/Makefile.am +++ b/include/os/freebsd/Makefile.am @@ -20,7 +20,6 @@ noinst_HEADERS = \ %D%/spl/sys/debug.h \ %D%/spl/sys/dirent.h \ %D%/spl/sys/disp.h \ - %D%/spl/sys/dkio.h \ %D%/spl/sys/fcntl.h \ %D%/spl/sys/file.h \ %D%/spl/sys/freebsd_rwlock.h \ diff --git a/include/os/freebsd/spl/sys/dkio.h b/include/os/freebsd/spl/sys/dkio.h deleted file mode 100644 index cd747089d4..0000000000 --- a/include/os/freebsd/spl/sys/dkio.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - * - * $FreeBSD$ - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _OPENSOLARIS_SYS_DKIO_H_ -#define _OPENSOLARIS_SYS_DKIO_H_ - -#define DKIOC (0x04 << 8) -#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ - -#endif /* _OPENSOLARIS_SYS_DKIO_H_ */ diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am index 332569efe3..f31ae50b96 100644 --- a/include/os/linux/Makefile.am +++ b/include/os/linux/Makefile.am @@ -63,7 +63,6 @@ kernel_spl_sys_HEADERS = \ %D%/spl/sys/ctype.h \ %D%/spl/sys/debug.h \ %D%/spl/sys/disp.h \ - %D%/spl/sys/dkio.h \ %D%/spl/sys/errno.h \ %D%/spl/sys/fcntl.h \ %D%/spl/sys/file.h \ diff --git a/include/os/linux/spl/sys/dkio.h b/include/os/linux/spl/sys/dkio.h deleted file mode 100644 index a90b67d367..0000000000 --- a/include/os/linux/spl/sys/dkio.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . - */ - -#ifndef _SPL_DKIO_H -#define _SPL_DKIO_H - -#define DFL_SZ(num_exts) \ - (sizeof (dkioc_free_list_t) + (num_exts - 1) * 16) - -#define DKIOC (0x04 << 8) -#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ - -/* - * ioctl to free space (e.g. SCSI UNMAP) off a disk. - * Pass a dkioc_free_list_t containing a list of extents to be freed. - */ -#define DKIOCFREE (DKIOC|50) - -#endif /* _SPL_DKIO_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 2a93f7c680..95164c4546 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include From d7605ae77b7ad176e8dbd5649fe4d14f5f4e8b9f Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 4 Apr 2024 22:35:00 +1100 Subject: [PATCH 014/113] zio: rename ZIO_TYPE_IOCTL to ZIO_TYPE_FLUSH The only possible ioctl is a flush, and any other kind of meta-operation introduced in the future is likely to have different semantics (much like trim did). So, lets just call it what it is. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16064 --- cmd/zinject/zinject.c | 10 +++++----- include/sys/fs/zfs.h | 8 +++++++- include/sys/zio_impl.h | 18 +++++++++--------- man/man8/zpool-events.8 | 12 ++++++------ module/os/freebsd/zfs/vdev_file.c | 2 +- module/os/freebsd/zfs/vdev_geom.c | 8 ++++---- module/os/linux/zfs/vdev_disk.c | 2 +- module/os/linux/zfs/vdev_file.c | 2 +- module/zfs/spa.c | 2 +- module/zfs/vdev.c | 12 ++++++------ module/zfs/vdev_draid.c | 10 +++++----- module/zfs/zio.c | 12 ++++++------ module/zfs/zio_inject.c | 2 +- 13 files changed, 53 insertions(+), 47 deletions(-) diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index 07d3d8af99..a1afa4a63f 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -265,7 +265,7 @@ usage(void) "\t\tspa_vdev_exit() will trigger a panic.\n" "\n" "\tzinject -d device [-e errno] [-L ] [-F]\n" - "\t\t[-T ] [-f frequency] pool\n\n" + "\t\t[-T ] [-f frequency] pool\n\n" "\t\tInject a fault into a particular device or the device's\n" "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " "\t\t'pad1', or 'pad2'.\n" @@ -425,7 +425,7 @@ print_device_handler(int id, const char *pool, zinject_record_t *record, void *data) { static const char *iotypestr[] = { - "null", "read", "write", "free", "claim", "ioctl", "trim", "all", + "null", "read", "write", "free", "claim", "flush", "trim", "all", }; int *count = data; @@ -978,14 +978,14 @@ main(int argc, char **argv) io_type = ZIO_TYPE_FREE; } else if (strcasecmp(optarg, "claim") == 0) { io_type = ZIO_TYPE_CLAIM; - } else if (strcasecmp(optarg, "ioctl") == 0) { - io_type = ZIO_TYPE_IOCTL; + } else if (strcasecmp(optarg, "flush") == 0) { + io_type = ZIO_TYPE_FLUSH; } else if (strcasecmp(optarg, "all") == 0) { io_type = ZIO_TYPES; } else { (void) fprintf(stderr, "invalid I/O type " "'%s': must be 'read', 'write', 'free', " - "'claim', 'ioctl' or 'all'\n", optarg); + "'claim', 'flush' or 'all'\n", optarg); usage(); libzfs_fini(g_zfs); return (1); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 21f99baccc..e191420f2d 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1094,11 +1094,17 @@ typedef enum zio_type { ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, - ZIO_TYPE_IOCTL, + ZIO_TYPE_FLUSH, ZIO_TYPE_TRIM, ZIO_TYPES } zio_type_t; +/* + * Compatibility: _IOCTL was renamed to _FLUSH; keep the old name available to + * user programs. + */ +#define ZIO_TYPE_IOCTL ZIO_TYPE_FLUSH + /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 4b3726d7ee..2b026d4867 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -40,7 +40,7 @@ extern "C" { * * The ZFS I/O pipeline is comprised of various stages which are defined * in the zio_stage enum below. The individual stages are used to construct - * these basic I/O operations: Read, Write, Free, Claim, Ioctl and Trim. + * these basic I/O operations: Read, Write, Free, Claim, Flush and Trim. * * I/O operations: (XXX - provide detail for each of the operations) * @@ -48,7 +48,7 @@ extern "C" { * Write: * Free: * Claim: - * Ioctl: + * Flush: * Trim: * * Although the most common pipeline are used by the basic I/O operations @@ -122,7 +122,7 @@ extern "C" { * zio pipeline stage definitions */ enum zio_stage { - ZIO_STAGE_OPEN = 1 << 0, /* RWFCIT */ + ZIO_STAGE_OPEN = 1 << 0, /* RWFCXT */ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R----- */ ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W---- */ @@ -150,15 +150,15 @@ enum zio_stage { ZIO_STAGE_DVA_FREE = 1 << 18, /* --F--- */ ZIO_STAGE_DVA_CLAIM = 1 << 19, /* ---C-- */ - ZIO_STAGE_READY = 1 << 20, /* RWFCIT */ + ZIO_STAGE_READY = 1 << 20, /* RWFCXT */ - ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--IT */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--IT */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--IT */ + ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--XT */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--XT */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--XT */ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */ - ZIO_STAGE_DONE = 1 << 25 /* RWFCIT */ + ZIO_STAGE_DONE = 1 << 25 /* RWFCXT */ }; #define ZIO_ROOT_PIPELINE \ @@ -259,7 +259,7 @@ enum zio_stage { (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_DVA_CLAIM) -#define ZIO_IOCTL_PIPELINE \ +#define ZIO_FLUSH_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_VDEV_IO_STAGES) diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 12331b7b2a..ef20ef4e00 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -364,7 +364,7 @@ that is, the bits set in the good data which are cleared in the bad data. .Sh I/O STAGES The ZFS I/O pipeline is comprised of various stages which are defined below. The individual stages are used to construct these basic I/O -operations: Read, Write, Free, Claim, Ioctl and Trim. +operations: Read, Write, Free, Claim, Flush and Trim. These stages may be set on an event to describe the life cycle of a given I/O request. .Pp @@ -373,7 +373,7 @@ tab(:); l l l . Stage:Bit Mask:Operations _:_:_ -ZIO_STAGE_OPEN:0x00000001:RWFCIT +ZIO_STAGE_OPEN:0x00000001:RWFCXT ZIO_STAGE_READ_BP_INIT:0x00000002:R----- ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W---- @@ -403,13 +403,13 @@ ZIO_STAGE_DVA_CLAIM:0x00080000:---C-- ZIO_STAGE_READY:0x00100000:RWFCIT -ZIO_STAGE_VDEV_IO_START:0x00200000:RW--IT -ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--IT -ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--IT +ZIO_STAGE_VDEV_IO_START:0x00200000:RW--XT +ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--XT +ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--XT ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R----- -ZIO_STAGE_DONE:0x02000000:RWFCIT +ZIO_STAGE_DONE:0x02000000:RWFCXT .TE . .Sh I/O FLAGS diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index 888c8e7f88..869093afa3 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -247,7 +247,7 @@ vdev_file_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - if (zio->io_type == ZIO_TYPE_IOCTL) { + if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 264dfa5c92..9d88971919 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1053,7 +1053,7 @@ vdev_geom_io_intr(struct bio *bp) /* * We have to split bio freeing into two parts, because the ABD code * cannot be called in this context and vdev_op_io_done is not called - * for ZIO_TYPE_IOCTL zio-s. + * for ZIO_TYPE_FLUSH zio-s. */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { g_destroy_bio(bp); @@ -1153,7 +1153,7 @@ vdev_geom_io_start(zio_t *zio) vd = zio->io_vd; - if (zio->io_type == ZIO_TYPE_IOCTL) { + if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); @@ -1181,7 +1181,7 @@ vdev_geom_io_start(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM || - zio->io_type == ZIO_TYPE_IOCTL); + zio->io_type == ZIO_TYPE_FLUSH); cp = vd->vdev_tsd; if (cp == NULL) { @@ -1233,7 +1233,7 @@ vdev_geom_io_start(zio_t *zio) bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; - case ZIO_TYPE_IOCTL: + case ZIO_TYPE_FLUSH: bp->bio_cmd = BIO_FLUSH; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 554ed22b9d..2cea61a629 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1400,7 +1400,7 @@ vdev_disk_io_start(zio_t *zio) } switch (zio->io_type) { - case ZIO_TYPE_IOCTL: + case ZIO_TYPE_FLUSH: if (!vdev_readable(v)) { /* Drive not there, can't flush */ diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index 2b483c9a9f..ac41a2615f 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -242,7 +242,7 @@ vdev_file_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - if (zio->io_type == ZIO_TYPE_IOCTL) { + if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 3704ffd088..f67d980ae4 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -180,7 +180,7 @@ static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ }; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ebba453e2b..d97d0a8100 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4924,11 +4924,11 @@ vdev_stat_update(zio_t *zio, uint64_t psize) /* * TRIM ops and bytes are reported to user space as - * ZIO_TYPE_IOCTL. This is done to preserve the + * ZIO_TYPE_FLUSH. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) - vs_type = ZIO_TYPE_IOCTL; + vs_type = ZIO_TYPE_FLUSH; /* * Solely for the purposes of 'zpool iostat -lqrw' @@ -6239,12 +6239,12 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user - * space as ZIO_TYPE_IOCTL. This is done to + * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, - vd->vdev_stat.vs_ops[ZIO_TYPE_IOCTL], + vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: @@ -6275,12 +6275,12 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user - * space as ZIO_TYPE_IOCTL. This is done to + * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, - vd->vdev_stat.vs_bytes[ZIO_TYPE_IOCTL], + vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 7769ed6a37..13bb33cc68 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -2548,11 +2548,11 @@ vdev_draid_read_config_spare(vdev_t *vd) } /* - * Handle any ioctl requested of the distributed spare. Only flushes - * are supported in which case all children must be flushed. + * Handle any flush requested of the distributed spare. All children must be + * flushed. */ static int -vdev_draid_spare_ioctl(zio_t *zio) +vdev_draid_spare_flush(zio_t *zio) { vdev_t *vd = zio->io_vd; int error = 0; @@ -2592,8 +2592,8 @@ vdev_draid_spare_io_start(zio_t *zio) } switch (zio->io_type) { - case ZIO_TYPE_IOCTL: - zio->io_error = vdev_draid_spare_ioctl(zio); + case ZIO_TYPE_FLUSH: + zio->io_error = vdev_draid_spare_flush(zio); break; case ZIO_TYPE_WRITE: diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 031fc3d513..8d8523038e 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -63,7 +63,7 @@ const char *const zio_type_name[ZIO_TYPES] = { * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ - "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim" + "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; @@ -1632,8 +1632,8 @@ zio_flush(zio_t *pio, vdev_t *vd) if (vd->vdev_children == 0) { zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, - NULL, NULL, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, - NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE)); + NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0, + NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE)); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) zio_flush(pio, vd->vdev_child[c]); @@ -4086,7 +4086,7 @@ zio_vdev_io_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || - zio->io_type == ZIO_TYPE_IOCTL || + zio->io_type == ZIO_TYPE_FLUSH || zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) @@ -4094,7 +4094,7 @@ zio_vdev_io_done(zio_t *zio) if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { - if (zio->io_type != ZIO_TYPE_IOCTL) + if (zio->io_type != ZIO_TYPE_FLUSH) vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) @@ -4239,7 +4239,7 @@ zio_vdev_io_assess(zio_t *zio) * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_IOCTL && vd != NULL) + zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 0a4851ecb4..1af2c26f8a 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -367,7 +367,7 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) * We skip over faults in the labels unless it's during device open * (i.e. zio == NULL) or a device flush (offset is meaningless) */ - if (zio != NULL && zio->io_type != ZIO_TYPE_IOCTL) { + if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) { uint64_t offset = zio->io_offset; if (offset < VDEV_LABEL_START_SIZE || From b181b2e604de3f36feab1092c702cdec5e78c693 Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 13 Apr 2024 02:00:20 +1000 Subject: [PATCH 015/113] bdev_discard_supported: understand discard_granularity=0 Kernel documentation for the discard_granularity property says: A discard_granularity of 0 means that the device does not support discard functionality. Some older kernels had drivers (notably loop, but also some USB-SATA adapters) that would set the QUEUE_FLAG_DISCARD capability flag, but have discard_granularity=0. Since 5.10 (torvalds/linux@b35fd7422c2f) the discard entry point blkdev_issue_discard() has had a check for this, which would immediately reject the call with EOPNOTSUPP, and throw a scary diagnostic message into the log. See #16068. Since 6.8, the block layer sets a non-zero default for discard_granularity (torvalds/linux@3c407dc723bb), and a future kernel will remove the check entirely[1]. As such, there's no good reason for us to enable discard when discard_granularity=0. The kernel will never let the request go in anyway; better that we just disable it so we can report it properly to the user. 1. https://patchwork.kernel.org/project/linux-block/patch/20240312144826.1045212-2-hch@lst.de/ Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16068 Closes #16082 --- include/os/linux/kernel/linux/blkdev_compat.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index f111e648cc..b0f398354e 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -563,9 +563,11 @@ static inline boolean_t bdev_discard_supported(struct block_device *bdev) { #if defined(HAVE_BDEV_MAX_DISCARD_SECTORS) - return (!!bdev_max_discard_sectors(bdev)); + return (bdev_max_discard_sectors(bdev) > 0 && + bdev_discard_granularity(bdev) > 0); #elif defined(HAVE_BLK_QUEUE_DISCARD) - return (!!blk_queue_discard(bdev_get_queue(bdev))); + return (blk_queue_discard(bdev_get_queue(bdev)) > 0 && + bdev_get_queue(bdev)->limits.discard_granularity > 0); #else #error "Unsupported kernel" #endif From f22b110f60d83f62b75d20fabb0968ab74324778 Mon Sep 17 00:00:00 2001 From: Rob N Date: Tue, 16 Apr 2024 06:44:12 +1000 Subject: [PATCH 016/113] zts: allow running a single test by name only Specifying a single test is kind of a hassle, because the full relative path under the test suite dir has to be included, but it's not always clear what that path even is. This change allows `-t` to take the name of a single test instead of a full path. If the value has no `/` characters, we search for a file of that name under the test root, and if found, use that as the full test path instead. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Akash B Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16088 --- scripts/zfs-tests.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index 179e24d7a0..b5b3e4ab35 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -326,7 +326,8 @@ OPTIONS: -d DIR Use world-writable DIR for files and loopback devices -s SIZE Use vdevs of SIZE (default: 4G) -r RUNFILES Run tests in RUNFILES (default: ${DEFAULT_RUNFILES}) - -t PATH Run single test at PATH relative to test suite + -t PATH|NAME Run single test at PATH relative to test suite, + or search for test by NAME -T TAGS Comma separated list of tags (default: 'functional') -u USER Run single test as USER (default: root) @@ -340,6 +341,9 @@ $0 -r linux-fast # Run a single test $0 -t tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh +# Run a single test by name +$0 -t zfs_bookmark_cliargs + # Cleanup a previous run of the test suite prior to testing, run the # default ($(echo "${DEFAULT_RUNFILES}" | sed 's/\.run//')) suite of tests and perform no cleanup on exit. $0 -x @@ -450,8 +454,15 @@ post_user = root post = outputdir = /var/tmp/test_results EOF - SINGLETESTDIR="${SINGLETEST%/*}" + if [ "$SINGLETEST" = "${SINGLETEST%/*}" ] ; then + NEWSINGLETEST=$(find "$STF_SUITE" -name "$SINGLETEST*" -print -quit) + if [ -z "$NEWSINGLETEST" ] ; then + fail "couldn't find test matching '$SINGLETEST'" + fi + SINGLETEST=$NEWSINGLETEST + fi + SINGLETESTDIR="${SINGLETEST%/*}" SETUPDIR="$SINGLETESTDIR" [ "${SETUPDIR#/}" = "$SETUPDIR" ] && SETUPDIR="$STF_SUITE/$SINGLETESTDIR" [ -x "$SETUPDIR/setup.ksh" ] && SETUPSCRIPT="setup" || SETUPSCRIPT= From 4725e543be32f74d3a0a46ce3bb5c8e89280b471 Mon Sep 17 00:00:00 2001 From: Rob N Date: Tue, 16 Apr 2024 06:52:20 +1000 Subject: [PATCH 017/113] zinject: "no-op" error injection When injected, this causes the matching IO to appear to succeed, but the actual work is never submitted to the physical device. This can be used to simulate a write-back cache servicing a write, but the backing device has failed and the cache cannot complete the operation in the background. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16085 --- cmd/zinject/zinject.c | 7 ++++--- man/man8/zinject.8 | 6 ++++-- module/zfs/zio.c | 10 ++++++++++ .../tests/functional/cli_root/zinject/zinject_args.ksh | 2 +- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index a1afa4a63f..e9141fb4ba 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -221,6 +221,7 @@ static const struct errstr errstrtable[] = { { ENXIO, "nxio" }, { ECHILD, "dtl" }, { EILSEQ, "corrupt" }, + { ENOSYS, "noop" }, { 0, NULL }, }; @@ -269,8 +270,8 @@ usage(void) "\t\tInject a fault into a particular device or the device's\n" "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " "\t\t'pad1', or 'pad2'.\n" - "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n" - "\t\t'corrupt' (bit flip).\n" + "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl',\n" + "\t\t'corrupt' (bit flip), or 'noop' (successfully do nothing).\n" "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n" "\t\tdevice error injection to a percentage of the IOs.\n" "\n" @@ -889,7 +890,7 @@ main(int argc, char **argv) if (error < 0) { (void) fprintf(stderr, "invalid error type " "'%s': must be one of: io decompress " - "decrypt nxio dtl corrupt\n", + "decrypt nxio dtl corrupt noop\n", optarg); usage(); libzfs_fini(g_zfs); diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index 817dcb7fe3..f67b5e378d 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -211,9 +211,11 @@ to flip a bit in the data after a read, .It Sy dtl for an ECHILD error, .It Sy io -for an EIO error where reopening the device will succeed, or +for an EIO error where reopening the device will succeed, .It Sy nxio -for an ENXIO error where reopening the device will fail. +for an ENXIO error where reopening the device will fail, or +.It Sy noop +to drop the IO without executing it, and return success. .El .Pp For EIO and ENXIO, the "failed" reads or writes still occur. diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 8d8523038e..414e3d4e93 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4058,6 +4058,16 @@ zio_vdev_io_start(zio_t *zio) zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { + if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) { + /* + * "no-op" injections return success, but do no actual + * work. Just skip the remaining vdev stages. + */ + zio_vdev_io_bypass(zio); + zio_interrupt(zio); + return (NULL); + } + if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); diff --git a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh index f8a8ffbb7b..dd9ef9ddd2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh @@ -47,7 +47,7 @@ function cleanup function test_device_fault { - typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt") + typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop") for e in ${errno[@]}; do log_must eval \ "zinject -d $DISK1 -e $e -T read -f 0.001 $TESTPOOL" From c6da985e28d7071b187bd928e7fd41ba9e9f6aa7 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 15 Apr 2024 21:53:39 +0100 Subject: [PATCH 018/113] Add the BTI elf note to the AArch64 SHA2 assembly On ELF platforms there is a note to specify when an application or library supports BTI. When linking one of these the linker needs all input object files to have the note. If not it will not include it in the output file. Normally the compiler would generate it, but for assembly files we need to do it our selves. Add the note to the aarch64 sha256 and sha512 assembly files. Tested by building with BTI enabled and using the -zbti-report=error flag to lld that makes it an error if the note is missing. Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: Andrew Turner Closes #16086 --- module/icp/asm-aarch64/sha2/sha256-armv8.S | 10 ++++++++++ module/icp/asm-aarch64/sha2/sha512-armv8.S | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S index 7ae486e4e2..4dcdd3b65d 100644 --- a/module/icp/asm-aarch64/sha2/sha256-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S @@ -21,6 +21,16 @@ #if defined(__aarch64__) + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 .text .align 6 diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S index 9c61eeee4d..f6c8f77429 100644 --- a/module/icp/asm-aarch64/sha2/sha512-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S @@ -21,6 +21,16 @@ #if defined(__aarch64__) + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 .text .align 6 From 90ba19eb7b81f0225e63bedfb902000d23383921 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Mon, 15 Apr 2024 22:56:10 +0200 Subject: [PATCH 019/113] Do no use .cfi_negate_ra_state within the assembly on Arm64 Compiling openzfs on aarch64 with gcc-8 and gcc-9 is failing currently. See issue #14965 for deeper context. On platforms without pointer authentication, .cfi_negate_ra_state can be defined to a no-op: https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=gdb/aarch64-tdep.c#l1413 I have tested this on Arm64 FreeBSD 13.2 and AlmaLinux-8. Reviewed-by: Andrew Turner Signed-off-by: Tino Reichardt Closes #14965 Closes #15784 --- module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S | 14 +++++++++++--- module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 12 ++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S index dc2719d142..e66bb4bc7f 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -32,6 +32,14 @@ */ #if defined(__aarch64__) + +/* make gcc <= 9 happy */ +#if LD_VERSION >= 233010000 +#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state +#else +#define CFI_NEGATE_RA_STATE +#endif + .text .section .note.gnu.property,"a",@note .p2align 3 @@ -51,7 +59,7 @@ zfs_blake3_compress_in_place_sse2: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 @@ -555,7 +563,7 @@ compress_pre: zfs_blake3_compress_xof_sse2: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 @@ -608,7 +616,7 @@ zfs_blake3_compress_xof_sse2: zfs_blake3_hash_many_sse2: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE stp d15, d14, [sp, #-160]! stp d13, d12, [sp, #16] stp d11, d10, [sp, #32] diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S index c4c2dfc5bc..b9fb28dfcf 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -32,6 +32,14 @@ */ #if defined(__aarch64__) + +/* make gcc <= 9 happy */ +#if LD_VERSION >= 233010000 +#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state +#else +#define CFI_NEGATE_RA_STATE +#endif + .text .section .note.gnu.property,"a",@note .p2align 3 @@ -51,7 +59,7 @@ zfs_blake3_compress_in_place_sse41: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 @@ -565,7 +573,7 @@ compress_pre: zfs_blake3_compress_xof_sse41: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 From cf60db6ebe516d8470d9935c380f7ecc27071a25 Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 17 Apr 2024 02:13:01 +1000 Subject: [PATCH 020/113] zts: add a debug option to get full test output The test runner accumulates output from individual tests, then writes it to the log at the end. If a test hangs or crashes the system half way through, we get no insight into how it got to where it did. This adds a -D option for "debug". When set, all test output is written to stdout. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Akash B Signed-off-by: Rob Norris Closes #16096 --- scripts/zfs-tests.sh | 10 +++++++++- tests/test-runner/bin/test-runner.py.in | 23 ++++++++++++++++------- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index b5b3e4ab35..c25903ea1b 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -32,6 +32,7 @@ SCRIPT_COMMON=${SCRIPT_COMMON:-${0%/*}/common.sh} PROG=zfs-tests.sh VERBOSE="no" QUIET="" +DEBUG="" CLEANUP="yes" CLEANUPALL="no" KMSG="" @@ -313,6 +314,7 @@ OPTIONS: -h Show this message -v Verbose zfs-tests.sh output -q Quiet test-runner output + -D Debug; show all test output immediately (noisy) -x Remove all testpools, dm, lo, and files (unsafe) -k Disable cleanup after test failure -K Log test names to /dev/kmsg @@ -351,7 +353,7 @@ $0 -x EOF } -while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do +while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do case $OPTION in h) usage @@ -397,6 +399,9 @@ while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do d) FILEDIR="$OPTARG" ;; + D) + DEBUG="yes" + ;; I) ITERATIONS="$OPTARG" if [ "$ITERATIONS" -le 0 ]; then @@ -691,6 +696,7 @@ REPORT_FILE=$(mktemp_file zts-report) # msg "${TEST_RUNNER}" \ "${QUIET:+-q}" \ + "${DEBUG:+-D}" \ "${KMEMLEAK:+-m}" \ "${KMSG:+-K}" \ "-c \"${RUNFILES}\"" \ @@ -700,6 +706,7 @@ msg "${TEST_RUNNER}" \ { PATH=$STF_PATH \ ${TEST_RUNNER} \ ${QUIET:+-q} \ + ${DEBUG:+-D} \ ${KMEMLEAK:+-m} \ ${KMSG:+-K} \ -c "${RUNFILES}" \ @@ -726,6 +733,7 @@ if [ "$RESULT" -eq "2" ] && [ -n "$RERUN" ]; then { PATH=$STF_PATH \ ${TEST_RUNNER} \ ${QUIET:+-q} \ + ${DEBUG:+-D} \ ${KMEMLEAK:+-m} \ -c "${RUNFILES}" \ -T "${TAGS}" \ diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in index 422ebd7bc8..65247f4f06 100755 --- a/tests/test-runner/bin/test-runner.py.in +++ b/tests/test-runner/bin/test-runner.py.in @@ -113,8 +113,9 @@ class Output(object): This class is a slightly modified version of the 'Stream' class found here: http://goo.gl/aSGfv """ - def __init__(self, stream): + def __init__(self, stream, debug=False): self.stream = stream + self.debug = debug self._buf = b'' self.lines = [] @@ -140,6 +141,8 @@ class Output(object): buf = os.read(fd, 4096) if not buf: return None + if self.debug: + os.write(sys.stderr.fileno(), buf) if b'\n' not in buf: self._buf += buf return [] @@ -238,14 +241,14 @@ User: %s ret = '%s -E -u %s %s' % (SUDO, user, cmd) return ret.split(' ') - def collect_output(self, proc): + def collect_output(self, proc, debug=False): """ Read from stdout/stderr as data becomes available, until the process is no longer running. Return the lines from the stdout and stderr Output objects. """ - out = Output(proc.stdout) - err = Output(proc.stderr) + out = Output(proc.stdout, debug) + err = Output(proc.stderr, debug) res = [] while proc.returncode is None: proc.poll() @@ -308,7 +311,10 @@ User: %s try: t.start() - self.result.stdout, self.result.stderr = self.collect_output(proc) + + out, err = self.collect_output(proc, options.debug) + self.result.stdout = out + self.result.stderr = err if kmemleak: cmd = f'{SUDO} sh -c "echo scan > {KMEMLEAK_FILE}"' @@ -624,7 +630,7 @@ Tags: %s class TestRun(object): - props = ['quiet', 'outputdir'] + props = ['quiet', 'outputdir', 'debug'] def __init__(self, options): self.tests = {} @@ -644,7 +650,8 @@ class TestRun(object): ('post_user', ''), ('failsafe', ''), ('failsafe_user', ''), - ('tags', []) + ('tags', []), + ('debug', False) ] def __str__(self): @@ -1067,6 +1074,8 @@ def parse_args(): help='Specify tests to run via config files.') parser.add_option('-d', action='store_true', default=False, dest='dryrun', help='Dry run. Print tests, but take no other action.') + parser.add_option('-D', action='store_true', default=False, dest='debug', + help='Write all test output to stdout as it arrives.') parser.add_option('-l', action='callback', callback=options_cb, default=None, dest='logfile', metavar='logfile', type='string', From 454c0b0e46eca93a9d6af262c41b56987b15928e Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 17 Apr 2024 09:29:21 -0700 Subject: [PATCH 021/113] Linux 6.8 compat: META (#16099) Update the META file to reflect compatibility with the 6.8 kernel. Signed-off-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 8a257f0feb..19a796050f 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.7 +Linux-Maximum: 6.8 Linux-Minimum: 3.10 From 35bf2584852d47a666a0ae3d1c6903c367e8f169 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Fri, 19 Apr 2024 19:15:38 +0200 Subject: [PATCH 022/113] Fix: FreeBSD Arm64 does not build currently The define LD_VERSION isn't defined on FreeBSD Arm64 when OpenZFS is build with the default compiler: clang. I used only gcc for testing - my fault. Fast fix as suggested by @mmatuska Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Martin Matuska Signed-off-by: Tino Reichardt Closes #16103 --- module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S | 2 +- module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S index e66bb4bc7f..fefebf0811 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -34,7 +34,7 @@ #if defined(__aarch64__) /* make gcc <= 9 happy */ -#if LD_VERSION >= 233010000 +#if !defined(LD_VERSION) || LD_VERSION >= 233010000 #define CFI_NEGATE_RA_STATE .cfi_negate_ra_state #else #define CFI_NEGATE_RA_STATE diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S index b9fb28dfcf..1ad6cefc6d 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -34,7 +34,7 @@ #if defined(__aarch64__) /* make gcc <= 9 happy */ -#if LD_VERSION >= 233010000 +#if !defined(LD_VERSION) || LD_VERSION >= 233010000 #define CFI_NEGATE_RA_STATE .cfi_negate_ra_state #else #define CFI_NEGATE_RA_STATE From cd3e6b4f4c5e0b514f3e76e194b2a5753264d44f Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Fri, 19 Apr 2024 22:19:12 +0500 Subject: [PATCH 023/113] Add zfetch stats in arcstats arc_summary also reports zfetch stats but it's inconvenient to monitor contiguously incrementing numbers. Adding them in arcstats allows us to observe streams more conveniently. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #16094 --- cmd/arcstat.in | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/cmd/arcstat.in b/cmd/arcstat.in index 8df1c62f7e..220f343b5b 100755 --- a/cmd/arcstat.in +++ b/cmd/arcstat.in @@ -157,6 +157,16 @@ cols = { "free": [5, 1024, "ARC free memory"], "avail": [5, 1024, "ARC available memory"], "waste": [5, 1024, "Wasted memory due to round up to pagesize"], + "ztotal": [6, 1000, "zfetch total prefetcher calls per second"], + "zhits": [5, 1000, "zfetch stream hits per second"], + "zahead": [6, 1000, "zfetch hits ahead of streams per second"], + "zpast": [5, 1000, "zfetch hits behind streams per second"], + "zmisses": [7, 1000, "zfetch stream misses per second"], + "zmax": [4, 1000, "zfetch limit reached per second"], + "zfuture": [7, 1000, "zfetch stream future per second"], + "zstride": [7, 1000, "zfetch stream strides per second"], + "zissued": [7, 1000, "zfetch prefetches issued per second"], + "zactive": [7, 1000, "zfetch prefetches active per second"], } v = {} @@ -164,6 +174,8 @@ hdr = ["time", "read", "ddread", "ddh%", "dmread", "dmh%", "pread", "ph%", "size", "c", "avail"] xhdr = ["time", "mfu", "mru", "mfug", "mrug", "unc", "eskip", "mtxmis", "dread", "pread", "read"] +zhdr = ["time", "ztotal", "zhits", "zahead", "zpast", "zmisses", "zmax", + "zfuture", "zstride", "zissued", "zactive"] sint = 1 # Default interval is 1 second count = 1 # Default count is 1 hdr_intr = 20 # Print header every 20 lines of output @@ -206,12 +218,17 @@ elif sys.platform.startswith('linux'): def kstat_update(): global kstat - k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] + k1 = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] - if not k: + k2 = ["zfetch_" + line.strip() for line in + open('/proc/spl/kstat/zfs/zfetchstats')] + + if k1 is None or k2 is None: sys.exit(1) - del k[0:2] + del k1[0:2] + del k2[0:2] + k = k1 + k2 kstat = {} for s in k: @@ -239,6 +256,7 @@ def usage(): sys.stderr.write("\t -v : List all possible field headers and definitions" "\n") sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -z : Print zfetch stats\n") sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") sys.stderr.write("\t -o : Redirect output to the specified file\n") sys.stderr.write("\t -s : Override default field separator with custom " @@ -357,6 +375,7 @@ def init(): global count global hdr global xhdr + global zhdr global opfile global sep global out @@ -368,15 +387,17 @@ def init(): xflag = False hflag = False vflag = False + zflag = False i = 1 try: opts, args = getopt.getopt( sys.argv[1:], - "axo:hvs:f:p", + "axzo:hvs:f:p", [ "all", "extended", + "zfetch", "outfile", "help", "verbose", @@ -410,13 +431,15 @@ def init(): i += 1 if opt in ('-p', '--parsable'): pretty_print = False + if opt in ('-z', '--zfetch'): + zflag = True i += 1 argv = sys.argv[i:] sint = int(argv[0]) if argv else sint count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1) - if hflag or (xflag and desired_cols): + if hflag or (xflag and zflag) or ((zflag or xflag) and desired_cols): usage() if vflag: @@ -425,6 +448,9 @@ def init(): if xflag: hdr = xhdr + if zflag: + hdr = zhdr + update_hdr_intr() # check if L2ARC exists @@ -569,6 +595,17 @@ def calculate(): v["el2mru"] = d["evict_l2_eligible_mru"] // sint v["el2inel"] = d["evict_l2_ineligible"] // sint v["mtxmis"] = d["mutex_miss"] // sint + v["ztotal"] = (d["zfetch_hits"] + d["zfetch_future"] + d["zfetch_stride"] + + d["zfetch_past"] + d["zfetch_misses"]) // sint + v["zhits"] = d["zfetch_hits"] // sint + v["zahead"] = (d["zfetch_future"] + d["zfetch_stride"]) // sint + v["zpast"] = d["zfetch_past"] // sint + v["zmisses"] = d["zfetch_misses"] // sint + v["zmax"] = d["zfetch_max_streams"] // sint + v["zfuture"] = d["zfetch_future"] // sint + v["zstride"] = d["zfetch_stride"] // sint + v["zissued"] = d["zfetch_io_issued"] // sint + v["zactive"] = d["zfetch_io_active"] // sint if l2exist: v["l2hits"] = d["l2_hits"] // sint From f75574cbaaa1ade5bf24ab11751cbd5bc62ef7f1 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 16 Apr 2024 15:03:33 +1000 Subject: [PATCH 024/113] tests/quota_005_pos: use a long int for doubling the quota size When run in isolation, quota_005_pos would see an empty ~300G dataset. Doubling it's space overflows a int32, which meant it was trying to then set the quota to a negative value, and would fail. When run as part of the quota tests, the filesystem appears to have stuff in it, and so a lower available space, which doesn't overflow, and so succeeds. The bare minimum fix seems to be to use a int64 for the available space, so it can be comfortably doubled. Here it is. (Also a typo fix and a tiny bit of cleanup). Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Rob Norris Closes #16097 --- tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh index 98ee4edae6..fb3d97f486 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh @@ -55,15 +55,14 @@ function cleanup log_onexit cleanup -log_assert "Verify that quota doesnot inherit its value from parent." -log_onexit cleanup +log_assert "Verify that quota does not inherit its value from parent." fs=$TESTPOOL/$TESTFS fs_child=$TESTPOOL/$TESTFS/$TESTFS space_avail=$(get_prop available $fs) quota_val=$(get_prop quota $fs) -typeset -i quotasize=$space_avail +typeset -li quotasize=$space_avail ((quotasize = quotasize * 2 )) log_must zfs set quota=$quotasize $fs @@ -72,4 +71,4 @@ quota_space=$(get_prop quota $fs_child) [[ $quota_space == $quotasize ]] && \ log_fail "The quota of child dataset inherits its value from parent." -log_pass "quota doesnot inherit its value from parent as expected." +log_pass "quota does not inherit its value from parent as expected." From 26d49fec5f862818a0410fedbba1efded0543374 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 16 Apr 2024 14:56:35 +1000 Subject: [PATCH 025/113] tests/quota: consistently clear quota property between tests When run in isolation, quota_005_pos would fail in cleanup because it would attempt restore the previous quota, which was 0, and so get an error (because you can't set quota to '0', you have to use 'none'). It worked as part of the quota tag set because the previous tests did not clean up their quota, so there was always a non-zero quota to return to. This adds a simple quota reset function, and has all quota tests run it at cleanup. For the ones that weren't cleaning up, they now do, and for quota_005_pos, which was trying to do the right thing, it now just resets it. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Rob Norris Closes #16097 --- tests/zfs-tests/tests/functional/quota/quota.kshlib | 7 +++++++ tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh | 2 ++ tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh | 2 ++ tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh | 2 ++ tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh | 2 ++ tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh | 2 +- tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh | 2 +- 7 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/quota/quota.kshlib b/tests/zfs-tests/tests/functional/quota/quota.kshlib index 5083415c89..b4cfde020f 100644 --- a/tests/zfs-tests/tests/functional/quota/quota.kshlib +++ b/tests/zfs-tests/tests/functional/quota/quota.kshlib @@ -95,3 +95,10 @@ function exceed_quota log_fail "Returned error code: $zret. Expected: $EDQUOT." return 0 } + +function reset_quota +{ + typeset FILESYSTEM="$1" + + log_must zfs set quota=none $FILESYSTEM +} diff --git a/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh index d124cb26ae..f01008a46b 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh @@ -64,6 +64,8 @@ function cleanup # wait_freeing $TESTPOOL sync_pool $TESTPOOL + + reset_quota $TESTPOOL/$TESTFS } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh index 3af005e874..bea2a5a686 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh @@ -64,6 +64,8 @@ function cleanup wait_freeing $TESTPOOL sync_pool $TESTPOOL + + reset_quota $TESTPOOL/$TESTFS } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh index de265813d5..33f6421131 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh @@ -67,6 +67,8 @@ function cleanup # wait_freeing $TESTPOOL sync_pool $TESTPOOL + + reset_quota $TESTPOOL/$TESTCTR/$TESTFS1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh index 8f20b533da..682d09f080 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh @@ -65,6 +65,8 @@ function cleanup wait_freeing $TESTPOOL sync_pool $TESTPOOL + + reset_quota $TESTPOOL/$TESTCTR/$TESTFS1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh index fb3d97f486..9c4db81ca2 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh @@ -50,7 +50,7 @@ function cleanup { datasetexists $fs_child && destroy_dataset $fs_child - log_must zfs set quota=$quota_val $fs + reset_quota $fs } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh b/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh index 12105162c5..111d771188 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh @@ -50,7 +50,7 @@ log_assert "Verify cannot set quota lower than the space currently in use" function cleanup { - log_must zfs set quota=none $TESTPOOL/$TESTFS + reset_quota $TESTPOOL/$TESTFS } log_onexit cleanup From 9f83eec03904b18e052fbe2c66542bd47254cf57 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 19 Apr 2024 19:18:54 -0400 Subject: [PATCH 026/113] Handle FLUSH errors as "expected" Before #16061 zio_vdev_io_done() was not used for FLUSH requests. Addition of it triggers reprobe each TXG for vdevs not supporting them. Since those errors are often expected, they are normally handled by individual vdev drivers and should be ignored here. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16110 --- module/zfs/zio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 414e3d4e93..1ba99f4d46 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4114,7 +4114,8 @@ zio_vdev_io_done(zio_t *zio) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); - if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { + if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH && + zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { From f4f156157de3f61e55db0429b10c63d02226e115 Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 20 Apr 2024 09:41:31 +1000 Subject: [PATCH 027/113] abd_iter_page: rework to handle multipage scatterlists Previously, abd_iter_page() would assume that every scatterlist would contain a single page (compound or no), because that's all we ever create in abd_alloc_chunks(). However, scatterlists can contain multiple pages of arbitrary provenance, and if we get one of those, we'd get all the math wrong. This reworks things to handle multiple pages in a scatterlist, by properly finding the right page within it for the given offset, and understanding better where the end of the page is and not crossing it. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reported-by: Brian Atkinson Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Rob Norris Closes #16108 --- module/os/linux/zfs/abd_os.c | 120 +++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 46 deletions(-) diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index d3255dcbc0..cee7410c88 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -1015,10 +1015,50 @@ abd_cache_reap_now(void) } #if defined(_KERNEL) + /* - * Yield the next page struct and data offset and size within it, without + * This is abd_iter_page(), the function underneath abd_iterate_page_func(). + * It yields the next page struct and data offset and size within it, without * mapping it into the address space. */ + +/* + * "Compound pages" are a group of pages that can be referenced from a single + * struct page *. Its organised as a "head" page, followed by a series of + * "tail" pages. + * + * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we + * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a + * great many of the IO buffers we get are going to be of this type. + * + * The tail pages are just regular PAGESIZE pages, and can be safely used + * as-is. However, the head page has length covering itself and all the tail + * pages. If the ABD chunk spans multiple pages, then we can use the head page + * and a >PAGESIZE length, which is far more efficient. + * + * Before kernel 4.5 however, compound page heads were refcounted separately + * from tail pages, such that moving back to the head page would require us to + * take a reference to it and releasing it once we're completely finished with + * it. In practice, that means when our caller is done with the ABD, which we + * have no insight into from here. Rather than contort this API to track head + * page references on such ancient kernels, we disable this special compound + * page handling on 4.5, instead just using treating each page within it as a + * regular PAGESIZE page (which it is). This is slightly less efficient, but + * makes everything far simpler. + * + * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the + * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to + * understand compound pages, or not, as required. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#define ABD_ITER_COMPOUND_PAGES 1 +#define ABD_ITER_PAGE_SIZE(page) \ + (PageCompound(page) ? page_size(page) : PAGESIZE) +#else +#undef ABD_ITER_COMPOUND_PAGES +#define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) +#endif + void abd_iter_page(struct abd_iter *aiter) { @@ -1032,6 +1072,12 @@ abd_iter_page(struct abd_iter *aiter) struct page *page; size_t doff, dsize; + /* + * Find the page, and the start of the data within it. This is computed + * differently for linear and scatter ABDs; linear is referenced by + * virtual memory location, while scatter is referenced by page + * pointer. + */ if (abd_is_linear(aiter->iter_abd)) { ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); @@ -1044,57 +1090,24 @@ abd_iter_page(struct abd_iter *aiter) /* offset of address within the page */ doff = offset_in_page(paddr); - - /* total data remaining in abd from this position */ - dsize = aiter->iter_abd->abd_size - aiter->iter_offset; } else { ASSERT(!abd_is_gang(aiter->iter_abd)); /* current scatter page */ - page = sg_page(aiter->iter_sg); + page = nth_page(sg_page(aiter->iter_sg), + aiter->iter_offset >> PAGE_SHIFT); /* position within page */ - doff = aiter->iter_offset; - - /* remaining data in scatterlist */ - dsize = MIN(aiter->iter_sg->length - aiter->iter_offset, - aiter->iter_abd->abd_size - aiter->iter_pos); + doff = aiter->iter_offset & (PAGESIZE - 1); } - ASSERT(page); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#ifdef ABD_ITER_COMPOUND_PAGES if (PageTail(page)) { /* - * This page is part of a "compound page", which is a group of - * pages that can be referenced from a single struct page *. - * Its organised as a "head" page, followed by a series of - * "tail" pages. - * - * In OpenZFS, compound pages are allocated using the - * __GFP_COMP flag, which we get from scatter ABDs and SPL - * vmalloc slabs (ie >16K allocations). So a great many of the - * IO buffers we get are going to be of this type. - * - * The tail pages are just regular PAGE_SIZE pages, and can be - * safely used as-is. However, the head page has length - * covering itself and all the tail pages. If this ABD chunk - * spans multiple pages, then we can use the head page and a - * >PAGE_SIZE length, which is far more efficient. - * - * To do this, we need to adjust the offset to be counted from - * the head page. struct page for compound pages are stored - * contiguously, so we can just adjust by a simple offset. - * - * Before kernel 4.5, compound page heads were refcounted - * separately, such that moving back to the head page would - * require us to take a reference to it and releasing it once - * we're completely finished with it. In practice, that means - * when our caller is done with the ABD, which we have no - * insight into from here. Rather than contort this API to - * track head page references on such ancient kernels, we just - * compile this block out and use the tail pages directly. This - * is slightly less efficient, but makes everything far - * simpler. + * If this is a compound tail page, move back to the head, and + * adjust the offset to match. This may let us yield a much + * larger amount of data from a single logical page, and so + * leave our caller with fewer pages to process. */ struct page *head = compound_head(page); doff += ((page - head) * PAGESIZE); @@ -1102,12 +1115,27 @@ abd_iter_page(struct abd_iter *aiter) } #endif - /* final page and position within it */ + ASSERT(page); + + /* + * Compute the maximum amount of data we can take from this page. This + * is the smaller of: + * - the remaining space in the page + * - the remaining space in this scatterlist entry (which may not cover + * the entire page) + * - the remaining space in the abd (which may not cover the entire + * scatterlist entry) + */ + dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff, + aiter->iter_abd->abd_size - aiter->iter_pos); + if (!abd_is_linear(aiter->iter_abd)) + dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset); + ASSERT3U(dsize, >, 0); + + /* final iterator outputs */ aiter->iter_page = page; aiter->iter_page_doff = doff; - - /* amount of data in the chunk, up to the end of the page */ - aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); + aiter->iter_page_dsize = dsize; } /* From c183d164aa11e61dfe1f34907c1a029d75162f1d Mon Sep 17 00:00:00 2001 From: George Wilson Date: Mon, 22 Apr 2024 12:42:38 -0400 Subject: [PATCH 028/113] Parallel pool import This commit allow spa_load() to drop the spa_namespace_lock so that imports can happen concurrently. Prior to dropping the spa_namespace_lock, the import logic will set the spa_load_thread value to track the thread which is doing the import. Consumers of spa_lookup() retain the same behavior by blocking when either a thread is holding the spa_namespace_lock or the spa_load_thread value is set. This will ensure that critical concurrent operations cannot take place while a pool is being imported. The zpool command is also enhanced to provide multi-threaded support when invoking zpool import -a. Lastly, zinject provides a mechanism to insert artificial delays when importing a pool and new zfs tests are added to verify parallel import functionality. Contributions-by: Don Brady Reviewed-by: Brian Behlendorf Signed-off-by: George Wilson Closes #16093 --- cmd/zinject/zinject.c | 115 +++++++++++- cmd/zpool/zpool_main.c | 72 ++++++-- include/libzutil.h | 4 +- include/sys/spa.h | 2 + include/sys/spa_impl.h | 3 +- include/sys/zfs_ioctl.h | 4 +- include/sys/zio.h | 4 +- man/man8/zinject.8 | 8 + module/zfs/spa.c | 58 ++++-- module/zfs/spa_misc.c | 26 ++- module/zfs/vdev_initialize.c | 5 +- module/zfs/vdev_rebuild.c | 4 +- module/zfs/vdev_trim.c | 9 +- module/zfs/zio_inject.c | 138 ++++++++++++++- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 3 + .../zpool_import_parallel_admin.ksh | 165 ++++++++++++++++++ .../zpool_import_parallel_neg.ksh | 130 ++++++++++++++ .../zpool_import_parallel_pos.ksh | 137 +++++++++++++++ 19 files changed, 818 insertions(+), 72 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index e9141fb4ba..ed60cce3dd 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2024, Klara Inc. + * Copyright (c) 2023-2024, Klara Inc. */ /* @@ -310,6 +310,11 @@ usage(void) "\t\tcreate 3 lanes on the device; one lane with a latency\n" "\t\tof 10 ms and two lanes with a 25 ms latency.\n" "\n" + "\tzinject -P import|export -s pool\n" + "\t\tAdd an artificial delay to a future pool import or export,\n" + "\t\tsuch that the operation takes a minimum of supplied seconds\n" + "\t\tto complete.\n" + "\n" "\tzinject -I [-s | -g ] pool\n" "\t\tCause the pool to stop writing blocks yet not\n" "\t\treport errors for a duration. Simulates buggy hardware\n" @@ -392,8 +397,10 @@ print_data_handler(int id, const char *pool, zinject_record_t *record, { int *count = data; - if (record->zi_guid != 0 || record->zi_func[0] != '\0') + if (record->zi_guid != 0 || record->zi_func[0] != '\0' || + record->zi_duration != 0) { return (0); + } if (*count == 0) { (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " @@ -507,6 +514,33 @@ print_panic_handler(int id, const char *pool, zinject_record_t *record, return (0); } +static int +print_pool_delay_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_cmd != ZINJECT_DELAY_IMPORT && + record->zi_cmd != ZINJECT_DELAY_EXPORT) { + return (0); + } + + if (*count == 0) { + (void) printf("%3s %-19s %-11s %s\n", + "ID", "POOL", "DELAY (sec)", "COMMAND"); + (void) printf("--- ------------------- -----------" + " -------\n"); + } + + *count += 1; + + (void) printf("%3d %-19s %-11llu %s\n", + id, pool, (u_longlong_t)record->zi_duration, + record->zi_cmd == ZINJECT_DELAY_IMPORT ? "import": "export"); + + return (0); +} + /* * Print all registered error handlers. Returns the number of handlers * registered. @@ -537,6 +571,13 @@ print_all_handlers(void) count = 0; } + (void) iter_handlers(print_pool_delay_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + (void) iter_handlers(print_panic_handler, &count); return (count + total); @@ -609,9 +650,27 @@ register_handler(const char *pool, int flags, zinject_record_t *record, zc.zc_guid = flags; if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) { - (void) fprintf(stderr, "failed to add handler: %s\n", - errno == EDOM ? "block level exceeds max level of object" : - strerror(errno)); + const char *errmsg = strerror(errno); + + switch (errno) { + case EDOM: + errmsg = "block level exceeds max level of object"; + break; + case EEXIST: + if (record->zi_cmd == ZINJECT_DELAY_IMPORT) + errmsg = "pool already imported"; + if (record->zi_cmd == ZINJECT_DELAY_EXPORT) + errmsg = "a handler already exists"; + break; + case ENOENT: + /* import delay injector running on older zfs module */ + if (record->zi_cmd == ZINJECT_DELAY_IMPORT) + errmsg = "import delay injector not supported"; + break; + default: + break; + } + (void) fprintf(stderr, "failed to add handler: %s\n", errmsg); return (1); } @@ -636,6 +695,9 @@ register_handler(const char *pool, int flags, zinject_record_t *record, } else if (record->zi_duration < 0) { (void) printf(" txgs: %lld \n", (u_longlong_t)-record->zi_duration); + } else if (record->zi_timer > 0) { + (void) printf(" timer: %lld ms\n", + (u_longlong_t)NSEC2MSEC(record->zi_timer)); } else { (void) printf("objset: %llu\n", (u_longlong_t)record->zi_objset); @@ -834,7 +896,7 @@ main(int argc, char **argv) } while ((c = getopt(argc, argv, - ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -952,6 +1014,19 @@ main(int argc, char **argv) sizeof (record.zi_func)); record.zi_cmd = ZINJECT_PANIC; break; + case 'P': + if (strcasecmp(optarg, "import") == 0) { + record.zi_cmd = ZINJECT_DELAY_IMPORT; + } else if (strcasecmp(optarg, "export") == 0) { + record.zi_cmd = ZINJECT_DELAY_EXPORT; + } else { + (void) fprintf(stderr, "invalid command '%s': " + "must be 'import' or 'export'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; case 'q': quiet = 1; break; @@ -1033,7 +1108,7 @@ main(int argc, char **argv) argc -= optind; argv += optind; - if (record.zi_duration != 0) + if (record.zi_duration != 0 && record.zi_cmd == 0) record.zi_cmd = ZINJECT_IGNORED_WRITES; if (cancel != NULL) { @@ -1179,8 +1254,8 @@ main(int argc, char **argv) if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || device != NULL || record.zi_freq > 0 || dvas != 0) { - (void) fprintf(stderr, "panic (-p) incompatible with " - "other options\n"); + (void) fprintf(stderr, "%s incompatible with other " + "options\n", "import|export delay (-P)"); usage(); libzfs_fini(g_zfs); return (2); @@ -1198,6 +1273,28 @@ main(int argc, char **argv) if (argv[1] != NULL) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; + } else if (record.zi_cmd == ZINJECT_DELAY_IMPORT || + record.zi_cmd == ZINJECT_DELAY_EXPORT) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || device != NULL || record.zi_freq > 0 || + dvas != 0) { + (void) fprintf(stderr, "%s incompatible with other " + "options\n", "import|export delay (-P)"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc != 1 || record.zi_duration <= 0) { + (void) fprintf(stderr, "import|export delay (-P) " + "injection requires a duration (-s) and a single " + "pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_freq > 0 || dvas != 0) { diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d670cd1afe..e6664b918b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -3455,15 +3456,40 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, return (ret); } +typedef struct import_parameters { + nvlist_t *ip_config; + const char *ip_mntopts; + nvlist_t *ip_props; + int ip_flags; + int *ip_err; +} import_parameters_t; + +static void +do_import_task(void *arg) +{ + import_parameters_t *ip = arg; + *ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts, + ip->ip_props, ip->ip_flags); + free(ip); +} + + static int import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, - char *orig_name, char *new_name, - boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all, - importargs_t *import) + char *orig_name, char *new_name, importargs_t *import) { nvlist_t *config = NULL; nvlist_t *found_config = NULL; uint64_t pool_state; + boolean_t pool_specified = (import->poolname != NULL || + import->guid != 0); + + + tpool_t *tp = NULL; + if (import->do_all) { + tp = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + } /* * At this point we have a list of import candidate configs. Even if @@ -3480,9 +3506,11 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); - if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) + if (!import->do_destroyed && + pool_state == POOL_STATE_DESTROYED) continue; - if (do_destroyed && pool_state != POOL_STATE_DESTROYED) + if (import->do_destroyed && + pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, @@ -3491,12 +3519,21 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, if (!pool_specified) { if (first) first = B_FALSE; - else if (!do_all) + else if (!import->do_all) (void) fputc('\n', stdout); - if (do_all) { - err |= do_import(config, NULL, mntopts, - props, flags); + if (import->do_all) { + import_parameters_t *ip = safe_malloc( + sizeof (import_parameters_t)); + + ip->ip_config = config; + ip->ip_mntopts = mntopts; + ip->ip_props = props; + ip->ip_flags = flags; + ip->ip_err = &err; + + (void) tpool_dispatch(tp, do_import_task, + (void *)ip); } else { /* * If we're importing from cachefile, then @@ -3544,6 +3581,10 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, found_config = config; } } + if (import->do_all) { + tpool_wait(tp); + tpool_destroy(tp); + } /* * If we were searching for a specific pool, verify that we found a @@ -3773,7 +3814,6 @@ zpool_do_import(int argc, char **argv) boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; - boolean_t pool_specified = B_FALSE; uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; @@ -3972,7 +4012,6 @@ zpool_do_import(int argc, char **argv) searchname = argv[0]; searchguid = 0; } - pool_specified = B_TRUE; /* * User specified a name or guid. Ensure it's unique. @@ -4005,6 +4044,8 @@ zpool_do_import(int argc, char **argv) idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; + idata.do_destroyed = do_destroyed; + idata.do_all = do_all; libpc_handle_t lpch = { .lpc_lib_handle = g_zfs, @@ -4047,9 +4088,7 @@ zpool_do_import(int argc, char **argv) } err = import_pools(pools, props, mntopts, flags, - argc >= 1 ? argv[0] : NULL, - argc >= 2 ? argv[1] : NULL, - do_destroyed, pool_specified, do_all, &idata); + argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, &idata); /* * If we're using the cachefile and we failed to import, then @@ -4070,9 +4109,8 @@ zpool_do_import(int argc, char **argv) pools = zpool_search_import(&lpch, &idata); err = import_pools(pools, props, mntopts, flags, - argc >= 1 ? argv[0] : NULL, - argc >= 2 ? argv[1] : NULL, - do_destroyed, pool_specified, do_all, &idata); + argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, + &idata); } error: diff --git a/include/libzutil.h b/include/libzutil.h index d9a9a65753..e2108ceeaa 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018 by Delphix. All rights reserved. + * Copyright (c) 2018, 2024 by Delphix. All rights reserved. */ #ifndef _LIBZUTIL_H @@ -79,6 +79,8 @@ typedef struct importargs { boolean_t can_be_active; /* can the pool be active? */ boolean_t scan; /* prefer scanning to libblkid cache */ nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ + boolean_t do_destroyed; + boolean_t do_all; } importargs_t; typedef struct libpc_handle { diff --git a/include/sys/spa.h b/include/sys/spa.h index b969f05afe..ca15025ba3 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -833,6 +833,8 @@ void spa_select_allocator(zio_t *zio); /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; +extern avl_tree_t spa_namespace_avl; +extern kcondvar_t spa_namespace_cv; /* * SPA configuration functions in spa_config.c diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 0cd0c4720f..d7da085ab3 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -237,6 +237,7 @@ struct spa { dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ + kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 26dfe97604..525d40759f 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. */ @@ -454,6 +454,8 @@ typedef enum zinject_type { ZINJECT_PANIC, ZINJECT_DELAY_IO, ZINJECT_DECRYPT_FAULT, + ZINJECT_DELAY_IMPORT, + ZINJECT_DELAY_EXPORT, } zinject_type_t; typedef struct zfs_share { diff --git a/include/sys/zio.h b/include/sys/zio.h index 545b9cf0c3..4037b42998 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome @@ -686,6 +686,8 @@ extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); +extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed); +extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed); /* * Checksum ereport functions diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index f67b5e378d..ad9e7a42bf 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -129,6 +129,14 @@ Force a vdev error. . .It Xo .Nm zinject +.Fl i Ar seconds +.Ar pool +.Xc +Add an artificial delay during the future import of a pool. +This injector is automatically cleared after the import is finished. +. +.It Xo +.Nm zinject .Fl I .Op Fl s Ar seconds Ns | Ns Fl g Ar txgs .Ar pool diff --git a/module/zfs/spa.c b/module/zfs/spa.c index f67d980ae4..96daf51b69 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3273,8 +3273,6 @@ spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - spa_start_raidz_expansion_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); @@ -4981,7 +4979,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa) int error = 0; ASSERT0(spa->spa_checkpoint_txg); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), @@ -5228,6 +5227,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; + hrtime_t load_start = gethrtime(); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); @@ -5272,13 +5272,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) return (error); } + /* + * Drop the namespace lock for the rest of the function. + */ + spa->spa_load_thread = curthread; + mutex_exit(&spa_namespace_lock); + /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ spa_import_progress_set_notes(spa, "Loading checkpoint txg"); error = spa_ld_read_checkpoint_txg(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed @@ -5291,7 +5297,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the full list of active features from the MOS and check if @@ -5300,7 +5306,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Checking feature flags"); error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) - return (error); + goto fail; /* * Load several special directories from the MOS needed by the dsl_pool @@ -5309,7 +5315,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading special MOS directories"); error = spa_ld_load_special_directories(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve pool properties from the MOS. @@ -5317,7 +5323,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading properties"); error = spa_ld_get_props(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the list of auxiliary devices - cache devices and spares - @@ -5326,7 +5332,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading AUX vdevs"); error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) - return (error); + goto fail; /* * Load the metadata for all vdevs. Also check if unopenable devices @@ -5335,17 +5341,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading vdev metadata"); error = spa_ld_load_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) - return (error); + goto fail; spa_import_progress_set_notes(spa, "Loading BRT"); error = spa_ld_load_brt(spa); if (error != 0) - return (error); + goto fail; /* * Verify the logs now to make sure we don't have any unexpected errors @@ -5354,7 +5360,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Verifying Log Devices"); error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) - return (error); + goto fail; if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); @@ -5364,8 +5370,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ - return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, - ENOTSUP)); + error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, + ENOTSUP); + goto fail; } /* @@ -5376,7 +5383,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Verifying pool data"); error = spa_ld_verify_pool_data(spa); if (error != 0) - return (error); + goto fail; /* * Calculate the deflated space for the pool. This must be done before @@ -5501,13 +5508,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_config_exit(spa, SCL_CONFIG, FTAG); spa_import_progress_set_notes(spa, "Finished importing"); } + zio_handle_import_delay(spa, gethrtime() - load_start); spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); +fail: + mutex_enter(&spa_namespace_lock); + spa->spa_load_thread = NULL; + cv_broadcast(&spa_namespace_cv); + + return (error); - return (0); } static int @@ -6757,9 +6770,14 @@ spa_tryimport(nvlist_t *tryconfig) /* * Create and initialize the spa structure. */ + char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", + TRYIMPORT_NAME, (u_longlong_t)curthread, poolname); + mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); + spa = spa_add(name, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); + kmem_free(name, MAXPATHLEN); /* * Rewind pool if a max txg was provided. @@ -6874,6 +6892,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, { int error; spa_t *spa; + hrtime_t export_start = gethrtime(); if (oldconfig) *oldconfig = NULL; @@ -7018,6 +7037,9 @@ export_spa: spa->spa_is_exporting = B_FALSE; } + if (new_state == POOL_STATE_EXPORTED) + zio_handle_export_delay(spa, gethrtime() - export_start); + mutex_exit(&spa_namespace_lock); return (0); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 68b9076141..5fb7847b5d 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -82,7 +82,8 @@ * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/import/export + * - Held for the duration of create/destroy/export + * - Held at the start and end of import * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by @@ -235,9 +236,9 @@ * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ -static avl_tree_t spa_namespace_avl; +avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; -static kcondvar_t spa_namespace_cv; +kcondvar_t spa_namespace_cv; static const int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; @@ -619,6 +620,7 @@ spa_lookup(const char *name) ASSERT(MUTEX_HELD(&spa_namespace_lock)); +retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* @@ -630,6 +632,14 @@ spa_lookup(const char *name) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); + if (spa == NULL) + return (NULL); + + if (spa->spa_load_thread != NULL && + spa->spa_load_thread != curthread) { + cv_wait(&spa_namespace_cv, &spa_namespace_lock); + goto retry; + } return (spa); } @@ -728,6 +738,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_config_lock_init(spa); spa_stats_init(spa); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); avl_add(&spa_namespace_avl, spa); /* @@ -826,7 +837,6 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); - cv_broadcast(&spa_namespace_cv); if (spa->spa_root) spa_strfree(spa->spa_root); @@ -920,7 +930,8 @@ void spa_open_ref(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); (void) zfs_refcount_add(&spa->spa_refcount, tag); } @@ -932,7 +943,8 @@ void spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 5aaef1a699..c5e16af166 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. */ #include @@ -775,7 +775,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) void vdev_initialize_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 6503390f79..00ebd4c9fc 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -23,6 +23,7 @@ * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2024 by Delphix. All rights reserved. */ #include @@ -1071,7 +1072,8 @@ vdev_rebuild_restart_impl(vdev_t *vd) void vdev_rebuild_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); vdev_rebuild_restart_impl(spa->spa_root_vdev); } diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 7e3c5f6847..9753d5a1ea 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP * Copyright 2023 RackTop Systems, Inc. @@ -1148,7 +1148,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) void vdev_trim_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { @@ -1568,8 +1569,8 @@ vdev_autotrim_stop_all(spa_t *spa) void vdev_autotrim_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); if (spa->spa_autotrim) vdev_autotrim(spa); } diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 1af2c26f8a..3773e400d7 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara Inc. */ /* @@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0; typedef struct inject_handler { int zi_id; spa_t *zi_spa; + char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */ zinject_record_t zi_record; uint64_t *zi_lanes; int zi_next_lane; @@ -703,6 +705,63 @@ zio_handle_io_delay(zio_t *zio) return (min_target); } +static void +zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) +{ + inject_handler_t *handler; + hrtime_t delay = 0; + int id = 0; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); + handler != NULL && handler->zi_record.zi_cmd == command; + handler = list_next(&inject_handlers, handler)) { + ASSERT3P(handler->zi_spa_name, !=, NULL); + if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { + uint64_t pause = + SEC2NSEC(handler->zi_record.zi_duration); + if (pause > elapsed) { + delay = pause - elapsed; + } + id = handler->zi_id; + break; + } + } + + rw_exit(&inject_lock); + + if (delay) { + if (command == ZINJECT_DELAY_IMPORT) { + spa_import_progress_set_notes(spa, "injecting %llu " + "sec delay", (u_longlong_t)NSEC2SEC(delay)); + } + zfs_sleep_until(gethrtime() + delay); + } + if (id) { + /* all done with this one-shot handler */ + zio_clear_fault(id); + } +} + +/* + * For testing, inject a delay during an import + */ +void +zio_handle_import_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT); +} + +/* + * For testing, inject a delay during an export + */ +void +zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); +} + static int zio_calculate_range(const char *pool, zinject_record_t *record) { @@ -760,6 +819,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record) return (0); } +static boolean_t +zio_pool_handler_exists(const char *name, zinject_type_t command) +{ + boolean_t exists = B_FALSE; + + rw_enter(&inject_lock, RW_READER); + for (inject_handler_t *handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { + if (command != handler->zi_record.zi_cmd) + continue; + + const char *pool = (handler->zi_spa_name != NULL) ? + handler->zi_spa_name : spa_name(handler->zi_spa); + if (strcmp(name, pool) == 0) { + exists = B_TRUE; + break; + } + } + rw_exit(&inject_lock); + + return (exists); +} /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, @@ -810,16 +891,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) if (!(flags & ZINJECT_NULL)) { /* - * spa_inject_ref() will add an injection reference, which will - * prevent the pool from being removed from the namespace while - * still allowing it to be unloaded. + * Pool delays for import or export don't take an + * injection reference on the spa. Instead they + * rely on matching by name. */ - if ((spa = spa_inject_addref(name)) == NULL) - return (SET_ERROR(ENOENT)); + if (record->zi_cmd == ZINJECT_DELAY_IMPORT || + record->zi_cmd == ZINJECT_DELAY_EXPORT) { + if (record->zi_duration <= 0) + return (SET_ERROR(EINVAL)); + /* + * Only one import | export delay handler per pool. + */ + if (zio_pool_handler_exists(name, record->zi_cmd)) + return (SET_ERROR(EEXIST)); + + mutex_enter(&spa_namespace_lock); + boolean_t has_spa = spa_lookup(name) != NULL; + mutex_exit(&spa_namespace_lock); + + if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa) + return (SET_ERROR(EEXIST)); + if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa) + return (SET_ERROR(ENOENT)); + spa = NULL; + } else { + /* + * spa_inject_ref() will add an injection reference, + * which will prevent the pool from being removed + * from the namespace while still allowing it to be + * unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (SET_ERROR(ENOENT)); + } handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); - - handler->zi_spa = spa; + handler->zi_spa = spa; /* note: can be NULL */ handler->zi_record = *record; if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { @@ -832,6 +939,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) handler->zi_next_lane = 0; } + if (handler->zi_spa == NULL) + handler->zi_spa_name = spa_strdup(name); + else + handler->zi_spa_name = NULL; + rw_enter(&inject_lock, RW_WRITER); /* @@ -891,7 +1003,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen, if (handler) { *record = handler->zi_record; *id = handler->zi_id; - (void) strlcpy(name, spa_name(handler->zi_spa), buflen); + ASSERT(handler->zi_spa || handler->zi_spa_name); + if (handler->zi_spa != NULL) + (void) strlcpy(name, spa_name(handler->zi_spa), buflen); + else + (void) strlcpy(name, handler->zi_spa_name, buflen); ret = 0; } else { ret = SET_ERROR(ENOENT); @@ -941,7 +1057,11 @@ zio_clear_fault(int id) ASSERT3P(handler->zi_lanes, ==, NULL); } - spa_inject_delref(handler->zi_spa); + if (handler->zi_spa_name != NULL) + spa_strfree(handler->zi_spa_name); + + if (handler->zi_spa != NULL) + spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 558cd425af..0586d991b8 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -466,7 +466,8 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced', - 'zpool_import_status'] + 'zpool_import_status', 'zpool_import_parallel_pos', + 'zpool_import_parallel_neg', 'zpool_import_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index f182a2825c..dc447e0422 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1144,6 +1144,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_status.ksh \ + functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \ + functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \ + functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh new file mode 100755 index 0000000000..c681d1b7dd --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh @@ -0,0 +1,165 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Verify that admin commands to different pool are not blocked by import +# +# STRATEGY: +# 1. Create 2 pools +# 2. Export one of the pools +# 4. Import the pool with an injected delay +# 5. Execute some admin commands against both pools +# 6. Verify that the admin commands to the non-imported pool don't stall +# + +verify_runnable "global" + +function cleanup +{ + zinject -c all + destroy_pool $TESTPOOL1 + destroy_pool $TESTPOOL2 +} + +function pool_import +{ + typeset dir=$1 + typeset pool=$2 + + SECONDS=0 + errmsg=$(zpool import -d $dir -f $pool 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: imported in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-import + else + echo ${pool}: import failed $errmsg in $SECONDS secs + fi +} + +function pool_add_device +{ + typeset pool=$1 + typeset device=$2 + typeset devtype=$3 + + SECONDS=0 + errmsg=$(zpool add $pool $devtype $device 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: added $devtype vdev in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-add + else + echo ${pool}: add $devtype vdev failed ${errmsg}, in $SECONDS secs + fi +} + +function pool_stats +{ + typeset stats=$1 + typeset pool=$2 + + SECONDS=0 + errmsg=$(zpool $stats $pool 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: $stats in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-${stats} + else + echo ${pool}: $stats failed ${errmsg}, in $SECONDS secs + fi +} + +function pool_create +{ + typeset pool=$1 + typeset device=$2 + + SECONDS=0 + errmsg=$(zpool create $pool $device 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: created in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-create + else + echo ${pool}: create failed ${errmsg}, in $SECONDS secs + fi +} + +log_assert "Simple admin commands to different pool not blocked by import" + +log_onexit cleanup + +# +# create two pools and export one +# +log_must zpool create $TESTPOOL1 $VDEV0 +log_must zpool export $TESTPOOL1 +log_must zpool create $TESTPOOL2 $VDEV1 + +# +# import pool asyncronously with an injected 10 second delay +# +log_must zinject -P import -s 10 $TESTPOOL1 +pool_import $DEVICE_DIR $TESTPOOL1 & + +sleep 2 + +# +# run some admin commands on the pools while the import is in progress +# + +pool_add_device $TESTPOOL1 $VDEV2 "log" & +pool_add_device $TESTPOOL2 $VDEV3 "cache" & +pool_stats "status" $TESTPOOL1 & +pool_stats "status" $TESTPOOL2 & +pool_stats "list" $TESTPOOL1 & +pool_stats "list" $TESTPOOL2 & +pool_create $TESTPOOL1 $VDEV4 & +wait + +log_must zpool sync $TESTPOOL1 $TESTPOOL2 + +zpool history $TESTPOOL1 +zpool history $TESTPOOL2 + +log_must test "5" -lt $(<${DEVICE_DIR}/${TESTPOOL1}-import) + +# +# verify that commands to second pool did not wait for import to finish +# +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-status) +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-list) +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-add) +[[ -e ${DEVICE_DIR}/${TESTPOOL1}-create ]] && log_fail "unexpected pool create" + +log_pass "Simple admin commands to different pool not blocked by import" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh new file mode 100755 index 0000000000..339dc2575e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh @@ -0,0 +1,130 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Verify that pool imports by same name only have one winner +# +# STRATEGY: +# 1. Create 4 single disk pools with the same name +# 2. Generate some ZIL records (for a longer import) +# 3. Export the pools +# 4. Import the pools in parallel +# 5. Repeat with using matching guids +# + +verify_runnable "global" + +POOLNAME="import_pool" +DEV_DIR_PREFIX="$DEVICE_DIR/$POOLNAME" +VDEVSIZE=$((512 * 1024 * 1024)) + +log_assert "parallel pool imports by same name only have one winner" + +# each pool has its own device directory +for i in {0..3}; do + log_must mkdir -p ${DEV_DIR_PREFIX}$i + log_must truncate -s $VDEVSIZE ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i +done + +function cleanup +{ + zinject -c all + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + destroy_pool $POOLNAME + + log_must rm -rf $DEV_DIR_PREFIX* +} + +log_onexit cleanup + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 + +function import_pool +{ + typeset dir=$1 + typeset pool=$2 + typeset newname=$3 + + SECONDS=0 + errmsg=$(zpool import -N -d $dir -f $pool $newname 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + touch $dir/imported + echo "imported $pool in $SECONDS secs" + elif [[ $errmsg == *"cannot import"* ]]; then + echo "pool import failed: $errmsg, waited $SECONDS secs" + touch $dir/failed + fi +} + +# +# create four exported pools with the same name +# +for i in {0..3}; do + log_must zpool create $POOLNAME ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i + log_must zpool export $POOLNAME +done +log_must zinject -P import -s 10 $POOLNAME + +# +# import the pools in parallel, expecting only one winner +# +for i in {0..3}; do + import_pool ${DEV_DIR_PREFIX}$i $POOLNAME & +done +wait + +# check the result of background imports +typeset num_imports=0 +typeset num_cannot=0 +for i in {0..3}; do + if [[ -f ${DEV_DIR_PREFIX}$i/imported ]]; then + ((num_imports += 1)) + fi + if [[ -f ${DEV_DIR_PREFIX}$i/failed ]]; then + ((num_cannot += 1)) + loser=$i + fi +done +[[ $num_imports -eq "1" ]] || log_fail "expecting an import" +[[ $num_cannot -eq "3" ]] || \ + log_fail "expecting 3 pool exists errors, found $num_cannot" + +log_note "$num_imports imported and $num_cannot failed (expected)" + +log_pass "parallel pool imports by same name only have one winner" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh new file mode 100755 index 0000000000..71b2437a37 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh @@ -0,0 +1,137 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# test uses 8 vdevs +export MAX_NUM=8 + +# +# DESCRIPTION: +# Verify that pool imports can occur in parallel +# +# STRATEGY: +# 1. Create 8 pools +# 2. Generate some ZIL records +# 3. Export the pools +# 4. Import half of the pools synchronously to baseline sequential cost +# 5. Import the other half asynchronously to demonstrate parallel savings +# 6. Export 4 pools +# 7. Test zpool import -a +# + +verify_runnable "global" + +# +# override the minimum sized vdevs +# +VDEVSIZE=$((512 * 1024 * 1024)) +increase_device_sizes $VDEVSIZE + +POOLNAME="import_pool" + +function cleanup +{ + zinject -c all + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + for i in {0..$(($MAX_NUM - 1))}; do + destroy_pool $POOLNAME-$i + done + # reset the devices + increase_device_sizes 0 + increase_device_sizes $FILE_SIZE +} + +log_assert "Pool imports can occur in parallel" + +log_onexit cleanup + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 + + +# +# create some exported pools with import delay injectors +# +for i in {0..$(($MAX_NUM - 1))}; do + log_must zpool create $POOLNAME-$i $DEVICE_DIR/${DEVICE_FILE}$i + log_must zpool export $POOLNAME-$i + log_must zinject -P import -s 12 $POOLNAME-$i +done +wait + +# +# import half of the pools synchronously +# +SECONDS=0 +for i in {0..3}; do + log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i +done +sequential_time=$SECONDS +log_note "sequentially imported 4 pools in $sequential_time seconds" + +# +# import half of the pools in parallel +# +SECONDS=0 +for i in {4..7}; do + log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i & +done +wait +parallel_time=$SECONDS +log_note "asyncronously imported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +# +# export pools with import delay injectors +# +for i in {4..7}; do + log_must zpool export $POOLNAME-$i + log_must zinject -P import -s 12 $POOLNAME-$i +done +wait + +# +# now test zpool import -a +# +SECONDS=0 +log_must zpool import -a -d $DEVICE_DIR -f +parallel_time=$SECONDS +log_note "asyncronously imported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +log_pass "Pool imports occur in parallel" From 9b43d7ba85059d37533d42f62cbb646203fd4a94 Mon Sep 17 00:00:00 2001 From: Seth Troisi Date: Mon, 22 Apr 2024 10:45:39 -0700 Subject: [PATCH 029/113] Add newline to two zpool messages Reviewed-by: Brian Behlendorf Signed-off-by: Seth Troisi Closes #16113 --- cmd/zpool/zpool_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index e6664b918b..636eb2a301 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3445,10 +3445,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, ms_status = zpool_enable_datasets(zhp, mntopts, 0); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Import was " - "successful, but unable to share some datasets")); + "successful, but unable to share some datasets\n")); } else if (ms_status == EZFS_MOUNTFAILED) { (void) fprintf(stderr, gettext("Import was " - "successful, but unable to mount some datasets")); + "successful, but unable to mount some datasets\n")); } } From cdae59e1530061cf4caa549a062994161c4383c6 Mon Sep 17 00:00:00 2001 From: Seth Troisi Date: Mon, 22 Apr 2024 10:47:44 -0700 Subject: [PATCH 030/113] ZTS: user_namespace_004.ksh avoid error in cleanup if unsupported Reviewed-by: Brian Behlendorf Signed-off-by: Seth Troisi Closes #16114 --- .../tests/functional/user_namespace/user_namespace_004.ksh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh index 37ef84b723..e6ad25f23f 100755 --- a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh +++ b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh @@ -44,8 +44,6 @@ user_ns_cleanup() { log_must zfs destroy -r "$TESTPOOL/userns" } -log_onexit user_ns_cleanup - log_assert "Check zfs zone command handling of non-namespace files" # Pass if user namespaces are not supported. @@ -54,6 +52,8 @@ if [ "$?" -ne "0" ]; then log_unsupported "Failed to create user namespace" fi +log_onexit user_ns_cleanup + # Create the baseline datasets. log_must zfs create -o zoned=on "$TESTPOOL/userns" From 7e52795aad561ec39e76a3ef6fea9e5c254b2e16 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Mon, 22 Apr 2024 10:48:58 -0700 Subject: [PATCH 031/113] ztest: use ASSERT3P to compare pointers With a sufficiently modern gcc (I saw this with gcc13), gcc complains when casting pointers to an integer of a different type (even a larger one). On 32-bt ASSERT3U does this on 32-bit systems by casting a 32-bit pointer to uint64_t so use ASSERT3P which uses uintptr_t. Fixes: 5caeef02fa53 RAID-Z expansion feature Reviewed-by: Brian Behlendorf Signed-off-by: Brooks Davis Closes #16115 --- cmd/ztest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index 684ab586bb..b0fea8b3cf 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -8045,7 +8045,7 @@ ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) ztest_expand_io_t *thread_args; ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); - ASSERT3U(rzvd->vdev_ops, ==, &vdev_raidz_ops); + ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; /* Setup a 1 MiB buffer of random data */ From c346068e5efeafd5676ab1644086877173ca4226 Mon Sep 17 00:00:00 2001 From: Ryan Date: Tue, 23 Apr 2024 01:59:31 +0800 Subject: [PATCH 032/113] zfs get: add '-t fs' and '-t vol' options Make `zfs get` accept `fs` for `filesystem` and `vol` for `volume`. Reviewed-by: Rob Norris Reviewed-by: Brian Behlendorf Signed-off-by: Ryan Closes #16117 --- cmd/zfs/zfs_main.c | 22 ++++++++++++++++------ man/man8/zfs-set.8 | 11 ++++++++++- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index ec52c563b4..0bbdd5b18e 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -2146,15 +2146,25 @@ found2:; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const type_opts[] = { - "filesystem", "volume", - "snapshot", "snap", + "filesystem", + "fs", + "volume", + "vol", + "snapshot", + "snap", "bookmark", - "all" }; + "all" + }; static const int type_types[] = { - ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME, - ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT, + ZFS_TYPE_FILESYSTEM, + ZFS_TYPE_FILESYSTEM, + ZFS_TYPE_VOLUME, + ZFS_TYPE_VOLUME, + ZFS_TYPE_SNAPSHOT, + ZFS_TYPE_SNAPSHOT, ZFS_TYPE_BOOKMARK, - ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK }; + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK + }; for (i = 0; i < ARRAY_SIZE(type_opts); ++i) if (strcmp(tok, type_opts[i]) == 0) { diff --git a/man/man8/zfs-set.8 b/man/man8/zfs-set.8 index c01bcc643e..8cc19caf3f 100644 --- a/man/man8/zfs-set.8 +++ b/man/man8/zfs-set.8 @@ -29,7 +29,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd April 20, 2024 .Dt ZFS-SET 8 .Os . @@ -158,6 +158,15 @@ A comma-separated list of types to display, where .Ar type is one of .Sy filesystem , snapshot , volume , bookmark , No or Sy all . +.Sy fs , +.Sy snap , +or +.Sy vol +can be used as aliases for +.Sy filesystem , +.Sy snapshot , +or +.Sy volume . .El .It Xo .Nm zfs From 4036b8d027fb7fe1a629b08a0d23cac975ab2eb9 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 22 Apr 2024 14:41:03 -0400 Subject: [PATCH 033/113] Refactor dbuf_read() for safer decryption In dbuf_read_verify_dnode_crypt(): - We don't need original dbuf locked there. Instead take a lock on a dnode dbuf, that is actually manipulated. - Block decryption for a dnode dbuf if it is currently being written. ARC hash lock does not protect anonymous buffers, so arc_untransform() is unsafe when used on buffers being written, that may happen in case of encrypted dnode buffers, since they are not copied by dbuf_dirty()/dbuf_hold_copy(). In dbuf_read(): - If the buffer is in flight, recheck its compression/encryption status after it is cached, since it may need arc_untransform(). Tested-by: Rich Ercolani Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16104 --- module/zfs/dbuf.c | 222 ++++++++++++++++++++++------------------------ 1 file changed, 108 insertions(+), 114 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 5f3643f573..bb913f5563 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -161,13 +161,13 @@ struct { } dbuf_sums; #define DBUF_STAT_INCR(stat, val) \ - wmsum_add(&dbuf_sums.stat, val); + wmsum_add(&dbuf_sums.stat, val) #define DBUF_STAT_DECR(stat, val) \ - DBUF_STAT_INCR(stat, -(val)); + DBUF_STAT_INCR(stat, -(val)) #define DBUF_STAT_BUMP(stat) \ - DBUF_STAT_INCR(stat, 1); + DBUF_STAT_INCR(stat, 1) #define DBUF_STAT_BUMPDOWN(stat) \ - DBUF_STAT_INCR(stat, -1); + DBUF_STAT_INCR(stat, -1) #define DBUF_STAT_MAX(stat, v) { \ uint64_t _m; \ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ @@ -177,7 +177,6 @@ struct { static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); -static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); /* * Global data structures and functions for the dbuf cache. @@ -1418,13 +1417,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, * a decrypted block. Otherwise success. */ static int -dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn) { - int bonuslen, max_bonuslen, err; - - err = dbuf_read_verify_dnode_crypt(db, flags); - if (err) - return (err); + int bonuslen, max_bonuslen; bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); @@ -1509,32 +1504,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) * decrypt / authenticate them when we need to read an encrypted bonus buffer. */ static int -dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) +dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) { - int err = 0; objset_t *os = db->db_objset; - arc_buf_t *dnode_abuf; - dnode_t *dn; + dmu_buf_impl_t *dndb; + arc_buf_t *dnbuf; zbookmark_phys_t zb; - - ASSERT(MUTEX_HELD(&db->db_mtx)); + int err; if ((flags & DB_RF_NO_DECRYPT) != 0 || - !os->os_encrypted || os->os_raw_receive) + !os->os_encrypted || os->os_raw_receive || + (dndb = dn->dn_dbuf) == NULL) return (0); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; - - if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { - DB_DNODE_EXIT(db); + dnbuf = dndb->db_buf; + if (!arc_is_encrypted(dnbuf)) return (0); - } + + mutex_enter(&dndb->db_mtx); + + /* + * Since dnode buffer is modified by sync process, there can be only + * one copy of it. It means we can not modify (decrypt) it while it + * is being written. I don't see how this may happen now, since + * encrypted dnode writes by receive should be completed before any + * plain-text reads due to txg wait, but better be safe than sorry. + */ + while (1) { + if (!arc_is_encrypted(dnbuf)) { + mutex_exit(&dndb->db_mtx); + return (0); + } + dbuf_dirty_record_t *dr = dndb->db_data_pending; + if (dr == NULL || dr->dt.dl.dr_data != dnbuf) + break; + cv_wait(&dndb->db_changed, &dndb->db_mtx); + }; SET_BOOKMARK(&zb, dmu_objset_id(os), - DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); - err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); + DMU_META_DNODE_OBJECT, 0, dndb->db_blkid); + err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE); /* * An error code of EACCES tells us that the key is still not @@ -1547,7 +1556,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) err = 0; - DB_DNODE_EXIT(db); + mutex_exit(&dndb->db_mtx); return (err); } @@ -1573,7 +1582,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { - err = dbuf_read_bonus(db, dn, flags); + err = dbuf_read_bonus(db, dn); goto early_unlock; } @@ -1635,10 +1644,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, goto early_unlock; } - err = dbuf_read_verify_dnode_crypt(db, flags); - if (err != 0) - goto early_unlock; - db->db_state = DB_READ; DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); @@ -1754,19 +1759,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) int dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) { - int err = 0; - boolean_t prefetch; dnode_t *dn; + boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch; + int err; - /* - * We don't have to hold the mutex to check db_state because it - * can't be freed while we have a hold on the buffer. - */ ASSERT(!zfs_refcount_is_zero(&db->db_holds)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); + /* + * Ensure that this block's dnode has been decrypted if the caller + * has requested decrypted data. + */ + err = dbuf_read_verify_dnode_crypt(db, dn, flags); + if (err != 0) + goto done; + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0; @@ -1775,13 +1784,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) db->db_partial_read = B_TRUE; else if (!(flags & DB_RF_PARTIAL_MORE)) db->db_partial_read = B_FALSE; - if (db->db_state == DB_CACHED) { - /* - * Ensure that this block's dnode has been decrypted if - * the caller has requested decrypted data. - */ - err = dbuf_read_verify_dnode_crypt(db, flags); + miss = (db->db_state != DB_CACHED); + if (db->db_state == DB_READ || db->db_state == DB_FILL) { + /* + * Another reader came in while the dbuf was in flight between + * UNCACHED and CACHED. Either a writer will finish filling + * the buffer, sending the dbuf to CACHED, or the first reader's + * request will reach the read_done callback and send the dbuf + * to CACHED. Otherwise, a failure occurred and the dbuf will + * be sent to UNCACHED. + */ + if (flags & DB_RF_NEVERWAIT) { + mutex_exit(&db->db_mtx); + DB_DNODE_EXIT(db); + goto done; + } + do { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, + zio_t *, pio); + cv_wait(&db->db_changed, &db->db_mtx); + } while (db->db_state == DB_READ || db->db_state == DB_FILL); + if (db->db_state == DB_UNCACHED) { + err = SET_ERROR(EIO); + mutex_exit(&db->db_mtx); + DB_DNODE_EXIT(db); + goto done; + } + } + + if (db->db_state == DB_CACHED) { /* * If the arc buf is compressed or encrypted and the caller * requested uncompressed data, we need to untransform it @@ -1789,8 +1823,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) * unauthenticated blocks, which will verify their MAC if * the key is now available. */ - if (err == 0 && db->db_buf != NULL && - (flags & DB_RF_NO_DECRYPT) == 0 && + if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL && (arc_is_encrypted(db->db_buf) || arc_is_unauthenticated(db->db_buf) || arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { @@ -1804,17 +1837,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); - if (err == 0 && prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - B_FALSE, flags & DB_RF_HAVESTRUCT); - } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_hits); - } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) { - boolean_t need_wait = B_FALSE; - + } else { + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); - if (pio == NULL && (db->db_state == DB_NOFILL || (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { spa_t *spa = dn->dn_objset->os_spa; @@ -1822,65 +1848,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) need_wait = B_TRUE; } err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG); - /* - * dbuf_read_impl has dropped db_mtx and our parent's rwlock - * for us - */ - if (!err && prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - db->db_state != DB_CACHED, - flags & DB_RF_HAVESTRUCT); - } - - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); - - /* - * If we created a zio_root we must execute it to avoid - * leaking it, even if it isn't attached to any work due - * to an error in dbuf_read_impl(). - */ - if (need_wait) { - if (err == 0) - err = zio_wait(pio); - else - (void) zio_wait(pio); - pio = NULL; - } - } else { - /* - * Another reader came in while the dbuf was in flight - * between UNCACHED and CACHED. Either a writer will finish - * writing the buffer (sending the dbuf to CACHED) or the - * first reader's request will reach the read_done callback - * and send the dbuf to CACHED. Otherwise, a failure - * occurred and the dbuf went to UNCACHED. - */ - mutex_exit(&db->db_mtx); - if (prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - B_TRUE, flags & DB_RF_HAVESTRUCT); - } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); - - /* Skip the wait per the caller's request. */ - if ((flags & DB_RF_NEVERWAIT) == 0) { - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || - db->db_state == DB_FILL) { - ASSERT(db->db_state == DB_READ || - (flags & DB_RF_HAVESTRUCT) == 0); - DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, - db, zio_t *, pio); - cv_wait(&db->db_changed, &db->db_mtx); - } - if (db->db_state == DB_UNCACHED) - err = SET_ERROR(EIO); - mutex_exit(&db->db_mtx); - } + /* dbuf_read_impl drops db_mtx and parent's rwlock. */ + miss = (db->db_state != DB_CACHED); } + if (err == 0 && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss, + flags & DB_RF_HAVESTRUCT); + } + DB_DNODE_EXIT(db); + + /* + * If we created a zio we must execute it to avoid leaking it, even if + * it isn't attached to any work due to an error in dbuf_read_impl(). + */ + if (need_wait) { + if (err == 0) + err = zio_wait(pio); + else + (void) zio_wait(pio); + pio = NULL; + } + +done: + if (miss) + DBUF_STAT_BUMP(hash_misses); + else + DBUF_STAT_BUMP(hash_hits); if (pio && err != 0) { zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_FLAG_CANFAIL); From 87d81d1d13e0ef848d2d533a4f12f5de41026e73 Mon Sep 17 00:00:00 2001 From: Todd <18294602+seidelma@users.noreply.github.com> Date: Mon, 22 Apr 2024 17:55:41 -0700 Subject: [PATCH 034/113] zfs-kmod: fix empty rpm requires/conflicts Fix an error in zfs-kmod.spec that causes kmod-zfs packages not to include the correct RPM requires/conflicts relationships. With this change applied, RPM correctly no longer allows kmod-zfs & zfs-dkms packages to be installed together. Reviewed-by: Brian Behlendorf Signed-off-by: Todd Seidelmann <18294602+seidelma@users.noreply.github.com> Closes #16121 --- rpm/redhat/zfs-kmod.spec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index 9c836786ba..876c198c64 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -17,7 +17,7 @@ BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) # by generating a preamble text file which kmodtool can append to the spec file. %(/bin/echo -e "\ Requires: @PACKAGE@ = %{version}\n\ -Conflicts: @PACKAGE@-dkms) +Conflicts: @PACKAGE@-dkms" > %{_sourcedir}/kmod-preamble) # LDFLAGS are not sanitized by arch/*/Makefile for these architectures. %ifarch ppc ppc64 ppc64le aarch64 From 1f940de07224c2068e7c721222b1f3a519820ca9 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 23 Apr 2024 12:06:00 -0400 Subject: [PATCH 035/113] L2ARC: Cleanup buffer re-compression When compressed ARC is disabled, we may have to re-compress when writing into L2ARC. If doing so we can't fit it into the original physical size, we should just fail immediately, since even if it may still fit into allocation size, its checksum will never match. While there, refactor the code similar to other compression places without using abd_return_buf_copy(). Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16038 --- module/zfs/arc.c | 59 ++++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 39 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6954051b1d..51039af9bc 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8902,7 +8902,6 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; - void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); @@ -8923,12 +8922,11 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ - if (HDR_HAS_RABD(hdr) && asize != psize) { - ASSERT3U(asize, >=, psize); + if (HDR_HAS_RABD(hdr)) { + ASSERT3U(asize, >, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); - if (psize != asize) - abd_zero_off(to_write, psize, asize - psize); + abd_zero_off(to_write, psize, asize - psize); goto out; } @@ -8937,48 +8935,31 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); - if (size != asize) + if (asize > size) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - /* - * In some cases, we can wind up with size > asize, so - * we need to opt for the larger allocation option here. - * - * (We also need abd_return_buf_copy in all cases because - * it's an ASSERT() to modify the buffer before returning it - * with arc_return_buf(), and all the compressors - * write things before deciding to fail compression in nearly - * every case.) - */ - uint64_t bufsize = MAX(size, asize); - cabd = abd_alloc_for_io(bufsize, ismd); - tmp = abd_borrow_buf(cabd, bufsize); - - psize = zio_compress_data(compress, to_write, &tmp, size, - hdr->b_complevel); - - if (psize >= asize) { - psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, bufsize); - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); - to_write = cabd; - abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); - if (psize != asize) - abd_zero_off(to_write, psize, asize - psize); - goto encrypt; + size_t bufsize = MAX(size, asize); + void *buf = zio_buf_alloc(bufsize); + uint64_t csize = zio_compress_data(compress, to_write, &buf, + size, hdr->b_complevel); + if (csize > psize) { + /* + * We can't re-compress the block into the original + * psize. Even if it fits into asize, it does not + * matter, since checksum will never match on read. + */ + zio_buf_free(buf, bufsize); + return (SET_ERROR(EIO)); } - ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); - if (psize < asize) - memset((char *)tmp + psize, 0, bufsize - psize); - psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, bufsize); - to_write = cabd; + if (asize > csize) + memset((char *)buf + csize, 0, asize - csize); + to_write = cabd = abd_get_from_buf(buf, bufsize); + abd_take_ownership_of_buf(cabd, B_TRUE); } -encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); From 67d13998b3e055232a07311c2dc609571eaf1df1 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 24 Apr 2024 17:38:48 -0400 Subject: [PATCH 036/113] Make more taskq parameters writable There is no reason for these module parameters to be read-only. Being modified they just apply on next pool import/creation, that is useful for testing different values. Reviewed-by: Rich Ercolani Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16118 --- man/man4/zfs.4 | 9 +++++++-- module/zfs/spa.c | 8 ++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6088ebc7ef..22e1106bbf 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2327,8 +2327,8 @@ Prioritize requeued I/O. . .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint Percentage of online CPUs which will run a worker thread for I/O. -These workers are responsible for I/O work such as compression and -checksum calculations. +These workers are responsible for I/O work such as compression, encryption, +checksum and parity calculations. Fractional number of CPUs will be rounded down. .Pp The default value of @@ -2336,6 +2336,7 @@ The default value of was chosen to avoid using all CPUs which can result in latency issues and inconsistent application performance, especially when slower compression and/or checksumming is enabled. +Set value only applies to pools imported/created after that. . .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint Number of worker threads per taskq. @@ -2345,6 +2346,7 @@ while higher reduces lock contention. If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. +Set value only applies to pools imported/created after that. . .It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint Determines the number of CPUs to run write issue taskqs. @@ -2353,16 +2355,19 @@ When 0 (the default), the value to use is computed internally as the number of actual CPUs in the system divided by the .Sy spa_num_allocators value. +Set value only applies to pools imported/created after that. . .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp Set the queue and thread configuration for the IO read queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. +Set values only apply to pools imported/created after that. . .It Sy zio_taskq_write Ns = Ns Sy sync fixed,1,5 scale fixed,1,5 Pq charp Set the queue and thread configuration for the IO write queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. +Set values only apply to pools imported/created after that. . .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 96daf51b69..879147b097 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -10811,10 +10811,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, "Percentage of CPUs to run an IO worker thread"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, "Number of threads per IO worker taskqueue"); /* BEGIN CSTYLED */ @@ -10845,10 +10845,10 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, #ifdef _KERNEL ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, - spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, "Configure IO queues for read IO"); ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, - spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, "Configure IO queues for write IO"); #endif /* END CSTYLED */ From 5044c4e3ff0558b726b491a9267fc3db6f855a2d Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Wed, 24 Apr 2024 17:51:21 -0400 Subject: [PATCH 037/113] Fast Dedup: ZAP Shrinking This allows ZAPs to shrink. When there are two empty sibling leafs, one of them is collapsed and its storage space is reused. This improved performance on directories that at one time contained a large number of files, but many or all of those files have since been deleted. This also applies to all other types of ZAPs as well. Sponsored-by: iXsystems, Inc. Sponsored-by: Klara, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Alexander Stetsenko Closes #15888 --- man/man4/zfs.4 | 7 +- module/zfs/zap.c | 336 +++++++++++++++++- tests/runfiles/common.run | 4 + tests/zfs-tests/tests/Makefile.am | 3 + .../tests/functional/zap_shrink/cleanup.ksh | 34 ++ .../tests/functional/zap_shrink/setup.ksh | 35 ++ .../zap_shrink/zap_shrink_001_pos.ksh | 81 +++++ 7 files changed, 488 insertions(+), 12 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/zap_shrink/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 22e1106bbf..ef0385d42b 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -16,7 +16,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd January 9, 2024 +.Dd February 14, 2024 .Dt ZFS 4 .Os . @@ -564,9 +564,8 @@ However, this is limited by Maximum micro ZAP size. A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. . -.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint -Log2 fraction of holes in speculative prefetch stream allowed for it to -proceed. +.It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +If set, adjacent empty ZAP blocks will be collapsed, reducing disk space. . .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Min bytes to prefetch per stream. diff --git a/module/zfs/zap.c b/module/zfs/zap.c index da86defb44..1b6b16fc66 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -22,6 +22,8 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2023 Alexander Stetsenko + * Copyright (c) 2023, Klara Inc. */ /* @@ -41,6 +43,7 @@ #include #include +#include #include #include #include @@ -78,9 +81,16 @@ */ static int zap_iterate_prefetch = B_TRUE; +/* + * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be + * collapsed into a single block. + */ +int zap_shrink_enabled = B_TRUE; + int fzap_default_block_shift = 14; /* 16k blocksize */ static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); +static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx); void fzap_byteswap(void *vbuf, size_t size) @@ -586,6 +596,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) } } +static int +zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk, + dmu_tx_t *tx) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + int epb = bs >> 3; /* entries per block */ + int err = 0; + + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + /* + * Check for i/o errors + */ + for (int i = 0; i < nptrs; i += epb) { + uint64_t blk; + err = zap_idx_to_blk(zap, idx + i, &blk); + if (err != 0) { + return (err); + } + } + + for (int i = 0; i < nptrs; i++) { + err = zap_set_idx_to_blk(zap, idx + i, blk, tx); + ASSERT0(err); /* we checked for i/o errors above */ + if (err != 0) + break; + } + + return (err); +} + +#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len))) + +/* + * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl. + * If two leaves are siblings, their ranges are adjecent and contain the same + * number of entries. In order to find out if a leaf has a sibling, we need to + * check the range corresponding to the sibling leaf. There is no need to check + * all entries in the range, we only need to check the frist and the last one. + */ +static uint64_t +check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len); + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len; + uint64_t nptrs = (1 << pref_diff); + uint64_t first; + uint64_t last; + + ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); + + if (zap_idx_to_blk(zap, idx, &first) != 0) + return (0); + + if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0) + return (0); + + if (first != last) + return (0); + return (first); +} + static int zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { @@ -958,6 +1034,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx) if (err == 0) { zap_entry_remove(&zeh); zap_increment_num_entries(zn->zn_zap, -1, tx); + + if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 && + zap_shrink_enabled) + return (zap_shrink(zn, l, tx)); } zap_put_leaf(l); return (err); @@ -1222,13 +1302,19 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) ZIO_PRIORITY_ASYNC_READ); } - if (zc->zc_leaf && - (ZAP_HASH_IDX(zc->zc_hash, - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; + + /* + * The leaf was either shrunk or split. + */ + if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) || + (ZAP_HASH_IDX(zc->zc_hash, + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } } again: @@ -1237,8 +1323,6 @@ again: &zc->zc_leaf); if (err != 0) return (err); - } else { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); } l = zc->zc_leaf; @@ -1367,6 +1451,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } +/* + * Find last allocated block and update freeblk. + */ +static void +zap_trunc(zap_t *zap) +{ + uint64_t nentries; + uint64_t lastblk; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) { + /* External ptrtbl */ + nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift); + lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk + + zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1; + } else { + /* Embedded ptrtbl */ + nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + lastblk = 0; + } + + for (uint64_t idx = 0; idx < nentries; idx++) { + uint64_t blk; + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + if (blk > lastblk) + lastblk = blk; + } + + ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk); + + zap_f_phys(zap)->zap_freeblk = lastblk + 1; +} + +/* + * ZAP shrinking algorithm. + * + * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf + * only if it has a sibling. Sibling leaves have the same prefix length and + * their prefixes differ only by the least significant (sibling) bit. We require + * both siblings to be empty. This eliminates a need to rehash the non-empty + * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl + * entries of the removed leaf to point out to the remaining leaf. Prefix length + * of the remaining leaf is decremented. As a result, it has a new prefix and it + * might have a new sibling. So, we repeat the process. + * + * Steps: + * 1. Check if a sibling leaf (sl) exists and it is empty. + * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1. + * 3. Release the sibling (sl) to derefer it again with WRITER lock. + * 4. Upgrade zapdir lock to WRITER (once). + * 5. Derefer released leaves again. + * 6. If it is needed, recheck whether both leaves are still siblings and empty. + * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of + * the remaining leaf (slbit 0). + * 8. Free disk block of the removed leaf (dmu_free_range). + * 9. Decrement prefix_len of the remaining leaf. + * 10. Repeat the steps. + */ +static int +zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + uint64_t hash = zn->zn_hash; + uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + boolean_t trunc = B_FALSE; + int err = 0; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); + ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix); + + boolean_t writer = B_FALSE; + + /* + * To avoid deadlock always deref leaves in the same order - + * sibling 0 first, then sibling 1. + */ + while (prefix_len) { + zap_leaf_t *sl; + int64_t prefix_diff = zt_shift - prefix_len; + uint64_t sl_prefix = prefix ^ 1; + uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len); + int slbit = prefix & 1; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); + + /* + * Check if there is a sibling by reading ptrtbl ptrs. + */ + if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0) + break; + + /* + * sibling 1, unlock it - we haven't yet dereferenced sibling 0. + */ + if (slbit == 1) { + zap_put_leaf(l); + l = NULL; + } + + /* + * Dereference sibling leaf and check if it is empty. + */ + if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER, + &sl)) != 0) + break; + + ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix); + + /* + * Check if we have a sibling and it is empty. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len || + zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) { + zap_put_leaf(sl); + break; + } + + zap_put_leaf(sl); + + /* + * If there two empty sibling, we have work to do, so + * we need to lock ZAP ptrtbl as WRITER. + */ + if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) { + /* We failed to upgrade */ + if (l != NULL) { + zap_put_leaf(l); + l = NULL; + } + + /* + * Usually, the right way to upgrade from a READER lock + * to a WRITER lock is to call zap_unlockdir() and + * zap_lockdir(), but we do not have a tag. Instead, + * we do it in more sophisticated way. + */ + rw_exit(&zap->zap_rwlock); + rw_enter(&zap->zap_rwlock, RW_WRITER); + dmu_buf_will_dirty(zap->zap_dbuf, tx); + + zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + writer = B_TRUE; + } + + /* + * Here we have WRITER lock for ptrtbl. + * Now, we need a WRITER lock for both siblings leaves. + * Also, we have to recheck if the leaves are still siblings + * and still empty. + */ + if (l == NULL) { + /* sibling 0 */ + if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash), + tx, RW_WRITER, &l)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len) + break; + } + + /* sibling 1 */ + if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx, + RW_WRITER, &sl)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) { + zap_put_leaf(sl); + break; + } + + /* If we have gotten here, we have a leaf to collapse */ + uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff; + uint64_t nptrs = (1ULL << prefix_diff); + uint64_t sl_blkid = sl->l_blkid; + + /* + * Set ptrtbl entries to point out to the slibling 0 blkid + */ + if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid, + tx)) != 0) { + zap_put_leaf(sl); + break; + } + + /* + * Free sibling 1 disk block. + */ + int bs = FZAP_BLOCK_SHIFT(zap); + if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1) + trunc = B_TRUE; + + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + sl_blkid << bs, 1 << bs, tx); + zap_put_leaf(sl); + + zap_f_phys(zap)->zap_num_leafs--; + + /* + * Update prefix and prefix_len. + */ + zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1; + zap_leaf_phys(l)->l_hdr.lh_prefix_len--; + + prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + } + + if (trunc) + zap_trunc(zap); + + if (l != NULL) + zap_put_leaf(l); + + return (err); +} + /* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, "When iterating ZAP object, prefetch it"); + +/* CSTYLED */ +ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW, + "Enable ZAP shrinking"); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 0586d991b8..5e7fdf359a 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -643,6 +643,10 @@ tags = ['functional', 'compression'] tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress'] tags = ['functional', 'cp_files'] +[tests/functional/zap_shrink] +tests = ['zap_shrink_001_pos'] +tags = ['functional', 'zap_shrink'] + [tests/functional/crtime] tests = ['crtime_001_pos' ] tags = ['functional', 'crtime'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index dc447e0422..a6fe030d41 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -2074,6 +2074,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/xattr/xattr_012_pos.ksh \ functional/xattr/xattr_013_pos.ksh \ functional/xattr/xattr_compat.ksh \ + functional/zap_shrink/cleanup.ksh \ + functional/zap_shrink/zap_shrink_001_pos.ksh \ + functional/zap_shrink/setup.ksh \ functional/zpool_influxdb/cleanup.ksh \ functional/zpool_influxdb/setup.ksh \ functional/zpool_influxdb/zpool_influxdb.ksh \ diff --git a/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh b/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh new file mode 100755 index 0000000000..42fe70042d --- /dev/null +++ b/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh b/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh new file mode 100755 index 0000000000..b756d4e76c --- /dev/null +++ b/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh @@ -0,0 +1,35 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh b/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh new file mode 100755 index 0000000000..4dbf579b8a --- /dev/null +++ b/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh @@ -0,0 +1,81 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2024, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create a large number of files in a directory. Then remove all files and +# check that the directory zap was shrunk. Use zdb to check that the zap object +# contains only one leaf block using zdb. +# + +verify_runnable "global" + +DIR=largedir + +NR_FILES=100000 +BATCH=1000 +CWD=$PWD + +log_assert "Create a large number of files ($NR_FILES) in a directory. " \ + "Make sure that the directory ZAP object was shrunk." + +log_must mkdir $TESTDIR/$DIR + +cd $TESTDIR/$DIR +# In order to prevent arguments overflowing, create NR_FILES in BATCH at once. +for i in $(seq $(($NR_FILES/$BATCH))); do + touch $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH))); +done +cd $CWD + +log_must test $NR_FILES -eq $(ls -U $TESTDIR/$DIR | wc -l) + +# remove all files in $DIR directory +cd $TESTDIR/$DIR +for i in $(seq $(($NR_FILES/$BATCH))); do + rm $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH))) +done +cd $CWD +sync_pool $TESTPOOL + +log_must test 0 -eq $(ls -U $TESTDIR/$DIR | wc -l) + +# check whether zap_shrink works +zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR) +nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}') +log_must test 1 -eq $nleafs + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +# check whether zap_shrink works +zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR) +nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}') +log_must test 1 -eq $nleafs + +log_pass From 317b31eedb2b729985a48d5b98a3a5d34895eeb2 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 Apr 2024 13:40:09 -0700 Subject: [PATCH 038/113] Python 3.12 deprecated python3-distutils As for python-3.12 the distutils package has been deprecated. The latest ax_python_devel.m4 macro from the autoconf archive has been updated accordingly so let's pull in the new version. We can also drop the changes made to our customized version to continue if the development version is not installed since this functionality has been included upstream. Reviewed-by: Rich Ercolani Signed-off-by: Brian Behlendorf Closes #16126 Closes #16129 --- config/always-pyzfs.m4 | 9 +- config/ax_python_devel.m4 | 341 +++++++++++++++++++++++++------------- contrib/debian/control | 2 +- 3 files changed, 235 insertions(+), 117 deletions(-) diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4 index 9b123b1b2d..98c1cc2302 100644 --- a/config/always-pyzfs.m4 +++ b/config/always-pyzfs.m4 @@ -80,10 +80,11 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ [AC_MSG_ERROR("Python $PYTHON_VERSION unknown")] ) - AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [ - AS_IF([test "x$enable_pyzfs" = xyes], [ - AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed") - ], [test "x$enable_pyzfs" != xno], [ + AS_IF([test "x$enable_pyzfs" = xyes], [ + AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION]) + ], [ + AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [true]) + AS_IF([test "x$ax_python_devel_found" = xno], [ enable_pyzfs=no ]) ]) diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4 index f6d4b01444..1f480db6d2 100644 --- a/config/ax_python_devel.m4 +++ b/config/ax_python_devel.m4 @@ -4,18 +4,13 @@ # # SYNOPSIS # -# AX_PYTHON_DEVEL([version], [action-if-not-found]) +# AX_PYTHON_DEVEL([version[,optional]]) # # DESCRIPTION # # Note: Defines as a precious variable "PYTHON_VERSION". Don't override it # in your configure.ac. # -# Note: this is a slightly modified version of the original AX_PYTHON_DEVEL -# macro which accepts an additional [action-if-not-found] argument. This -# allow to detect if Python development is available without aborting the -# configure phase with an hard error in case it is not. -# # This macro checks for Python and tries to get the include path to # 'Python.h'. It provides the $(PYTHON_CPPFLAGS) and $(PYTHON_LIBS) output # variables. It also exports $(PYTHON_EXTRA_LIBS) and @@ -28,6 +23,11 @@ # version number. Don't use "PYTHON_VERSION" for this: that environment # variable is declared as precious and thus reserved for the end-user. # +# By default this will fail if it does not detect a development version of +# python. If you want it to continue, set optional to true, like +# AX_PYTHON_DEVEL([], [true]). The ax_python_devel_found variable will be +# "no" if it fails. +# # This macro should work for all versions of Python >= 2.1.0. As an end # user, you can disable the check for the python version by setting the # PYTHON_NOVERSIONCHECK environment variable to something else than the @@ -45,7 +45,6 @@ # Copyright (c) 2009 Matteo Settenvini # Copyright (c) 2009 Horst Knorr # Copyright (c) 2013 Daniel Mullner -# Copyright (c) 2018 loli10K # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the @@ -73,10 +72,18 @@ # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. -#serial 21 +#serial 36 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL]) AC_DEFUN([AX_PYTHON_DEVEL],[ + # Get whether it's optional + if test -z "$2"; then + ax_python_devel_optional=false + else + ax_python_devel_optional=$2 + fi + ax_python_devel_found=yes + # # Allow the use of a (user set) custom python version # @@ -87,23 +94,26 @@ AC_DEFUN([AX_PYTHON_DEVEL],[ AC_PATH_PROG([PYTHON],[python[$PYTHON_VERSION]]) if test -z "$PYTHON"; then - m4_ifvaln([$2],[$2],[ - AC_MSG_ERROR([Cannot find python$PYTHON_VERSION in your system path]) - PYTHON_VERSION="" - ]) + AC_MSG_WARN([Cannot find python$PYTHON_VERSION in your system path]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up, python development not available]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" fi - # - # Check for a version of Python >= 2.1.0 - # - AC_MSG_CHECKING([for a version of Python >= '2.1.0']) - ac_supports_python_ver=`$PYTHON -c "import sys; \ + if test $ax_python_devel_found = yes; then + # + # Check for a version of Python >= 2.1.0 + # + AC_MSG_CHECKING([for a version of Python >= '2.1.0']) + ac_supports_python_ver=`$PYTHON -c "import sys; \ ver = sys.version.split ()[[0]]; \ print (ver >= '2.1.0')"` - if test "$ac_supports_python_ver" != "True"; then + if test "$ac_supports_python_ver" != "True"; then if test -z "$PYTHON_NOVERSIONCHECK"; then AC_MSG_RESULT([no]) - AC_MSG_FAILURE([ + AC_MSG_WARN([ This version of the AC@&t@_PYTHON_DEVEL macro doesn't work properly with versions of Python before 2.1.0. You may need to re-run configure, setting the @@ -112,20 +122,27 @@ PYTHON_EXTRA_LIBS and PYTHON_EXTRA_LDFLAGS by hand. Moreover, to disable this check, set PYTHON_NOVERSIONCHECK to something else than an empty string. ]) + if ! $ax_python_devel_optional; then + AC_MSG_FAILURE([Giving up]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" else AC_MSG_RESULT([skip at user request]) fi - else + else AC_MSG_RESULT([yes]) + fi fi - # - # If the macro parameter ``version'' is set, honour it. - # A Python shim class, VPy, is used to implement correct version comparisons via - # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for - # Python 2.7.10 (the ".1" being evaluated as less than ".3"). - # - if test -n "$1"; then + if test $ax_python_devel_found = yes; then + # + # If the macro parameter ``version'' is set, honour it. + # A Python shim class, VPy, is used to implement correct version comparisons via + # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for + # Python 2.7.10 (the ".1" being evaluated as less than ".3"). + # + if test -n "$1"; then AC_MSG_CHECKING([for a version of Python $1]) cat << EOF > ax_python_devel_vpy.py class VPy: @@ -133,7 +150,7 @@ class VPy: return tuple(map(int, s.strip().replace("rc", ".").split("."))) def __init__(self): import sys - self.vpy = tuple(sys.version_info) + self.vpy = tuple(sys.version_info)[[:3]] def __eq__(self, s): return self.vpy == self.vtup(s) def __ne__(self, s): @@ -155,25 +172,69 @@ EOF AC_MSG_RESULT([yes]) else AC_MSG_RESULT([no]) - AC_MSG_ERROR([this package requires Python $1. + AC_MSG_WARN([this package requires Python $1. If you have it installed, but it isn't the default Python interpreter in your system path, please pass the PYTHON_VERSION variable to configure. See ``configure --help'' for reference. ]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up]) + fi + ax_python_devel_found=no PYTHON_VERSION="" fi + fi fi - # - # Check for Python include path - # - # - AC_MSG_CHECKING([for Python include path]) - if test -z "$PYTHON_CPPFLAGS"; then - python_path=`$PYTHON -c "import sysconfig; \ - print (sysconfig.get_path('include'));"` - plat_python_path=`$PYTHON -c "import sysconfig; \ - print (sysconfig.get_path('platinclude'));"` + if test $ax_python_devel_found = yes; then + # + # Check if you have distutils, else fail + # + AC_MSG_CHECKING([for the sysconfig Python package]) + ac_sysconfig_result=`$PYTHON -c "import sysconfig" 2>&1` + if test $? -eq 0; then + AC_MSG_RESULT([yes]) + IMPORT_SYSCONFIG="import sysconfig" + else + AC_MSG_RESULT([no]) + + AC_MSG_CHECKING([for the distutils Python package]) + ac_sysconfig_result=`$PYTHON -c "from distutils import sysconfig" 2>&1` + if test $? -eq 0; then + AC_MSG_RESULT([yes]) + IMPORT_SYSCONFIG="from distutils import sysconfig" + else + AC_MSG_WARN([cannot import Python module "distutils". +Please check your Python installation. The error was: +$ac_sysconfig_result]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" + fi + fi + fi + + if test $ax_python_devel_found = yes; then + # + # Check for Python include path + # + AC_MSG_CHECKING([for Python include path]) + if test -z "$PYTHON_CPPFLAGS"; then + if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then + # sysconfig module has different functions + python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_path ('include'));"` + plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_path ('platinclude'));"` + else + # old distutils way + python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_inc ());"` + plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_inc (plat_specific=1));"` + fi if test -n "${python_path}"; then if test "${plat_python_path}" != "${python_path}"; then python_path="-I$python_path -I$plat_python_path" @@ -182,15 +243,15 @@ variable to configure. See ``configure --help'' for reference. fi fi PYTHON_CPPFLAGS=$python_path - fi - AC_MSG_RESULT([$PYTHON_CPPFLAGS]) - AC_SUBST([PYTHON_CPPFLAGS]) + fi + AC_MSG_RESULT([$PYTHON_CPPFLAGS]) + AC_SUBST([PYTHON_CPPFLAGS]) - # - # Check for Python library path - # - AC_MSG_CHECKING([for Python library path]) - if test -z "$PYTHON_LIBS"; then + # + # Check for Python library path + # + AC_MSG_CHECKING([for Python library path]) + if test -z "$PYTHON_LIBS"; then # (makes two attempts to ensure we've got a version number # from the interpreter) ac_python_version=`cat</dev/null || \ - $PYTHON -c "import sysconfig; \ - print (sysconfig.get_path('purelib'));"` - fi - AC_MSG_RESULT([$PYTHON_SITE_PKG]) - AC_SUBST([PYTHON_SITE_PKG]) + if test $ax_python_devel_found = yes; then + AC_MSG_RESULT([$PYTHON_LIBS]) + AC_SUBST([PYTHON_LIBS]) - # - # libraries which must be linked in when embedding - # - AC_MSG_CHECKING(python extra libraries) - if test -z "$PYTHON_EXTRA_LIBS"; then - PYTHON_EXTRA_LIBS=`$PYTHON -c "import sysconfig; \ + # + # Check for site packages + # + AC_MSG_CHECKING([for Python site-packages path]) + if test -z "$PYTHON_SITE_PKG"; then + if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then + PYTHON_SITE_PKG=`$PYTHON -c " +$IMPORT_SYSCONFIG; +if hasattr(sysconfig, 'get_default_scheme'): + scheme = sysconfig.get_default_scheme() +else: + scheme = sysconfig._get_default_scheme() +if scheme == 'posix_local': + # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/ + scheme = 'posix_prefix' +prefix = '$prefix' +if prefix == 'NONE': + prefix = '$ac_default_prefix' +sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix}) +print(sitedir)"` + else + # distutils.sysconfig way + PYTHON_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_lib(0,0));"` + fi + fi + AC_MSG_RESULT([$PYTHON_SITE_PKG]) + AC_SUBST([PYTHON_SITE_PKG]) + + # + # Check for platform-specific site packages + # + AC_MSG_CHECKING([for Python platform specific site-packages path]) + if test -z "$PYTHON_PLATFORM_SITE_PKG"; then + if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then + PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c " +$IMPORT_SYSCONFIG; +if hasattr(sysconfig, 'get_default_scheme'): + scheme = sysconfig.get_default_scheme() +else: + scheme = sysconfig._get_default_scheme() +if scheme == 'posix_local': + # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/ + scheme = 'posix_prefix' +prefix = '$prefix' +if prefix == 'NONE': + prefix = '$ac_default_prefix' +sitedir = sysconfig.get_path('platlib', scheme, vars={'platbase': prefix}) +print(sitedir)"` + else + # distutils.sysconfig way + PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_lib(1,0));"` + fi + fi + AC_MSG_RESULT([$PYTHON_PLATFORM_SITE_PKG]) + AC_SUBST([PYTHON_PLATFORM_SITE_PKG]) + + # + # libraries which must be linked in when embedding + # + AC_MSG_CHECKING(python extra libraries) + if test -z "$PYTHON_EXTRA_LIBS"; then + PYTHON_EXTRA_LIBS=`$PYTHON -c "$IMPORT_SYSCONFIG; \ conf = sysconfig.get_config_var; \ print (conf('LIBS') + ' ' + conf('SYSLIBS'))"` - fi - AC_MSG_RESULT([$PYTHON_EXTRA_LIBS]) - AC_SUBST(PYTHON_EXTRA_LIBS) + fi + AC_MSG_RESULT([$PYTHON_EXTRA_LIBS]) + AC_SUBST(PYTHON_EXTRA_LIBS) - # - # linking flags needed when embedding - # - AC_MSG_CHECKING(python extra linking flags) - if test -z "$PYTHON_EXTRA_LDFLAGS"; then - PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "import sysconfig; \ + # + # linking flags needed when embedding + # + AC_MSG_CHECKING(python extra linking flags) + if test -z "$PYTHON_EXTRA_LDFLAGS"; then + PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "$IMPORT_SYSCONFIG; \ conf = sysconfig.get_config_var; \ print (conf('LINKFORSHARED'))"` - fi - AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS]) - AC_SUBST(PYTHON_EXTRA_LDFLAGS) + # Hack for macos, it sticks this in here. + PYTHON_EXTRA_LDFLAGS=`echo $PYTHON_EXTRA_LDFLAGS | sed 's/CoreFoundation.*$/CoreFoundation/'` + fi + AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS]) + AC_SUBST(PYTHON_EXTRA_LDFLAGS) - # - # final check to see if everything compiles alright - # - AC_MSG_CHECKING([consistency of all components of python development environment]) - # save current global flags - ac_save_LIBS="$LIBS" - ac_save_LDFLAGS="$LDFLAGS" - ac_save_CPPFLAGS="$CPPFLAGS" - LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS $PYTHON_EXTRA_LIBS" - LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS" - CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS" - AC_LANG_PUSH([C]) - AC_LINK_IFELSE([ + # + # final check to see if everything compiles alright + # + AC_MSG_CHECKING([consistency of all components of python development environment]) + # save current global flags + ac_save_LIBS="$LIBS" + ac_save_LDFLAGS="$LDFLAGS" + ac_save_CPPFLAGS="$CPPFLAGS" + LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS" + LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS" + CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS" + AC_LANG_PUSH([C]) + AC_LINK_IFELSE([ AC_LANG_PROGRAM([[#include ]], [[Py_Initialize();]]) ],[pythonexists=yes],[pythonexists=no]) - AC_LANG_POP([C]) - # turn back to default flags - CPPFLAGS="$ac_save_CPPFLAGS" - LIBS="$ac_save_LIBS" - LDFLAGS="$ac_save_LDFLAGS" + AC_LANG_POP([C]) + # turn back to default flags + CPPFLAGS="$ac_save_CPPFLAGS" + LIBS="$ac_save_LIBS" + LDFLAGS="$ac_save_LDFLAGS" - AC_MSG_RESULT([$pythonexists]) + AC_MSG_RESULT([$pythonexists]) - if test ! "x$pythonexists" = "xyes"; then - m4_ifvaln([$2],[$2],[ - AC_MSG_FAILURE([ + if test ! "x$pythonexists" = "xyes"; then + AC_MSG_WARN([ Could not link test program to Python. Maybe the main Python library has been installed in some non-standard library path. If so, pass it to configure, via the LIBS environment variable. @@ -340,9 +453,13 @@ EOD` You probably have to install the development version of the Python package for your distribution. The exact name of this package varies among them. ============================================================================ - ]) - PYTHON_VERSION="" - ]) + ]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" + fi fi # diff --git a/contrib/debian/control b/contrib/debian/control index 98beb900d0..e56fbf0f1c 100644 --- a/contrib/debian/control +++ b/contrib/debian/control @@ -189,7 +189,7 @@ Depends: dkms (>> 2.1.1.2-5), file, libc6-dev | libc-dev, lsb-release, - python3-distutils | libpython3-stdlib (<< 3.6.4), + python3 (>> 3.12) | python3-distutils | libpython3-stdlib (<< 3.6.4), ${misc:Depends}, ${perl:Depends} Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Recommends} From 21bc066ece7fcf0f8250ba5dfe05fd7f507dca28 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Thu, 25 Apr 2024 16:24:52 -0500 Subject: [PATCH 039/113] Fix updating the zvol_htable when renaming a zvol When renaming a zvol, insert it into zvol_htable using the new name, not the old name. Otherwise some operations won't work. For example, "zfs set volsize" while the zvol is open. Sponsored by: Axcient Reviewed-by: Brian Behlendorf Reviewed-by: Alek Pinchuk Signed-off-by: Alan Somers Closes #16127 Closes #16128 --- module/os/freebsd/zfs/zvol_os.c | 2 +- module/os/linux/zfs/zvol_os.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 6a7c2d2811..712ff1b837 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -1259,7 +1259,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* Move to a new hashtable entry. */ - zv->zv_hash = zvol_name_hash(zv->zv_name); + zv->zv_hash = zvol_name_hash(newname); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 4b960daf89..2a036dc513 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1571,7 +1571,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ - zv->zv_hash = zvol_name_hash(zv->zv_name); + zv->zv_hash = zvol_name_hash(newname); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); From 4840f023afae7c4932c903cf3a436c02c6704e20 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 29 Apr 2024 11:31:50 -0700 Subject: [PATCH 040/113] GCC: Fixes for gcc 14 on Fedora 40 - Workaround dangling pointer in uu_list.c (#16124) - Fix calloc() transposed arguments in zpool_vdev_os.c - Make some temp variables unsigned to prevent triggering a '-Werror=alloc-size-larger-than' error. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #16124 Closes #16125 --- cmd/zpool/os/linux/zpool_vdev_os.c | 2 +- lib/libuutil/uu_list.c | 14 ++++++++++---- module/zfs/vdev_raidz.c | 5 +++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c index 80627b5821..f194d28c55 100644 --- a/cmd/zpool/os/linux/zpool_vdev_os.c +++ b/cmd/zpool/os/linux/zpool_vdev_os.c @@ -438,7 +438,7 @@ static char *zpool_sysfs_gets(char *path) return (NULL); } - buf = calloc(sizeof (*buf), statbuf.st_size + 1); + buf = calloc(statbuf.st_size + 1, sizeof (*buf)); if (buf == NULL) { close(fd); return (NULL); diff --git a/lib/libuutil/uu_list.c b/lib/libuutil/uu_list.c index 0ca6f05205..aa8b129cc2 100644 --- a/lib/libuutil/uu_list.c +++ b/lib/libuutil/uu_list.c @@ -505,14 +505,20 @@ uu_list_walk(uu_list_t *lp, uu_walk_fn_t *func, void *private, uint32_t flags) } if (lp->ul_debug || robust) { - uu_list_walk_t my_walk; + uu_list_walk_t *my_walk; void *e; - list_walk_init(&my_walk, lp, flags); + my_walk = uu_zalloc(sizeof (*my_walk)); + if (my_walk == NULL) + return (-1); + + list_walk_init(my_walk, lp, flags); while (status == UU_WALK_NEXT && - (e = uu_list_walk_next(&my_walk)) != NULL) + (e = uu_list_walk_next(my_walk)) != NULL) status = (*func)(e, private); - list_walk_fini(&my_walk); + list_walk_fini(my_walk); + + uu_free(my_walk); } else { if (!reverse) { for (np = lp->ul_null_node.uln_next; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b03331ec69..de7d0fa794 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1891,8 +1891,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, static void vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { - int n, i, c, t, tt; - int nmissing_rows; + int i, c, t, tt; + unsigned int n; + unsigned int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; From db499e68f9ef8d4b12ebdab699184e3acf35567c Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:32:49 -0400 Subject: [PATCH 041/113] Overflowing refreservation is bad Someone came to me and pointed out that you could pretty readily cause the refreservation calculation to exceed 2**64, given the 2**17 multiplier in it, and produce refreservations wildly less than the actual volsize in cases where it should have failed. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #15996 --- lib/libzfs/libzfs_dataset.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 6f8773aed4..231bbbd92d 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -5565,8 +5565,21 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) /* * Scale this size down as a ratio of 128k / tsize. * See theory statement above. + * + * Bitshift is to avoid the case of nblocks * asize < tsize + * producing a size of 0. */ - volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; + volsize = (nblocks * asize) / (tsize >> SPA_MINBLOCKSHIFT); + /* + * If we would blow UINT64_MAX with this next multiplication, + * don't. + */ + if (volsize > + (UINT64_MAX / (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT))) + volsize = UINT64_MAX; + else + volsize *= (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + if (volsize > ret) { ret = volsize; } From b28461b7c6511be571ee2f7d71c0d7be12aa4630 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Tue, 30 Apr 2024 01:28:50 +0500 Subject: [PATCH 042/113] Fix arcstats for FreeBSD after zfetch support Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #16141 --- cmd/arcstat.in | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cmd/arcstat.in b/cmd/arcstat.in index 220f343b5b..c4f10a1d6d 100755 --- a/cmd/arcstat.in +++ b/cmd/arcstat.in @@ -200,6 +200,8 @@ if sys.platform.startswith('freebsd'): k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats') if ctl.type != sysctl.CTLTYPE_NODE] + k += [ctl for ctl in sysctl.filter('kstat.zfs.misc.zfetchstats') + if ctl.type != sysctl.CTLTYPE_NODE] if not k: sys.exit(1) @@ -211,8 +213,12 @@ if sys.platform.startswith('freebsd'): continue name, value = s.name, s.value - # Trims 'kstat.zfs.misc.arcstats' from the name - kstat[name[24:]] = int(value) + + if "arcstats" in name: + # Trims 'kstat.zfs.misc.arcstats' from the name + kstat[name[24:]] = int(value) + else: + kstat["zfetch_" + name[27:]] = int(value) elif sys.platform.startswith('linux'): def kstat_update(): From c3f2f1aa2dccd5528336d90a6dd2f2a5c97b6352 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Mon, 29 Apr 2024 15:35:53 -0600 Subject: [PATCH 043/113] vdev probe to slow disk can stall mmp write checker Simplify vdev probes in the zio_vdev_io_done context to avoid holding the spa config lock for a long duration. Also allow zpool clear if no evidence of another host is using the pool. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Olaf Faaland Reviewed-by: Brian Behlendorf Signed-off-by: Don Brady Closes #15839 --- cmd/zpool/zpool_main.c | 2 +- include/sys/spa.h | 4 +- include/sys/uberblock_impl.h | 16 +-- include/sys/vdev_impl.h | 2 +- man/man8/zpool-clear.8 | 7 +- module/zfs/mmp.c | 5 +- module/zfs/spa.c | 102 ++++++++++++++---- module/zfs/txg.c | 9 ++ module/zfs/vdev.c | 22 ++-- module/zfs/vdev_label.c | 4 +- module/zfs/zfs_ioctl.c | 9 +- module/zfs/zio.c | 6 +- module/zfs/zio_inject.c | 6 +- tests/runfiles/linux.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/mmp/mmp_write_slow_disk.ksh | 97 +++++++++++++++++ 16 files changed, 242 insertions(+), 52 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 636eb2a301..300b383af4 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -9050,7 +9050,7 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " - "pool.\n")); + "pool or run 'zpool clear' to resume the pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: diff --git a/include/sys/spa.h b/include/sys/spa.h index ca15025ba3..001c221fb4 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -770,7 +770,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_FAULT_VDEV 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 @@ -1123,6 +1123,8 @@ extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); +extern boolean_t spa_mmp_remote_host_activity(spa_t *spa); + extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 1736b32cd3..e480a4bac0 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -50,20 +50,20 @@ extern "C" { #define MMP_SEQ_VALID_BIT 0x02 #define MMP_FAIL_INT_VALID_BIT 0x04 -#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ - ubp->ub_mmp_magic == MMP_MAGIC) -#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \ + (ubp)->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_INTERVAL_VALID_BIT)) -#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_SEQ_VALID_BIT)) -#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_FAIL_INT_VALID_BIT)) -#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ +#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \ >> 8) -#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ +#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \ >> 32) -#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ +#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \ >> 48) #define MMP_INTERVAL_SET(write) \ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 95164c4546..57ff31e89e 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -273,7 +273,7 @@ struct vdev { txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ - boolean_t vdev_probe_wanted; /* async probe wanted? */ + boolean_t vdev_fault_wanted; /* async faulted wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 index c61ecae483..3e448be87f 100644 --- a/man/man8/zpool-clear.8 +++ b/man/man8/zpool-clear.8 @@ -50,9 +50,10 @@ If the pool was suspended it will be brought back online provided the devices can be accessed. Pools with .Sy multihost -enabled which have been suspended cannot be resumed. -While the pool was suspended, it may have been imported on -another host, and resuming I/O could result in pool damage. +enabled which have been suspended cannot be resumed when there is evidence +that the pool was imported by another host. +The same checks performed during an import will be applied before the clear +proceeds. .Bl -tag -width Ds .It Fl -power Power on the devices's slot in the storage enclosure and wait for the device diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 66bc0ae60b..7112254275 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -664,12 +664,13 @@ mmp_thread(void *arg) (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " "mmp_last_write %llu mmp_interval %llu " - "mmp_fail_intervals %llu mmp_fail_ns %llu", + "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)mmp->mmp_last_write, (u_longlong_t)mmp_interval, (u_longlong_t)mmp_fail_intervals, - (u_longlong_t)mmp_fail_ns); + (u_longlong_t)mmp_fail_ns, + (u_longlong_t)spa->spa_uberblock.ub_txg); cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llu ms; suspending pool. " "Hrtime %llu", diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 879147b097..147165ee85 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3594,11 +3594,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) } /* - * Perform the import activity check. If the user canceled the import or - * we detected activity then fail. + * Remote host activity check. + * + * error results: + * 0 - no activity detected + * EREMOTEIO - remote activity detected + * EINTR - user canceled the operation */ static int -spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) +spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, + boolean_t importing) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; @@ -3643,19 +3648,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_expire = gethrtime() + import_delay; - spa_import_progress_set_notes(spa, "Checking MMP activity, waiting " - "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + if (importing) { + spa_import_progress_set_notes(spa, "Checking MMP activity, " + "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + } - int interations = 0; + int iterations = 0; while ((now = gethrtime()) < import_expire) { - if (interations++ % 30 == 0) { + if (importing && iterations++ % 30 == 0) { spa_import_progress_set_notes(spa, "Checking MMP " "activity, %llu ms remaining", (u_longlong_t)NSEC2MSEC(import_expire - now)); } - (void) spa_import_progress_set_mmp_check(spa_guid(spa), - NSEC2SEC(import_expire - gethrtime())); + if (importing) { + (void) spa_import_progress_set_mmp_check(spa_guid(spa), + NSEC2SEC(import_expire - gethrtime())); + } vdev_uberblock_load(rvd, ub, &mmp_label); @@ -3737,6 +3746,61 @@ out: return (error); } +/* + * Called from zfs_ioc_clear for a pool that was suspended + * after failing mmp write checks. + */ +boolean_t +spa_mmp_remote_host_activity(spa_t *spa) +{ + ASSERT(spa_multihost(spa) && spa_suspended(spa)); + + nvlist_t *best_label; + uberblock_t best_ub; + + /* + * Locate the best uberblock on disk + */ + vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); + if (best_label) { + /* + * confirm that the best hostid matches our hostid + */ + if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && + spa_get_hostid(spa) != + fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { + nvlist_free(best_label); + return (B_TRUE); + } + nvlist_free(best_label); + } else { + return (B_TRUE); + } + + if (!MMP_VALID(&best_ub) || + !MMP_FAIL_INT_VALID(&best_ub) || + MMP_FAIL_INT(&best_ub) == 0) { + return (B_TRUE); + } + + if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || + best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { + zfs_dbgmsg("txg mismatch detected during pool clear " + "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", + (u_longlong_t)spa->spa_uberblock.ub_txg, + (u_longlong_t)best_ub.ub_txg, + (u_longlong_t)spa->spa_uberblock.ub_timestamp, + (u_longlong_t)best_ub.ub_timestamp); + return (B_TRUE); + } + + /* + * Perform an activity check looking for any remote writer + */ + return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, + B_FALSE) != 0); +} + static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { @@ -4063,7 +4127,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } - int error = spa_activity_check(spa, ub, spa->spa_config); + int error = + spa_activity_check(spa, ub, spa->spa_config, B_TRUE); if (error) { nvlist_free(label); return (error); @@ -8771,15 +8836,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_probe(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(spa_t *spa, vdev_t *vd) { - if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = B_FALSE; - vdev_reopen(vd); /* vdev_open() does the actual probe */ + if (vd->vdev_fault_wanted) { + vd->vdev_fault_wanted = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) - spa_async_probe(spa, vd->vdev_child[c]); + spa_async_fault_vdev(spa, vd->vdev_child[c]); } static void @@ -8867,11 +8933,11 @@ spa_async_thread(void *arg) } /* - * See if any devices need to be probed. + * See if any devices need to be marked faulted. */ - if (tasks & SPA_ASYNC_PROBE) { + if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_probe(spa, spa->spa_root_vdev); + spa_async_fault_vdev(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index a67c043446..5ce6be69be 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -550,6 +550,15 @@ txg_sync_thread(void *arg) timer = (delta > timeout ? 0 : timeout - delta); } + /* + * When we're suspended, nothing should be changing and for + * MMP we don't want to bump anything that would make it + * harder to detect if another host is changing it when + * resuming after a MMP suspend. + */ + if (spa_suspended(spa)) + continue; + /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index d97d0a8100..c5551eb6cf 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1664,6 +1664,7 @@ vdev_metaslab_fini(vdev_t *vd) typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; + boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; @@ -1709,6 +1710,17 @@ vdev_probe_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); + + /* + * If this probe was initiated from zio pipeline, then + * change the state in a spa_async_request. Probes that + * were initiated from a vdev_open can change the state + * as part of the open call. + */ + if (vps->vps_zio_done_probe) { + vd->vdev_fault_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); + } } mutex_enter(&vd->vdev_probe_lock); @@ -1759,6 +1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; + vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -1785,15 +1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio) vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); - - /* - * We can't change the vdev state in this context, so we - * kick off an async task to do it on our behalf. - */ - if (zio != NULL) { - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(spa, SPA_ASYNC_PROBE); - } } if (zio != NULL) diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index c31f48028b..ed592514fd 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -2027,6 +2027,7 @@ retry: /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, + * and multihost protection isn't enabled, * and the vdev configuration hasn't changed, * then there's nothing to do. */ @@ -2034,7 +2035,8 @@ retry: boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); - if (!changed && list_is_empty(&spa->spa_config_dirty_list)) + if (!changed && list_is_empty(&spa->spa_config_dirty_list) && + !spa_multihost(spa)) return (0); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 2ac1e34dcc..908b9efc18 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -5823,10 +5823,13 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * If multihost is enabled, resuming I/O is unsafe as another - * host may have imported the pool. + * host may have imported the pool. Check for remote activity. */ - if (spa_multihost(spa) && spa_suspended(spa)) - return (SET_ERROR(EINVAL)); + if (spa_multihost(spa) && spa_suspended(spa) && + spa_mmp_remote_host_activity(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EREMOTEIO)); + } spa_vdev_state_enter(spa, SCL_NONE); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1ba99f4d46..ce967a7cdc 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2532,8 +2532,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); - cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " - "failure and has been suspended.\n", spa_name(spa)); + if (reason != ZIO_SUSPEND_MMP) { + cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " + "I/O failure and has been suspended.\n", spa_name(spa)); + } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 3773e400d7..012a0e3c6c 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -607,9 +607,11 @@ zio_handle_io_delay(zio_t *zio) if (vd->vdev_guid != handler->zi_record.zi_guid) continue; + /* also match on I/O type (e.g., -T read) */ if (handler->zi_record.zi_iotype != ZIO_TYPES && - handler->zi_record.zi_iotype != zio->io_type) - continue; + handler->zi_record.zi_iotype != zio->io_type) { + continue; + } /* * Defensive; should never happen as the array allocation diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index a0b74ef4a8..92ce09ec6f 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -146,7 +146,7 @@ tags = ['functional', 'mmap'] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', - 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] + 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk'] tags = ['functional', 'mmp'] [tests/functional/mount:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index a6fe030d41..d625c040b8 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1593,6 +1593,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/mmp/mmp_on_zdb.ksh \ functional/mmp/mmp_reset_interval.ksh \ functional/mmp/mmp_write_distribution.ksh \ + functional/mmp/mmp_write_slow_disk.ksh \ functional/mmp/mmp_write_uberblocks.ksh \ functional/mmp/multihost_history.ksh \ functional/mmp/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh new file mode 100755 index 0000000000..8b118684aa --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc +# + +# DESCRIPTION: +# Verify that long VDEV probes do not cause MMP checks to suspend pool +# Note: without PR-15839 fix, this test will suspend the pool. +# +# A device that is returning unexpected errors will trigger a vdev_probe. +# When the device additionally has slow response times, the probe can hold +# the spa config lock as a writer for a long period of time such that the +# mmp uberblock updates stall when trying to acquire the spa config lock. +# +# STRATEGY: +# 1. Create a pool with multiple leaf vdevs +# 2. Enable multihost and multihost_history +# 3. Delay for MMP writes to occur +# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmp/mmp.cfg +. $STF_SUITE/tests/functional/mmp/mmp.kshlib + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + + if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then + log_must zpool clear $MMP_POOL + zpool get state $MMP_POOL $MMP_DIR/file.3 + zpool events | grep ".fs.zfs." | grep -v "history_event" + fi + + poolexists $MMP_POOL && destroy_pool $MMP_POOL + log_must rm -r $MMP_DIR + log_must mmp_clear_hostid +} + +log_assert "A long VDEV probe doesn't cause a MMP check suspend" +log_onexit cleanup + +MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost + +# Create a multiple drive pool +log_must zpool events -c +log_must mkdir -p $MMP_DIR +log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5} +log_must zpool create -f $MMP_POOL \ + mirror $MMP_DIR/file.{0,1,2} \ + mirror $MMP_DIR/file.{3,4,5} + +# Enable MMP +log_must mmp_set_hostid $HOSTID1 +log_must zpool set multihost=on $MMP_POOL +clear_mmp_history + +# Inject vdev write error along with a delay +log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL + +log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5 +sleep 10 +sync_pool $MMP_POOL + +# Confirm mmp writes to the non-slow disks have taken place +for x in {0,1,2,4}; do + write_count=$(grep -c file.${x} $MMP_HISTORY_URL) + [[ $write_count -gt 0 ]] || log_fail "expecting mmp writes" +done + +# Expect that the pool was not suspended +log_must check_state $MMP_POOL "" "ONLINE" +health=$(zpool list -H -o health $MMP_POOL) +log_note "$MMP_POOL health is $health" +[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected" + +log_pass "A long VDEV probe doesn't cause a MMP check suspend" From a6edc0adb293caf4e8bca2948af71b192b26bf58 Mon Sep 17 00:00:00 2001 From: Rob N Date: Tue, 30 Apr 2024 08:57:32 +1000 Subject: [PATCH 044/113] zio: try to execute TYPE_NULL ZIOs on the current task Many TYPE_NULL ZIOs are used to provide a sync point for child ZIOs, and do not do any actual work themselves. However, they are still dispatched to a dedicated, single-thread taskq, which leads to their execution being entirely task switch and dequeue overhead for no actual reason. This commit changes it so that when selecting a parent ZIO to execute, if the parent is TYPE_NULL and has no done function (that is, no additional work), it is executed on the same thread. This reduces task switches and frees up CPU cores for other work. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16134 --- module/zfs/zio.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ce967a7cdc..0e7993d87e 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -803,9 +803,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, /* * If we can tell the caller to execute this parent next, do - * so. We only do this if the parent's zio type matches the - * child's type. Otherwise dispatch the parent zio in its - * own taskq. + * so. We do this if the parent's zio type matches the child's + * type, or if it's a zio_null() with no done callback, and so + * has no actual work to do. Otherwise dispatch the parent zio + * in its own taskq. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch @@ -825,7 +826,8 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL && - pio->io_type == zio->io_type) { + (pio->io_type == zio->io_type || + (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); From 7ac00d3c26652892e01956af29d087362ab29410 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 30 Apr 2024 12:35:30 +1000 Subject: [PATCH 045/113] find_system_library: fix var cleanup when library not found The "not found" path is attempting to clear SOMELIB_CFLAGS and SOMELIB_LIBS by resetting them in AC_SUBST(). However, the second arg to AC_SUBST is expanded in autoconf with `m4_ifvaln([$2], [[$1]=$2])`, which is defined as "if the first arg is non-empty". The m4 "empty" construction is [], therefore, the existing AC_SUBST calls never modify the variables at all. The effect of this is that leftovers from the library test can leak out. At least, if a library header is found in the first stage, but the library itself is not, -lsomelib is added to SOMELIB_LIBS and further tests done. If that library is not found, SOMELIB_LIBS will not be cleared. For most of our library tests this hasn't been a problem, as they're either always found properly via pkg-config or set directly, or the calling test immediately aborts configure. For an optional dependency however, an apparent "partial" result where the header is found but no corresponding library causes link errors later. I think a complete fix should probably not be setting SOMELIB_xxx until the final result is known, but for now, adjusting the AC_SUBST calls to explictly set the empty shell string (which is not "empty" to m4) at least restores the intent. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- config/find_system_library.m4 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/find_system_library.m4 b/config/find_system_library.m4 index 310b44112a..8b98bd67d2 100644 --- a/config/find_system_library.m4 +++ b/config/find_system_library.m4 @@ -90,8 +90,8 @@ AC_DEFUN([ZFS_AC_FIND_SYSTEM_LIBRARY], [ AC_DEFINE([HAVE_][$1], [1], [Define if you have [$5]]) $7 ],[dnl ELSE - AC_SUBST([$1]_CFLAGS, []) - AC_SUBST([$1]_LIBS, []) + AC_SUBST([$1]_CFLAGS, [""]) + AC_SUBST([$1]_LIBS, [""]) AC_MSG_WARN([cannot find [$5] via pkg-config or in the standard locations]) $8 ]) From 4429ad9276cea193bb29463a7d6c38367d0d78ce Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 28 Apr 2024 11:03:11 +1000 Subject: [PATCH 046/113] libzpool: set thread names Arrange for the thread/task name to be set when new threads are created. This makes them visible in the process table etc. pthread_setname_np() is generally available in glibc, musl and FreeBSD, so no test is required. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- include/sys/zfs_context.h | 8 ++++---- lib/libzpool/kernel.c | 5 ++++- lib/libzpool/taskq.c | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 9ec2f73b36..8f264b50e9 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -228,9 +228,9 @@ typedef pthread_t kthread_t; #define thread_create_named(name, stk, stksize, func, arg, len, \ pp, state, pri) \ - zk_thread_create(func, arg, stksize, state) + zk_thread_create(name, func, arg, stksize, state) #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ - zk_thread_create(func, arg, stksize, state) + zk_thread_create(#func, func, arg, stksize, state) #define thread_exit() pthread_exit(NULL) #define thread_join(t) pthread_join((pthread_t)(t), NULL) @@ -246,8 +246,8 @@ extern struct proc p0; #define PS_NONE -1 -extern kthread_t *zk_thread_create(void (*func)(void *), void *arg, - size_t stksize, int state); +extern kthread_t *zk_thread_create(const char *name, void (*func)(void *), + void *arg, size_t stksize, int state); #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index ffad7fc02b..a3930ee07f 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -92,7 +92,8 @@ zk_thread_wrapper(void *arg) } kthread_t * -zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state) +zk_thread_create(const char *name, void (*func)(void *), void *arg, + size_t stksize, int state) { pthread_attr_t attr; pthread_t tid; @@ -140,6 +141,8 @@ zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state) VERIFY0(pthread_create(&tid, &attr, zk_thread_wrapper, ztw)); VERIFY0(pthread_attr_destroy(&attr)); + pthread_setname_np(tid, name); + return ((void *)(uintptr_t)tid); } diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 99a181ec3c..5fb2283cf0 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -295,8 +295,8 @@ taskq_create(const char *name, int nthreads, pri_t pri, } for (t = 0; t < nthreads; t++) - VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0, - taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL); + VERIFY((tq->tq_threadlist[t] = thread_create_named(tq->tq_name, + NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL); return (tq); } From 394800200e033f3a21dcbbf38a1e71b9d33b3b70 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 21 Apr 2024 21:43:53 +1000 Subject: [PATCH 047/113] libspl/assert: show process/task details in assert output Makes it much easier to see what thing complained. Getting thread id, program name and thread name vary wildly between Linux and FreeBSD, so those are set up in macros. pthread_getname_np() did not appear in musl until very recently, but the same info has always been available via prctl(PR_GET_NAME), so we use that instead. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- config/user.m4 | 2 +- lib/libspl/assert.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/config/user.m4 b/config/user.m4 index 87df8c7cca..3a69086a9d 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -31,7 +31,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy]) + AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid]) AC_SUBST(RM) ]) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 9d44740d4e..185ec65cb8 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -22,9 +22,32 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2024, Rob Norris + */ #include +#if defined(__linux__) +#include +#include +#ifdef HAVE_GETTID +#define libspl_gettid() gettid() +#else +#include +#define libspl_gettid() ((pid_t)syscall(__NR_gettid)) +#endif +#define libspl_getprogname() (program_invocation_short_name) +#define libspl_getthreadname(buf, len) \ + prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0) +#elif defined(__FreeBSD__) +#include +#define libspl_gettid() pthread_getthreadid_np() +#define libspl_getprogname() getprogname() +#define libspl_getthreadname(buf, len) \ + pthread_getname_np(pthread_self(), buf, len); +#endif + static boolean_t libspl_assert_ok = B_FALSE; void @@ -39,13 +62,22 @@ libspl_assertf(const char *file, const char *func, int line, const char *format, ...) { va_list args; + char tname[64]; + + libspl_getthreadname(tname, sizeof (tname)); + + fprintf(stderr, "ASSERT at %s:%d:%s()\n", file, line, func); va_start(args, format); vfprintf(stderr, format, args); - fprintf(stderr, "\n"); - fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func); va_end(args); + fprintf(stderr, "\n" + " PID: %-8u COMM: %s\n" + " TID: %-8u NAME: %s\n", + getpid(), libspl_getprogname(), + libspl_gettid(), tname); + #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { return; From dec697ad683ecfdf9833455af0568ce4ddc7c885 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 28 Apr 2024 12:49:58 +1000 Subject: [PATCH 048/113] libspl/assert: add lock around assertion output If multiple threads trip an assertion at the same moment (quite common), they can be printing at the same time, and their output gets messy. This adds a simple lock around the whole thing, to prevent a second task printing assert output before the first has finished. Additionally, if libspl_assert_ok is not set, abort() is called without dropping the lock, so that any other asserting tasks will be killed before starting any output, rather than only getting part-way through. This is a tradeoff; it's assumed that multiple threads asserting at the same moment are likely the same fault in different instances of a thread, and so there won't be any more useful information from the other tasks anyway. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- lib/libspl/assert.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 185ec65cb8..d402462531 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -27,6 +27,7 @@ */ #include +#include #if defined(__linux__) #include @@ -56,11 +57,15 @@ libspl_set_assert_ok(boolean_t val) libspl_assert_ok = val; } +static pthread_mutex_t assert_lock = PTHREAD_MUTEX_INITIALIZER; + /* printf version of libspl_assert */ void libspl_assertf(const char *file, const char *func, int line, const char *format, ...) { + pthread_mutex_lock(&assert_lock); + va_list args; char tname[64]; @@ -80,6 +85,7 @@ libspl_assertf(const char *file, const char *func, int line, #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { + pthread_mutex_unlock(&assert_lock); return; } #endif From 2152c405ba6ab0bc9fca482e9a0a968eb35699fb Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 27 Apr 2024 21:35:05 +1000 Subject: [PATCH 049/113] libspl/assert: dump backtrace in assert Adds a check for the backtrace() function. If available, uses it to show a stack backtrace in the assertion output. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- config/user-backtrace.m4 | 14 ++++++++++++++ config/user.m4 | 1 + lib/libspl/Makefile.am | 2 ++ lib/libspl/assert.c | 20 ++++++++++++++++++++ 4 files changed, 37 insertions(+) create mode 100644 config/user-backtrace.m4 diff --git a/config/user-backtrace.m4 b/config/user-backtrace.m4 new file mode 100644 index 0000000000..25706767cd --- /dev/null +++ b/config/user-backtrace.m4 @@ -0,0 +1,14 @@ +dnl +dnl backtrace(), for userspace assertions. glibc has this directly in libc. +dnl FreeBSD and (sometimes) musl have it in a separate -lexecinfo. It's assumed +dnl that this will also get the companion function backtrace_symbols(). +dnl +AC_DEFUN([ZFS_AC_CONFIG_USER_BACKTRACE], [ + AX_SAVE_FLAGS + LIBS="" + AC_SEARCH_LIBS([backtrace], [execinfo], [ + AC_DEFINE(HAVE_BACKTRACE, 1, [backtrace() is available]) + AC_SUBST([BACKTRACE_LIBS], ["$LIBS"]) + ]) + AX_RESTORE_FLAGS +]) diff --git a/config/user.m4 b/config/user.m4 index 3a69086a9d..8d11e031ba 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -26,6 +26,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_AIO_H ZFS_AC_CONFIG_USER_CLOCK_GETTIME ZFS_AC_CONFIG_USER_PAM + ZFS_AC_CONFIG_USER_BACKTRACE ZFS_AC_CONFIG_USER_RUNSTATEDIR ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 822bef7e7a..9f413b08c1 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -43,3 +43,5 @@ libspl_la_LIBADD = \ libspl_assert.la libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) + +libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index d402462531..4acf687f4b 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -49,6 +49,24 @@ pthread_getname_np(pthread_self(), buf, len); #endif +#if defined(HAVE_BACKTRACE) +#include + +static inline void +libspl_dump_backtrace(void) +{ + void *btptrs[100]; + size_t nptrs = backtrace(btptrs, 100); + char **bt = backtrace_symbols(btptrs, nptrs); + fprintf(stderr, "Call trace:\n"); + for (size_t i = 0; i < nptrs; i++) + fprintf(stderr, " %s\n", bt[i]); + free(bt); +} +#else +#define libspl_dump_backtrace() +#endif + static boolean_t libspl_assert_ok = B_FALSE; void @@ -83,6 +101,8 @@ libspl_assertf(const char *file, const char *func, int line, getpid(), libspl_getprogname(), libspl_gettid(), tname); + libspl_dump_backtrace(); + #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { pthread_mutex_unlock(&assert_lock); From 051460b8b2bb78add2b7ed5255f7656a33be903a Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 30 Apr 2024 10:37:29 +1000 Subject: [PATCH 050/113] libspl/assert: use libunwind for backtrace when available libunwind seems to do a better job of resolving a symbols than backtrace(), and is also useful on platforms that don't have backtrace() (eg musl). If it's available, use it. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #16140 --- config/user-libunwind.m4 | 44 ++++++++++++++++++++++++++++++++++++++++ config/user.m4 | 1 + lib/libspl/Makefile.am | 4 ++-- lib/libspl/assert.c | 33 +++++++++++++++++++++++++++++- 4 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 config/user-libunwind.m4 diff --git a/config/user-libunwind.m4 b/config/user-libunwind.m4 new file mode 100644 index 0000000000..99ba3dcf45 --- /dev/null +++ b/config/user-libunwind.m4 @@ -0,0 +1,44 @@ +dnl +dnl Checks for libunwind, which usually does a better job than backtrace() when +dnl resolving symbols in the stack backtrace. Newer versions have support for +dnl getting info about the object file the function came from, so we look for +dnl that too and use it if found. +dnl +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUNWIND], [ + AC_ARG_WITH([libunwind], + AS_HELP_STRING([--with-libunwind], + [use libunwind for backtraces in userspace assertions]), + [], + [with_libunwind=auto]) + + AS_IF([test "x$with_libunwind" != "xno"], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUNWIND, [libunwind], [libunwind.h], [], [unwind], [], [ + dnl unw_get_elf_filename() is sometimes a macro, other + dnl times a proper symbol, so we can't just do a link + dnl check; we need to include the header properly. + AX_SAVE_FLAGS + CFLAGS="$CFLAGS $LIBUNWIND_CFLAGS" + LIBS="$LIBS $LIBUNWIND_LIBS" + AC_MSG_CHECKING([for unw_get_elf_filename in libunwind]) + AC_LINK_IFELSE([ + AC_LANG_PROGRAM([ + #define UNW_LOCAL_ONLY + #include + ], [ + unw_get_elf_filename(0, 0, 0, 0); + ]) + ], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_LIBUNWIND_ELF, 1, + [libunwind has unw_get_elf_filename]) + ], [ + AC_MSG_RESULT([no]) + ]) + AX_RESTORE_FLAGS + ], [ + AS_IF([test "x$with_libunwind" = "xyes"], [ + AC_MSG_FAILURE([--with-libunwind was given, but libunwind is not available, try installing libunwind-devel]) + ]) + ]) + ]) +]) diff --git a/config/user.m4 b/config/user.m4 index 8d11e031ba..badd920d2b 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -27,6 +27,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_CLOCK_GETTIME ZFS_AC_CONFIG_USER_PAM ZFS_AC_CONFIG_USER_BACKTRACE + ZFS_AC_CONFIG_USER_LIBUNWIND ZFS_AC_CONFIG_USER_RUNSTATEDIR ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 9f413b08c1..eb2377305a 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -1,6 +1,6 @@ include $(srcdir)/%D%/include/Makefile.am -libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) +libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) $(LIBUNWIND_CFLAGS) libspl_la_CFLAGS = $(libspl_assert_la_CFLAGS) noinst_LTLIBRARIES += libspl_assert.la libspl.la @@ -44,4 +44,4 @@ libspl_la_LIBADD = \ libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) -libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) +libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 4acf687f4b..e6e3008f0a 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -49,7 +49,38 @@ pthread_getname_np(pthread_self(), buf, len); #endif -#if defined(HAVE_BACKTRACE) +#if defined(HAVE_LIBUNWIND) +#define UNW_LOCAL_ONLY +#include + +static inline void +libspl_dump_backtrace(void) +{ + unw_context_t uc; + unw_cursor_t cp; + unw_word_t ip, off; + char funcname[128]; +#ifdef HAVE_LIBUNWIND_ELF + char objname[128]; + unw_word_t objoff; +#endif + + fprintf(stderr, "Call trace:\n"); + unw_getcontext(&uc); + unw_init_local(&cp, &uc); + while (unw_step(&cp) > 0) { + unw_get_reg(&cp, UNW_REG_IP, &ip); + unw_get_proc_name(&cp, funcname, sizeof (funcname), &off); +#ifdef HAVE_LIBUNWIND_ELF + unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff); + fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n", + ip, funcname, off, objname, objoff); +#else + fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off); +#endif + } +} +#elif defined(HAVE_BACKTRACE) #include static inline void From 8fd3a5d02f3f6bad9e8e65b6aded694eae222bf2 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 1 May 2024 13:59:32 -0400 Subject: [PATCH 051/113] Slightly improve dnode hash As I understand just for being less predictable dnode hash includes 8 bits of objset pointer, starting at 6. But since objset_t is more than 1KB in size, its allocations are likely aligned to 2KB, that means 11 lower bits provide no entropy. Just take the 8 bits starting from 11. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16131 --- module/zfs/dmu_objset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 2ba26f68e3..5ea99f7428 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -400,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj) ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); /* - * The low 6 bits of the pointer don't have much entropy, because - * the objset_t is larger than 2^6 bytes long. + * The lower 11 bits of the pointer don't have much entropy, because + * the objset_t is more than 1KB long and so likely aligned to 2KB. */ - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; From 645b83307918085ab2f0e12618809e348635b34f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 1 May 2024 14:07:20 -0400 Subject: [PATCH 052/113] Improve write issue taskqs utilization - Reduce number of allocators on small system down to one per 4 CPU cores, keeping maximum at 4 on 16+ core systems. Small systems should not have the lock contention multiple allocators supposed to solve, while having several metaslabs open and modified each TXG is not free. - Reduce number of write issue taskqs down to one per 16 CPU cores and an integer fraction of number of allocators. On mid- sized systems, where multiple allocators already make sense, too many write issue taskqs may reduce write speed on single-file workloads, since single file is handled by only one taskq to reduce fragmentation. On large systems, that can actually benefit from many taskq's better IOPS, the bottleneck is less important, since in worst case there will be at least 16 cores to handle it. - Distribute dnodes between allocators (and taskqs) in a round- robin fashion instead of relying on sync taskqs to be balanced. The last is not guarantied and may depend on scheduling. - Remove io_wr_iss_tq from struct zio. io_allocator is enough. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16130 --- include/sys/spa.h | 2 + include/sys/spa_impl.h | 9 ++++- include/sys/zio.h | 3 -- man/man4/zfs.4 | 25 ++++++++----- module/zfs/dmu_objset.c | 2 + module/zfs/spa.c | 81 ++++++++++++++++++++++++++--------------- module/zfs/spa_misc.c | 22 +++++++++-- module/zfs/zio.c | 1 - 8 files changed, 98 insertions(+), 47 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index 001c221fb4..3073c4d1b9 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -829,6 +829,8 @@ extern uint_t zfs_sync_pass_deferred_free; /* spa sync taskqueues */ taskq_t *spa_sync_tq_create(spa_t *spa, const char *name); void spa_sync_tq_destroy(spa_t *spa); +uint_t spa_acq_allocator(spa_t *spa); +void spa_rel_allocator(spa_t *spa, uint_t allocator); void spa_select_allocator(zio_t *zio); /* spa namespace global mutex */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index d7da085ab3..a40914ec5f 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -63,6 +63,12 @@ typedef struct spa_alloc { avl_tree_t spaa_tree; } ____cacheline_aligned spa_alloc_t; +typedef struct spa_allocs_use { + kmutex_t sau_lock; + uint_t sau_rotor; + boolean_t sau_inuse[]; +} spa_allocs_use_t; + typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; @@ -192,7 +198,7 @@ typedef struct spa_taskqs { /* one for each thread in the spa sync taskq */ typedef struct spa_syncthread_info { kthread_t *sti_thread; - taskq_t *sti_wr_iss_tq; /* assigned wr_iss taskq */ + uint_t sti_allocator; } spa_syncthread_info_t; typedef enum spa_all_vdev_zap_action { @@ -270,6 +276,7 @@ struct spa { * allocation performance in write-heavy workloads. */ spa_alloc_t *spa_allocs; + spa_allocs_use_t *spa_allocs_use; int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 4037b42998..77c70b9b48 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -528,9 +528,6 @@ struct zio { /* Taskq dispatching state */ taskq_ent_t io_tqent; - - /* write issue taskq selection, based upon sync thread */ - taskq_t *io_wr_iss_tq; }; enum blk_verify_flag { diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index ef0385d42b..5edd80659e 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -525,10 +525,17 @@ most ZPL operations (e.g. write, create) will return . .It Sy spa_num_allocators Ns = Ns Sy 4 Pq int Determines the number of block alloctators to use per spa instance. -Capped by the number of actual CPUs in the system. +Capped by the number of actual CPUs in the system via +.Sy spa_cpus_per_allocator . .Pp Note that setting this value too high could result in performance degredation and/or excess fragmentation. +Set value only applies to pools imported/created after that. +. +.It Sy spa_cpus_per_allocator Ns = Ns Sy 4 Pq int +Determines the minimum number of CPUs in a system for block alloctator +per spa instance. +Set value only applies to pools imported/created after that. . .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint Limits the number of on-disk error log entries that will be converted to the @@ -2339,21 +2346,19 @@ Set value only applies to pools imported/created after that. . .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint Number of worker threads per taskq. -Lower values improve I/O ordering and CPU utilization, -while higher reduces lock contention. +Higher values improve I/O ordering and CPU utilization, +while lower reduce lock contention. +Set value only applies to pools imported/created after that. .Pp If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. Set value only applies to pools imported/created after that. . -.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint -Determines the number of CPUs to run write issue taskqs. -.Pp -When 0 (the default), the value to use is computed internally -as the number of actual CPUs in the system divided by the -.Sy spa_num_allocators -value. +.It Sy zio_taskq_write_tpq Ns = Ns Sy 16 Pq uint +Determines the minumum number of threads per write issue taskq. +Higher values improve CPU utilization on high throughput, +while lower reduce taskq locks contention on high IOPS. Set value only applies to pools imported/created after that. . .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 5ea99f7428..f1818ae155 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1664,12 +1664,14 @@ sync_dnodes_task(void *arg) sync_objset_arg_t *soa = sda->sda_soa; objset_t *os = soa->soa_os; + uint_t allocator = spa_acq_allocator(os->os_spa); multilist_sublist_t *ms = multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx); dmu_objset_sync_dnodes(ms, soa->soa_tx); multilist_sublist_unlock(ms); + spa_rel_allocator(os->os_spa, allocator); kmem_free(sda, sizeof (*sda)); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 147165ee85..ec2b674fb7 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -208,7 +208,7 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ #endif -static uint_t zio_taskq_wr_iss_ncpus = 0; +static uint_t zio_taskq_write_tpq = 16; /* * Report any spa_load_verify errors found, but do not fail spa_load. @@ -1067,17 +1067,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) case ZTI_MODE_SYNC: /* - * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus', - * not to exceed the number of spa allocators. + * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, + * not to exceed the number of spa allocators, and align to it. */ - if (zio_taskq_wr_iss_ncpus == 0) { - count = MAX(boot_ncpus / spa->spa_alloc_count, 1); - } else { - count = MAX(1, - boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus)); - } + cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); + count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); count = MAX(count, (zio_taskq_batch_pct + 99) / 100); count = MIN(count, spa->spa_alloc_count); + while (spa->spa_alloc_count % count != 0 && + spa->spa_alloc_count < count * 2) + count--; /* * zio_taskq_batch_pct is unbounded and may exceed 100%, but no @@ -1495,15 +1494,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); - if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && - (zio != NULL) && (zio->io_wr_iss_tq != NULL)) { - /* dispatch to assigned write issue taskq */ - tq = zio->io_wr_iss_tq; - return (tq); - } - if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; + } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && + (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) { + tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } @@ -10233,16 +10228,10 @@ spa_sync_tq_create(spa_t *spa, const char *name) VERIFY(spa->spa_sync_tq != NULL); VERIFY(kthreads != NULL); - spa_taskqs_t *tqs = - &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE]; - spa_syncthread_info_t *ti = spa->spa_syncthreads; - for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) { + for (int i = 0; i < nthreads; i++, ti++) { ti->sti_thread = kthreads[i]; - if (w == tqs->stqs_count) { - w = 0; - } - ti->sti_wr_iss_tq = tqs->stqs_taskq[w]; + ti->sti_allocator = i; } kmem_free(kthreads, sizeof (*kthreads) * nthreads); @@ -10261,6 +10250,42 @@ spa_sync_tq_destroy(spa_t *spa) spa->spa_sync_tq = NULL; } +uint_t +spa_acq_allocator(spa_t *spa) +{ + int i; + + if (spa->spa_alloc_count == 1) + return (0); + + mutex_enter(&spa->spa_allocs_use->sau_lock); + uint_t r = spa->spa_allocs_use->sau_rotor; + do { + if (++r == spa->spa_alloc_count) + r = 0; + } while (spa->spa_allocs_use->sau_inuse[r]); + spa->spa_allocs_use->sau_inuse[r] = B_TRUE; + spa->spa_allocs_use->sau_rotor = r; + mutex_exit(&spa->spa_allocs_use->sau_lock); + + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (i = 0; i < spa->spa_alloc_count; i++, ti++) { + if (ti->sti_thread == curthread) { + ti->sti_allocator = r; + break; + } + } + ASSERT3S(i, <, spa->spa_alloc_count); + return (r); +} + +void +spa_rel_allocator(spa_t *spa, uint_t allocator) +{ + if (spa->spa_alloc_count > 1) + spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; +} + void spa_select_allocator(zio_t *zio) { @@ -10288,8 +10313,7 @@ spa_select_allocator(zio_t *zio) spa_syncthread_info_t *ti = spa->spa_syncthreads; for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { if (ti->sti_thread == curthread) { - zio->io_allocator = i; - zio->io_wr_iss_tq = ti->sti_wr_iss_tq; + zio->io_allocator = ti->sti_allocator; return; } } @@ -10306,7 +10330,6 @@ spa_select_allocator(zio_t *zio) bm->zb_blkid >> 20); zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; - zio->io_wr_iss_tq = NULL; } /* @@ -10919,5 +10942,5 @@ ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, #endif /* END CSTYLED */ -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW, - "Number of CPUs to run write issue taskqs"); +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, + "Number of CPUs per write issue taskq"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 5fb7847b5d..e6d4a9bdb2 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -394,6 +394,7 @@ static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; * Number of allocators to use, per spa instance */ static int spa_num_allocators = 4; +static int spa_cpus_per_allocator = 4; /* * Spa active allocator. @@ -747,8 +748,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) if (altroot) spa->spa_root = spa_strdup(altroot); - /* Do not allow more allocators than CPUs. */ - spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus); + /* Do not allow more allocators than fraction of CPUs. */ + spa->spa_alloc_count = MAX(MIN(spa_num_allocators, + boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1); spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); @@ -758,6 +760,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } + if (spa->spa_alloc_count > 1) { + spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t, + sau_inuse[spa->spa_alloc_count]), KM_SLEEP); + mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT, + NULL); + } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); @@ -853,6 +861,11 @@ spa_remove(spa_t *spa) } kmem_free(spa->spa_allocs, spa->spa_alloc_count * sizeof (spa_alloc_t)); + if (spa->spa_alloc_count > 1) { + mutex_destroy(&spa->spa_allocs_use->sau_lock); + kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t, + sau_inuse[spa->spa_alloc_count])); + } avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); @@ -3097,4 +3110,7 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, - "Number of allocators per spa, capped by ncpus"); + "Number of allocators per spa"); + +ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, + "Minimum number of CPUs per allocators"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0e7993d87e..870343bf4f 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2925,7 +2925,6 @@ static void zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) { cio->io_allocator = pio->io_allocator; - cio->io_wr_iss_tq = pio->io_wr_iss_tq; } static void From 8f1b7a6fa6762ea4c89198ceb11c521f80b92ddc Mon Sep 17 00:00:00 2001 From: Rob N Date: Fri, 3 May 2024 08:18:35 +1000 Subject: [PATCH 053/113] vdev_disk: disable flushes if device does not support it If the underlying device doesn't have a write-back cache, the kernel will just return a successful response. This doesn't hurt anything, but it's extra work on the IO taskqs that are unnecessary. So, detect this when we open the device for the first time. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16148 --- include/os/linux/kernel/linux/blkdev_compat.h | 27 +++++++++++++++++++ module/os/linux/zfs/vdev_disk.c | 7 +++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index b0f398354e..658f546213 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -94,6 +94,33 @@ blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) #endif } +/* + * Detect if a device has a write cache. Used to set the intial value for the + * vdev nowritecache flag. + * + * 4.10: QUEUE_FLAG_WC added. Initialised by the driver, but can be changed + * later by the operator. If not set, kernel will return flush requests + * immediately without doing anything. + * 6.6: QUEUE_FLAG_HW_WC added. Initialised by the driver, can't be changed. + * Only controls if the operator is allowed to change _WC. Initial version + * buggy; aliased to QUEUE_FLAG_FUA, so unuseable. + * 6.6.10, 6.7: QUEUE_FLAG_HW_WC fixed. + * + * Older than 4.10 we just assume write cache, and let the normal flush fail + * detection apply. + */ +static inline boolean_t +zfs_bdev_has_write_cache(struct block_device *bdev) +{ +#if defined(QUEUE_FLAG_HW_WC) && QUEUE_FLAG_HW_WC != QUEUE_FLAG_FUA + return (test_bit(QUEUE_FLAG_HW_WC, &bdev_get_queue(bdev)->queue_flags)); +#elif defined(QUEUE_FLAG_WC) + return (test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags)); +#else + return (B_TRUE); +#endif +} + static inline void blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) { diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 2cea61a629..463c5f7051 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -429,8 +429,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* Determine the logical block size */ int logical_block_size = bdev_logical_block_size(bdev); - /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ - v->vdev_nowritecache = B_FALSE; + /* + * If the device has a write cache, clear the nowritecache flag, + * so that we start issuing flush requests again. + */ + v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); /* Set when device reports it supports TRIM. */ v->vdev_has_trim = bdev_discard_supported(bdev); From 04bae5ec95f7273105237159a882d5b72ec2b998 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 3 May 2024 12:53:34 -0400 Subject: [PATCH 054/113] Disable high priority ZIO threads on FreeBSD and Linux High priority threads are handling ZIL writes. While there is no ZIL compression, there is encryption, checksuming and RAIDZ math. We've found that on large systems 1 taskq with 5 threads can be a bottleneck for throughput, IOPS or both. Instead of just bumping number of threads with a risk of overloading CPUs and increasing latency, switch to using TQ_FRONT mechanism to increase sync write requests priority within standard write threads. Do not do it on Illumos, since its TQ_FRONT implementation is inherently unfair. FreeBSD and Linux don't have this problem, so we can do it there. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #16146 --- man/man4/zfs.4 | 2 +- module/zfs/spa.c | 11 ++++++++--- module/zfs/zio.c | 12 +++++++----- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5edd80659e..6895a2a6d7 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2367,7 +2367,7 @@ This is an advanced debugging parameter. Don't change this unless you understand what it does. Set values only apply to pools imported/created after that. . -.It Sy zio_taskq_write Ns = Ns Sy sync fixed,1,5 scale fixed,1,5 Pq charp +.It Sy zio_taskq_write Ns = Ns Sy sync null scale null Pq charp Set the queue and thread configuration for the IO write queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index ec2b674fb7..560fd67087 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -170,14 +170,19 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * that scales with the number of CPUs. * * The different taskq priorities are to handle the different contexts (issue - * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that - * need to be handled with minimum delay. + * and interrupt) and then to reserve threads for high priority I/Os that + * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT + * implementation, so separate high priority threads are used there. */ static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ +#ifdef illumos { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ +#else + { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */ +#endif { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ @@ -1217,7 +1222,7 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) * * Example (the defaults for READ and WRITE) * zio_taskq_read='fixed,1,8 null scale null' - * zio_taskq_write='sync fixed,1,5 scale fixed,1,5' + * zio_taskq_write='sync null scale null' * * Each sets the entire row at a time. * diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 870343bf4f..65a0afaaa2 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2041,12 +2041,14 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) /* * If this is a high priority I/O, then use the high priority taskq if - * available. + * available or cut the line otherwise. */ - if ((zio->io_priority == ZIO_PRIORITY_NOW || - zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && - spa->spa_zio_taskq[t][q + 1].stqs_count != 0) - q++; + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) { + if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0) + q++; + else + flags |= TQ_FRONT; + } ASSERT3U(q, <, ZIO_TASKQ_TYPES); From 2dff7527d4a40310f589045f5ab3a07b02963516 Mon Sep 17 00:00:00 2001 From: Daniel Perry Date: Thu, 9 May 2024 10:30:28 -0400 Subject: [PATCH 055/113] Replace usage of schedule_timeout with schedule_timeout_interruptible (#16150) This commit replaces current usages of schedule_timeout() with schedule_timeout_interruptible() in code paths that expect the running task to sleep for a short period of time. When schedule_timeout() is called without previously calling set_current_state(), the running task never sleeps because the task state remains in TASK_RUNNING. By calling schedule_timeout_interruptible() to set the task state to TASK_INTERRUPTIBLE before calling schedule_timeout() we achieve the intended/desired behavior of putting the task to sleep for the specified timeout. Reviewed-by: Brian Behlendorf Signed-off-by: Daniel Perry Closes #16150 --- module/os/linux/spl/spl-taskq.c | 2 +- module/os/linux/zfs/vdev_disk.c | 2 +- module/os/linux/zfs/zvol_os.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index c384b7b378..e7b812c3b5 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -158,7 +158,7 @@ retry: * throttling the task dispatch rate. */ spin_unlock_irqrestore(&tq->tq_lock, *irqflags); - schedule_timeout(HZ / 100); + schedule_timeout_interruptible(HZ / 100); spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); if (count < 100) { diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 463c5f7051..7284b922b3 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -397,7 +397,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, if (v->vdev_removed) break; - schedule_timeout(MSEC_TO_TICK(10)); + schedule_timeout_interruptible(MSEC_TO_TICK(10)); } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 2a036dc513..3012423e9f 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -798,7 +798,8 @@ retry: if ((gethrtime() - start) > timeout) return (SET_ERROR(-ERESTARTSYS)); - schedule_timeout(MSEC_TO_TICK(10)); + schedule_timeout_interruptible( + MSEC_TO_TICK(10)); goto retry; #endif } else { From a0f3c8aaf1e8c1196282e91cca603f877d7a618b Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 9 May 2024 19:31:57 +0500 Subject: [PATCH 056/113] zdb: add missing cleanup for early return Reviewed-by: Brian Behlendorf Reviewed-by: Don Brady Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #16152 --- cmd/zdb/zdb.c | 78 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 25 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 449b6bf2cc..ce80c0aa59 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -120,6 +120,9 @@ static int flagbits[256]; static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; static range_tree_t *mos_refd_objs; +static spa_t *spa; +static objset_t *os; +static boolean_t kernel_init_done; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); @@ -131,6 +134,7 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, static void zdb_print_blkptr(const blkptr_t *bp, int flags); +static void zdb_exit(int reason); typedef struct sublivelist_verify_block_refcnt { /* block pointer entry in livelist being verified */ @@ -818,7 +822,7 @@ usage(void) (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); - exit(1); + zdb_exit(1); } static void @@ -849,7 +853,7 @@ fatal(const char *fmt, ...) dump_debug_buffer(); - exit(1); + zdb_exit(1); } static void @@ -2276,7 +2280,7 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); - exit(1); + zdb_exit(1); } decode_embedded_bp_compressed(bp, buf); memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); @@ -3231,6 +3235,23 @@ fuid_table_destroy(void) } } +static void +zdb_exit(int reason) +{ + if (os != NULL) { + close_objset(os, FTAG); + } else if (spa != NULL) { + spa_close(spa, FTAG); + } + + fuid_table_destroy(); + + if (kernel_init_done) + kernel_fini(); + + exit(reason); +} + /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. @@ -4161,32 +4182,32 @@ dump_cachefile(const char *cachefile) if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", cachefile, strerror(errno)); - exit(1); + zdb_exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", cachefile, strerror(errno)); - exit(1); + zdb_exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); - exit(1); + zdb_exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); - exit(1); + zdb_exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); - exit(1); + zdb_exit(1); } free(buf); @@ -5102,14 +5123,14 @@ dump_label(const char *dev) if ((fd = open64(path, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", path, strerror(errno)); - exit(1); + zdb_exit(1); } if (fstat64_blk(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); - exit(1); + zdb_exit(1); } if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) @@ -8221,7 +8242,7 @@ dump_zpool(spa_t *spa) if (rc != 0) { dump_debug_buffer(); - exit(rc); + zdb_exit(rc); } } @@ -8825,18 +8846,18 @@ zdb_embedded_block(char *thing) words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) fprintf(stderr, "invalid input format\n"); - exit(1); + zdb_exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); - exit(1); + zdb_exit(1); } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) fprintf(stderr, "decode failed: %u\n", err); - exit(1); + zdb_exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); free(buf); @@ -8863,8 +8884,6 @@ int main(int argc, char **argv) { int c; - spa_t *spa = NULL; - objset_t *os = NULL; int dump_all = 1; int verbose = 0; int error = 0; @@ -9093,6 +9112,7 @@ main(int argc, char **argv) spa_mode_readable_spacemaps = B_TRUE; kernel_init(SPA_MODE_READ); + kernel_init_done = B_TRUE; if (dump_all) verbose = MAX(verbose, 1); @@ -9116,19 +9136,23 @@ main(int argc, char **argv) if (argc != 1) usage(); zdb_embedded_block(argv[0]); - return (0); + error = 0; + goto fini; } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); - return (0); + error = 0; + goto fini; } usage(); } - if (dump_opt['l']) - return (dump_label(argv[0])); + if (dump_opt['l']) { + error = dump_label(argv[0]); + goto fini; + } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | @@ -9183,7 +9207,8 @@ main(int argc, char **argv) } else if (objset_str && !zdb_numeric(objset_str + 1) && dump_opt['N']) { printf("Supply a numeric objset ID with -N\n"); - exit(1); + error = 1; + goto fini; } } else { target_pool = target; @@ -9240,7 +9265,8 @@ main(int argc, char **argv) if (argc != 2) usage(); dump_opt['v'] = verbose + 3; - return (dump_path(argv[0], argv[1], NULL)); + error = dump_path(argv[0], argv[1], NULL); + goto fini; } if (dump_opt['r']) { @@ -9328,7 +9354,7 @@ main(int argc, char **argv) fatal("can't dump '%s': %s", target, strerror(error)); } - return (error); + goto fini; } else { target_pool = strdup(target); if (strpbrk(target, "/@") != NULL) @@ -9458,9 +9484,10 @@ retry_lookup: free(checkpoint_target); } +fini: if (os != NULL) { close_objset(os, FTAG); - } else { + } else if (spa != NULL) { spa_close(spa, FTAG); } @@ -9468,7 +9495,8 @@ retry_lookup: dump_debug_buffer(); - kernel_fini(); + if (kernel_init_done) + kernel_fini(); return (error); } From af5dbed3193eb91e1302e1b976606b64fb9c557b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 9 May 2024 10:32:59 -0400 Subject: [PATCH 057/113] Fix scn_queue races on very old pools Code for pools before version 11 uses dmu_objset_find_dp() to scan for children datasets/clones. It calls enqueue_clones_cb() and enqueue_cb() callbacks in parallel from multiple taskq threads. It ends up bad for scan_ds_queue_insert(), corrupting scn_queue AVL-tree. Fix it by introducing a mutex to protect those two scan_ds_queue_insert() calls. All other calls are done from the sync thread and so serialized. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16162 --- include/sys/dsl_scan.h | 1 + module/zfs/dsl_scan.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 2e3452e5eb..f32f59a2be 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -173,6 +173,7 @@ typedef struct dsl_scan { dsl_scan_phys_t scn_phys; /* on disk representation of scan */ dsl_scan_phys_t scn_phys_cached; avl_tree_t scn_queue; /* queue of datasets to scan */ + kmutex_t scn_queue_lock; /* serializes scn_queue inserts */ uint64_t scn_queues_pending; /* outstanding data to issue */ /* members needed for syncing error scrub status to disk */ dsl_errorscrub_phys_t errorscrub_phys; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 55e89b89f0..085cfd3c56 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -491,6 +491,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); + mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, sizeof (scan_prefetch_issue_ctx_t), offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); @@ -646,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp) scan_ds_queue_clear(scn); avl_destroy(&scn->scn_queue); + mutex_destroy(&scn->scn_queue_lock); scan_ds_prefetch_queue_clear(scn); avl_destroy(&scn->scn_prefetch_queue); @@ -2723,8 +2725,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) return (err); ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } @@ -2915,8 +2919,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } From 3400127a75fda737bc59ae52f1f8ecedd6201117 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 9 May 2024 10:39:57 -0400 Subject: [PATCH 058/113] Fix ZIL clone records for legacy holes Previous code overengineered cloned range calculation by using BP_GET_LSIZE(). The problem is that legacy holes don't have the logical size, so result will be wrong. But we also don't need to look on every block size, since they all must be identical. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16165 --- module/zfs/zfs_log.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 433a653e55..fa4e7093ca 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -895,7 +895,7 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, itx_t *itx; lr_clone_range_t *lr; uint64_t partlen, max_log_data; - size_t i, partnbps; + size_t partnbps; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; @@ -904,10 +904,8 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, while (nbps > 0) { partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); - partlen = 0; - for (i = 0; i < partnbps; i++) { - partlen += BP_GET_LSIZE(&bps[i]); - } + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); partlen = MIN(partlen, len); itx = zil_itx_create(txtype, From 414acbd37e0a1121e93310e88956e30554ad1dae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Thu, 9 May 2024 16:42:51 +0200 Subject: [PATCH 059/113] Unbreak FreeBSD cross-build on MacOS broken in 051460b8b MacOS used FreeBSD-compatible getprogname() and pthread_getname_np(). But pthread_getthreadid_np() does not exist on MacOS. This implements libspl_gettid() using pthread_threadid_np() to get the thread id of the current thread. Tested with FreeBSD GitHub actions freebsd-src/.github/workflows/cross-bootstrap-tools.yml Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Martin Matuska Closes #16167 --- lib/libspl/assert.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index e6e3008f0a..5b12c14acd 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -41,9 +41,11 @@ #define libspl_getprogname() (program_invocation_short_name) #define libspl_getthreadname(buf, len) \ prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0) -#elif defined(__FreeBSD__) +#elif defined(__FreeBSD__) || defined(__APPLE__) +#if !defined(__APPLE__) #include #define libspl_gettid() pthread_getthreadid_np() +#endif #define libspl_getprogname() getprogname() #define libspl_getthreadname(buf, len) \ pthread_getname_np(pthread_self(), buf, len); @@ -98,6 +100,19 @@ libspl_dump_backtrace(void) #define libspl_dump_backtrace() #endif +#if defined(__APPLE__) +static inline uint64_t +libspl_gettid(void) +{ + uint64_t tid; + + if (pthread_threadid_np(NULL, &tid) != 0) + tid = 0; + + return (tid); +} +#endif + static boolean_t libspl_assert_ok = B_FALSE; void @@ -128,7 +143,11 @@ libspl_assertf(const char *file, const char *func, int line, fprintf(stderr, "\n" " PID: %-8u COMM: %s\n" +#if defined(__APPLE__) + " TID: %-8" PRIu64 " NAME: %s\n", +#else " TID: %-8u NAME: %s\n", +#endif getpid(), libspl_getprogname(), libspl_gettid(), tname); From 1ede0c716beeee4a720ff5c361121021555d7e3c Mon Sep 17 00:00:00 2001 From: Rob N Date: Fri, 10 May 2024 00:43:48 +1000 Subject: [PATCH 060/113] libspl_assert: always link -lpthread on FreeBSD The pthread_* functions are in -lpthread on FreeBSD. Some of them are implicitly linked through libc, but on FreeBSD 13 at least pthread_getname_np() is not. Just be explicit, since -lpthread is the documented interface anyway. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16168 --- lib/libspl/Makefile.am | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index eb2377305a..94be416d46 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -45,3 +45,7 @@ libspl_la_LIBADD = \ libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS) + +if BUILD_FREEBSD +libspl_assert_la_LIBADD += -lpthread +endif From 41ae864b69991f7e13d5171f54a42c721b297233 Mon Sep 17 00:00:00 2001 From: chenqiuhao1997 Date: Fri, 10 May 2024 23:47:21 +0800 Subject: [PATCH 061/113] Replace P2ALIGN with P2ALIGN_TYPED and delete P2ALIGN. In P2ALIGN, the result would be incorrect when align is unsigned integer and x is larger than max value of the type of align. In that case, -(align) would be a positive integer, which means high bits would be zero and finally stay zero after '&' when align is converted to a larger integer type. Reviewed-by: Brian Behlendorf Reviewed-by: Youzhong Yang Signed-off-by: Qiuhao Chen Closes #15940 --- cmd/zdb/zdb.c | 2 +- cmd/ztest.c | 15 +++++++++------ include/os/freebsd/spl/sys/ccompile.h | 3 ++- include/os/freebsd/spl/sys/sysmacros.h | 3 ++- include/os/linux/spl/sys/sysmacros.h | 3 ++- lib/libefi/rdwr_efi.c | 4 ++-- lib/libspl/include/os/linux/sys/sysmacros.h | 3 ++- lib/libzfs/os/linux/libzfs_pool_os.c | 3 ++- module/os/freebsd/zfs/vdev_geom.c | 2 +- module/os/linux/zfs/zvol_os.c | 2 +- module/zcommon/zfs_fletcher.c | 8 +++++--- module/zfs/btree.c | 2 +- module/zfs/dmu.c | 5 +++-- module/zfs/dmu_object.c | 2 +- module/zfs/metaslab.c | 4 ++-- module/zfs/vdev.c | 11 ++++++----- module/zfs/vdev_raidz.c | 3 ++- 17 files changed, 44 insertions(+), 31 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index ce80c0aa59..797ae34b6e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5143,7 +5143,7 @@ dump_label(const char *dev) sizeof (cksum_record_t), offsetof(cksum_record_t, link)); psize = statbuf.st_size; - psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); ashift = SPA_MINBLOCKSHIFT; /* diff --git a/cmd/ztest.c b/cmd/ztest.c index b0fea8b3cf..56eb01618c 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -2529,7 +2529,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, ASSERT3P(zio, !=, NULL); size = doi.doi_data_block_size; if (ISP2(size)) { - offset = P2ALIGN(offset, size); + offset = P2ALIGN_TYPED(offset, size, uint64_t); } else { ASSERT3U(offset, <, size); offset = 0; @@ -3978,7 +3978,8 @@ raidz_scratch_verify(void) raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); offset = RRSS_GET_OFFSET(&spa->spa_uberblock); state = RRSS_GET_STATE(&spa->spa_uberblock); - write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift); + write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, + uint64_t); logical_size = write_size * raidvd->vdev_children; switch (state) { @@ -5016,7 +5017,8 @@ ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) */ mutex_enter(&os->os_obj_lock); object = ztest_random(os->os_obj_next_chunk); - os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); + os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, + uint64_t); mutex_exit(&os->os_obj_lock); } @@ -6638,7 +6640,8 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * the end of the disk (vdev_psize) is aligned to * sizeof (vdev_label_t). */ - uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); + uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), + uint64_t); if ((leaf & 1) == 1 && offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; @@ -6962,8 +6965,8 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) size_t inc = 64 * ztest_random(size / 67); /* sometimes add few bytes to test non-simd */ if (ztest_random(100) < 10) - inc += P2ALIGN(ztest_random(64), - sizeof (uint32_t)); + inc += P2ALIGN_TYPED(ztest_random(64), + sizeof (uint32_t), uint64_t); if (inc > (size - pos)) inc = size - pos; diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h index 26cf4db87a..bebeb0db24 100644 --- a/include/os/freebsd/spl/sys/ccompile.h +++ b/include/os/freebsd/spl/sys/ccompile.h @@ -138,7 +138,8 @@ typedef int enum_t; #define readdir64 readdir #define dirent64 dirent #endif -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2PHASE(x, align) ((x) & ((align) - 1)) diff --git a/include/os/freebsd/spl/sys/sysmacros.h b/include/os/freebsd/spl/sys/sysmacros.h index 3e8841ae66..2c9f4438d7 100644 --- a/include/os/freebsd/spl/sys/sysmacros.h +++ b/include/os/freebsd/spl/sys/sysmacros.h @@ -191,7 +191,8 @@ extern unsigned char bcd_to_byte[256]; * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align) * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align) */ -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) /* * return x % (mod) align diff --git a/include/os/linux/spl/sys/sysmacros.h b/include/os/linux/spl/sys/sysmacros.h index 99e3a6fb41..0e83907363 100644 --- a/include/os/linux/spl/sys/sysmacros.h +++ b/include/os/linux/spl/sys/sysmacros.h @@ -159,7 +159,8 @@ makedev(unsigned int major, unsigned int minor) /* * Compatibility macros/typedefs needed for Solaris -> Linux port */ -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2PHASE(x, align) ((x) & ((align) - 1)) diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c index 739219e041..63c91059ae 100644 --- a/lib/libefi/rdwr_efi.c +++ b/lib/libefi/rdwr_efi.c @@ -1175,8 +1175,8 @@ efi_use_whole_disk(int fd) * (for performance reasons). The alignment should match the * alignment used by the "zpool_label_disk" function. */ - limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE, - PARTITION_END_ALIGNMENT); + limit = P2ALIGN_TYPED(efi_label->efi_last_lba - nblocks - + EFI_MIN_RESV_SIZE, PARTITION_END_ALIGNMENT, diskaddr_t); if (data_start + data_size != limit || resv_start != limit) sync_needed = B_TRUE; diff --git a/lib/libspl/include/os/linux/sys/sysmacros.h b/lib/libspl/include/os/linux/sys/sysmacros.h index 5765ee25c6..26e1c87a35 100644 --- a/lib/libspl/include/os/linux/sys/sysmacros.h +++ b/lib/libspl/include/os/linux/sys/sysmacros.h @@ -52,7 +52,8 @@ /* * Compatibility macros/typedefs needed for Solaris -> Linux port */ -#define P2ALIGN(x, align) ((x) & -(align)) +// Deprecated. Use P2ALIGN_TYPED instead. +// #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2BOUNDARY(off, len, align) \ diff --git a/lib/libzfs/os/linux/libzfs_pool_os.c b/lib/libzfs/os/linux/libzfs_pool_os.c index 86eef3255b..7b18e31c86 100644 --- a/lib/libzfs/os/linux/libzfs_pool_os.c +++ b/lib/libzfs/os/linux/libzfs_pool_os.c @@ -268,7 +268,8 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) if (start_block == MAXOFFSET_T) start_block = NEW_START_BLOCK; slice_size -= start_block; - slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT); + slice_size = P2ALIGN_TYPED(slice_size, PARTITION_END_ALIGNMENT, + uint64_t); vtoc->efi_parts[0].p_start = start_block; vtoc->efi_parts[0].p_size = slice_size; diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 9d88971919..38c1d8e9e4 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -457,7 +457,7 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; - psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); size = sizeof (*vdev_lists[0]) + pp->sectorsize - ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 3012423e9f..3e020e5322 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -384,7 +384,7 @@ zvol_discard(zv_request_t *zvr) */ if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); - end = P2ALIGN(end, zv->zv_volblocksize); + end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); size = end - start; } diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 619ddef024..74b8c2a475 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -471,7 +471,8 @@ fletcher_4_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; - const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, + uint64_t); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); @@ -519,7 +520,8 @@ fletcher_4_byteswap(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; - const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, + uint64_t); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); @@ -878,7 +880,7 @@ abd_fletcher_4_iter(void *data, size_t size, void *private) fletcher_4_ctx_t *ctx = cdp->acd_ctx; fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE; - uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + uint64_t asize = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, uint64_t); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); diff --git a/module/zfs/btree.c b/module/zfs/btree.c index af2b94a850..9c52083603 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -218,7 +218,7 @@ zfs_btree_create_custom(zfs_btree_t *tree, zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; tree->bt_leaf_size = lsize; - tree->bt_leaf_cap = P2ALIGN(esize / size, 2); + tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t); tree->bt_height = -1; tree->bt_bulk = NULL; } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 6ef149aab9..8b440aafba 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -537,7 +537,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; + P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t)) + >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " @@ -854,7 +855,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) } /* set start to the beginning of this L1 indirect */ - *start = P2ALIGN(*start, iblkrange); + *start = P2ALIGN_TYPED(*start, iblkrange, uint64_t); } if (*start < minimum) *start = minimum; diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index d0e39a423b..56986ea434 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -160,7 +160,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, * is not suitably aligned. */ os->os_obj_next_chunk = - P2ALIGN(object, dnodes_per_chunk) + + P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) + dnodes_per_chunk; (void) atomic_swap_64(cpuobj, object); mutex_exit(&os->os_obj_lock); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 9e762357b7..cb004930d2 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -627,8 +627,8 @@ metaslab_class_expandable_space(metaslab_class_t *mc) * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ - space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, - 1ULL << tvd->vdev_ms_shift); + space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, + 1ULL << tvd->vdev_ms_shift, uint64_t); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c5551eb6cf..414bf84f6f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -348,7 +348,8 @@ vdev_get_min_asize(vdev_t *vd) * to the nearest metaslab. */ if (vd == vd->vdev_top) - return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); + return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, + uint64_t)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } @@ -2115,8 +2116,8 @@ vdev_open(vdev_t *vd) } } - osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); - max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); + osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); + max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { @@ -4764,9 +4765,9 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { - vs->vs_esize = P2ALIGN( + vs->vs_esize = P2ALIGN_TYPED( vd->vdev_max_asize - vd->vdev_asize, - 1ULL << tvd->vdev_ms_shift); + 1ULL << tvd->vdev_ms_shift, uint64_t); } vs->vs_configured_ashift = vd->vdev_top != NULL diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index de7d0fa794..15c8b8ca60 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -4039,7 +4039,8 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); int ashift = raidvd->vdev_ashift; - uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift); + uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, + uint64_t); uint64_t logical_size = write_size * raidvd->vdev_children; uint64_t read_size = P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), From 136c05321140ecefa81a830754c64a7867d033e0 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 10 May 2024 15:35:20 -0400 Subject: [PATCH 062/113] ZAP: Fix leaf references on zap_expand_leaf() errors Depending on kind of error zap_expand_leaf() may return with or without valid leaf reference held. Make sure it returns NULL if due to error it has no leaf to return. Make its callers to check the returned leaf pointer, and release the leaf if it is not NULL. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #12366 Closes #16159 --- module/zfs/zap.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 1b6b16fc66..81dab80daf 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -711,6 +711,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, uint64_t object = zap->zap_object; zap_put_leaf(l); + *lp = l = NULL; zap_unlockdir(zap, tag); err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); @@ -920,21 +921,17 @@ retry: } else if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) { + if (err == 0) goto retry; - } else if (err == ENOSPC) { - /* - * If we failed to expand the leaf, then bailout - * as there is no point trying - * zap_put_leaf_maybe_grow_ptrtbl(). - */ - return (err); - } } out: - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } return (err); } @@ -991,8 +988,12 @@ retry: goto retry; } - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } return (err); } From abec7dcd30acfb195bca36334cec4fe82b082b1d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 13 May 2024 17:12:07 -0500 Subject: [PATCH 063/113] Linux: disable lockdep for a couple of locks When running a debug kernel with lockdep enabled there are several locks which report false positives. Set MUTEX_NOLOCKDEP/RW_NOLOCKDEP to disable these warnings. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #16188 --- module/os/linux/spl/spl-procfs-list.c | 2 +- module/zfs/dbuf.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/module/os/linux/spl/spl-procfs-list.c b/module/os/linux/spl/spl-procfs-list.c index 5e073950d6..91840ed2ca 100644 --- a/module/os/linux/spl/spl-procfs-list.c +++ b/module/os/linux/spl/spl-procfs-list.c @@ -234,7 +234,7 @@ procfs_list_install(const char *module, modulestr = kmem_asprintf("%s/%s", module, submodule); else modulestr = kmem_asprintf("%s", module); - mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_NOLOCKDEP, NULL); list_create(&procfs_list->pl_list, procfs_list_node_off + sizeof (procfs_list_node_t), procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index bb913f5563..806ebcfc57 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -292,8 +292,8 @@ dbuf_cons(void *vdb, void *unused, int kmflag) dmu_buf_impl_t *db = vdb; memset(db, 0, sizeof (dmu_buf_impl_t)); - mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); - rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL); + rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); @@ -958,7 +958,7 @@ dbuf_init(void) 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (int i = 0; i < hmsize; i++) - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL); dbuf_stats_init(h); From 975a13259b87572c39d8467f1f4a31869d0abc84 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 2 May 2024 19:28:10 +0000 Subject: [PATCH 064/113] Add support for parallel pool exports Changed spa_export_common() such that it no longer holds the spa_namespace_lock for the entire duration and instead sets spa_export_thread to indicate an import is in progress on the spa. This allows for an export to a diffent pool to proceed in parallel while an export is still processing potentially long operations like spa_unload_log_sm_flush_all(). Calls like spa_lookup() and spa_vdev_enter() that rely on the spa_namespace_lock to serialize them against a concurrent export, now wait for any in-progress export thread to complete before proceeding. The 'zpool import -a' sub-command also provides multi-threaded support, using a thread pool to submit the exports in parallel. Sponsored-By: Klara Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Don Brady Closes #16153 --- cmd/zpool/zpool_main.c | 88 +++++++++++- include/sys/spa_impl.h | 1 + module/zfs/arc.c | 4 +- module/zfs/spa.c | 36 ++++- module/zfs/spa_misc.c | 50 +++++-- module/zfs/vdev_initialize.c | 9 +- module/zfs/vdev_rebuild.c | 3 +- module/zfs/vdev_trim.c | 9 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 2 + .../zpool_export_parallel_admin.ksh | 72 ++++++++++ .../zpool_export_parallel_pos.ksh | 129 ++++++++++++++++++ 12 files changed, 373 insertions(+), 33 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 300b383af4..400f4bf1a6 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2030,10 +2030,19 @@ zpool_do_destroy(int argc, char **argv) } typedef struct export_cbdata { + tpool_t *tpool; + pthread_mutex_t mnttab_lock; boolean_t force; boolean_t hardforce; + int retval; } export_cbdata_t; + +typedef struct { + char *aea_poolname; + export_cbdata_t *aea_cbdata; +} async_export_args_t; + /* * Export one pool */ @@ -2042,11 +2051,20 @@ zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; - if (zpool_disable_datasets(zhp, cb->force) != 0) - return (1); + /* + * zpool_disable_datasets() is not thread-safe for mnttab access. + * So we serialize access here for 'zpool export -a' parallel case. + */ + if (cb->tpool != NULL) + pthread_mutex_lock(&cb->mnttab_lock); - /* The history must be logged as part of the export */ - log_history = B_FALSE; + int retval = zpool_disable_datasets(zhp, cb->force); + + if (cb->tpool != NULL) + pthread_mutex_unlock(&cb->mnttab_lock); + + if (retval) + return (1); if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) @@ -2058,6 +2076,48 @@ zpool_export_one(zpool_handle_t *zhp, void *data) return (0); } +/* + * Asynchronous export request + */ +static void +zpool_export_task(void *arg) +{ + async_export_args_t *aea = arg; + + zpool_handle_t *zhp = zpool_open(g_zfs, aea->aea_poolname); + if (zhp != NULL) { + int ret = zpool_export_one(zhp, aea->aea_cbdata); + if (ret != 0) + aea->aea_cbdata->retval = ret; + zpool_close(zhp); + } else { + aea->aea_cbdata->retval = 1; + } + + free(aea->aea_poolname); + free(aea); +} + +/* + * Process an export request in parallel + */ +static int +zpool_export_one_async(zpool_handle_t *zhp, void *data) +{ + tpool_t *tpool = ((export_cbdata_t *)data)->tpool; + async_export_args_t *aea = safe_malloc(sizeof (async_export_args_t)); + + /* save pool name since zhp will go out of scope */ + aea->aea_poolname = strdup(zpool_get_name(zhp)); + aea->aea_cbdata = data; + + /* ship off actual export to another thread */ + if (tpool_dispatch(tpool, zpool_export_task, (void *)aea) != 0) + return (errno); /* unlikely */ + else + return (0); +} + /* * zpool export [-f] ... * @@ -2098,17 +2158,33 @@ zpool_do_export(int argc, char **argv) cb.force = force; cb.hardforce = hardforce; + cb.tpool = NULL; + cb.retval = 0; argc -= optind; argv += optind; + /* The history will be logged as part of the export itself */ + log_history = B_FALSE; + if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } - return (for_each_pool(argc, argv, B_TRUE, NULL, - ZFS_TYPE_POOL, B_FALSE, zpool_export_one, &cb)); + cb.tpool = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + pthread_mutex_init(&cb.mnttab_lock, NULL); + + /* Asynchronously call zpool_export_one using thread pool */ + ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, + B_FALSE, zpool_export_one_async, &cb); + + tpool_wait(cb.tpool); + tpool_destroy(cb.tpool); + (void) pthread_mutex_destroy(&cb.mnttab_lock); + + return (ret | cb.retval); } /* check arguments */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index a40914ec5f..2004166781 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -243,6 +243,7 @@ struct spa { dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ + kthread_t *spa_export_thread; /* valid during pool export */ kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 51039af9bc..d1d60b8410 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8143,11 +8143,11 @@ l2arc_dev_get_next(void) ASSERT3P(next, !=, NULL); } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all); + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all) + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting) next = NULL; l2arc_dev_last = next; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 560fd67087..1f047dcd2a 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -34,6 +34,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2024, Klara Inc. */ /* @@ -1991,7 +1992,8 @@ spa_destroy_aux_threads(spa_t *spa) static void spa_unload(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); @@ -6955,7 +6957,7 @@ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { - int error; + int error = 0; spa_t *spa; hrtime_t export_start = gethrtime(); @@ -6979,8 +6981,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa->spa_is_exporting = B_TRUE; /* - * Put a hold on the pool, drop the namespace lock, stop async tasks, - * reacquire the namespace lock, and see if we can export. + * Put a hold on the pool, drop the namespace lock, stop async tasks + * and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -6990,10 +6992,18 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); + spa->spa_export_thread = curthread; spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + + /* + * At this point we no longer hold the spa_namespace_lock and + * the spa_export_thread indicates that an export is in progress. + */ if (spa->spa_state == POOL_STATE_UNINITIALIZED) goto export_spa; + /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, @@ -7089,6 +7099,10 @@ export_spa: if (oldconfig && spa->spa_config) *oldconfig = fnvlist_dup(spa->spa_config); + if (new_state == POOL_STATE_EXPORTED) + zio_handle_export_delay(spa, gethrtime() - export_start); + + mutex_enter(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); @@ -7100,17 +7114,25 @@ export_spa: * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; } - if (new_state == POOL_STATE_EXPORTED) - zio_handle_export_delay(spa, gethrtime() - export_start); - + /* + * Wake up any waiters on spa_namespace_lock + * They need to re-attempt a spa_lookup() + */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (0); fail: + mutex_enter(&spa_namespace_lock); spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; spa_async_resume(spa); + + /* Wake up any waiters on spa_namespace_lock */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (error); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index e6d4a9bdb2..6d7667cf3e 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -27,7 +27,7 @@ * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, 2024, Klara Inc. */ #include @@ -82,8 +82,8 @@ * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/export - * - Held at the start and end of import + * - Held for the duration of create/destroy + * - Held at the start and end of import and export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by @@ -636,8 +636,14 @@ retry: if (spa == NULL) return (NULL); - if (spa->spa_load_thread != NULL && - spa->spa_load_thread != curthread) { + /* + * Avoid racing with import/export, which don't hold the namespace + * lock for their entire duration. + */ + if ((spa->spa_load_thread != NULL && + spa->spa_load_thread != curthread) || + (spa->spa_export_thread != NULL && + spa->spa_export_thread != curthread)) { cv_wait(&spa_namespace_cv, &spa_namespace_lock); goto retry; } @@ -950,14 +956,15 @@ spa_open_ref(spa_t *spa, const void *tag) /* * Remove a reference to the given spa_t. Must have at least one reference, or - * have the namespace lock held. + * have the namespace lock held or be part of a pool import/export. */ void spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock) || - spa->spa_load_thread == curthread); + spa->spa_load_thread == curthread || + spa->spa_export_thread == curthread); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } @@ -977,13 +984,15 @@ spa_async_close(spa_t *spa, const void *tag) /* * Check to see if the spa refcount is zero. Must be called with - * spa_namespace_lock held. We really compare against spa_minref, which is the - * number of references acquired when opening a pool + * spa_namespace_lock held or be the spa export thread. We really + * compare against spa_minref, which is the number of references + * acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } @@ -1231,6 +1240,21 @@ spa_vdev_enter(spa_t *spa) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + /* + * We have a reference on the spa and a spa export could be + * starting but no longer holding the spa_namespace_lock. So + * check if there is an export and if so wait. It will fail + * fast (EBUSY) since we are still holding a spa reference. + * + * Note that we can be woken by a different spa transitioning + * through an import/export, so we must wait for our condition + * to change before proceeding. + */ + while (spa->spa_export_thread != NULL && + spa->spa_export_thread != curthread) { + cv_wait(&spa_namespace_cv, &spa_namespace_lock); + } + vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); @@ -1248,6 +1272,12 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + /* See comment in spa_vdev_enter() */ + while (spa->spa_export_thread != NULL && + spa->spa_export_thread != curthread) { + cv_wait(&spa_namespace_cv, &spa_namespace_lock); + } + vdev_autotrim_stop_all(spa); if (guid != 0) { diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index c5e16af166..0a7323f58d 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -682,7 +682,8 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_initialize_lock); @@ -724,7 +725,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, if (vd_list == NULL) { vdev_initialize_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -756,7 +758,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) spa_t *spa = vd->vdev_spa; list_t vd_list; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 00ebd4c9fc..8a8b02cab5 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -1087,7 +1087,8 @@ vdev_rebuild_stop_wait(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 9753d5a1ea..9cf10332e8 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -1040,7 +1040,8 @@ vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_trim_lock); @@ -1079,7 +1080,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) if (vd_list == NULL) { vdev_trim_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -1115,7 +1117,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) list_t vd_list; vdev_t *vd_l2cache; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 5e7fdf359a..ac2c541a91 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -430,7 +430,8 @@ tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', - 'zpool_export_003_neg', 'zpool_export_004_pos'] + 'zpool_export_003_neg', 'zpool_export_004_pos', + 'zpool_export_parallel_pos', 'zpool_export_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index d625c040b8..44eedcf6fa 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1084,6 +1084,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_export/zpool_export_002_pos.ksh \ functional/cli_root/zpool_export/zpool_export_003_neg.ksh \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ + functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh \ + functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ functional/cli_root/zpool_get/setup.ksh \ functional/cli_root/zpool_get/vdev_get_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh new file mode 100755 index 0000000000..cab8fc2b42 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that admin commands cannot race a pool export +# +# STRATEGY: +# 1. Create a pool +# 2. Import the pool with an injected delay in the background +# 3. Execute some admin commands against the pool +# + +verify_runnable "global" + +DEVICE_DIR=$TEST_BASE_DIR/dev_export-test + +function cleanup +{ + zinject -c all + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + [[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR +} + +log_assert "admin commands cannot race a pool export" + +log_onexit cleanup + +[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR +log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1 + +log_must zpool create -f $TESTPOOL1 mirror ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1 + +log_must zinject -P export -s 10 $TESTPOOL1 + +log_must zpool export $TESTPOOL1 & + +zpool set comment=hello $TESTPOOL1 +zpool reguid $TESTPOOL1 & +zpool split $TESTPOOL1 & + +log_pass "admin commands cannot race a pool export" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh new file mode 100755 index 0000000000..037d17d082 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh @@ -0,0 +1,129 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# test uses 8 vdevs +MAX_NUM=8 +DEVICE_DIR=$TEST_BASE_DIR/dev_import-test + + +# +# DESCRIPTION: +# Verify that pool exports can occur in parallel +# +# STRATEGY: +# 1. Create 8 pools +# 2. Inject an export delay using zinject +# 3. Export half of the pools synchronously to baseline sequential cost +# 4. Export the other half asynchronously to demonstrate parallel savings +# 6. Import 4 pools +# 7. Test zpool export -a +# + +verify_runnable "global" + +# +# override the minimum sized vdevs +# + +POOLNAME="test_pool" + +function cleanup +{ + zinject -c all + + for i in {0..$(($MAX_NUM - 1))}; do + poolexists $POOLNAME-$i && destroy_pool $POOLNAME-$i + done + + [[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR +} + +log_assert "Pool exports can occur in parallel" + +log_onexit cleanup + +[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR + +# +# Create some pools with export delay injectors +# +for i in {0..$(($MAX_NUM - 1))}; do + log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk$i + log_must zpool create $POOLNAME-$i $DEVICE_DIR/disk$i + log_must zinject -P export -s 8 $POOLNAME-$i +done + +# +# Export half of the pools synchronously +# +SECONDS=0 +for i in {0..3}; do + log_must zpool export $POOLNAME-$i +done +sequential_time=$SECONDS +log_note "sequentially exported 4 pools in $sequential_time seconds" + +# +# Export half of the pools in parallel +# +SECONDS=0 +for i in {4..7}; do + log_must zpool export $POOLNAME-$i & +done +wait +parallel_time=$SECONDS +log_note "asyncronously exported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +# +# import 4 pools with export delay injectors +# +for i in {4..7}; do + log_must zpool import -d $DEVICE_DIR/disk$i $POOLNAME-$i + log_must zinject -P export -s 8 $POOLNAME-$i +done + +# +# now test zpool export -a +# +SECONDS=0 +log_must zpool export -a +parallel_time=$SECONDS +log_note "asyncronously exported 4 pools, using '-a', in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +log_pass "Pool exports occur in parallel" From 89acef992bf328e0ffba63950b176a0d9572b792 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Sun, 5 May 2024 14:57:33 +0000 Subject: [PATCH 065/113] Simplified the scope of the namespace lock If we wait until after we check for no spa references to drop the namespace lock, then we know that spa consumers will need to call spa_lookup() and end up waiting on the spa_namespace_cv until we finish. This narrows the external checks to spa_lookup and we no longer need to worry about the spa_vdev_enter case. Sponsored-By: Klara Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Don Brady Closes #16153 --- module/zfs/spa.c | 32 ++++++++++++++++++++------------ module/zfs/spa_misc.c | 21 ++------------------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1f047dcd2a..9eb14b4f1d 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -6994,15 +6994,11 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, mutex_enter(&spa_namespace_lock); spa->spa_export_thread = curthread; spa_close(spa, FTAG); - mutex_exit(&spa_namespace_lock); - /* - * At this point we no longer hold the spa_namespace_lock and - * the spa_export_thread indicates that an export is in progress. - */ - - if (spa->spa_state == POOL_STATE_UNINITIALIZED) + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + mutex_exit(&spa_namespace_lock); goto export_spa; + } /* * The pool will be in core if it's openable, in which case we can @@ -7024,6 +7020,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, goto fail; } + mutex_exit(&spa_namespace_lock); + /* + * At this point we no longer hold the spa_namespace_lock and + * there were no references on the spa. Future spa_lookups will + * notice the spa->spa_export_thread and wait until we signal + * that we are finshed. + */ + if (spa->spa_sync_on) { vdev_t *rvd = spa->spa_root_vdev; /* @@ -7035,6 +7039,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); + mutex_enter(&spa_namespace_lock); goto fail; } @@ -7102,6 +7107,9 @@ export_spa: if (new_state == POOL_STATE_EXPORTED) zio_handle_export_delay(spa, gethrtime() - export_start); + /* + * Take the namespace lock for the actual spa_t removal + */ mutex_enter(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) @@ -7118,20 +7126,20 @@ export_spa: } /* - * Wake up any waiters on spa_namespace_lock - * They need to re-attempt a spa_lookup() + * Wake up any waiters in spa_lookup() */ cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (0); fail: - mutex_enter(&spa_namespace_lock); spa->spa_is_exporting = B_FALSE; spa->spa_export_thread = NULL; - spa_async_resume(spa); - /* Wake up any waiters on spa_namespace_lock */ + spa_async_resume(spa); + /* + * Wake up any waiters in spa_lookup() + */ cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (error); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 6d7667cf3e..d1d41bbe72 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1240,20 +1240,7 @@ spa_vdev_enter(spa_t *spa) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); - /* - * We have a reference on the spa and a spa export could be - * starting but no longer holding the spa_namespace_lock. So - * check if there is an export and if so wait. It will fail - * fast (EBUSY) since we are still holding a spa reference. - * - * Note that we can be woken by a different spa transitioning - * through an import/export, so we must wait for our condition - * to change before proceeding. - */ - while (spa->spa_export_thread != NULL && - spa->spa_export_thread != curthread) { - cv_wait(&spa_namespace_cv, &spa_namespace_lock); - } + ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); @@ -1272,11 +1259,7 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); - /* See comment in spa_vdev_enter() */ - while (spa->spa_export_thread != NULL && - spa->spa_export_thread != curthread) { - cv_wait(&spa_namespace_cv, &spa_namespace_lock); - } + ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); From f625d038d2ae59fa1ae81b76079da464ed6db61a Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Tue, 7 May 2024 13:53:38 -0600 Subject: [PATCH 066/113] tpool_dispatch: fail if it cannot start at least 1 worker. Sponsored by: Axcient Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Alan Somers Closes #16178 --- lib/libtpool/thread_pool.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/lib/libtpool/thread_pool.c b/lib/libtpool/thread_pool.c index 7802f8c175..9bf9cdf5dc 100644 --- a/lib/libtpool/thread_pool.c +++ b/lib/libtpool/thread_pool.c @@ -423,6 +423,26 @@ tpool_dispatch(tpool_t *tpool, void (*func)(void *), void *arg) pthread_mutex_lock(&tpool->tp_mutex); + if (!(tpool->tp_flags & TP_SUSPEND)) { + if (tpool->tp_idle > 0) + (void) pthread_cond_signal(&tpool->tp_workcv); + else if (tpool->tp_current >= tpool->tp_maximum) { + /* At worker limit. Leave task on queue */ + } else { + if (create_worker(tpool) == 0) { + /* Started a new worker thread */ + tpool->tp_current++; + } else if (tpool->tp_current > 0) { + /* Leave task on queue */ + } else { + /* Cannot start a single worker! */ + pthread_mutex_unlock(&tpool->tp_mutex); + free(job); + return (-1); + } + } + } + if (tpool->tp_head == NULL) tpool->tp_head = job; else @@ -430,14 +450,6 @@ tpool_dispatch(tpool_t *tpool, void (*func)(void *), void *arg) tpool->tp_tail = job; tpool->tp_njobs++; - if (!(tpool->tp_flags & TP_SUSPEND)) { - if (tpool->tp_idle > 0) - (void) pthread_cond_signal(&tpool->tp_workcv); - else if (tpool->tp_current < tpool->tp_maximum && - create_worker(tpool) == 0) - tpool->tp_current++; - } - pthread_mutex_unlock(&tpool->tp_mutex); return (0); } From eced2e2f1e56b54753702da52a88fccbe73b3dcb Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Tue, 7 May 2024 14:21:31 -0600 Subject: [PATCH 067/113] libzfs: Fix mounting datasets under thread limit pressure During parallel zpool import, /sbin/zpool will create a separate thread pool for each pool, used to mount that pool's datasets. If the total thread count exceed's the system's limit on threads per process, then tpool_dispatch may fail. If it does, directly execute the mount operation instead. Sponsored by: Axcient Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Alan Somers Closes #16178 Fixes #16172 --- lib/libzfs/libzfs_mount.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index ec6ebad2f1..3084e05e4d 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -1098,7 +1098,10 @@ zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles, mnt_param->mnt_func = func; mnt_param->mnt_data = data; - (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param); + if (tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param)) { + /* Could not dispatch to thread pool; execute directly */ + zfs_mount_task((void*)mnt_param); + } } /* From b64afa41d56e98b5817aaf14c7deb0fa7e2142fb Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Wed, 8 May 2024 10:01:22 -0600 Subject: [PATCH 068/113] Better control the thread pool size when mounting datasets Ever since a10d50f999, ZFS has mounted file systems in parallel when importing a pool. It uses a fixed size of 512 for the thread pool. But since c183d164aa1, it has also imported pools in parallel. So the total number of threads at one time is 513 * npools + 1. That can easily exceed the system's limit on the number of threads per process, which will cause one or more pools to be unable to allocate any worker threads, forcing them to fallback to slow serial mounting . To forestall that, manage the threadpool size in /sbin/zpool, not libzfs. Use the same size (512), but divided by the number of pools. This is a backwards-incompatible change to the libzfs abi. Sponsored by: Axcient Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Alan Somers Closes #16178 --- cmd/zed/agents/zfs_mod.c | 2 +- cmd/zfs/zfs_main.c | 6 ++++-- cmd/zpool/zpool_main.c | 20 +++++++++++++++----- include/libzfs.h | 5 +++-- lib/libzfs/libzfs.abi | 3 ++- lib/libzfs/libzfs_mount.c | 25 +++++++++++++------------ 6 files changed, 38 insertions(+), 23 deletions(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 69163b80bd..d0372608c7 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -702,7 +702,7 @@ zfs_enable_ds(void *arg) { unavailpool_t *pool = (unavailpool_t *)arg; - (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); + (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0, 512); zpool_close(pool->uap_zhp); free(pool); } diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 0bbdd5b18e..75c0e40b61 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -7192,6 +7192,8 @@ share_mount(int op, int argc, char **argv) int c, ret = 0; char *options = NULL; int flags = 0; + const uint_t mount_nthr = 512; + uint_t nthr; /* check options */ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":aRlvo:Of" : "al")) @@ -7310,9 +7312,9 @@ share_mount(int op, int argc, char **argv) * be serialized so that we can prompt the user for their keys * in a consistent manner. */ + nthr = op == OP_MOUNT && !(flags & MS_CRYPT) ? mount_nthr : 1; zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, - share_mount_one_cb, &share_mount_state, - op == OP_MOUNT && !(flags & MS_CRYPT)); + share_mount_one_cb, &share_mount_state, nthr); zfs_commit_shares(NULL); ret = share_mount_state.sm_status; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 400f4bf1a6..d47e1cda9c 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -80,6 +80,8 @@ libzfs_handle_t *g_zfs; +static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */ + static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); @@ -3418,7 +3420,7 @@ zfs_force_import_required(nvlist_t *config) */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, - nvlist_t *props, int flags) + nvlist_t *props, int flags, uint_t mntthreads) { int ret = 0; int ms_status = 0; @@ -3518,7 +3520,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY)) { - ms_status = zpool_enable_datasets(zhp, mntopts, 0); + ms_status = zpool_enable_datasets(zhp, mntopts, 0, mntthreads); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Import was " "successful, but unable to share some datasets\n")); @@ -3537,6 +3539,7 @@ typedef struct import_parameters { const char *ip_mntopts; nvlist_t *ip_props; int ip_flags; + uint_t ip_mntthreads; int *ip_err; } import_parameters_t; @@ -3545,7 +3548,7 @@ do_import_task(void *arg) { import_parameters_t *ip = arg; *ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts, - ip->ip_props, ip->ip_flags); + ip->ip_props, ip->ip_flags, ip->ip_mntthreads); free(ip); } @@ -3559,6 +3562,7 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, uint64_t pool_state; boolean_t pool_specified = (import->poolname != NULL || import->guid != 0); + uint_t npools = 0; tpool_t *tp = NULL; @@ -3576,6 +3580,10 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, int err = 0; nvpair_t *elem = NULL; boolean_t first = B_TRUE; + if (!pool_specified && import->do_all) { + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) + npools++; + } while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); @@ -3606,6 +3614,7 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, ip->ip_mntopts = mntopts; ip->ip_props = props; ip->ip_flags = flags; + ip->ip_mntthreads = mount_tp_nthr / npools; ip->ip_err = &err; (void) tpool_dispatch(tp, do_import_task, @@ -3673,7 +3682,7 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, err = B_TRUE; } else { err |= do_import(found_config, new_name, - mntopts, props, flags); + mntopts, props, flags, mount_tp_nthr); } } @@ -7217,7 +7226,8 @@ zpool_do_split(int argc, char **argv) } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { - ms_status = zpool_enable_datasets(zhp, mntopts, 0); + ms_status = zpool_enable_datasets(zhp, mntopts, 0, + mount_tp_nthr); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Split was successful, " "datasets are mounted but sharing of some datasets " diff --git a/include/libzfs.h b/include/libzfs.h index 2823b88458..7836c2325f 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -716,7 +716,7 @@ typedef struct get_all_cb { } get_all_cb_t; _LIBZFS_H void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, - size_t, zfs_iter_f, void *, boolean_t); + size_t, zfs_iter_f, void *, uint_t); _LIBZFS_H void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); /* @@ -1004,7 +1004,8 @@ _LIBZFS_H int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ -_LIBZFS_H int zpool_enable_datasets(zpool_handle_t *, const char *, int); +_LIBZFS_H int zpool_enable_datasets(zpool_handle_t *, const char *, int, + uint_t); _LIBZFS_H int zpool_disable_datasets(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_datasets_os(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_volume_os(const char *); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 2bbaae6345..c3efb29841 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5532,13 +5532,14 @@ - + + diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 3084e05e4d..42988bf9cb 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -83,8 +83,6 @@ #include #define MAXISALEN 257 /* based on sysinfo(2) man page */ -static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */ - static void zfs_mount_task(void *); static const proto_table_t proto_table[SA_PROTOCOL_COUNT] = { @@ -1205,19 +1203,20 @@ out: * * Callbacks are issued in one of two ways: * - * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT + * 1. Sequentially: If the nthr argument is <= 1 or the ZFS_SERIAL_MOUNT * environment variable is set, then we issue callbacks sequentially. * - * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT + * 2. In parallel: If the nthr argument is > 1 and the ZFS_SERIAL_MOUNT * environment variable is not set, then we use a tpool to dispatch threads * to mount filesystems in parallel. This function dispatches tasks to mount * the filesystems at the top-level mountpoints, and these tasks in turn * are responsible for recursively mounting filesystems in their children - * mountpoints. + * mountpoints. The value of the nthr argument will be the number of worker + * threads for the thread pool. */ void zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, - size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel) + size_t num_handles, zfs_iter_f func, void *data, uint_t nthr) { zoneid_t zoneid = getzoneid(); @@ -1226,7 +1225,7 @@ zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, * variable that can be used as a convenience to do a/b comparison * of serial vs. parallel mounting. */ - boolean_t serial_mount = !parallel || + boolean_t serial_mount = nthr <= 1 || (getenv("ZFS_SERIAL_MOUNT") != NULL); /* @@ -1246,7 +1245,7 @@ zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, * Issue the callback function for each dataset using a parallel * algorithm that uses a thread pool to manage threads. */ - tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL); + tpool_t *tp = tpool_create(1, nthr, 0, NULL); /* * There may be multiple "top level" mountpoints outside of the pool's @@ -1273,10 +1272,12 @@ zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, /* * Mount and share all datasets within the given pool. This assumes that no - * datasets within the pool are currently mounted. + * datasets within the pool are currently mounted. nthr will be number of + * worker threads to use while mounting datasets. */ int -zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) +zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags, + uint_t nthr) { get_all_cb_t cb = { 0 }; mount_state_t ms = { 0 }; @@ -1302,7 +1303,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) ms.ms_mntopts = mntopts; ms.ms_mntflags = flags; zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, - zfs_mount_one, &ms, B_TRUE); + zfs_mount_one, &ms, nthr); if (ms.ms_mntstatus != 0) ret = EZFS_MOUNTFAILED; @@ -1313,7 +1314,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) */ ms.ms_mntstatus = 0; zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, - zfs_share_one, &ms, B_FALSE); + zfs_share_one, &ms, 1); if (ms.ms_mntstatus != 0) ret = EZFS_SHAREFAILED; else From cc38691534310ba22ddc80fedbc10a7ac55237fd Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 2 May 2024 11:57:23 +1000 Subject: [PATCH 069/113] zfs_ioc_send: use a dedicated taskq thread for send When stack space is tight, the stream is written to its target on a separate taskq thread to make sure there's enough stack space to complete it. This has always used an IO taskq, but that doesn't really make sense for it, and moving it onto a regular taskq lets us get rid of spa_taskq_dispatch_sync(), which is not used anywhere else. Stream writes may block for a long time depending on what the target is, and we have no way of discovering this, so we can't risk using the system taskq, as there may be many tens of sends in progress. Instead, we create a dedicated taskq thread for each send writer to run on, and clean it up when it's done. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16151 --- module/zfs/zfs_ioctl.c | 103 ++++++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 33 deletions(-) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 908b9efc18..b720b4f222 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -38,7 +38,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. - * Copyright (c) 2019, 2021, Klara Inc. + * Copyright (c) 2019, 2021, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright 2024 Oxide Computer Company */ @@ -5514,6 +5514,14 @@ out: return (error); } +/* + * When stack space is limited, we write replication stream data to the target + * on a separate taskq thread, to make sure there's enough stack space. + */ +#ifndef HAVE_LARGE_STACKS +#define USE_SEND_TASKQ 1 +#endif + typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; @@ -5534,31 +5542,65 @@ dump_bytes_cb(void *arg) dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } +typedef struct dump_bytes_arg { + zfs_file_t *dba_fp; +#ifdef USE_SEND_TASKQ + taskq_t *dba_tq; + taskq_ent_t dba_tqent; +#endif +} dump_bytes_arg_t; + static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { + dump_bytes_arg_t *dba = (dump_bytes_arg_t *)arg; dump_bytes_io_t dbi; - dbi.dbi_fp = arg; + dbi.dbi_fp = dba->dba_fp; dbi.dbi_buf = buf; dbi.dbi_len = len; -#if defined(HAVE_LARGE_STACKS) - dump_bytes_cb(&dbi); +#ifdef USE_SEND_TASKQ + taskq_dispatch_ent(dba->dba_tq, dump_bytes_cb, &dbi, TQ_SLEEP, + &dba->dba_tqent); + taskq_wait(dba->dba_tq); #else - /* - * The vn_rdwr() call is performed in a taskq to ensure that there is - * always enough stack space to write safely to the target filesystem. - * The ZIO_TYPE_FREE threads are used because there can be a lot of - * them and they are used in vdev_file.c for a similar purpose. - */ - spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, - ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); -#endif /* HAVE_LARGE_STACKS */ + dump_bytes_cb(&dbi); +#endif return (dbi.dbi_err); } +static int +dump_bytes_init(dump_bytes_arg_t *dba, int fd, dmu_send_outparams_t *out) +{ + zfs_file_t *fp = zfs_file_get(fd); + if (fp == NULL) + return (SET_ERROR(EBADF)); + + dba->dba_fp = fp; +#ifdef USE_SEND_TASKQ + dba->dba_tq = taskq_create("z_send", 1, defclsyspri, 0, 0, 0); + taskq_init_ent(&dba->dba_tqent); +#endif + + memset(out, 0, sizeof (dmu_send_outparams_t)); + out->dso_outfunc = dump_bytes; + out->dso_arg = dba; + out->dso_dryrun = B_FALSE; + + return (0); +} + +static void +dump_bytes_fini(dump_bytes_arg_t *dba) +{ + zfs_file_put(dba->dba_fp); +#ifdef USE_SEND_TASKQ + taskq_destroy(dba->dba_tq); +#endif +} + /* * inputs: * zc_name name of snapshot to send @@ -5643,21 +5685,18 @@ zfs_ioc_send(zfs_cmd_t *zc) dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { - zfs_file_t *fp; - dmu_send_outparams_t out = {0}; + dump_bytes_arg_t dba; + dmu_send_outparams_t out; + error = dump_bytes_init(&dba, zc->zc_cookie, &out); + if (error) + return (error); - if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) - return (SET_ERROR(EBADF)); - - off = zfs_file_off(fp); - out.dso_outfunc = dump_bytes; - out.dso_arg = fp; - out.dso_dryrun = B_FALSE; + off = zfs_file_off(dba.dba_fp); error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); - zfs_file_put(fp); + dump_bytes_fini(&dba); } return (error); } @@ -6604,7 +6643,6 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) offset_t off; const char *fromname = NULL; int fd; - zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; @@ -6629,20 +6667,19 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); - if ((fp = zfs_file_get(fd)) == NULL) - return (SET_ERROR(EBADF)); + dump_bytes_arg_t dba; + dmu_send_outparams_t out; + error = dump_bytes_init(&dba, fd, &out); + if (error) + return (error); - off = zfs_file_off(fp); - - dmu_send_outparams_t out = {0}; - out.dso_outfunc = dump_bytes; - out.dso_arg = fp; - out.dso_dryrun = B_FALSE; + off = zfs_file_off(dba.dba_fp); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); - zfs_file_put(fp); + dump_bytes_fini(&dba); + return (error); } From adda768e3eb931b82e8477eb9287f7ca9c881a98 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 2 May 2024 12:04:24 +1000 Subject: [PATCH 070/113] spa: remove spa_taskq_dispatch_sync() It has no callers anymore. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16151 --- include/sys/spa_impl.h | 2 -- module/zfs/spa.c | 13 ------------- 2 files changed, 15 deletions(-) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 2004166781..366410acf2 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -481,8 +481,6 @@ extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio); -extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags); extern void spa_load_spares(spa_t *spa); extern void spa_load_l2cache(spa_t *spa); extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9eb14b4f1d..a1258546c8 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1520,19 +1520,6 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, taskq_dispatch_ent(tq, func, arg, flags, ent); } -/* - * Same as spa_taskq_dispatch_ent() but block on the task until completion. - */ -void -spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags) -{ - taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL); - taskqid_t id = taskq_dispatch(tq, func, arg, flags); - if (id) - taskq_wait_id(tq, id); -} - static void spa_create_zio_taskqs(spa_t *spa) { From 515c4dd2130a2c986640522d298a224ddd7e6018 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 2 May 2024 12:06:58 +1000 Subject: [PATCH 071/113] spa: flatten spa_taskq_dispatch_ent() It is the only user of spa_taskq_dispatch_select(), so might as well just carry it directly. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16151 --- module/zfs/spa.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index a1258546c8..930c527b63 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1490,8 +1490,9 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. */ -static taskq_t * -spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; @@ -1508,15 +1509,7 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } - return (tq); -} -void -spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, - zio_t *zio) -{ - taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio); taskq_dispatch_ent(tq, func, arg, flags, ent); } From 0a543db37111c28085043da89e452ea6b131c019 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 2 May 2024 12:13:38 +1000 Subject: [PATCH 072/113] spa_taskq_dispatch_ent: simplify arguments This renames it to spa_taskq_dispatch(), and reduces and simplifies its arguments based on these observations from its two call sites: - arg is always the zio, so it can be typed that way, and we don't need to provide it twice; - ent is always &zio->io_tqent, and zio is always provided, so we can use it directly; - the only flag used is TQ_FRONT, which can just be a bool; - zio != NULL was part of the "use allocator" test, but it never would have got that far, because that arg was only set to NULL in the reexecute path, which is forced to type CLAIM, so the condition would fail at t == WRITE anyway. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16151 --- include/sys/spa_impl.h | 4 ++-- module/zfs/spa.c | 18 +++++++++++++----- module/zfs/zio.c | 17 ++++------------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 366410acf2..5605a35b86 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -479,8 +479,8 @@ struct spa { extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; -extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio); +extern void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, zio_t *zio, boolean_t cutinline); extern void spa_load_spares(spa_t *spa); extern void spa_load_l2cache(spa_t *spa); extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 930c527b63..d762f21a37 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1491,9 +1491,8 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) * on the taskq itself. */ void -spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, - zio_t *zio) +spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, zio_t *zio, boolean_t cutinline) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; @@ -1501,16 +1500,25 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); + /* + * NB: We are assuming that the zio can only be dispatched + * to a single taskq at a time. It would be a grievous error + * to dispatch the zio to another taskq at the same time. + */ + ASSERT(zio); + ASSERT(taskq_empty_ent(&zio->io_tqent)); + if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && - (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) { + ZIO_HAS_ALLOCATOR(zio)) { tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } - taskq_dispatch_ent(tq, func, arg, flags, ent); + taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0, + &zio->io_tqent); } static void diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 65a0afaaa2..d68d5ababe 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2023,7 +2023,6 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; - int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and @@ -2047,19 +2046,12 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; else - flags |= TQ_FRONT; + cutinline = B_TRUE; } ASSERT3U(q, <, ZIO_TASKQ_TYPES); - /* - * NB: We are assuming that the zio can only be dispatched - * to a single taskq at a time. It would be a grievous error - * to dispatch the zio to another taskq at the same time. - */ - ASSERT(taskq_empty_ent(&zio->io_tqent)); - spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, - &zio->io_tqent, zio); + spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline); } static boolean_t @@ -5007,10 +4999,9 @@ zio_done(zio_t *zio) * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ - ASSERT(taskq_empty_ent(&zio->io_tqent)); - spa_taskq_dispatch_ent(zio->io_spa, + spa_taskq_dispatch(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, - zio_reexecute, zio, 0, &zio->io_tqent, NULL); + zio_reexecute, zio, B_FALSE); } return (NULL); } From 91c46d4399e42b2b14ae65ae8637061b67adbd82 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 09:56:48 +1000 Subject: [PATCH 073/113] zdb: bring crash handling over from ztest ztest has a very nice ability to show a backtrace when there's an unexpected crash. zdb is used often enough on corrupted data and can blow up too, so nice output is useful there too. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 797ae34b6e..f3274a65db 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -85,6 +85,9 @@ #include #include #include +#if (__GLIBC__ && !__UCLIBC__) +#include /* for backtrace() */ +#endif #include #include @@ -828,11 +831,41 @@ usage(void) static void dump_debug_buffer(void) { - if (dump_opt['G']) { - (void) printf("\n"); - (void) fflush(stdout); - zfs_dbgmsg_print("zdb"); - } + ssize_t ret __attribute__((unused)); + + if (!dump_opt['G']) + return; + /* + * We use write() instead of printf() so that this function + * is safe to call from a signal handler. + */ + ret = write(STDOUT_FILENO, "\n", 1); + zfs_dbgmsg_print("zdb"); +} + +#define BACKTRACE_SZ 100 + +static void sig_handler(int signo) +{ + struct sigaction action; +#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ + int nptrs; + void *buffer[BACKTRACE_SZ]; + + nptrs = backtrace(buffer, BACKTRACE_SZ); + backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); +#endif + dump_debug_buffer(); + + /* + * Restore default action and re-raise signal so SIGSEGV and + * SIGABRT can trigger a core dump. + */ + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + (void) sigaction(signo, &action, NULL); + raise(signo); } /* @@ -8899,9 +8932,27 @@ main(int argc, char **argv) char *spa_config_path_env, *objset_str; boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; nvlist_t *cfg = NULL; + struct sigaction action; dprintf_setup(&argc, argv); + /* + * Set up signal handlers, so if we crash due to bad on-disk data we + * can get more info. Unlike ztest, we don't bail out if we can't set + * up signal handlers, because zdb is very useful without them. + */ + action.sa_handler = sig_handler; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + if (sigaction(SIGSEGV, &action, NULL) < 0) { + (void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n", + strerror(errno)); + } + if (sigaction(SIGABRT, &action, NULL) < 0) { + (void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n", + strerror(errno)); + } + /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will From e7b451941b92e2bdbb9c08bb4283c9a39d5571c6 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 10:19:48 +1000 Subject: [PATCH 074/113] zdb/ztest: use libspl backtrace for crashes We can show much nicer backtraces these days, lets use them. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 12 +----------- cmd/ztest.c | 12 +----------- lib/libspl/assert.c | 9 ++++++--- lib/libspl/include/assert.h | 2 ++ 4 files changed, 10 insertions(+), 25 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index f3274a65db..908e4e0ab2 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -85,9 +85,6 @@ #include #include #include -#if (__GLIBC__ && !__UCLIBC__) -#include /* for backtrace() */ -#endif #include #include @@ -843,18 +840,11 @@ dump_debug_buffer(void) zfs_dbgmsg_print("zdb"); } -#define BACKTRACE_SZ 100 - static void sig_handler(int signo) { struct sigaction action; -#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ - int nptrs; - void *buffer[BACKTRACE_SZ]; - nptrs = backtrace(buffer, BACKTRACE_SZ); - backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); -#endif + libspl_dump_backtrace(); dump_debug_buffer(); /* diff --git a/cmd/ztest.c b/cmd/ztest.c index 56eb01618c..ccfe71c295 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -136,9 +136,6 @@ #include #include #include -#if (__GLIBC__ && !__UCLIBC__) -#include /* for backtrace() */ -#endif static int ztest_fd_data = -1; static int ztest_fd_rand = -1; @@ -621,18 +618,11 @@ dump_debug_buffer(void) zfs_dbgmsg_print("ztest"); } -#define BACKTRACE_SZ 100 - static void sig_handler(int signo) { struct sigaction action; -#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ - int nptrs; - void *buffer[BACKTRACE_SZ]; - nptrs = backtrace(buffer, BACKTRACE_SZ); - backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); -#endif + libspl_dump_backtrace(); dump_debug_buffer(); /* diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 5b12c14acd..79b640d895 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -55,7 +55,7 @@ #define UNW_LOCAL_ONLY #include -static inline void +void libspl_dump_backtrace(void) { unw_context_t uc; @@ -85,7 +85,7 @@ libspl_dump_backtrace(void) #elif defined(HAVE_BACKTRACE) #include -static inline void +void libspl_dump_backtrace(void) { void *btptrs[100]; @@ -97,7 +97,10 @@ libspl_dump_backtrace(void) free(bt); } #else -#define libspl_dump_backtrace() +void +libspl_dump_backtrace(void) +{ +} #endif #if defined(__APPLE__) diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index 155bbab302..126f2db241 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -60,6 +60,8 @@ libspl_assert(const char *buf, const char *file, const char *func, int line) return (0); } +extern void libspl_dump_backtrace(void); + #ifdef verify #undef verify #endif From 3974ef045ef270e72be6ca1d20baf67bfbecfbe5 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 11:26:11 +1000 Subject: [PATCH 075/113] libspl: lift backtrace into a separate file If it's going to be used directly by zdb/ztest, then it sort of doesn't make sense to carry it with the assert code. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 3 +- cmd/ztest.c | 3 +- lib/libnvpair/libnvpair.abi | 111 +++++++++++++++++++++++++++-- lib/libspl/Makefile.am | 3 +- lib/libspl/assert.c | 55 +------------- lib/libspl/backtrace.c | 80 +++++++++++++++++++++ lib/libspl/include/Makefile.am | 1 + lib/libspl/include/assert.h | 2 - lib/libspl/include/sys/backtrace.h | 32 +++++++++ lib/libuutil/libuutil.abi | 85 ++++++++++++++++++++-- lib/libzfs/libzfs.abi | 41 ++++++++++- lib/libzfs_core/libzfs_core.abi | 89 ++++++++++++++++++++--- 12 files changed, 426 insertions(+), 79 deletions(-) create mode 100644 lib/libspl/backtrace.c create mode 100644 lib/libspl/include/sys/backtrace.h diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 908e4e0ab2..01d584844c 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -85,6 +85,7 @@ #include #include #include +#include #include #include @@ -844,7 +845,7 @@ static void sig_handler(int signo) { struct sigaction action; - libspl_dump_backtrace(); + libspl_backtrace(); dump_debug_buffer(); /* diff --git a/cmd/ztest.c b/cmd/ztest.c index ccfe71c295..d6f22d04a6 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -136,6 +136,7 @@ #include #include #include +#include static int ztest_fd_data = -1; static int ztest_fd_rand = -1; @@ -622,7 +623,7 @@ static void sig_handler(int signo) { struct sigaction action; - libspl_dump_backtrace(); + libspl_backtrace(); dump_debug_buffer(); /* diff --git a/lib/libnvpair/libnvpair.abi b/lib/libnvpair/libnvpair.abi index ef92f3e9bd..69009375e8 100644 --- a/lib/libnvpair/libnvpair.abi +++ b/lib/libnvpair/libnvpair.abi @@ -79,6 +79,7 @@ + @@ -1156,6 +1157,11 @@ + + + + + @@ -2041,9 +2047,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2051,11 +2125,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2536,11 +2642,6 @@ - - - - - diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 94be416d46..f8943572bf 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -7,7 +7,8 @@ noinst_LTLIBRARIES += libspl_assert.la libspl.la CPPCHECKTARGETS += libspl_assert.la libspl.la libspl_assert_la_SOURCES = \ - %D%/assert.c + %D%/assert.c \ + %D%/backtrace.c libspl_la_SOURCES = \ %D%/libspl_impl.h \ diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 79b640d895..ff0d17ba2a 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -28,6 +28,7 @@ #include #include +#include #if defined(__linux__) #include @@ -51,58 +52,6 @@ pthread_getname_np(pthread_self(), buf, len); #endif -#if defined(HAVE_LIBUNWIND) -#define UNW_LOCAL_ONLY -#include - -void -libspl_dump_backtrace(void) -{ - unw_context_t uc; - unw_cursor_t cp; - unw_word_t ip, off; - char funcname[128]; -#ifdef HAVE_LIBUNWIND_ELF - char objname[128]; - unw_word_t objoff; -#endif - - fprintf(stderr, "Call trace:\n"); - unw_getcontext(&uc); - unw_init_local(&cp, &uc); - while (unw_step(&cp) > 0) { - unw_get_reg(&cp, UNW_REG_IP, &ip); - unw_get_proc_name(&cp, funcname, sizeof (funcname), &off); -#ifdef HAVE_LIBUNWIND_ELF - unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff); - fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n", - ip, funcname, off, objname, objoff); -#else - fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off); -#endif - } -} -#elif defined(HAVE_BACKTRACE) -#include - -void -libspl_dump_backtrace(void) -{ - void *btptrs[100]; - size_t nptrs = backtrace(btptrs, 100); - char **bt = backtrace_symbols(btptrs, nptrs); - fprintf(stderr, "Call trace:\n"); - for (size_t i = 0; i < nptrs; i++) - fprintf(stderr, " %s\n", bt[i]); - free(bt); -} -#else -void -libspl_dump_backtrace(void) -{ -} -#endif - #if defined(__APPLE__) static inline uint64_t libspl_gettid(void) @@ -154,7 +103,7 @@ libspl_assertf(const char *file, const char *func, int line, getpid(), libspl_getprogname(), libspl_gettid(), tname); - libspl_dump_backtrace(); + libspl_backtrace(); #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c new file mode 100644 index 0000000000..0e653cd964 --- /dev/null +++ b/lib/libspl/backtrace.c @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024, Rob Norris + * Copyright (c) 2024, Klara Inc. + */ + +#include +#include + +#if defined(HAVE_LIBUNWIND) +#define UNW_LOCAL_ONLY +#include + +void +libspl_backtrace(void) +{ + unw_context_t uc; + unw_cursor_t cp; + unw_word_t ip, off; + char funcname[128]; +#ifdef HAVE_LIBUNWIND_ELF + char objname[128]; + unw_word_t objoff; +#endif + + fprintf(stderr, "Call trace:\n"); + unw_getcontext(&uc); + unw_init_local(&cp, &uc); + while (unw_step(&cp) > 0) { + unw_get_reg(&cp, UNW_REG_IP, &ip); + unw_get_proc_name(&cp, funcname, sizeof (funcname), &off); +#ifdef HAVE_LIBUNWIND_ELF + unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff); + fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n", + ip, funcname, off, objname, objoff); +#else + fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off); +#endif + } +} +#elif defined(HAVE_BACKTRACE) +#include + +void +libspl_backtrace(void) +{ + void *btptrs[100]; + size_t nptrs = backtrace(btptrs, 100); + char **bt = backtrace_symbols(btptrs, nptrs); + fprintf(stderr, "Call trace:\n"); + for (size_t i = 0; i < nptrs; i++) + fprintf(stderr, " %s\n", bt[i]); + free(bt); +} +#else +void +libspl_backtrace(void) +{ +} +#endif + diff --git a/lib/libspl/include/Makefile.am b/lib/libspl/include/Makefile.am index 2c1d21edf1..4ad3b854cb 100644 --- a/lib/libspl/include/Makefile.am +++ b/lib/libspl/include/Makefile.am @@ -27,6 +27,7 @@ libspl_sys_HEADERS = \ %D%/sys/acl.h \ %D%/sys/acl_impl.h \ %D%/sys/asm_linkage.h \ + %D%/sys/backtrace.h \ %D%/sys/callb.h \ %D%/sys/cmn_err.h \ %D%/sys/cred.h \ diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index 126f2db241..155bbab302 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -60,8 +60,6 @@ libspl_assert(const char *buf, const char *file, const char *func, int line) return (0); } -extern void libspl_dump_backtrace(void); - #ifdef verify #undef verify #endif diff --git a/lib/libspl/include/sys/backtrace.h b/lib/libspl/include/sys/backtrace.h new file mode 100644 index 0000000000..97ee7740ce --- /dev/null +++ b/lib/libspl/include/sys/backtrace.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024, Rob Norris + * Copyright (c) 2024, Klara Inc. + */ + +#ifndef _LIBSPL_SYS_BACKTRACE_H +#define _LIBSPL_SYS_BACKTRACE_H + +void libspl_backtrace(void); + +#endif diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi index e942d24c65..2ed2fb2e41 100644 --- a/lib/libuutil/libuutil.abi +++ b/lib/libuutil/libuutil.abi @@ -149,6 +149,7 @@ + @@ -242,6 +243,22 @@ + + + + + + + + + + + + + + + + @@ -576,6 +593,27 @@ + + + + + + + + + + + + + + + + + + + + + @@ -596,14 +634,11 @@ - + - - - - + @@ -800,9 +835,16 @@ + + + + + + + @@ -912,6 +954,25 @@ + + + + + + + + + + + + + + + + + + + @@ -920,12 +981,23 @@ + + + + + + + + + + + @@ -937,8 +1009,9 @@ - + + diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index c3efb29841..80f4b7439a 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -191,6 +191,7 @@ + @@ -777,6 +778,18 @@ + + + + + + + + + + + + @@ -1092,6 +1105,19 @@ + + + + + + + + + + + + + @@ -6252,6 +6278,11 @@ + + + + + @@ -6363,7 +6394,7 @@ - + @@ -8672,7 +8703,7 @@ - + @@ -8697,6 +8728,12 @@ + + + + + + diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 5b95c8f779..cf9d6bddc9 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -132,6 +132,7 @@ + @@ -231,10 +232,18 @@ + + + + + + + + @@ -242,6 +251,14 @@ + + + + + + + + @@ -574,6 +591,27 @@ + + + + + + + + + + + + + + + + + + + + + @@ -594,14 +632,11 @@ - + - - - - + @@ -770,6 +805,13 @@ + + + + + + + @@ -873,12 +915,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -895,8 +967,9 @@ - + + @@ -1119,7 +1192,7 @@ - + @@ -1127,7 +1200,7 @@ - + From 1ea8c59441cd215d4f45cbe839cbfe51c6e32068 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 13:04:14 +1000 Subject: [PATCH 076/113] backtrace: rework for signal safety Mostly, try a lot harder to not allocate anything. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 2 +- cmd/ztest.c | 2 +- lib/libspl/assert.c | 2 +- lib/libspl/backtrace.c | 91 +++++++++++++++++++++--------- lib/libspl/include/sys/backtrace.h | 2 +- 5 files changed, 68 insertions(+), 31 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 01d584844c..7c2819d3cf 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -845,7 +845,7 @@ static void sig_handler(int signo) { struct sigaction action; - libspl_backtrace(); + libspl_backtrace(STDERR_FILENO); dump_debug_buffer(); /* diff --git a/cmd/ztest.c b/cmd/ztest.c index d6f22d04a6..b4d63b02dd 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -623,7 +623,7 @@ static void sig_handler(int signo) { struct sigaction action; - libspl_backtrace(); + libspl_backtrace(STDERR_FILENO); dump_debug_buffer(); /* diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index ff0d17ba2a..d11361b387 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -103,7 +103,7 @@ libspl_assertf(const char *file, const char *func, int line, getpid(), libspl_getprogname(), libspl_gettid(), tname); - libspl_backtrace(); + libspl_backtrace(STDERR_FILENO); #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index 0e653cd964..dd8cb025f4 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -24,57 +24,94 @@ */ #include -#include +#include +#include + +/* + * libspl_backtrace() must be safe to call from inside a signal hander. This + * mostly means it must not allocate, and so we can't use things like printf. + */ #if defined(HAVE_LIBUNWIND) #define UNW_LOCAL_ONLY #include -void -libspl_backtrace(void) +static size_t +libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) { + static const char hexdigits[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' + }; + + size_t pos = 0; + boolean_t want = (digits == 0); + for (int i = 15; i >= 0; i--) { + const uint64_t d = v >> (i * 4) & 0xf; + if (!want && (d != 0 || digits > i)) + want = B_TRUE; + if (want) { + buf[pos++] = hexdigits[d]; + if (pos == buflen) + break; + } + } + return (pos); +} + +void +libspl_backtrace(int fd) +{ + ssize_t ret __attribute__((unused)); unw_context_t uc; unw_cursor_t cp; - unw_word_t ip, off; - char funcname[128]; -#ifdef HAVE_LIBUNWIND_ELF - char objname[128]; - unw_word_t objoff; -#endif + unw_word_t loc; + char buf[128]; + size_t n; - fprintf(stderr, "Call trace:\n"); + ret = write(fd, "Call trace:\n", 12); unw_getcontext(&uc); unw_init_local(&cp, &uc); while (unw_step(&cp) > 0) { - unw_get_reg(&cp, UNW_REG_IP, &ip); - unw_get_proc_name(&cp, funcname, sizeof (funcname), &off); + unw_get_reg(&cp, UNW_REG_IP, &loc); + ret = write(fd, " [0x", 5); + n = libspl_u64_to_hex_str(loc, 10, buf, sizeof (buf)); + ret = write(fd, buf, n); + ret = write(fd, "] ", 2); + unw_get_proc_name(&cp, buf, sizeof (buf), &loc); + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + ret = write(fd, buf, n); + ret = write(fd, "+0x", 3); + n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); + ret = write(fd, buf, n); #ifdef HAVE_LIBUNWIND_ELF - unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff); - fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n", - ip, funcname, off, objname, objoff); -#else - fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off); + ret = write(fd, " (in ", 5); + unw_get_elf_filename(&cp, buf, sizeof (buf), &loc); + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + ret = write(fd, buf, n); + ret = write(fd, " +0x", 4); + n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); + ret = write(fd, buf, n); + ret = write(fd, ")", 1); #endif + ret = write(fd, "\n", 1); } } #elif defined(HAVE_BACKTRACE) #include void -libspl_backtrace(void) +libspl_backtrace(int fd) { - void *btptrs[100]; - size_t nptrs = backtrace(btptrs, 100); - char **bt = backtrace_symbols(btptrs, nptrs); - fprintf(stderr, "Call trace:\n"); - for (size_t i = 0; i < nptrs; i++) - fprintf(stderr, " %s\n", bt[i]); - free(bt); + ssize_t ret __attribute__((unused)); + void *btptrs[64]; + size_t nptrs = backtrace(btptrs, 64); + ret = write(fd, "Call trace:\n", 12); + backtrace_symbols_fd(btptrs, nptrs, fd); } #else void -libspl_backtrace(void) +libspl_backtrace(int fd __maybe_unused) { } #endif - diff --git a/lib/libspl/include/sys/backtrace.h b/lib/libspl/include/sys/backtrace.h index 97ee7740ce..f9869ffc9e 100644 --- a/lib/libspl/include/sys/backtrace.h +++ b/lib/libspl/include/sys/backtrace.h @@ -27,6 +27,6 @@ #ifndef _LIBSPL_SYS_BACKTRACE_H #define _LIBSPL_SYS_BACKTRACE_H -void libspl_backtrace(void); +void libspl_backtrace(int fd); #endif From fa99d9cd9cbc6aca3245afcfe321b8226985597d Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 13:54:08 +1000 Subject: [PATCH 077/113] zfs_dbgmsg_print: make FreeBSD and Linux consistent FreeBSD was using fprintf(), which might not be signal-safe. Meanwhile, Linux's locking did not cover the header output. This two quirks are unrelated, but both have the same response: be like the other one. So with this commit, both functions are the same except for the names of their lock and list variables. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- module/os/freebsd/zfs/zfs_debug.c | 24 ++++++++++++++++++++---- module/os/linux/zfs/zfs_debug.c | 3 ++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_debug.c b/module/os/freebsd/zfs/zfs_debug.c index 78d50c6fd8..3e832a9104 100644 --- a/module/os/freebsd/zfs/zfs_debug.c +++ b/module/os/freebsd/zfs/zfs_debug.c @@ -234,13 +234,29 @@ __dprintf(boolean_t dprint, const char *file, const char *func, void zfs_dbgmsg_print(const char *tag) { - zfs_dbgmsg_t *zdm; + ssize_t ret __attribute__((unused)); - (void) printf("ZFS_DBGMSG(%s):\n", tag); mutex_enter(&zfs_dbgmsgs_lock); - for (zdm = list_head(&zfs_dbgmsgs); zdm; + + /* + * We use write() in this function instead of printf() + * so it is safe to call from a signal handler. + */ + ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); + ret = write(STDOUT_FILENO, tag, strlen(tag)); + ret = write(STDOUT_FILENO, ") START:\n", 9); + + for (zfs_dbgmsg_t zdm = list_head(&zfs_dbgmsgs); zdm != NULL; zdm = list_next(&zfs_dbgmsgs, zdm)) - (void) printf("%s\n", zdm->zdm_msg); + ret = write(STDOUT_FILENO, zdm->zdm_msg, + strlen(zdm->zdm_msg)); + ret = write(STDOUT_FILENO, "\n", 1); + } + + ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); + ret = write(STDOUT_FILENO, tag, strlen(tag)); + ret = write(STDOUT_FILENO, ") END\n", 6); + mutex_exit(&zfs_dbgmsgs_lock); } #endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c index f707959c94..bc5c028dca 100644 --- a/module/os/linux/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -225,6 +225,8 @@ zfs_dbgmsg_print(const char *tag) { ssize_t ret __attribute__((unused)); + mutex_enter(&zfs_dbgmsgs.pl_lock); + /* * We use write() in this function instead of printf() * so it is safe to call from a signal handler. @@ -233,7 +235,6 @@ zfs_dbgmsg_print(const char *tag) ret = write(STDOUT_FILENO, tag, strlen(tag)); ret = write(STDOUT_FILENO, ") START:\n", 9); - mutex_enter(&zfs_dbgmsgs.pl_lock); for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL; zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) { ret = write(STDOUT_FILENO, zdm->zdm_msg, From 3c941d18183455138f7c5dcc212177bd3cea8afc Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 10 May 2024 13:58:26 +1000 Subject: [PATCH 078/113] zdb/ztest: send dbgmsg output to stderr And, make the output fd an arg to zfs_dbgmsg_print(). This is a change in behaviour, but keeps it consistent with where crash traces go, and it's easy to argue this is what we want anyway; this is information about the task, not the actual output of the task. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16181 --- cmd/zdb/zdb.c | 4 ++-- cmd/ztest.c | 4 ++-- include/sys/zfs_debug.h | 2 +- module/os/freebsd/zfs/zfs_debug.c | 25 ++++++++++++------------- module/os/linux/zfs/zfs_debug.c | 19 +++++++++---------- 5 files changed, 26 insertions(+), 28 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 7c2819d3cf..704fcf4422 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -837,8 +837,8 @@ dump_debug_buffer(void) * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ - ret = write(STDOUT_FILENO, "\n", 1); - zfs_dbgmsg_print("zdb"); + ret = write(STDERR_FILENO, "\n", 1); + zfs_dbgmsg_print(STDERR_FILENO, "zdb"); } static void sig_handler(int signo) diff --git a/cmd/ztest.c b/cmd/ztest.c index b4d63b02dd..f77a37c215 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -615,8 +615,8 @@ dump_debug_buffer(void) * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ - ret = write(STDOUT_FILENO, "\n", 1); - zfs_dbgmsg_print("ztest"); + ret = write(STDERR_FILENO, "\n", 1); + zfs_dbgmsg_print(STDERR_FILENO, "ztest"); } static void sig_handler(int signo) diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 8d94557a58..e509c8b7c6 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -104,7 +104,7 @@ extern void zfs_dbgmsg_fini(void); #ifndef _KERNEL extern int dprintf_find_string(const char *string); -extern void zfs_dbgmsg_print(const char *tag); +extern void zfs_dbgmsg_print(int fd, const char *tag); #endif #ifdef __cplusplus diff --git a/module/os/freebsd/zfs/zfs_debug.c b/module/os/freebsd/zfs/zfs_debug.c index 3e832a9104..c4cebe1020 100644 --- a/module/os/freebsd/zfs/zfs_debug.c +++ b/module/os/freebsd/zfs/zfs_debug.c @@ -232,30 +232,29 @@ __dprintf(boolean_t dprint, const char *file, const char *func, #else void -zfs_dbgmsg_print(const char *tag) +zfs_dbgmsg_print(int fd, const char *tag) { ssize_t ret __attribute__((unused)); - mutex_enter(&zfs_dbgmsgs_lock); - /* * We use write() in this function instead of printf() * so it is safe to call from a signal handler. */ - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") START:\n", 9); + ret = write(fd, "ZFS_DBGMSG(", 11); + ret = write(fd, tag, strlen(tag)); + ret = write(fd, ") START:\n", 9); - for (zfs_dbgmsg_t zdm = list_head(&zfs_dbgmsgs); zdm != NULL; + mutex_enter(&zfs_dbgmsgs_lock); + + for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs); zdm != NULL; zdm = list_next(&zfs_dbgmsgs, zdm)) - ret = write(STDOUT_FILENO, zdm->zdm_msg, - strlen(zdm->zdm_msg)); - ret = write(STDOUT_FILENO, "\n", 1); + ret = write(fd, zdm->zdm_msg, strlen(zdm->zdm_msg)); + ret = write(fd, "\n", 1); } - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") END\n", 6); + ret = write(fd, "ZFS_DBGMSG(", 11); + ret = write(fd, tag, strlen(tag)); + ret = write(fd, ") END\n", 6); mutex_exit(&zfs_dbgmsgs_lock); } diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c index bc5c028dca..9ee40771fc 100644 --- a/module/os/linux/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -221,7 +221,7 @@ __dprintf(boolean_t dprint, const char *file, const char *func, #else void -zfs_dbgmsg_print(const char *tag) +zfs_dbgmsg_print(int fd, const char *tag) { ssize_t ret __attribute__((unused)); @@ -231,20 +231,19 @@ zfs_dbgmsg_print(const char *tag) * We use write() in this function instead of printf() * so it is safe to call from a signal handler. */ - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") START:\n", 9); + ret = write(fd, "ZFS_DBGMSG(", 11); + ret = write(fd, tag, strlen(tag)); + ret = write(fd, ") START:\n", 9); for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL; zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) { - ret = write(STDOUT_FILENO, zdm->zdm_msg, - strlen(zdm->zdm_msg)); - ret = write(STDOUT_FILENO, "\n", 1); + ret = write(fd, zdm->zdm_msg, strlen(zdm->zdm_msg)); + ret = write(fd, "\n", 1); } - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") END\n", 6); + ret = write(fd, "ZFS_DBGMSG(", 11); + ret = write(fd, tag, strlen(tag)); + ret = write(fd, ") END\n", 6); mutex_exit(&zfs_dbgmsgs.pl_lock); } From e675852bc1d50404fcbe4fa0e1f57b6c318e6349 Mon Sep 17 00:00:00 2001 From: Rob N Date: Thu, 16 May 2024 06:03:41 +1000 Subject: [PATCH 079/113] dbuf: separate refcount calls for dbuf and dbuf_user In 92dc4ad83 I updated the dbuf_cache accounting to track the size of userdata associated with dbufs. This adds the size of the dbuf+userdata together in a single call to zfs_refcount_add_many(), but sometime removes them in separate calls to zfs_refcount_remove_many(), if dbuf and userdata are evicted separately. What I didn't realise is that when refcount tracking is on, zfs_refcount_add_many() and zfs_refcount_remove_many() are expected to be paired, with their second & third args (count & holder) the same on both sides. Splitting the remove part into two calls means the counts don't match up, tripping a panic. This commit fixes that, by always adding and removing the dbuf and userdata counts separately. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reported-by: Mark Johnston Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16191 --- module/zfs/dbuf.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 806ebcfc57..bce41948c4 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -578,7 +578,7 @@ dbuf_evict_user(dmu_buf_impl_t *db) */ uint64_t size = dbu->dbu_size; (void) zfs_refcount_remove_many( - &dbuf_caches[db->db_caching_status].size, size, db); + &dbuf_caches[db->db_caching_status].size, size, dbu); if (db->db_caching_status == DB_DBUF_CACHE) DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); } @@ -784,12 +784,15 @@ dbuf_evict_one(void) if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); - uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); + uint64_t size = db->db.db_size; + uint64_t usize = dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( &dbuf_caches[DB_DBUF_CACHE].size, size, db); + (void) zfs_refcount_remove_many( + &dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); @@ -3794,16 +3797,21 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); - uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); + uint64_t size = db->db.db_size; + uint64_t usize = dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, size, db); + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, usize, + db->db_user); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + size + usize); } db->db_caching_status = DB_NO_CACHE; } @@ -4022,10 +4030,12 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) db->db_caching_status = dcs; multilist_insert(&dbuf_caches[dcs].cache, db); - uint64_t db_size = db->db.db_size + - dmu_buf_user_size(&db->db); - size = zfs_refcount_add_many( + uint64_t db_size = db->db.db_size; + uint64_t dbu_size = dmu_buf_user_size(&db->db); + (void) zfs_refcount_add_many( &dbuf_caches[dcs].size, db_size, db); + size = zfs_refcount_add_many( + &dbuf_caches[dcs].size, dbu_size, db->db_user); uint8_t db_level = db->db_level; mutex_exit(&db->db_mtx); @@ -4038,7 +4048,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) DBUF_STAT_MAX(cache_size_bytes_max, size); DBUF_STAT_BUMP(cache_levels[db_level]); DBUF_STAT_INCR(cache_levels_bytes[db_level], - db_size); + db_size + dbu_size); } if (dcs == DB_DBUF_CACHE && !evicting) From a043b60f1eabcdd72f30b692565a2c982b1a1e8a Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Thu, 16 May 2024 18:37:50 -0400 Subject: [PATCH 080/113] Correct level handling in zstream recompress. sscanf returns number of items parsed on success and EOF on failure. Reviewed-by: Adam Moss Reviewed-by: Paul Dagnelie Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Rich Ercolani Closes #16198 --- cmd/zstream/zstream_recompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c index 8392ef3de7..f9e01d1aa4 100644 --- a/cmd/zstream/zstream_recompress.c +++ b/cmd/zstream/zstream_recompress.c @@ -77,7 +77,7 @@ zstream_do_recompress(int argc, char *argv[]) while ((c = getopt(argc, argv, "l:")) != -1) { switch (c) { case 'l': - if (sscanf(optarg, "%d", &level) != 0) { + if (sscanf(optarg, "%d", &level) != 1) { fprintf(stderr, "failed to parse level '%s'\n", optarg); From d0d7c0d8f92d1a70f2b92e600f980f254b725668 Mon Sep 17 00:00:00 2001 From: omni Date: Sat, 4 May 2024 08:44:55 +0000 Subject: [PATCH 081/113] config/zfs-build.m4: sort vendors Reviewed-by: Brian Behlendorf Signed-off-by: omni Closes #16164 --- config/zfs-build.m4 | 96 +++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index bb5a85d815..b33b9225bb 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -512,32 +512,33 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ [with_vendor=$withval], [with_vendor=check]) AS_IF([test "x$with_vendor" = "xcheck"],[ - if test -f /etc/toss-release ; then - VENDOR=toss ; - elif test -f /etc/fedora-release ; then - VENDOR=fedora ; - elif test -f /etc/redhat-release ; then - VENDOR=redhat ; - elif test -f /etc/gentoo-release ; then - VENDOR=gentoo ; + if test -f /etc/alpine-release ; then + VENDOR=alpine ; elif test -f /etc/arch-release ; then VENDOR=arch ; + elif test -f /etc/fedora-release ; then + VENDOR=fedora ; + elif test -f /bin/freebsd-version ; then + VENDOR=freebsd ; + elif test -f /etc/gentoo-release ; then + VENDOR=gentoo ; + elif test -f /etc/lunar.release ; then + VENDOR=lunar ; + elif test -f /etc/openEuler-release ; then + VENDOR=openeuler ; elif test -f /etc/SuSE-release ; then VENDOR=sles ; elif test -f /etc/slackware-version ; then VENDOR=slackware ; - elif test -f /etc/lunar.release ; then - VENDOR=lunar ; + elif test -f /etc/toss-release ; then + VENDOR=toss ; elif test -f /etc/lsb-release ; then VENDOR=ubuntu ; + # put debian and redhat last as derivatives may have also their file elif test -f /etc/debian_version ; then VENDOR=debian ; - elif test -f /etc/alpine-release ; then - VENDOR=alpine ; - elif test -f /bin/freebsd-version ; then - VENDOR=freebsd ; - elif test -f /etc/openEuler-release ; then - VENDOR=openeuler ; + elif test -f /etc/redhat-release ; then + VENDOR=redhat ; else VENDOR= ; fi], @@ -550,20 +551,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default package type]) case "$VENDOR" in - toss) DEFAULT_PACKAGE=rpm ;; - redhat) DEFAULT_PACKAGE=rpm ;; - fedora) DEFAULT_PACKAGE=rpm ;; - gentoo) DEFAULT_PACKAGE=tgz ;; - alpine) DEFAULT_PACKAGE=tgz ;; - arch) DEFAULT_PACKAGE=tgz ;; - sles) DEFAULT_PACKAGE=rpm ;; - slackware) DEFAULT_PACKAGE=tgz ;; - lunar) DEFAULT_PACKAGE=tgz ;; - ubuntu) DEFAULT_PACKAGE=deb ;; - debian) DEFAULT_PACKAGE=deb ;; - freebsd) DEFAULT_PACKAGE=pkg ;; - openeuler) DEFAULT_PACKAGE=rpm ;; - *) DEFAULT_PACKAGE=rpm ;; + alpine|arch|gentoo|lunar|slackware) + DEFAULT_PACKAGE=tgz ;; + debian|ubuntu) + DEFAULT_PACKAGE=deb ;; + freebsd) + DEFAULT_PACKAGE=pkg ;; + *) + # fedora|openeuler|redhat|sles|toss + DEFAULT_PACKAGE=rpm ;; esac AC_MSG_RESULT([$DEFAULT_PACKAGE]) AC_SUBST(DEFAULT_PACKAGE) @@ -578,7 +574,7 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default shell]) case "$VENDOR" in - gentoo|alpine) DEFAULT_INIT_SHELL=/sbin/openrc-run + alpine|gentoo) DEFAULT_INIT_SHELL=/sbin/openrc-run IS_SYSV_RC=false ;; *) DEFAULT_INIT_SHELL=/bin/sh IS_SYSV_RC=true ;; @@ -598,17 +594,19 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default init config directory]) case "$VENDOR" in - alpine) initconfdir=/etc/conf.d ;; - gentoo) initconfdir=/etc/conf.d ;; - toss) initconfdir=/etc/sysconfig ;; - redhat) initconfdir=/etc/sysconfig ;; - fedora) initconfdir=/etc/sysconfig ;; - sles) initconfdir=/etc/sysconfig ;; - openeuler) initconfdir=/etc/sysconfig ;; - ubuntu) initconfdir=/etc/default ;; - debian) initconfdir=/etc/default ;; - freebsd) initconfdir=$sysconfdir/rc.conf.d;; - *) initconfdir=/etc/default ;; + alpine|gentoo) + initconfdir=/etc/conf.d + ;; + fedora|openeuler|redhat|sles|toss) + initconfdir=/etc/sysconfig + ;; + freebsd) + initconfdir=$sysconfdir/rc.conf.d + ;; + *) + # debian|ubuntu + initconfdir=/etc/default + ;; esac AC_MSG_RESULT([$initconfdir]) AC_SUBST(initconfdir) @@ -625,11 +623,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default bash completion directory]) case "$VENDOR" in - ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; - debian) bashcompletiondir=/usr/share/bash-completion/completions ;; - freebsd) bashcompletiondir=$sysconfdir/bash_completion.d;; - gentoo) bashcompletiondir=/usr/share/bash-completion/completions ;; - *) bashcompletiondir=/etc/bash_completion.d ;; + debian|gentoo|ubuntu) + bashcompletiondir=/usr/share/bash-completion/completions + ;; + freebsd) + bashcompletiondir=$sysconfdir/bash_completion.d + ;; + *) + bashcompletiondir=/etc/bash_completion.d + ;; esac AC_MSG_RESULT([$bashcompletiondir]) AC_SUBST(bashcompletiondir) From fec16b93c46d80ae60a4f20632932601030b6fc0 Mon Sep 17 00:00:00 2001 From: omni Date: Sat, 4 May 2024 08:47:13 +0000 Subject: [PATCH 082/113] config/zfs-build.m4: add Alpine Linux bash-completion path Reviewed-by: Brian Behlendorf Signed-off-by: omni Closes #16164 --- config/zfs-build.m4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index b33b9225bb..368684e1c5 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -623,7 +623,7 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default bash completion directory]) case "$VENDOR" in - debian|gentoo|ubuntu) + alpine|debian|gentoo|ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; freebsd) From efbef9e6cc1e14cc19a24b76175f7ec86610161a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 16 May 2024 20:56:55 -0400 Subject: [PATCH 083/113] FreeBSD: Add zfs_link_create() error handling Originally Solaris didn't expect errors there, but they may happen if we fail to add entry into ZAP. Linux fixed it in #7421, but it was never fully ported to FreeBSD. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #13215 Closes #16138 --- module/os/freebsd/zfs/zfs_dir.c | 1 + module/os/freebsd/zfs/zfs_vnops_os.c | 54 ++++++++++++++++++++------ module/os/freebsd/zfs/zfs_znode.c | 1 - tests/test-runner/bin/zts-report.py.in | 1 - 4 files changed, 43 insertions(+), 14 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_dir.c b/module/os/freebsd/zfs/zfs_dir.c index 948df8e50d..3cdb94d6cd 100644 --- a/module/os/freebsd/zfs/zfs_dir.c +++ b/module/os/freebsd/zfs/zfs_dir.c @@ -543,6 +543,7 @@ zfs_rmnode(znode_t *zp) dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); zfs_znode_delete(zp, tx); + zfs_znode_free(zp); dmu_tx_commit(tx); diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index d9a8c8a0d7..b9b332434b 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1175,10 +1175,25 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + error = zfs_link_create(dzp, name, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + VOP_UNLOCK1(ZTOV(zp)); + zrele(zp); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + getnewvnode_drop_reserve(); + goto out; + } + if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); @@ -1526,13 +1541,19 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - /* * Now put new name in parent dir. */ - (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); + error = zfs_link_create(dzp, dirname, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + VOP_UNLOCK1(ZTOV(zp)); + zrele(zp); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); *zpp = zp; @@ -1540,6 +1561,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); +out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); @@ -1550,7 +1572,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); - return (0); + return (error); } #if __FreeBSD_version < 1300124 @@ -3586,10 +3608,14 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, /* * Insert the new object into the directory. */ - (void) zfs_link_create(dzp, name, zp, tx, ZNEW); - - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - *zpp = zp; + error = zfs_link_create(dzp, name, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + VOP_UNLOCK1(ZTOV(zp)); + zrele(zp); + } else { + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + } zfs_acl_ids_free(&acl_ids); @@ -3597,8 +3623,12 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, getnewvnode_drop_reserve(); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0) { + *zpp = zp; + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } zfs_exit(zfsvfs, FTAG); return (error); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 0d4c94555c..0eea2a8494 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -1234,7 +1234,6 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) VERIFY0(dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); - zfs_znode_free(zp); } void diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ecc50f4871..5ca1309313 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -182,7 +182,6 @@ if sys.platform.startswith('freebsd'): 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], 'cp_files/cp_files_002_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], - 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], 'rsend/send_raw_ashift': ['SKIP', 14961], }) From 08648cf0da381fb667fa413ba95407ae4c3f8a8f Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Fri, 24 May 2024 18:45:09 -0700 Subject: [PATCH 084/113] Allow block cloning to be interrupted by a signal. Even though block cloning is much faster than regular copying, it is not instantaneous - the file might be large and the recordsize small. It would be nice to be able to interrupt it with a signal (e.g., SIGINFO on FreeBSD to see the progress). Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #16208 --- module/zfs/zfs_vnops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index babb07ca25..b222a6f88d 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1412,6 +1412,11 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, outoff += size; len -= size; done += size; + + if (issig(JUSTLOOKING) && issig(FORREAL)) { + error = SET_ERROR(EINTR); + break; + } } vmem_free(bps, sizeof (bps[0]) * maxblocks); From 7572e8ca04adda7af207dd27d643d241351680e7 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 24 May 2024 18:45:58 -0700 Subject: [PATCH 085/113] Avoid a gcc -Wint-to-pointer-cast warning On 32-bit platforms long long is generally 64-bits. Sufficiently modern versions of gcc (13 in my testing) complains when casting a pointer to an integer of a different width so cast to uintptr_t first to avoid the warning. Fixes: c183d164aa Parallel pool import Reviewed-by: Brian Behlendorf Reviewed-by: Don Brady Signed-off-by: Brooks Davis Closes #16203 --- module/zfs/spa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d762f21a37..412f883e9c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -6827,7 +6827,7 @@ spa_tryimport(nvlist_t *tryconfig) */ char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", - TRYIMPORT_NAME, (u_longlong_t)curthread, poolname); + TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); mutex_enter(&spa_namespace_lock); spa = spa_add(name, tryconfig, NULL); From 708be0f415c83c941d3ed5153fafc5d3766e4cdd Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 25 May 2024 11:54:24 +1000 Subject: [PATCH 086/113] Linux 6.7 compat: detect if kernel defines intptr_t Since Linux 6.7 the kernel has defined intptr_t. Clang has -Wtypedef-redefinition by default, which causes the build to fail because we also have a typedef for intptr_t. Since its better to use the kernel's if it exists, detect it and skip our own. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16201 --- config/kernel-types.m4 | 40 ++++++++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ include/os/linux/spl/sys/types.h | 2 ++ 3 files changed, 44 insertions(+) create mode 100644 config/kernel-types.m4 diff --git a/config/kernel-types.m4 b/config/kernel-types.m4 new file mode 100644 index 0000000000..ed76af2833 --- /dev/null +++ b/config/kernel-types.m4 @@ -0,0 +1,40 @@ +dnl # +dnl # check if kernel provides definitions for given types +dnl # + +dnl _ZFS_AC_KERNEL_SRC_TYPE(type) +AC_DEFUN([_ZFS_AC_KERNEL_SRC_TYPE], [ + ZFS_LINUX_TEST_SRC([type_$1], [ + #include + ],[ + const $1 __attribute__((unused)) x = ($1) 0; + ]) +]) + +dnl _ZFS_AC_KERNEL_TYPE(type) +AC_DEFUN([_ZFS_AC_KERNEL_TYPE], [ + AC_MSG_CHECKING([whether kernel defines $1]) + ZFS_LINUX_TEST_RESULT([type_$1], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_KERNEL_]m4_quote(m4_translit([$1], [a-z], [A-Z])), + 1, [kernel defines $1]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl ZFS_AC_KERNEL_TYPES([types...]) +AC_DEFUN([ZFS_AC_KERNEL_TYPES], [ + AC_DEFUN([ZFS_AC_KERNEL_SRC_TYPES], [ + m4_foreach_w([type], [$1], [ + _ZFS_AC_KERNEL_SRC_TYPE(type) + ]) + ]) + AC_DEFUN([ZFS_AC_KERNEL_TYPES], [ + m4_foreach_w([type], [$1], [ + _ZFS_AC_KERNEL_TYPE(type) + ]) + ]) +]) + +ZFS_AC_KERNEL_TYPES([intptr_t]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 548905ccd0..b51477b6a9 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -37,6 +37,7 @@ dnl # only once the compilation can be done in parallel significantly dnl # speeding up the process. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ + ZFS_AC_KERNEL_SRC_TYPES ZFS_AC_KERNEL_SRC_OBJTOOL ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE @@ -187,6 +188,7 @@ dnl # dnl # Check results of kernel interface tests. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ + ZFS_AC_KERNEL_TYPES ZFS_AC_KERNEL_ACCESS_OK_TYPE ZFS_AC_KERNEL_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_OBJTOOL diff --git a/include/os/linux/spl/sys/types.h b/include/os/linux/spl/sys/types.h index 20ba457f7e..94ba7b6ad3 100644 --- a/include/os/linux/spl/sys/types.h +++ b/include/os/linux/spl/sys/types.h @@ -38,7 +38,9 @@ typedef unsigned long ulong_t; typedef unsigned long long u_longlong_t; typedef long long longlong_t; +#ifndef HAVE_KERNEL_INTPTR_T typedef long intptr_t; +#endif typedef unsigned long long rlim64_t; typedef struct task_struct kthread_t; From 34906f8bbee337ee5aa9b79c141517bff0a4e0ab Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 25 May 2024 11:55:47 +1000 Subject: [PATCH 087/113] zap: reuse zap_leaf_t on dbuf reuse after shrink If a shrink or truncate had recently freed a portion of the ZAP, the dbuf could still be sitting on the dbuf cache waiting for eviction. If it is then allocated for a new leaf before it can be evicted, the zap_leaf_t is still attached as userdata, tripping the VERIFY. Instead, just check for the userdata, and if we find it, reuse it. Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16157. Closes #16204 --- module/zfs/zap.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 81dab80daf..03b76ea1b7 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -425,20 +425,36 @@ zap_leaf_evict_sync(void *dbu) static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { - zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); - rw_enter(&l->l_rwlock, RW_WRITER); - l->l_blkid = zap_allocate_blocks(zap, 1); - l->l_dbuf = NULL; + uint64_t blkid = zap_allocate_blocks(zap, 1); + dmu_buf_t *db = NULL; VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, + blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db, DMU_READ_NO_PREFETCH)); - dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); - VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu)); + + /* + * Create the leaf structure and stash it on the dbuf. If zap was + * recent shrunk or truncated, the dbuf might have been sitting in the + * cache waiting to be evicted, and so still have the old leaf attached + * to it. If so, just reuse it. + */ + zap_leaf_t *l = dmu_buf_get_user(db); + if (l == NULL) { + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l->l_blkid = blkid; + l->l_dbuf = db; + rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, + &l->l_dbuf); + dmu_buf_set_user(l->l_dbuf, &l->l_dbu); + } else { + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + } + + rw_enter(&l->l_rwlock, RW_WRITER); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); From d0aa9dbccfb06778ca336732ee4e627f50475ad3 Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 25 May 2024 12:00:29 +1000 Subject: [PATCH 088/113] Use memset to zero stack allocations containing unions C99 6.7.8.17 says that when an undesignated initialiser is used, only the first element of a union is initialised. If the first element is not the largest within the union, how the remaining space is initialised is up to the compiler. GCC extends the initialiser to the entire union, while Clang treats the remainder as padding, and so initialises according to whatever automatic/implicit initialisation rules are currently active. When Linux is compiled with CONFIG_INIT_STACK_ALL_PATTERN, -ftrivial-auto-var-init=pattern is added to the kernel CFLAGS. This flag sets the policy for automatic/implicit initialisation of variables on the stack. Taken together, this means that when compiling under CONFIG_INIT_STACK_ALL_PATTERN on Clang, the "zero" initialiser will only zero the first element in a union, and the rest will be filled with a pattern. This is significant for aes_ctx_t, which in aes_encrypt_atomic() and aes_decrypt_atomic() is initialised to zero, but then used as a gcm_ctx_t, which is the fifth element in the union, and thus gets pattern initialisation. Later, it's assumed to be zero, resulting in a hang. As confusing and undiscoverable as it is, by the spec, we are at fault when we initialise a structure containing a union with the zero initializer. As such, this commit replaces these uses with an explicit memset(0). Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16135 Closes #16206 --- cmd/zstream/zstream_redup.c | 4 +++- lib/libzfs/libzfs_sendrecv.c | 6 ++++-- module/icp/io/aes.c | 8 ++++++-- tests/zfs-tests/cmd/libzfs_input_check.c | 4 +++- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/cmd/zstream/zstream_redup.c b/cmd/zstream/zstream_redup.c index c56a09cee7..6866639fe4 100644 --- a/cmd/zstream/zstream_redup.c +++ b/cmd/zstream/zstream_redup.c @@ -186,7 +186,7 @@ static void zfs_redup_stream(int infd, int outfd, boolean_t verbose) { int bufsz = SPA_MAXBLOCKSIZE; - dmu_replay_record_t thedrr = { 0 }; + dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; redup_table_t rdt; zio_cksum_t stream_cksum; @@ -194,6 +194,8 @@ zfs_redup_stream(int infd, int outfd, boolean_t verbose) uint64_t num_records = 0; uint64_t num_write_byref_records = 0; + memset(&thedrr, 0, sizeof (dmu_replay_record_t)); + #ifdef _ILP32 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; #else diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 526f57ea40..0370112c02 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -2170,7 +2170,8 @@ out: static int send_conclusion_record(int fd, zio_cksum_t *zc) { - dmu_replay_record_t drr = { 0 }; + dmu_replay_record_t drr; + memset(&drr, 0, sizeof (dmu_replay_record_t)); drr.drr_type = DRR_END; if (zc != NULL) drr.drr_u.drr_end.drr_checksum = *zc; @@ -2272,7 +2273,8 @@ send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, } if (!dryrun) { - dmu_replay_record_t drr = { 0 }; + dmu_replay_record_t drr; + memset(&drr, 0, sizeof (dmu_replay_record_t)); /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index d6f01304f5..522c436497 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -832,12 +832,14 @@ aes_encrypt_atomic(crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext, crypto_spi_ctx_template_t template) { - aes_ctx_t aes_ctx = {{{{0}}}}; + aes_ctx_t aes_ctx; off_t saved_offset; size_t saved_length; size_t length_needed; int ret; + memset(&aes_ctx, 0, sizeof (aes_ctx_t)); + ASSERT(ciphertext != NULL); /* @@ -956,12 +958,14 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *ciphertext, crypto_data_t *plaintext, crypto_spi_ctx_template_t template) { - aes_ctx_t aes_ctx = {{{{0}}}}; + aes_ctx_t aes_ctx; off_t saved_offset; size_t saved_length; size_t length_needed; int ret; + memset(&aes_ctx, 0, sizeof (aes_ctx_t)); + ASSERT(plaintext != NULL); /* diff --git a/tests/zfs-tests/cmd/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check.c index c661718a29..7d9ce4fada 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check.c @@ -521,13 +521,15 @@ test_send_new(const char *snapshot, int fd) static void test_recv_new(const char *dataset, int fd) { - dmu_replay_record_t drr = { 0 }; + dmu_replay_record_t drr; nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); char snapshot[MAXNAMELEN + 32]; ssize_t count; + memset(&drr, 0, sizeof (dmu_replay_record_t)); + int cleanup_fd = open(ZFS_DEV, O_RDWR); if (cleanup_fd == -1) { (void) fprintf(stderr, "open(%s) failed: %s\n", ZFS_DEV, From 8865dfbcaad44a1056f35be60d3058dd3b1e9145 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Sat, 25 May 2024 04:02:58 +0200 Subject: [PATCH 089/113] Fix assertion in Persistent L2ARC At the end of l2arc_evict() fix an assertion in the case that l2ad_hand + distance == l2ad_end. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #16202 Closes #16207 --- module/zfs/arc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index d1d60b8410..30d30b98a6 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8886,7 +8886,7 @@ out: * assertions may be violated without functional consequences * as the device is about to be removed. */ - ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } From 02c5aa9b092818785ed8db4e2246a828278138e3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 24 May 2024 22:11:18 -0400 Subject: [PATCH 090/113] Destroy ARC buffer in case of fill error In case of error dmu_buf_fill_done() returns the buffer back into DB_UNCACHED state. Since during transition from DB_UNCACHED into DB_FILL state dbuf_noread() allocates an ARC buffer, we must free it here, otherwise it will be leaked. Reviewed-by: Brian Behlendorf Reviewed-by: Jorgen Lundman Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15665 Closes #15802 Closes #16216 --- module/zfs/dbuf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index bce41948c4..56fe2c4dbe 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2853,6 +2853,7 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) failed = B_FALSE; } else if (failed) { VERIFY(!dbuf_undirty(db, tx)); + arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; dbuf_clear_data(db); DTRACE_SET_STATE(db, "fill failed"); From 800d59d5771806459a23f10f3c9ee8f2d178b9ed Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 29 May 2024 11:53:31 -0400 Subject: [PATCH 091/113] Some improvements to metaslabs eviction - Add old eviction for special and dedup metaslab classes. Those vdevs may be potentially big and fragmented with large metaslabs, while their asynchronous write pattern is not really different from normal class. It seems an omission to not evict old metaslabs from them. - If we have metaslab preload enabled, which means we are not too low on memory, do not evict active metaslabs even if they are not used for some time. Eviction of active metaslabs means we won't be able to write anything until we load them, that may take some time, that is straight opposite to metaslab preload goals. For small systems the memory saving should be less important after recent reduction in number of allocators and so open metaslabs. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16214 --- module/zfs/metaslab.c | 7 +++++-- module/zfs/spa.c | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index cb004930d2..7170b5eefc 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -638,6 +638,7 @@ void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; + hrtime_t now = gethrtime(); for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); @@ -661,8 +662,10 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && - gethrtime() > msp->ms_selected_time + - (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { + now > msp->ms_selected_time + + MSEC2NSEC(metaslab_unload_delay_ms) && + (msp->ms_allocator == -1 || + !metaslab_preload_enabled)) { metaslab_evict(msp, txg); } else { /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 412f883e9c..638572996c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -10168,6 +10168,9 @@ spa_sync(spa_t *spa, uint64_t txg) metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); + /* spa_embedded_log_class has only one metaslab per vdev. */ + metaslab_class_evict_old(spa->spa_special_class, txg); + metaslab_class_evict_old(spa->spa_dedup_class, txg); spa_sync_close_syncing_log_sm(spa); From 6b95031f5642f54bab063da84dd4009df2bc0b5e Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 29 May 2024 10:46:41 -0700 Subject: [PATCH 092/113] zed: Add deadman-slot_off.sh zedlet Optionally turn off disk's enclosure slot if an I/O is hung triggering the deadman. It's possible for outstanding I/O to a misbehaving SCSI disk to neither promptly complete or return an error. This can occur due to retry and recovery actions taken by the SCSI layer, driver, or disk. When it occurs the pool will be unresponsive even though there may be sufficient redundancy configured to proceeded without this single disk. When a hung I/O is detected by the kmods it will be posted as a deadman event. By default an I/O is considered to be hung after 5 minutes. This value can be changed with the zfs_deadman_ziotime_ms module parameter. If ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is set the disk's enclosure slot will be powered off causing the outstanding I/O to fail. The ZED will then handle this like a normal disk failure. By default ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is not set. As part of this change `zfs_deadman_events_per_second` is added to control the ratelimitting of deadman events independantly of delay events. In practice, a single deadman event is sufficient and more aren't particularly useful. Alphabetize the zfs_deadman_* entries in zfs.4. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #16226 --- cmd/zed/zed.d/Makefile.am | 2 + cmd/zed/zed.d/deadman-slot_off.sh | 71 +++++++++++++++++++ cmd/zed/zed.d/zed.rc | 7 ++ man/man4/zfs.4 | 21 +++--- module/zfs/vdev.c | 10 ++- tests/zfs-tests/include/tunables.cfg | 1 + .../functional/deadman/deadman_ratelimit.ksh | 8 +-- 7 files changed, 106 insertions(+), 14 deletions(-) create mode 100755 cmd/zed/zed.d/deadman-slot_off.sh diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am index 812558cf6d..093a04c463 100644 --- a/cmd/zed/zed.d/Makefile.am +++ b/cmd/zed/zed.d/Makefile.am @@ -9,6 +9,7 @@ dist_zedexec_SCRIPTS = \ %D%/all-debug.sh \ %D%/all-syslog.sh \ %D%/data-notify.sh \ + %D%/deadman-slot_off.sh \ %D%/generic-notify.sh \ %D%/pool_import-led.sh \ %D%/resilver_finish-notify.sh \ @@ -29,6 +30,7 @@ SUBSTFILES += $(nodist_zedexec_SCRIPTS) zedconfdefaults = \ all-syslog.sh \ data-notify.sh \ + deadman-slot_off.sh \ history_event-zfs-list-cacher.sh \ pool_import-led.sh \ resilver_finish-notify.sh \ diff --git a/cmd/zed/zed.d/deadman-slot_off.sh b/cmd/zed/zed.d/deadman-slot_off.sh new file mode 100755 index 0000000000..7b339b3add --- /dev/null +++ b/cmd/zed/zed.d/deadman-slot_off.sh @@ -0,0 +1,71 @@ +#!/bin/sh +# shellcheck disable=SC3014,SC2154,SC2086,SC2034 +# +# Turn off disk's enclosure slot if an I/O is hung triggering the deadman. +# +# It's possible for outstanding I/O to a misbehaving SCSI disk to neither +# promptly complete or return an error. This can occur due to retry and +# recovery actions taken by the SCSI layer, driver, or disk. When it occurs +# the pool will be unresponsive even though there may be sufficient redundancy +# configured to proceeded without this single disk. +# +# When a hung I/O is detected by the kmods it will be posted as a deadman +# event. By default an I/O is considered to be hung after 5 minutes. This +# value can be changed with the zfs_deadman_ziotime_ms module parameter. +# If ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is set the disk's enclosure +# slot will be powered off causing the outstanding I/O to fail. The ZED +# will then handle this like a normal disk failure and FAULT the vdev. +# +# We assume the user will be responsible for turning the slot back on +# after replacing the disk. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI Enclosure services (SES) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: slot successfully powered off +# 1: enclosure not available +# 2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN disabled +# 3: System not configured to wait on deadman +# 4: The enclosure sysfs path passed from ZFS does not exist +# 5: Enclosure slot didn't actually turn off after we told it to + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + # No JBOD enclosure or NVMe slots + exit 1 +fi + +if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN}" != "1" ] ; then + exit 2 +fi + +if [ "$ZEVENT_POOL_FAILMODE" != "wait" ] ; then + exit 3 +fi + +if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then + exit 4 +fi + +# Turn off the slot and wait for sysfs to report that the slot is off. +# It can take ~400ms on some enclosures and multiple retries may be needed. +for i in $(seq 1 20) ; do + echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" + + for j in $(seq 1 5) ; do + if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then + break 2 + fi + sleep 0.1 + done +done + +if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then + exit 5 +fi + +zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH" diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index ec64ecfaa1..af56147a96 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -148,6 +148,13 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" # supports slot power control via sysfs. #ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1 +## +# Power off the drive's slot in the enclosure if there is a hung I/O which +# exceeds the deadman timeout. This can help prevent a single misbehaving +# drive from rendering a redundant pool unavailable. This assumes your drive +# enclosure fully supports slot power control via sysfs. +#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN=1 + ## # Ntfy topic # This defines which topic will receive the ntfy notification. diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6895a2a6d7..f1d14b4d01 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -906,6 +906,13 @@ Historically used for controlling what reporting was available under .Pa /proc/spl/kstat/zfs . No effect. . +.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64 +Check time in milliseconds. +This defines the frequency at which we check for hung I/O requests +and potentially invoke the +.Sy zfs_deadman_failmode +behavior. +. .It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int When a pool sync operation takes longer than .Sy zfs_deadman_synctime_ms , @@ -921,6 +928,10 @@ By default, the deadman is enabled and set to which results in "hung" I/O operations only being logged. The deadman is automatically disabled when a pool gets suspended. . +.It Sy zfs_deadman_events_per_second Ns = Ns Sy 1 Ns /s Pq int +Rate limit deadman zevents (which report hung I/O operations) to this many per +second. +. .It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp Controls the failure behavior when the deadman detects a "hung" I/O operation. Valid values are: @@ -938,13 +949,6 @@ This can be used to facilitate automatic fail-over to a properly configured fail-over partner. .El . -.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64 -Check time in milliseconds. -This defines the frequency at which we check for hung I/O requests -and potentially invoke the -.Sy zfs_deadman_failmode -behavior. -. .It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64 Interval in milliseconds after which the deadman is triggered and also the interval after which a pool sync operation is considered to be "hung". @@ -1002,8 +1006,7 @@ will result in objects waiting when there is not actually contention on the same object. . .It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int -Rate limit delay and deadman zevents (which report slow I/O operations) to this -many per +Rate limit delay zevents (which report slow I/O operations) to this many per second. . .It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 414bf84f6f..c74f72159d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -112,6 +112,11 @@ int zfs_vdev_dtl_sm_blksz = (1 << 12); */ static unsigned int zfs_slow_io_events_per_second = 20; +/* + * Rate limit deadman "hung IO" events to this many per second. + */ +static unsigned int zfs_deadman_events_per_second = 1; + /* * Rate limit checksum events after this many checksum errors per second. */ @@ -666,7 +671,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); - zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second, + zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); @@ -6476,6 +6481,9 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); +ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, + "Rate limit hung IO (deadman) events to this many per second"); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index a619b846dd..721cf27f48 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -29,6 +29,7 @@ CONDENSE_INDIRECT_OBSOLETE_PCT condense.indirect_obsolete_pct zfs_condense_indir CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms +DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh index 4dd4c5b9a7..d851d03e1a 100755 --- a/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh +++ b/tests/zfs-tests/tests/functional/deadman/deadman_ratelimit.ksh @@ -28,7 +28,7 @@ # Verify spa deadman events are rate limited # # STRATEGY: -# 1. Reduce the zfs_slow_io_events_per_second to 1. +# 1. Reduce the zfs_deadman_events_per_second to 1. # 2. Reduce the zfs_deadman_ziotime_ms to 1ms. # 3. Write data to a pool and read it back. # 4. Verify deadman events have been produced at a reasonable rate. @@ -44,15 +44,15 @@ function cleanup zinject -c all default_cleanup_noexit - set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + set_tunable64 DEADMAN_EVENTS_PER_SECOND $OLD_DEADMAN_EVENTS set_tunable64 DEADMAN_ZIOTIME_MS $ZIOTIME_DEFAULT } log_assert "Verify spa deadman events are rate limited" log_onexit cleanup -OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) -log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1 +OLD_DEADMAN_EVENTS=$(get_tunable DEADMAN_EVENTS_PER_SECOND) +log_must set_tunable64 DEADMAN_EVENTS_PER_SECOND 1 log_must set_tunable64 DEADMAN_ZIOTIME_MS 1 # Create a new pool in order to use the updated deadman settings. From 01c8efdd59b540eb6ea21e339d2dbd0283095130 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 29 May 2024 10:49:11 -0700 Subject: [PATCH 093/113] Simplify issig(). We always call it twice with JUSTLOOKING and then FORREAL. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #16225 --- include/os/freebsd/spl/sys/sig.h | 8 +------- include/os/linux/spl/sys/signal.h | 5 +---- include/sys/zfs_context.h | 3 +-- module/os/linux/spl/spl-thread.c | 16 +++------------- module/zfs/dmu_diff.c | 2 +- module/zfs/dmu_objset.c | 2 +- module/zfs/dmu_recv.c | 2 +- module/zfs/dmu_redact.c | 2 +- module/zfs/dmu_send.c | 2 +- module/zfs/zcp.c | 3 +-- module/zfs/zfs_ioctl.c | 2 +- module/zfs/zfs_vnops.c | 2 +- 12 files changed, 14 insertions(+), 35 deletions(-) diff --git a/include/os/freebsd/spl/sys/sig.h b/include/os/freebsd/spl/sys/sig.h index a4d440d383..17fc65cbe3 100644 --- a/include/os/freebsd/spl/sys/sig.h +++ b/include/os/freebsd/spl/sys/sig.h @@ -39,20 +39,14 @@ #include #include -#define FORREAL 0 -#define JUSTLOOKING 1 - static __inline int -issig(int why) +issig(void) { struct thread *td = curthread; struct proc *p; int sig; - ASSERT(why == FORREAL || why == JUSTLOOKING); if (SIGPENDING(td)) { - if (why == JUSTLOOKING) - return (1); p = td->td_proc; PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); diff --git a/include/os/linux/spl/sys/signal.h b/include/os/linux/spl/sys/signal.h index 6b538c8966..cb4b332616 100644 --- a/include/os/linux/spl/sys/signal.h +++ b/include/os/linux/spl/sys/signal.h @@ -30,9 +30,6 @@ #include #endif -#define FORREAL 0 /* Usual side-effects */ -#define JUSTLOOKING 1 /* Don't stop the process */ - -extern int issig(int why); +extern int issig(void); #endif /* SPL_SIGNAL_H */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 8f264b50e9..e4711ce419 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -249,8 +249,7 @@ extern struct proc p0; extern kthread_t *zk_thread_create(const char *name, void (*func)(void *), void *arg, size_t stksize, int state); -#define issig(why) (FALSE) -#define ISSIG(thr, why) (FALSE) +#define issig() (FALSE) #define KPREEMPT_SYNC (-1) diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index ee3eb4690c..dbb8eefa7e 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -152,26 +152,16 @@ spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) EXPORT_SYMBOL(spl_kthread_create); /* - * The "why" argument indicates the allowable side-effects of the call: - * - * FORREAL: Extract the next pending signal from p_sig into p_cursig; - * stop the process if a stop has been requested or if a traced signal - * is pending. - * - * JUSTLOOKING: Don't stop the process, just indicate whether or not - * a signal might be pending (FORREAL is needed to tell for sure). + * Extract the next pending signal from p_sig into p_cursig; stop the process + * if a stop has been requested or if a traced signal is pending. */ int -issig(int why) +issig(void) { - ASSERT(why == FORREAL || why == JUSTLOOKING); if (!signal_pending(current)) return (0); - if (why != FORREAL) - return (1); - struct task_struct *task = current; spl_kernel_siginfo_t __info; sigset_t set; diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index a2b1a27c88..0def0956be 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -116,7 +116,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dmu_diffarg_t *da = arg; int err = 0; - if (issig(JUSTLOOKING) && issig(FORREAL)) + if (issig()) return (SET_ERROR(EINTR)); if (zb->zb_level == ZB_DNODE_LEVEL || diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f1818ae155..8f4fefa4f4 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -2437,7 +2437,7 @@ dmu_objset_space_upgrade(objset_t *os) if (err != 0) return (err); - if (issig(JUSTLOOKING) && issig(FORREAL)) + if (issig()) return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 680aed4513..0119191d79 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -3389,7 +3389,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) * stream, then we free drc->drc_rrd and exit. */ while (rwa->err == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { err = SET_ERROR(EINTR); break; } diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 5ac14edfca..1feba0ba83 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -912,7 +912,7 @@ perform_redaction(objset_t *os, redaction_list_t *rl, object = prev_obj; } while (err == 0 && object <= rec->end_object) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { err = EINTR; break; } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index b6cc2f0a5e..cb2b62fed3 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2552,7 +2552,7 @@ dmu_send_impl(struct dmu_send_params *dspp) while (err == 0 && !range->eos_marker) { err = do_dump(&dsc, range); range = get_next_range(&srt_arg->q, range); - if (issig(JUSTLOOKING) && issig(FORREAL)) + if (issig()) err = SET_ERROR(EINTR); } diff --git a/module/zfs/zcp.c b/module/zfs/zcp.c index 959404f665..7c279162a9 100644 --- a/module/zfs/zcp.c +++ b/module/zfs/zcp.c @@ -780,8 +780,7 @@ zcp_lua_counthook(lua_State *state, lua_Debug *ar) * Check if we were canceled while waiting for the * txg to sync or from our open context thread */ - if (ri->zri_canceled || - (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) { + if (ri->zri_canceled || (!ri->zri_sync && issig())) { ri->zri_canceled = B_TRUE; (void) lua_pushstring(state, "Channel program was canceled."); (void) lua_error(state); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index b720b4f222..7b527eb75e 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -2336,7 +2336,7 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) } while (error == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { error = SET_ERROR(EINTR); break; } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index b222a6f88d..f3db953eab 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1413,7 +1413,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, len -= size; done += size; - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { error = SET_ERROR(EINTR); break; } From ae22044da998e27497c3ad6724a0c64c89cfd87f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Wed, 29 May 2024 19:51:01 +0200 Subject: [PATCH 094/113] spl: fix compilation without HAVE_BACKTRACE The __maybe_unused macro is defined in spl/sys/debug.h Reviewed-by: Brian Behlendorf Signed-off-by: Martin Matuska Closes #16229 --- lib/libspl/backtrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index dd8cb025f4..d26d742106 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -110,6 +110,8 @@ libspl_backtrace(int fd) backtrace_symbols_fd(btptrs, nptrs, fd); } #else +#include + void libspl_backtrace(int fd __maybe_unused) { From 5137c132a5e82b2e799ad3ee5d82fb32b500e5a4 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 29 May 2024 13:34:59 -0700 Subject: [PATCH 095/113] zpool import output is not formated properly. The 'zpool status' output assumes that the longest prefix is six character long plus colon plus space, eg. 'status: ', 'action: ' or 'config: ' (so eight in total). This works well even when we have messages that requires more than one line, as '\t' is exactly eight characters, just like the longest prefix. The 'zpool import' output is a bit different, as it may display the comment pool property, then the longest prefix is 'comment: ', which is nine characters long, not eight. All the prefixes were given an extra space in front, but: - 'status: ' did not get an extra space. - Messages that require more than one line should use nine spaces of indentation, not eight. - The extra space in front looks redundant if there is no comment property set on the given pool. Fix it by adding an extra space to all prefixes, but only if the comment property is defined. Also, when we need to continue the message in a new line, use '\t ' for indentation. While here, apply small corrections to a couple messages. Before: pool: tank id: 7412636063178848859 state: ONLINE status: Some supported features are not enabled on the pool. (Note that they may be intentionally disabled if the 'compatibility' property is set.) action: The pool can be imported using its name or numeric identif[...] some features will not be available without an explicit 'zp[...] comment: Example comment. config: bclone ONLINE ada0 ONLINE After: pool: tank id: 10180960571062436759 state: ONLINE status: Some supported features are not enabled on the pool. (Note that they may be intentionally disabled if the 'compatibility' property is set.) action: The pool can be imported using its name or numeric identifi[...] some features will not be available without an explicit 'zp[...] config: tank ONLINE ada3 ONLINE pool: dozer id: 11028319538368222579 state: ONLINE status: Some supported features are not enabled on the pool. (Note that they may be intentionally disabled if the 'compatibility' property is set.) action: The pool can be imported using its name or numeric identif[...] some features will not be available without an explicit 'z[...] comment: Example comment. config: dozer ONLINE ada1 ONLINE Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Dawidek Closes #16128 --- cmd/zpool/zpool_main.c | 269 +++++++++++++++++++++-------------------- 1 file changed, 138 insertions(+), 131 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d47e1cda9c..57170c8ae7 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3016,6 +3016,7 @@ show_import(nvlist_t *config, boolean_t report_error) const char *health; uint_t vsc; const char *comment; + const char *indent; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, @@ -3040,82 +3041,84 @@ show_import(nvlist_t *config, boolean_t report_error) if (reason != ZPOOL_STATUS_OK && !report_error) return (reason); - (void) printf(gettext(" pool: %s\n"), name); - (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); - (void) printf(gettext(" state: %s"), health); + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) { + indent = " "; + } else { + comment = NULL; + indent = ""; + } + + (void) printf(gettext("%s pool: %s\n"), indent, name); + (void) printf(gettext("%s id: %llu\n"), indent, (u_longlong_t)guid); + (void) printf(gettext("%s state: %s"), indent, health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); + if (reason != ZPOOL_STATUS_OK) { + (void) printf("%s", indent); + printf_color(ANSI_BOLD, gettext("status: ")); + } switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: - printf_color(ANSI_BOLD, gettext("status: ")); - printf_color(ANSI_YELLOW, gettext("One or more devices contains" - " corrupted data.\n")); + printf_color(ANSI_YELLOW, gettext("One or more devices " + "contains corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: - (void) printf( - gettext(" status: The pool data is corrupted.\n")); + printf_color(ANSI_YELLOW, gettext("The pool data is " + "corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "an incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported " - "features are not enabled on the pool.\n\t" - "(Note that they may be intentionally disabled " - "if the\n\t'compatibility' property is set.)\n")); + "features are not enabled on the pool.\n" + "\t%s(Note that they may be intentionally disabled if the\n" + "\t%s'compatibility' property is set.)\n"), indent, indent); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Error reading or parsing " "the file(s) indicated by the 'compatibility'\n" - "property.\n")); + "\t%sproperty.\n"), indent); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more features " "are enabled on the pool despite not being\n" - "requested by the 'compatibility' property.\n")); + "\t%srequested by the 'compatibility' property.\n"), + indent); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool uses the following " "feature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); @@ -3124,66 +3127,60 @@ show_import(nvlist_t *config, boolean_t report_error) break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " - "accessed in read-only mode on this system. It\n\tcannot be" - " accessed in read-write mode because it uses the " - "following\n\tfeature(s) not supported on this system:\n")); + "accessed in read-only mode on this system. It\n" + "\t%scannot be accessed in read-write mode because it uses " + "the following\n" + "\t%sfeature(s) not supported on this system:\n"), + indent, indent); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_HOSTID_ACTIVE: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has the " - "multihost property on. It cannot\n\tbe safely imported " - "when the system hostid is not set.\n")); + "multihost property on. It cannot\n" + "\t%sbe safely imported when the system hostid is not " + "set.\n"), indent); break; case ZPOOL_STATUS_HOSTID_MISMATCH: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool was last accessed " "by another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record cannot " "be read.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: - printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "configured to use a non-native block size.\n" - "\tExpect reduced performance.\n")); + "\t%sExpect reduced performance.\n"), indent); break; default: @@ -3196,114 +3193,121 @@ show_import(nvlist_t *config, boolean_t report_error) /* * Print out an action according to the overall state of the pool. */ + if (vs->vs_state != VDEV_STATE_HEALTHY || + reason != ZPOOL_STATUS_ERRATA || errata != ZPOOL_ERRATA_NONE) { + (void) printf("%s", indent); + (void) printf(gettext("action: ")); + } if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric identifier, " - "though\n\tsome features will not be available " - "without an explicit 'zpool upgrade'.\n")); + (void) printf(gettext("The pool can be imported using " + "its name or numeric identifier, though\n" + "\t%ssome features will not be available without " + "an explicit 'zpool upgrade'.\n"), indent); } else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric\n\tidentifier, " - "though the file(s) indicated by its " - "'compatibility'\n\tproperty cannot be parsed at " - "this time.\n")); + (void) printf(gettext("The pool can be imported using " + "its name or numeric\n" + "\t%sidentifier, though the file(s) indicated by " + "its 'compatibility'\n" + "\t%sproperty cannot be parsed at this time.\n"), + indent, indent); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric " - "identifier and\n\tthe '-f' flag.\n")); + (void) printf(gettext("The pool can be imported using " + "its name or numeric identifier and\n" + "\t%sthe '-f' flag.\n"), indent); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { - case ZPOOL_ERRATA_NONE: - break; - case ZPOOL_ERRATA_ZOL_2094_SCRUB: - (void) printf(gettext(" action: The pool can " - "be imported using its name or numeric " - "identifier,\n\thowever there is a compat" - "ibility issue which should be corrected" - "\n\tby running 'zpool scrub'\n")); + (void) printf(gettext("The pool can be " + "imported using its name or numeric " + "identifier,\n" + "\t%showever there is a compatibility " + "issue which should be corrected\n" + "\t%sby running 'zpool scrub'\n"), + indent, indent); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: - (void) printf(gettext(" action: The pool can" - "not be imported with this version of ZFS " - "due to\n\tan active asynchronous destroy. " - "Revert to an earlier version\n\tand " - "allow the destroy to complete before " - "updating.\n")); + (void) printf(gettext("The pool cannot be " + "imported with this version of ZFS due to\n" + "\t%san active asynchronous destroy. " + "Revert to an earlier version\n" + "\t%sand allow the destroy to complete " + "before updating.\n"), indent, indent); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: - (void) printf(gettext(" action: Existing " - "encrypted datasets contain an on-disk " - "incompatibility, which\n\tneeds to be " - "corrected. Backup these datasets to new " - "encrypted datasets\n\tand destroy the " - "old ones.\n")); + (void) printf(gettext("Existing encrypted " + "datasets contain an on-disk " + "incompatibility, which\n" + "\t%sneeds to be corrected. Backup these " + "datasets to new encrypted datasets\n" + "\t%sand destroy the old ones.\n"), + indent, indent); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: - (void) printf(gettext(" action: Existing " - "encrypted snapshots and bookmarks contain " - "an on-disk\n\tincompatibility. This may " - "cause on-disk corruption if they are used" - "\n\twith 'zfs recv'. To correct the " - "issue, enable the bookmark_v2 feature.\n\t" - "No additional action is needed if there " - "are no encrypted snapshots or\n\t" - "bookmarks. If preserving the encrypted " - "snapshots and bookmarks is\n\trequired, " - "use a non-raw send to backup and restore " - "them. Alternately,\n\tthey may be removed" - " to resolve the incompatibility.\n")); + (void) printf(gettext("Existing encrypted " + "snapshots and bookmarks contain an " + "on-disk\n" + "\t%sincompatibility. This may cause " + "on-disk corruption if they are used\n" + "\t%swith 'zfs recv'. To correct the " + "issue, enable the bookmark_v2 feature.\n" + "\t%sNo additional action is needed if " + "there are no encrypted snapshots or\n" + "\t%sbookmarks. If preserving the " + "encrypted snapshots and bookmarks is\n" + "\t%srequired, use a non-raw send to " + "backup and restore them. Alternately,\n" + "\t%sthey may be removed to resolve the " + "incompatibility.\n"), indent, indent, + indent, indent, indent, indent); break; default: /* * All errata must contain an action message. */ - assert(0); + assert(errata == ZPOOL_ERRATA_NONE); } } else { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric " - "identifier.\n")); + (void) printf(gettext("The pool can be imported using " + "its name or numeric identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { - (void) printf(gettext(" action: The pool can be imported " - "despite missing or damaged devices. The\n\tfault " - "tolerance of the pool may be compromised if imported.\n")); + (void) printf(gettext("The pool can be imported despite " + "missing or damaged devices. The\n" + "\t%sfault tolerance of the pool may be compromised if " + "imported.\n"), indent); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext(" action: The pool cannot be " - "imported. Access the pool on a system running " - "newer\n\tsoftware, or recreate the pool from " - "backup.\n")); + (void) printf(gettext("The pool cannot be imported. " + "Access the pool on a system running newer\n" + "\t%ssoftware, or recreate the pool from " + "backup.\n"), indent); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: - printf_color(ANSI_BOLD, gettext("action: ")); - printf_color(ANSI_YELLOW, gettext("The pool cannot be " - "imported. Access the pool on a system that " - "supports\n\tthe required feature(s), or recreate " - "the pool from backup.\n")); + (void) printf(gettext("The pool cannot be imported. " + "Access the pool on a system that supports\n" + "\t%sthe required feature(s), or recreate the pool " + "from backup.\n"), indent); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - printf_color(ANSI_BOLD, gettext("action: ")); - printf_color(ANSI_YELLOW, gettext("The pool cannot be " - "imported in read-write mode. Import the pool " - "with\n" - "\t\"-o readonly=on\", access the pool on a system " - "that supports the\n\trequired feature(s), or " - "recreate the pool from backup.\n")); + (void) printf(gettext("The pool cannot be imported in " + "read-write mode. Import the pool with\n" + "\t%s'-o readonly=on', access the pool on a system " + "that supports the\n" + "\t%srequired feature(s), or recreate the pool " + "from backup.\n"), indent, indent); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: - (void) printf(gettext(" action: The pool cannot be " - "imported. Attach the missing\n\tdevices and try " - "again.\n")); + (void) printf(gettext("The pool cannot be imported. " + "Attach the missing\n" + "\t%sdevices and try again.\n"), indent); break; case ZPOOL_STATUS_HOSTID_ACTIVE: VERIFY0(nvlist_lookup_nvlist(config, @@ -3317,47 +3321,49 @@ show_import(nvlist_t *config, boolean_t report_error) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); - (void) printf(gettext(" action: The pool must be " - "exported from %s (hostid=%"PRIx64")\n\tbefore it " - "can be safely imported.\n"), hostname, hostid); + (void) printf(gettext("The pool must be exported from " + "%s (hostid=%"PRIx64")\n" + "\t%sbefore it can be safely imported.\n"), + hostname, hostid, indent); break; case ZPOOL_STATUS_HOSTID_REQUIRED: - (void) printf(gettext(" action: Set a unique system " - "hostid with the zgenhostid(8) command.\n")); + (void) printf(gettext("Set a unique system hostid with " + "the zgenhostid(8) command.\n")); break; default: - (void) printf(gettext(" action: The pool cannot be " - "imported due to damaged devices or data.\n")); + (void) printf(gettext("The pool cannot be imported due " + "to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ - if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + if (comment != NULL) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ - if (((vs->vs_state == VDEV_STATE_CLOSED) || - (vs->vs_state == VDEV_STATE_CANT_OPEN)) && - (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { + if ((vs->vs_state == VDEV_STATE_CLOSED || + vs->vs_state == VDEV_STATE_CANT_OPEN) && + vs->vs_aux == VDEV_AUX_CORRUPT_DATA) { if (pool_state == POOL_STATE_DESTROYED) - (void) printf(gettext("\tThe pool was destroyed, " - "but can be imported using the '-Df' flags.\n")); + (void) printf(gettext("\t%sThe pool was destroyed, " + "but can be imported using the '-Df' flags.\n"), + indent); else if (pool_state != POOL_STATE_EXPORTED) - (void) printf(gettext("\tThe pool may be active on " - "another system, but can be imported using\n\t" - "the '-f' flag.\n")); + (void) printf(gettext("\t%sThe pool may be active on " + "another system, but can be imported using\n" + "\t%sthe '-f' flag.\n"), indent, indent); } if (msgid != NULL) { - (void) printf(gettext( - " see: https://openzfs.github.io/openzfs-docs/msg/%s\n"), - msgid); + (void) printf(gettext("%s see: " + "https://openzfs.github.io/openzfs-docs/msg/%s\n"), + indent, msgid); } - (void) printf(gettext(" config:\n\n")); + (void) printf(gettext("%sconfig:\n\n"), indent); cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), VDEV_NAME_TYPE_ID); @@ -3371,9 +3377,10 @@ show_import(nvlist_t *config, boolean_t report_error) print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { - (void) printf(gettext("\n\tAdditional devices are known to " - "be part of this pool, though their\n\texact " - "configuration cannot be determined.\n")); + (void) printf(gettext("\n\t%sAdditional devices are known to " + "be part of this pool, though their\n" + "\t%sexact configuration cannot be determined.\n"), + indent, indent); } return (0); } From e2357561b9499296bff758afe4868dbc39735675 Mon Sep 17 00:00:00 2001 From: Zhenlei Huang Date: Fri, 31 May 2024 00:58:20 +0800 Subject: [PATCH 096/113] FreeBSD: Add const qualifier to members of struct opensolaris_utsname These members have directly references to the global variables exposed by the kernel. They are not going to be changed by this kernel module. Reviewed-by: Brian Behlendorf Signed-off-by: Zhenlei Huang Closes #16210 --- include/os/freebsd/spl/sys/misc.h | 10 +++++----- module/os/freebsd/spl/spl_misc.c | 7 +++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/os/freebsd/spl/sys/misc.h b/include/os/freebsd/spl/sys/misc.h index 2e4efc6054..894ccd8bf9 100644 --- a/include/os/freebsd/spl/sys/misc.h +++ b/include/os/freebsd/spl/sys/misc.h @@ -45,11 +45,11 @@ #define F_SEEK_HOLE FIOSEEKHOLE struct opensolaris_utsname { - char *sysname; - char *nodename; - char *release; - char version[32]; - char *machine; + const char *sysname; + const char *nodename; + const char *release; + char version[32]; + const char *machine; }; #define task_io_account_read(n) diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c index a5fc996b65..2d0821417a 100644 --- a/module/os/freebsd/spl/spl_misc.c +++ b/module/os/freebsd/spl/spl_misc.c @@ -37,6 +37,9 @@ #include static struct opensolaris_utsname hw_utsname = { + .sysname = ostype, + .nodename = prison0.pr_hostname, + .release = osrelease, .machine = MACHINE }; @@ -49,10 +52,6 @@ utsname(void) static void opensolaris_utsname_init(void *arg) { - - hw_utsname.sysname = ostype; - hw_utsname.nodename = prison0.pr_hostname; - hw_utsname.release = osrelease; snprintf(hw_utsname.version, sizeof (hw_utsname.version), "%d", osreldate); } From a301dc364c336bee58950f88e040f825df840b60 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 31 May 2024 15:11:00 -0700 Subject: [PATCH 097/113] ZTS: Fix redacted_send failures on FreeBSD We're seeing failures for redacted_deleted and redacted_mount on FreeBSD 13-15: 09:58:34.74 diff: /dev/fd/3: No such file or directory 09:58:34.74 ERROR: diff /dev/fd/3 /dev/fd/4 exited 2 The test was trying to diff the file listings between two directories to see if they are the same. The workaround is to do a string comparison of the directory listings instead of using `diff`. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #16224 --- .../tests/functional/redacted_send/redacted_deleted.ksh | 2 +- .../tests/functional/redacted_send/redacted_mounts.ksh | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh index 3e2aeb7335..ec11610742 100755 --- a/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh @@ -96,7 +96,7 @@ log_must zfs destroy -R $clone2 log_must eval "zfs send -i $sendfs#book2 --redact book3 $sendfs@snap2 >$stream" log_must eval "zfs recv $recvfs <$stream" log_must mount_redacted -f $recvfs -log_must diff <(ls $send_mnt) <(ls $recv_mnt) +log_must [ "$(ls $send_mnt)" == "$(ls $recv_mnt)" ] log_must zfs destroy -R $recvfs log_must zfs rollback -R $sendfs@snap diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh index 0bc4bf4617..c041469163 100755 --- a/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh @@ -71,8 +71,7 @@ log_must ismounted $recvfs # deleted. contents=$(log_must find $recv_mnt) contents_orig=$(log_must find $send_mnt) -log_must diff <(echo ${contents//$recv_mnt/}) \ - <(echo ${contents_orig//$send_mnt/}) +log_must [ "${contents//$recv_mnt/}" == "${contents_orig//$send_mnt/}" ] log_must zfs redact $sendvol@snap book2 $clonevol@snap log_must eval "zfs send --redact book2 $sendvol@snap >$stream" log_must eval "zfs receive $recvvol <$stream" @@ -103,7 +102,6 @@ log_must mount_redacted -f $recvfs log_must ismounted $recvfs contents=$(log_must find $recv_mnt) contents_orig=$(log_must find $send_mnt) -log_must diff <(echo ${contents//$recv_mnt/}) \ - <(echo ${contents_orig//$send_mnt/}) +log_must [ "${contents//$recv_mnt/}" == "${contents_orig//$send_mnt/}" ] log_pass "Received redacted streams can be mounted." From 4185179190f3dd8e4ee467452a54f1eb2ee360b9 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 18 May 2024 21:05:20 +1000 Subject: [PATCH 098/113] icp: remove unused blowfish_ctx and des_ctx Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- module/icp/include/modes/modes.h | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/module/icp/include/modes/modes.h b/module/icp/include/modes/modes.h index 23bf46ab51..950c1115f3 100644 --- a/module/icp/include/modes/modes.h +++ b/module/icp/include/modes/modes.h @@ -269,34 +269,6 @@ typedef struct aes_ctx { #define ac_processed_data_len acu.acu_ccm.ccm_processed_data_len #define ac_tag_len acu.acu_gcm.gcm_tag_len -typedef struct blowfish_ctx { - union { - ecb_ctx_t bcu_ecb; - cbc_ctx_t bcu_cbc; - } bcu; -} blowfish_ctx_t; - -#define bc_flags bcu.bcu_ecb.ecb_common.cc_flags -#define bc_remainder_len bcu.bcu_ecb.ecb_common.cc_remainder_len -#define bc_keysched bcu.bcu_ecb.ecb_common.cc_keysched -#define bc_keysched_len bcu.bcu_ecb.ecb_common.cc_keysched_len -#define bc_iv bcu.bcu_ecb.ecb_common.cc_iv -#define bc_lastp bcu.bcu_ecb.ecb_common.cc_lastp - -typedef struct des_ctx { - union { - ecb_ctx_t dcu_ecb; - cbc_ctx_t dcu_cbc; - } dcu; -} des_ctx_t; - -#define dc_flags dcu.dcu_ecb.ecb_common.cc_flags -#define dc_remainder_len dcu.dcu_ecb.ecb_common.cc_remainder_len -#define dc_keysched dcu.dcu_ecb.ecb_common.cc_keysched -#define dc_keysched_len dcu.dcu_ecb.ecb_common.cc_keysched_len -#define dc_iv dcu.dcu_ecb.ecb_common.cc_iv -#define dc_lastp dcu.dcu_ecb.ecb_common.cc_lastp - extern int ecb_cipher_contiguous_blocks(ecb_ctx_t *, char *, size_t, crypto_data_t *, size_t, int (*cipher)(const void *, const uint8_t *, uint8_t *)); From 57249bcddc0d743d64777d79344d03d22c1233d4 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 18 May 2024 21:57:36 +1000 Subject: [PATCH 099/113] icp: brutally remove unused AES modes Still retaining the struture, for now. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- include/sys/crypto/common.h | 17 -- lib/libicp/Makefile.am | 3 - module/Kbuild.in | 3 - module/icp/algs/aes/aes_modes.c | 37 +-- module/icp/algs/modes/cbc.c | 264 -------------------- module/icp/algs/modes/ctr.c | 227 ------------------ module/icp/algs/modes/ecb.c | 127 ---------- module/icp/algs/modes/gcm.c | 79 +----- module/icp/algs/modes/modes.c | 20 +- module/icp/include/aes/aes_impl.h | 5 - module/icp/include/modes/modes.h | 114 +-------- module/icp/io/aes.c | 384 ++---------------------------- 12 files changed, 57 insertions(+), 1223 deletions(-) delete mode 100644 module/icp/algs/modes/cbc.c delete mode 100644 module/icp/algs/modes/ctr.c delete mode 100644 module/icp/algs/modes/ecb.c diff --git a/include/sys/crypto/common.h b/include/sys/crypto/common.h index 261e88ecee..7438056500 100644 --- a/include/sys/crypto/common.h +++ b/include/sys/crypto/common.h @@ -51,12 +51,6 @@ typedef struct crypto_mechanism { size_t cm_param_len; /* mech. parameter len */ } crypto_mechanism_t; -/* CK_AES_CTR_PARAMS provides parameters to the CKM_AES_CTR mechanism */ -typedef struct CK_AES_CTR_PARAMS { - ulong_t ulCounterBits; - uint8_t cb[16]; -} CK_AES_CTR_PARAMS; - /* CK_AES_CCM_PARAMS provides parameters to the CKM_AES_CCM mechanism */ typedef struct CK_AES_CCM_PARAMS { ulong_t ulMACSize; @@ -77,13 +71,6 @@ typedef struct CK_AES_GCM_PARAMS { ulong_t ulTagBits; } CK_AES_GCM_PARAMS; -/* CK_AES_GMAC_PARAMS provides parameters to the CKM_AES_GMAC mechanism */ -typedef struct CK_AES_GMAC_PARAMS { - uchar_t *pIv; - uchar_t *pAAD; - ulong_t ulAADLen; -} CK_AES_GMAC_PARAMS; - /* * The measurement unit bit flag for a mechanism's minimum or maximum key size. * The unit are mechanism dependent. It can be in bits or in bytes. @@ -103,12 +90,8 @@ typedef uint32_t crypto_keysize_unit_t; #define SUN_CKM_SHA512_HMAC_GENERAL "CKM_SHA512_HMAC_GENERAL" #define SUN_CKM_SHA512_224 "CKM_SHA512_224" #define SUN_CKM_SHA512_256 "CKM_SHA512_256" -#define SUN_CKM_AES_CBC "CKM_AES_CBC" -#define SUN_CKM_AES_ECB "CKM_AES_ECB" -#define SUN_CKM_AES_CTR "CKM_AES_CTR" #define SUN_CKM_AES_CCM "CKM_AES_CCM" #define SUN_CKM_AES_GCM "CKM_AES_GCM" -#define SUN_CKM_AES_GMAC "CKM_AES_GMAC" /* Data arguments of cryptographic operations */ diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index 4ba55b2158..f40512bec9 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -18,13 +18,10 @@ nodist_libicp_la_SOURCES = \ module/icp/algs/blake3/blake3_impl.c \ module/icp/algs/edonr/edonr.c \ module/icp/algs/modes/modes.c \ - module/icp/algs/modes/cbc.c \ module/icp/algs/modes/gcm_generic.c \ module/icp/algs/modes/gcm_pclmulqdq.c \ module/icp/algs/modes/gcm.c \ - module/icp/algs/modes/ctr.c \ module/icp/algs/modes/ccm.c \ - module/icp/algs/modes/ecb.c \ module/icp/algs/sha2/sha2_generic.c \ module/icp/algs/sha2/sha256_impl.c \ module/icp/algs/sha2/sha512_impl.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index 7e08374fa2..6e2eab2258 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -101,10 +101,7 @@ ICP_OBJS := \ algs/blake3/blake3_generic.o \ algs/blake3/blake3_impl.o \ algs/edonr/edonr.o \ - algs/modes/cbc.o \ algs/modes/ccm.o \ - algs/modes/ctr.o \ - algs/modes/ecb.o \ algs/modes/gcm.o \ algs/modes/gcm_generic.o \ algs/modes/modes.o \ diff --git a/module/icp/algs/aes/aes_modes.c b/module/icp/algs/aes/aes_modes.c index 6a25496d05..631e92f354 100644 --- a/module/icp/algs/aes/aes_modes.c +++ b/module/icp/algs/aes/aes_modes.c @@ -75,25 +75,17 @@ aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length, aes_ctx_t *aes_ctx = ctx; int rv; - if (aes_ctx->ac_flags & CTR_MODE) { - rv = ctr_mode_contiguous_blocks(ctx, data, length, out, - AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); - } else if (aes_ctx->ac_flags & CCM_MODE) { + if (aes_ctx->ac_flags & CCM_MODE) { rv = ccm_mode_encrypt_contiguous_blocks(ctx, data, length, out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); - } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + } else if (aes_ctx->ac_flags & GCM_MODE) { rv = gcm_mode_encrypt_contiguous_blocks(ctx, data, length, out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); - } else if (aes_ctx->ac_flags & CBC_MODE) { - rv = cbc_encrypt_contiguous_blocks(ctx, - data, length, out, AES_BLOCK_LEN, aes_encrypt_block, - aes_copy_block, aes_xor_block); - } else { - rv = ecb_cipher_contiguous_blocks(ctx, data, length, out, - AES_BLOCK_LEN, aes_encrypt_block); } + else + __builtin_unreachable(); return (rv); } @@ -108,28 +100,15 @@ aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length, aes_ctx_t *aes_ctx = ctx; int rv; - if (aes_ctx->ac_flags & CTR_MODE) { - rv = ctr_mode_contiguous_blocks(ctx, data, length, out, - AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); - if (rv == CRYPTO_DATA_LEN_RANGE) - rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; - } else if (aes_ctx->ac_flags & CCM_MODE) { + if (aes_ctx->ac_flags & CCM_MODE) { rv = ccm_mode_decrypt_contiguous_blocks(ctx, data, length, out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); - } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + } else if (aes_ctx->ac_flags & GCM_MODE) { rv = gcm_mode_decrypt_contiguous_blocks(ctx, data, length, out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); - } else if (aes_ctx->ac_flags & CBC_MODE) { - rv = cbc_decrypt_contiguous_blocks(ctx, data, length, out, - AES_BLOCK_LEN, aes_decrypt_block, aes_copy_block, - aes_xor_block); - } else { - rv = ecb_cipher_contiguous_blocks(ctx, data, length, out, - AES_BLOCK_LEN, aes_decrypt_block); - if (rv == CRYPTO_DATA_LEN_RANGE) - rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; - } + } else + __builtin_unreachable(); return (rv); } diff --git a/module/icp/algs/modes/cbc.c b/module/icp/algs/modes/cbc.c deleted file mode 100644 index d0219fb24c..0000000000 --- a/module/icp/algs/modes/cbc.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include - -/* - * Algorithm independent CBC functions. - */ -int -cbc_encrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length, - crypto_data_t *out, size_t block_size, - int (*encrypt)(const void *, const uint8_t *, uint8_t *), - void (*copy_block)(uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)) -{ - size_t remainder = length; - size_t need = 0; - uint8_t *datap = (uint8_t *)data; - uint8_t *blockp; - uint8_t *lastp; - void *iov_or_mp; - offset_t offset; - uint8_t *out_data_1; - uint8_t *out_data_2; - size_t out_data_1_len; - - if (length + ctx->cbc_remainder_len < block_size) { - /* accumulate bytes here and return */ - memcpy((uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len, - datap, - length); - ctx->cbc_remainder_len += length; - ctx->cbc_copy_to = datap; - return (CRYPTO_SUCCESS); - } - - lastp = (uint8_t *)ctx->cbc_iv; - crypto_init_ptrs(out, &iov_or_mp, &offset); - - do { - /* Unprocessed data from last call. */ - if (ctx->cbc_remainder_len > 0) { - need = block_size - ctx->cbc_remainder_len; - - if (need > remainder) - return (CRYPTO_DATA_LEN_RANGE); - - memcpy(&((uint8_t *)ctx->cbc_remainder) - [ctx->cbc_remainder_len], datap, need); - - blockp = (uint8_t *)ctx->cbc_remainder; - } else { - blockp = datap; - } - - /* - * XOR the previous cipher block or IV with the - * current clear block. - */ - xor_block(blockp, lastp); - encrypt(ctx->cbc_keysched, lastp, lastp); - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); - - /* copy block to where it belongs */ - if (out_data_1_len == block_size) { - copy_block(lastp, out_data_1); - } else { - memcpy(out_data_1, lastp, out_data_1_len); - if (out_data_2 != NULL) { - memcpy(out_data_2, - lastp + out_data_1_len, - block_size - out_data_1_len); - } - } - /* update offset */ - out->cd_offset += block_size; - - /* Update pointer to next block of data to be processed. */ - if (ctx->cbc_remainder_len != 0) { - datap += need; - ctx->cbc_remainder_len = 0; - } else { - datap += block_size; - } - - remainder = (size_t)&data[length] - (size_t)datap; - - /* Incomplete last block. */ - if (remainder > 0 && remainder < block_size) { - memcpy(ctx->cbc_remainder, datap, remainder); - ctx->cbc_remainder_len = remainder; - ctx->cbc_copy_to = datap; - goto out; - } - ctx->cbc_copy_to = NULL; - - } while (remainder > 0); - -out: - /* - * Save the last encrypted block in the context. - */ - if (ctx->cbc_lastp != NULL) { - copy_block((uint8_t *)ctx->cbc_lastp, (uint8_t *)ctx->cbc_iv); - ctx->cbc_lastp = (uint8_t *)ctx->cbc_iv; - } - - return (CRYPTO_SUCCESS); -} - -#define OTHER(a, ctx) \ - (((a) == (ctx)->cbc_lastblock) ? (ctx)->cbc_iv : (ctx)->cbc_lastblock) - -int -cbc_decrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length, - crypto_data_t *out, size_t block_size, - int (*decrypt)(const void *, const uint8_t *, uint8_t *), - void (*copy_block)(uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)) -{ - size_t remainder = length; - size_t need = 0; - uint8_t *datap = (uint8_t *)data; - uint8_t *blockp; - uint8_t *lastp; - void *iov_or_mp; - offset_t offset; - uint8_t *out_data_1; - uint8_t *out_data_2; - size_t out_data_1_len; - - if (length + ctx->cbc_remainder_len < block_size) { - /* accumulate bytes here and return */ - memcpy((uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len, - datap, - length); - ctx->cbc_remainder_len += length; - ctx->cbc_copy_to = datap; - return (CRYPTO_SUCCESS); - } - - lastp = ctx->cbc_lastp; - crypto_init_ptrs(out, &iov_or_mp, &offset); - - do { - /* Unprocessed data from last call. */ - if (ctx->cbc_remainder_len > 0) { - need = block_size - ctx->cbc_remainder_len; - - if (need > remainder) - return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); - - memcpy(&((uint8_t *)ctx->cbc_remainder) - [ctx->cbc_remainder_len], datap, need); - - blockp = (uint8_t *)ctx->cbc_remainder; - } else { - blockp = datap; - } - - /* LINTED: pointer alignment */ - copy_block(blockp, (uint8_t *)OTHER((uint64_t *)lastp, ctx)); - - decrypt(ctx->cbc_keysched, blockp, - (uint8_t *)ctx->cbc_remainder); - blockp = (uint8_t *)ctx->cbc_remainder; - - /* - * XOR the previous cipher block or IV with the - * currently decrypted block. - */ - xor_block(lastp, blockp); - - /* LINTED: pointer alignment */ - lastp = (uint8_t *)OTHER((uint64_t *)lastp, ctx); - - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); - - memcpy(out_data_1, blockp, out_data_1_len); - if (out_data_2 != NULL) { - memcpy(out_data_2, blockp + out_data_1_len, - block_size - out_data_1_len); - } - - /* update offset */ - out->cd_offset += block_size; - - /* Update pointer to next block of data to be processed. */ - if (ctx->cbc_remainder_len != 0) { - datap += need; - ctx->cbc_remainder_len = 0; - } else { - datap += block_size; - } - - remainder = (size_t)&data[length] - (size_t)datap; - - /* Incomplete last block. */ - if (remainder > 0 && remainder < block_size) { - memcpy(ctx->cbc_remainder, datap, remainder); - ctx->cbc_remainder_len = remainder; - ctx->cbc_lastp = lastp; - ctx->cbc_copy_to = datap; - return (CRYPTO_SUCCESS); - } - ctx->cbc_copy_to = NULL; - - } while (remainder > 0); - - ctx->cbc_lastp = lastp; - return (CRYPTO_SUCCESS); -} - -int -cbc_init_ctx(cbc_ctx_t *cbc_ctx, char *param, size_t param_len, - size_t block_size, void (*copy_block)(uint8_t *, uint64_t *)) -{ - /* Copy IV into context. */ - ASSERT3P(param, !=, NULL); - ASSERT3U(param_len, ==, block_size); - - copy_block((uchar_t *)param, cbc_ctx->cbc_iv); - - return (CRYPTO_SUCCESS); -} - -void * -cbc_alloc_ctx(int kmflag) -{ - cbc_ctx_t *cbc_ctx; - - if ((cbc_ctx = kmem_zalloc(sizeof (cbc_ctx_t), kmflag)) == NULL) - return (NULL); - - cbc_ctx->cbc_flags = CBC_MODE; - return (cbc_ctx); -} diff --git a/module/icp/algs/modes/ctr.c b/module/icp/algs/modes/ctr.c deleted file mode 100644 index db6b1c71d5..0000000000 --- a/module/icp/algs/modes/ctr.c +++ /dev/null @@ -1,227 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#include - -/* - * Encrypt and decrypt multiple blocks of data in counter mode. - */ -int -ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length, - crypto_data_t *out, size_t block_size, - int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct), - void (*xor_block)(uint8_t *, uint8_t *)) -{ - size_t remainder = length; - size_t need = 0; - uint8_t *datap = (uint8_t *)data; - uint8_t *blockp; - uint8_t *lastp; - void *iov_or_mp; - offset_t offset; - uint8_t *out_data_1; - uint8_t *out_data_2; - size_t out_data_1_len; - uint64_t lower_counter, upper_counter; - - if (length + ctx->ctr_remainder_len < block_size) { - /* accumulate bytes here and return */ - memcpy((uint8_t *)ctx->ctr_remainder + ctx->ctr_remainder_len, - datap, - length); - ctx->ctr_remainder_len += length; - ctx->ctr_copy_to = datap; - return (CRYPTO_SUCCESS); - } - - crypto_init_ptrs(out, &iov_or_mp, &offset); - - do { - /* Unprocessed data from last call. */ - if (ctx->ctr_remainder_len > 0) { - need = block_size - ctx->ctr_remainder_len; - - if (need > remainder) - return (CRYPTO_DATA_LEN_RANGE); - - memcpy(&((uint8_t *)ctx->ctr_remainder) - [ctx->ctr_remainder_len], datap, need); - - blockp = (uint8_t *)ctx->ctr_remainder; - } else { - blockp = datap; - } - - /* ctr_cb is the counter block */ - cipher(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, - (uint8_t *)ctx->ctr_tmp); - - lastp = (uint8_t *)ctx->ctr_tmp; - - /* - * Increment Counter. - */ - lower_counter = ntohll(ctx->ctr_cb[1] & ctx->ctr_lower_mask); - lower_counter = htonll(lower_counter + 1); - lower_counter &= ctx->ctr_lower_mask; - ctx->ctr_cb[1] = (ctx->ctr_cb[1] & ~(ctx->ctr_lower_mask)) | - lower_counter; - - /* wrap around */ - if (lower_counter == 0) { - upper_counter = - ntohll(ctx->ctr_cb[0] & ctx->ctr_upper_mask); - upper_counter = htonll(upper_counter + 1); - upper_counter &= ctx->ctr_upper_mask; - ctx->ctr_cb[0] = - (ctx->ctr_cb[0] & ~(ctx->ctr_upper_mask)) | - upper_counter; - } - - /* - * XOR encrypted counter block with the current clear block. - */ - xor_block(blockp, lastp); - - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); - - /* copy block to where it belongs */ - memcpy(out_data_1, lastp, out_data_1_len); - if (out_data_2 != NULL) { - memcpy(out_data_2, lastp + out_data_1_len, - block_size - out_data_1_len); - } - /* update offset */ - out->cd_offset += block_size; - - /* Update pointer to next block of data to be processed. */ - if (ctx->ctr_remainder_len != 0) { - datap += need; - ctx->ctr_remainder_len = 0; - } else { - datap += block_size; - } - - remainder = (size_t)&data[length] - (size_t)datap; - - /* Incomplete last block. */ - if (remainder > 0 && remainder < block_size) { - memcpy(ctx->ctr_remainder, datap, remainder); - ctx->ctr_remainder_len = remainder; - ctx->ctr_copy_to = datap; - goto out; - } - ctx->ctr_copy_to = NULL; - - } while (remainder > 0); - -out: - return (CRYPTO_SUCCESS); -} - -int -ctr_mode_final(ctr_ctx_t *ctx, crypto_data_t *out, - int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) -{ - uint8_t *lastp; - void *iov_or_mp; - offset_t offset; - uint8_t *out_data_1; - uint8_t *out_data_2; - size_t out_data_1_len; - uint8_t *p; - int i; - - if (out->cd_length < ctx->ctr_remainder_len) - return (CRYPTO_DATA_LEN_RANGE); - - encrypt_block(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, - (uint8_t *)ctx->ctr_tmp); - - lastp = (uint8_t *)ctx->ctr_tmp; - p = (uint8_t *)ctx->ctr_remainder; - for (i = 0; i < ctx->ctr_remainder_len; i++) { - p[i] ^= lastp[i]; - } - - crypto_init_ptrs(out, &iov_or_mp, &offset); - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, ctx->ctr_remainder_len); - - memcpy(out_data_1, p, out_data_1_len); - if (out_data_2 != NULL) { - memcpy(out_data_2, - (uint8_t *)p + out_data_1_len, - ctx->ctr_remainder_len - out_data_1_len); - } - out->cd_offset += ctx->ctr_remainder_len; - ctx->ctr_remainder_len = 0; - return (CRYPTO_SUCCESS); -} - -int -ctr_init_ctx(ctr_ctx_t *ctr_ctx, ulong_t count, uint8_t *cb, - void (*copy_block)(uint8_t *, uint8_t *)) -{ - uint64_t upper_mask = 0; - uint64_t lower_mask = 0; - - if (count == 0 || count > 128) { - return (CRYPTO_MECHANISM_PARAM_INVALID); - } - /* upper 64 bits of the mask */ - if (count >= 64) { - count -= 64; - upper_mask = (count == 64) ? UINT64_MAX : (1ULL << count) - 1; - lower_mask = UINT64_MAX; - } else { - /* now the lower 63 bits */ - lower_mask = (1ULL << count) - 1; - } - ctr_ctx->ctr_lower_mask = htonll(lower_mask); - ctr_ctx->ctr_upper_mask = htonll(upper_mask); - - copy_block(cb, (uchar_t *)ctr_ctx->ctr_cb); - ctr_ctx->ctr_lastp = (uint8_t *)&ctr_ctx->ctr_cb[0]; - ctr_ctx->ctr_flags |= CTR_MODE; - return (CRYPTO_SUCCESS); -} - -void * -ctr_alloc_ctx(int kmflag) -{ - ctr_ctx_t *ctr_ctx; - - if ((ctr_ctx = kmem_zalloc(sizeof (ctr_ctx_t), kmflag)) == NULL) - return (NULL); - - ctr_ctx->ctr_flags = CTR_MODE; - return (ctr_ctx); -} diff --git a/module/icp/algs/modes/ecb.c b/module/icp/algs/modes/ecb.c deleted file mode 100644 index e2d8e71c16..0000000000 --- a/module/icp/algs/modes/ecb.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include - -/* - * Algorithm independent ECB functions. - */ -int -ecb_cipher_contiguous_blocks(ecb_ctx_t *ctx, char *data, size_t length, - crypto_data_t *out, size_t block_size, - int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct)) -{ - size_t remainder = length; - size_t need = 0; - uint8_t *datap = (uint8_t *)data; - uint8_t *blockp; - uint8_t *lastp; - void *iov_or_mp; - offset_t offset; - uint8_t *out_data_1; - uint8_t *out_data_2; - size_t out_data_1_len; - - if (length + ctx->ecb_remainder_len < block_size) { - /* accumulate bytes here and return */ - memcpy((uint8_t *)ctx->ecb_remainder + ctx->ecb_remainder_len, - datap, - length); - ctx->ecb_remainder_len += length; - ctx->ecb_copy_to = datap; - return (CRYPTO_SUCCESS); - } - - lastp = (uint8_t *)ctx->ecb_iv; - crypto_init_ptrs(out, &iov_or_mp, &offset); - - do { - /* Unprocessed data from last call. */ - if (ctx->ecb_remainder_len > 0) { - need = block_size - ctx->ecb_remainder_len; - - if (need > remainder) - return (CRYPTO_DATA_LEN_RANGE); - - memcpy(&((uint8_t *)ctx->ecb_remainder) - [ctx->ecb_remainder_len], datap, need); - - blockp = (uint8_t *)ctx->ecb_remainder; - } else { - blockp = datap; - } - - cipher(ctx->ecb_keysched, blockp, lastp); - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); - - /* copy block to where it belongs */ - memcpy(out_data_1, lastp, out_data_1_len); - if (out_data_2 != NULL) { - memcpy(out_data_2, lastp + out_data_1_len, - block_size - out_data_1_len); - } - /* update offset */ - out->cd_offset += block_size; - - /* Update pointer to next block of data to be processed. */ - if (ctx->ecb_remainder_len != 0) { - datap += need; - ctx->ecb_remainder_len = 0; - } else { - datap += block_size; - } - - remainder = (size_t)&data[length] - (size_t)datap; - - /* Incomplete last block. */ - if (remainder > 0 && remainder < block_size) { - memcpy(ctx->ecb_remainder, datap, remainder); - ctx->ecb_remainder_len = remainder; - ctx->ecb_copy_to = datap; - goto out; - } - ctx->ecb_copy_to = NULL; - - } while (remainder > 0); - -out: - return (CRYPTO_SUCCESS); -} - -void * -ecb_alloc_ctx(int kmflag) -{ - ecb_ctx_t *ecb_ctx; - - if ((ecb_ctx = kmem_zalloc(sizeof (ecb_ctx_t), kmflag)) == NULL) - return (NULL); - - ecb_ctx->ecb_flags = ECB_MODE; - return (ecb_ctx); -} diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index dd8db6f974..21f4301d58 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -50,11 +50,6 @@ static uint32_t icp_gcm_impl = IMPL_FASTEST; static uint32_t user_sel_impl = IMPL_FASTEST; -static inline int gcm_init_ctx_impl(boolean_t, gcm_ctx_t *, char *, size_t, - int (*)(const void *, const uint8_t *, uint8_t *), - void (*)(uint8_t *, uint8_t *), - void (*)(uint8_t *, uint8_t *)); - #ifdef CAN_USE_GCM_ASM /* Does the architecture we run on support the MOVBE instruction? */ boolean_t gcm_avx_can_use_movbe = B_FALSE; @@ -590,40 +585,11 @@ gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, return (CRYPTO_SUCCESS); } -/* - * The following function is called at encrypt or decrypt init time - * for AES GCM mode. - */ -int -gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, - int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), - void (*copy_block)(uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)) -{ - return (gcm_init_ctx_impl(B_FALSE, gcm_ctx, param, block_size, - encrypt_block, copy_block, xor_block)); -} - -/* - * The following function is called at encrypt or decrypt init time - * for AES GMAC mode. - */ -int -gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, - int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), - void (*copy_block)(uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)) -{ - return (gcm_init_ctx_impl(B_TRUE, gcm_ctx, param, block_size, - encrypt_block, copy_block, xor_block)); -} - /* * Init the GCM context struct. Handle the cycle and avx implementations here. - * Initialization of a GMAC context differs slightly from a GCM context. */ -static inline int -gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, +int +gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) @@ -635,22 +601,16 @@ gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, if (param != NULL) { gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; - if (gmac_mode == B_FALSE) { - /* GCM mode. */ - if ((rv = gcm_validate_args(gcm_param)) != 0) { - return (rv); - } - gcm_ctx->gcm_flags |= GCM_MODE; - - size_t tbits = gcm_param->ulTagBits; - tag_len = CRYPTO_BITS2BYTES(tbits); - iv_len = gcm_param->ulIvLen; - } else { - /* GMAC mode. */ - gcm_ctx->gcm_flags |= GMAC_MODE; - tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS); - iv_len = AES_GMAC_IV_LEN; + /* GCM mode. */ + if ((rv = gcm_validate_args(gcm_param)) != 0) { + return (rv); } + gcm_ctx->gcm_flags |= GCM_MODE; + + size_t tbits = gcm_param->ulTagBits; + tag_len = CRYPTO_BITS2BYTES(tbits); + iv_len = gcm_param->ulIvLen; + gcm_ctx->gcm_tag_len = tag_len; gcm_ctx->gcm_processed_data_len = 0; @@ -684,10 +644,9 @@ gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, } /* * If this is a GCM context, use the MOVBE and the BSWAP - * variants alternately. GMAC contexts code paths do not - * use the MOVBE instruction. + * variants alternately. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && gmac_mode == B_FALSE && + if (gcm_ctx->gcm_use_avx == B_TRUE && zfs_movbe_available() == B_TRUE) { (void) atomic_toggle_boolean_nv( (volatile boolean_t *)&gcm_avx_can_use_movbe); @@ -758,18 +717,6 @@ gcm_alloc_ctx(int kmflag) return (gcm_ctx); } -void * -gmac_alloc_ctx(int kmflag) -{ - gcm_ctx_t *gcm_ctx; - - if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) - return (NULL); - - gcm_ctx->gcm_flags = GMAC_MODE; - return (gcm_ctx); -} - /* GCM implementation that contains the fastest methods */ static gcm_impl_ops_t gcm_fastest_impl = { .name = "fastest" diff --git a/module/icp/algs/modes/modes.c b/module/icp/algs/modes/modes.c index 6f6649b3b5..786a89f10c 100644 --- a/module/icp/algs/modes/modes.c +++ b/module/icp/algs/modes/modes.c @@ -126,20 +126,7 @@ crypto_free_mode_ctx(void *ctx) { common_ctx_t *common_ctx = (common_ctx_t *)ctx; - switch (common_ctx->cc_flags & - (ECB_MODE|CBC_MODE|CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) { - case ECB_MODE: - kmem_free(common_ctx, sizeof (ecb_ctx_t)); - break; - - case CBC_MODE: - kmem_free(common_ctx, sizeof (cbc_ctx_t)); - break; - - case CTR_MODE: - kmem_free(common_ctx, sizeof (ctr_ctx_t)); - break; - + switch (common_ctx->cc_flags & (CCM_MODE|GCM_MODE)) { case CCM_MODE: if (((ccm_ctx_t *)ctx)->ccm_pt_buf != NULL) vmem_free(((ccm_ctx_t *)ctx)->ccm_pt_buf, @@ -149,9 +136,12 @@ crypto_free_mode_ctx(void *ctx) break; case GCM_MODE: - case GMAC_MODE: gcm_clear_ctx((gcm_ctx_t *)ctx); kmem_free(ctx, sizeof (gcm_ctx_t)); + break; + + default: + __builtin_unreachable(); } } diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h index 66eb4a6c8f..d26ced58ff 100644 --- a/module/icp/include/aes/aes_impl.h +++ b/module/icp/include/aes/aes_impl.h @@ -149,13 +149,8 @@ extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length, #ifdef _AES_IMPL typedef enum aes_mech_type { - AES_ECB_MECH_INFO_TYPE, /* SUN_CKM_AES_ECB */ - AES_CBC_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC */ - AES_CBC_PAD_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC_PAD */ - AES_CTR_MECH_INFO_TYPE, /* SUN_CKM_AES_CTR */ AES_CCM_MECH_INFO_TYPE, /* SUN_CKM_AES_CCM */ AES_GCM_MECH_INFO_TYPE, /* SUN_CKM_AES_GCM */ - AES_GMAC_MECH_INFO_TYPE /* SUN_CKM_AES_GMAC */ } aes_mech_type_t; #endif /* _AES_IMPL */ diff --git a/module/icp/include/modes/modes.h b/module/icp/include/modes/modes.h index 950c1115f3..daa0335b5c 100644 --- a/module/icp/include/modes/modes.h +++ b/module/icp/include/modes/modes.h @@ -45,12 +45,8 @@ extern "C" { extern boolean_t gcm_avx_can_use_movbe; #endif -#define ECB_MODE 0x00000002 -#define CBC_MODE 0x00000004 -#define CTR_MODE 0x00000008 #define CCM_MODE 0x00000010 #define GCM_MODE 0x00000020 -#define GMAC_MODE 0x00000040 /* * cc_keysched: Pointer to key schedule. @@ -76,7 +72,7 @@ extern boolean_t gcm_avx_can_use_movbe; * by the caller, or internally, e.g. an init routine. * If allocated by the latter, then it needs to be freed. * - * ECB_MODE, CBC_MODE, CTR_MODE, or CCM_MODE + * CCM_MODE */ struct common_ctx { void *cc_keysched; @@ -91,57 +87,6 @@ struct common_ctx { typedef struct common_ctx common_ctx_t; -typedef struct ecb_ctx { - struct common_ctx ecb_common; - uint64_t ecb_lastblock[2]; -} ecb_ctx_t; - -#define ecb_keysched ecb_common.cc_keysched -#define ecb_keysched_len ecb_common.cc_keysched_len -#define ecb_iv ecb_common.cc_iv -#define ecb_remainder ecb_common.cc_remainder -#define ecb_remainder_len ecb_common.cc_remainder_len -#define ecb_lastp ecb_common.cc_lastp -#define ecb_copy_to ecb_common.cc_copy_to -#define ecb_flags ecb_common.cc_flags - -typedef struct cbc_ctx { - struct common_ctx cbc_common; - uint64_t cbc_lastblock[2]; -} cbc_ctx_t; - -#define cbc_keysched cbc_common.cc_keysched -#define cbc_keysched_len cbc_common.cc_keysched_len -#define cbc_iv cbc_common.cc_iv -#define cbc_remainder cbc_common.cc_remainder -#define cbc_remainder_len cbc_common.cc_remainder_len -#define cbc_lastp cbc_common.cc_lastp -#define cbc_copy_to cbc_common.cc_copy_to -#define cbc_flags cbc_common.cc_flags - -/* - * ctr_lower_mask Bit-mask for lower 8 bytes of counter block. - * ctr_upper_mask Bit-mask for upper 8 bytes of counter block. - */ -typedef struct ctr_ctx { - struct common_ctx ctr_common; - uint64_t ctr_lower_mask; - uint64_t ctr_upper_mask; - uint32_t ctr_tmp[4]; -} ctr_ctx_t; - -/* - * ctr_cb Counter block. - */ -#define ctr_keysched ctr_common.cc_keysched -#define ctr_keysched_len ctr_common.cc_keysched_len -#define ctr_cb ctr_common.cc_iv -#define ctr_remainder ctr_common.cc_remainder -#define ctr_remainder_len ctr_common.cc_remainder_len -#define ctr_lastp ctr_common.cc_lastp -#define ctr_copy_to ctr_common.cc_copy_to -#define ctr_flags ctr_common.cc_flags - /* * * ccm_mac_len: Stores length of the MAC in CCM mode. @@ -241,27 +186,21 @@ typedef struct gcm_ctx { #define gcm_copy_to gcm_common.cc_copy_to #define gcm_flags gcm_common.cc_flags -#define AES_GMAC_IV_LEN 12 -#define AES_GMAC_TAG_BITS 128 - void gcm_clear_ctx(gcm_ctx_t *ctx); typedef struct aes_ctx { union { - ecb_ctx_t acu_ecb; - cbc_ctx_t acu_cbc; - ctr_ctx_t acu_ctr; ccm_ctx_t acu_ccm; gcm_ctx_t acu_gcm; } acu; } aes_ctx_t; -#define ac_flags acu.acu_ecb.ecb_common.cc_flags -#define ac_remainder_len acu.acu_ecb.ecb_common.cc_remainder_len -#define ac_keysched acu.acu_ecb.ecb_common.cc_keysched -#define ac_keysched_len acu.acu_ecb.ecb_common.cc_keysched_len -#define ac_iv acu.acu_ecb.ecb_common.cc_iv -#define ac_lastp acu.acu_ecb.ecb_common.cc_lastp +#define ac_flags acu.acu_ccm.ccm_common.cc_flags +#define ac_remainder_len acu.acu_ccm.ccm_common.cc_remainder_len +#define ac_keysched acu.acu_ccm.ccm_common.cc_keysched +#define ac_keysched_len acu.acu_ccm.ccm_common.cc_keysched_len +#define ac_iv acu.acu_ccm.ccm_common.cc_iv +#define ac_lastp acu.acu_ccm.ccm_common.cc_lastp #define ac_pt_buf acu.acu_ccm.ccm_pt_buf #define ac_mac_len acu.acu_ccm.ccm_mac_len #define ac_data_len acu.acu_ccm.ccm_data_len @@ -269,27 +208,6 @@ typedef struct aes_ctx { #define ac_processed_data_len acu.acu_ccm.ccm_processed_data_len #define ac_tag_len acu.acu_gcm.gcm_tag_len -extern int ecb_cipher_contiguous_blocks(ecb_ctx_t *, char *, size_t, - crypto_data_t *, size_t, int (*cipher)(const void *, const uint8_t *, - uint8_t *)); - -extern int cbc_encrypt_contiguous_blocks(cbc_ctx_t *, char *, size_t, - crypto_data_t *, size_t, - int (*encrypt)(const void *, const uint8_t *, uint8_t *), - void (*copy_block)(uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)); - -extern int cbc_decrypt_contiguous_blocks(cbc_ctx_t *, char *, size_t, - crypto_data_t *, size_t, - int (*decrypt)(const void *, const uint8_t *, uint8_t *), - void (*copy_block)(uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)); - -extern int ctr_mode_contiguous_blocks(ctr_ctx_t *, char *, size_t, - crypto_data_t *, size_t, - int (*cipher)(const void *, const uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)); - extern int ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *, char *, size_t, crypto_data_t *, size_t, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), @@ -332,15 +250,6 @@ extern int gcm_decrypt_final(gcm_ctx_t *, crypto_data_t *, size_t, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)); -extern int ctr_mode_final(ctr_ctx_t *, crypto_data_t *, - int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)); - -extern int cbc_init_ctx(cbc_ctx_t *, char *, size_t, size_t, - void (*copy_block)(uint8_t *, uint64_t *)); - -extern int ctr_init_ctx(ctr_ctx_t *, ulong_t, uint8_t *, - void (*copy_block)(uint8_t *, uint8_t *)); - extern int ccm_init_ctx(ccm_ctx_t *, char *, int, boolean_t, size_t, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)); @@ -350,11 +259,6 @@ extern int gcm_init_ctx(gcm_ctx_t *, char *, size_t, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)); -extern int gmac_init_ctx(gcm_ctx_t *, char *, size_t, - int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), - void (*copy_block)(uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)); - extern void calculate_ccm_mac(ccm_ctx_t *, uint8_t *, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)); @@ -364,12 +268,8 @@ extern void crypto_init_ptrs(crypto_data_t *, void **, offset_t *); extern void crypto_get_ptrs(crypto_data_t *, void **, offset_t *, uint8_t **, size_t *, uint8_t **, size_t); -extern void *ecb_alloc_ctx(int); -extern void *cbc_alloc_ctx(int); -extern void *ctr_alloc_ctx(int); extern void *ccm_alloc_ctx(int); extern void *gcm_alloc_ctx(int); -extern void *gmac_alloc_ctx(int); extern void crypto_free_mode_ctx(void *); #ifdef __cplusplus diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index 522c436497..a4ef171671 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -40,18 +40,6 @@ * Mechanism info structure passed to KCF during registration. */ static const crypto_mech_info_t aes_mech_info_tab[] = { - /* AES_ECB */ - {SUN_CKM_AES_ECB, AES_ECB_MECH_INFO_TYPE, - CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | - CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC}, - /* AES_CBC */ - {SUN_CKM_AES_CBC, AES_CBC_MECH_INFO_TYPE, - CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | - CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC}, - /* AES_CTR */ - {SUN_CKM_AES_CTR, AES_CTR_MECH_INFO_TYPE, - CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | - CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC}, /* AES_CCM */ {SUN_CKM_AES_CCM, AES_CCM_MECH_INFO_TYPE, CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | @@ -60,11 +48,6 @@ static const crypto_mech_info_t aes_mech_info_tab[] = { {SUN_CKM_AES_GCM, AES_GCM_MECH_INFO_TYPE, CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC}, - /* AES_GMAC */ - {SUN_CKM_AES_GMAC, AES_GMAC_MECH_INFO_TYPE, - CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | - CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC | - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, }; static int aes_encrypt_init(crypto_ctx_t *, crypto_mechanism_t *, @@ -103,20 +86,6 @@ static const crypto_cipher_ops_t aes_cipher_ops = { .decrypt_atomic = aes_decrypt_atomic }; -static int aes_mac_atomic(crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, - crypto_data_t *, crypto_spi_ctx_template_t); -static int aes_mac_verify_atomic(crypto_mechanism_t *, crypto_key_t *, - crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t); - -static const crypto_mac_ops_t aes_mac_ops = { - .mac_init = NULL, - .mac = NULL, - .mac_update = NULL, - .mac_final = NULL, - .mac_atomic = aes_mac_atomic, - .mac_verify_atomic = aes_mac_verify_atomic -}; - static int aes_create_ctx_template(crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *, size_t *); static int aes_free_context(crypto_ctx_t *); @@ -129,7 +98,7 @@ static const crypto_ctx_ops_t aes_ctx_ops = { static const crypto_ops_t aes_crypto_ops = { NULL, &aes_cipher_ops, - &aes_mac_ops, + NULL, &aes_ctx_ops, }; @@ -141,7 +110,6 @@ static const crypto_provider_info_t aes_prov_info = { }; static crypto_kcf_provider_handle_t aes_prov_handle = 0; -static crypto_data_t null_crypto_data = { CRYPTO_DATA_RAW }; int aes_mod_init(void) @@ -181,18 +149,6 @@ aes_check_mech_param(crypto_mechanism_t *mechanism, aes_ctx_t **ctx) int rv = CRYPTO_SUCCESS; switch (mechanism->cm_type) { - case AES_ECB_MECH_INFO_TYPE: - param_required = B_FALSE; - alloc_fun = ecb_alloc_ctx; - break; - case AES_CBC_MECH_INFO_TYPE: - param_len = AES_BLOCK_LEN; - alloc_fun = cbc_alloc_ctx; - break; - case AES_CTR_MECH_INFO_TYPE: - param_len = sizeof (CK_AES_CTR_PARAMS); - alloc_fun = ctr_alloc_ctx; - break; case AES_CCM_MECH_INFO_TYPE: param_len = sizeof (CK_AES_CCM_PARAMS); alloc_fun = ccm_alloc_ctx; @@ -201,13 +157,8 @@ aes_check_mech_param(crypto_mechanism_t *mechanism, aes_ctx_t **ctx) param_len = sizeof (CK_AES_GCM_PARAMS); alloc_fun = gcm_alloc_ctx; break; - case AES_GMAC_MECH_INFO_TYPE: - param_len = sizeof (CK_AES_GMAC_PARAMS); - alloc_fun = gmac_alloc_ctx; - break; default: - rv = CRYPTO_MECHANISM_INVALID; - return (rv); + __builtin_unreachable(); } if (param_required && mechanism->cm_param != NULL && mechanism->cm_param_len != param_len) { @@ -282,22 +233,6 @@ aes_common_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, return (CRYPTO_SUCCESS); } -static void -aes_copy_block64(uint8_t *in, uint64_t *out) -{ - if (IS_P2ALIGNED(in, sizeof (uint64_t))) { - /* LINTED: pointer alignment */ - out[0] = *(uint64_t *)&in[0]; - /* LINTED: pointer alignment */ - out[1] = *(uint64_t *)&in[8]; - } else { - uint8_t *iv8 = (uint8_t *)&out[0]; - - AES_COPY_BLOCK(in, iv8); - } -} - - static int aes_encrypt(crypto_ctx_t *ctx, crypto_data_t *plaintext, crypto_data_t *ciphertext) @@ -310,35 +245,21 @@ aes_encrypt(crypto_ctx_t *ctx, crypto_data_t *plaintext, ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; - /* - * For block ciphers, plaintext must be a multiple of AES block size. - * This test is only valid for ciphers whose blocksize is a power of 2. - */ - if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) - == 0) && (plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0) - return (CRYPTO_DATA_LEN_RANGE); - ASSERT(ciphertext != NULL); /* * We need to just return the length needed to store the output. * We should not destroy the context for the following case. */ - switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) { + switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE)) { case CCM_MODE: length_needed = plaintext->cd_length + aes_ctx->ac_mac_len; break; case GCM_MODE: length_needed = plaintext->cd_length + aes_ctx->ac_tag_len; break; - case GMAC_MODE: - if (plaintext->cd_length != 0) - return (CRYPTO_ARGUMENTS_BAD); - - length_needed = aes_ctx->ac_tag_len; - break; default: - length_needed = plaintext->cd_length; + __builtin_unreachable(); } if (ciphertext->cd_length < length_needed) { @@ -382,7 +303,7 @@ aes_encrypt(crypto_ctx_t *ctx, crypto_data_t *plaintext, ciphertext->cd_offset - saved_offset; } ciphertext->cd_offset = saved_offset; - } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + } else if (aes_ctx->ac_flags & GCM_MODE) { /* * gcm_encrypt_final() will compute the MAC and append * it to existing ciphertext. So, need to adjust the left over @@ -426,15 +347,6 @@ aes_decrypt(crypto_ctx_t *ctx, crypto_data_t *ciphertext, ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; - /* - * For block ciphers, plaintext must be a multiple of AES block size. - * This test is only valid for ciphers whose blocksize is a power of 2. - */ - if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) - == 0) && (ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0) { - return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); - } - ASSERT(plaintext != NULL); /* @@ -443,23 +355,16 @@ aes_decrypt(crypto_ctx_t *ctx, crypto_data_t *ciphertext, * * CCM: plaintext is MAC len smaller than cipher text * GCM: plaintext is TAG len smaller than cipher text - * GMAC: plaintext length must be zero */ - switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) { + switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE)) { case CCM_MODE: length_needed = aes_ctx->ac_processed_data_len; break; case GCM_MODE: length_needed = ciphertext->cd_length - aes_ctx->ac_tag_len; break; - case GMAC_MODE: - if (plaintext->cd_length != 0) - return (CRYPTO_ARGUMENTS_BAD); - - length_needed = 0; - break; default: - length_needed = ciphertext->cd_length; + __builtin_unreachable(); } if (plaintext->cd_length < length_needed) { @@ -499,7 +404,7 @@ aes_decrypt(crypto_ctx_t *ctx, crypto_data_t *ciphertext, } plaintext->cd_offset = saved_offset; - } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + } else if (aes_ctx->ac_flags & GCM_MODE) { /* order of following 2 lines MUST not be reversed */ plaintext->cd_offset = plaintext->cd_length; plaintext->cd_length = saved_length - plaintext->cd_length; @@ -571,17 +476,6 @@ aes_encrypt_update(crypto_ctx_t *ctx, crypto_data_t *plaintext, ret = CRYPTO_ARGUMENTS_BAD; } - /* - * Since AES counter mode is a stream cipher, we call - * ctr_mode_final() to pick up any remaining bytes. - * It is an internal function that does not destroy - * the context like *normal* final routines. - */ - if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) { - ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, - ciphertext, aes_encrypt_block); - } - if (ret == CRYPTO_SUCCESS) { if (plaintext != ciphertext) ciphertext->cd_length = @@ -600,32 +494,13 @@ aes_decrypt_update(crypto_ctx_t *ctx, crypto_data_t *ciphertext, crypto_data_t *plaintext) { off_t saved_offset; - size_t saved_length, out_len; + size_t saved_length; int ret = CRYPTO_SUCCESS; - aes_ctx_t *aes_ctx; ASSERT(ctx->cc_provider_private != NULL); - aes_ctx = ctx->cc_provider_private; ASSERT(plaintext != NULL); - /* - * Compute number of bytes that will hold the plaintext. - * This is not necessary for CCM, GCM, and GMAC since these - * mechanisms never return plaintext for update operations. - */ - if ((aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) { - out_len = aes_ctx->ac_remainder_len; - out_len += ciphertext->cd_length; - out_len &= ~(AES_BLOCK_LEN - 1); - - /* return length needed to store the output */ - if (plaintext->cd_length < out_len) { - plaintext->cd_length = out_len; - return (CRYPTO_BUFFER_TOO_SMALL); - } - } - saved_offset = plaintext->cd_offset; saved_length = plaintext->cd_length; @@ -645,19 +520,6 @@ aes_decrypt_update(crypto_ctx_t *ctx, crypto_data_t *ciphertext, ret = CRYPTO_ARGUMENTS_BAD; } - /* - * Since AES counter mode is a stream cipher, we call - * ctr_mode_final() to pick up any remaining bytes. - * It is an internal function that does not destroy - * the context like *normal* final routines. - */ - if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) { - ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, plaintext, - aes_encrypt_block); - if (ret == CRYPTO_DATA_LEN_RANGE) - ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; - } - if (ret == CRYPTO_SUCCESS) { if (ciphertext != plaintext) plaintext->cd_length = @@ -685,20 +547,13 @@ aes_encrypt_final(crypto_ctx_t *ctx, crypto_data_t *data) return (CRYPTO_ARGUMENTS_BAD); } - if (aes_ctx->ac_flags & CTR_MODE) { - if (aes_ctx->ac_remainder_len > 0) { - ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data, - aes_encrypt_block); - if (ret != CRYPTO_SUCCESS) - return (ret); - } - } else if (aes_ctx->ac_flags & CCM_MODE) { + if (aes_ctx->ac_flags & CCM_MODE) { ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, data, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); if (ret != CRYPTO_SUCCESS) { return (ret); } - } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + } else if (aes_ctx->ac_flags & GCM_MODE) { size_t saved_offset = data->cd_offset; ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, data, @@ -709,16 +564,6 @@ aes_encrypt_final(crypto_ctx_t *ctx, crypto_data_t *data) } data->cd_length = data->cd_offset - saved_offset; data->cd_offset = saved_offset; - } else { - /* - * There must be no unprocessed plaintext. - * This happens if the length of the last data is - * not a multiple of the AES block length. - */ - if (aes_ctx->ac_remainder_len > 0) { - return (CRYPTO_DATA_LEN_RANGE); - } - data->cd_length = 0; } (void) aes_free_context(ctx); @@ -747,18 +592,8 @@ aes_decrypt_final(crypto_ctx_t *ctx, crypto_data_t *data) * This happens if the length of the last ciphertext is * not a multiple of the AES block length. */ - if (aes_ctx->ac_remainder_len > 0) { - if ((aes_ctx->ac_flags & CTR_MODE) == 0) - return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); - else { - ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data, - aes_encrypt_block); - if (ret == CRYPTO_DATA_LEN_RANGE) - ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; - if (ret != CRYPTO_SUCCESS) - return (ret); - } - } + if (aes_ctx->ac_remainder_len > 0) + return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); if (aes_ctx->ac_flags & CCM_MODE) { /* @@ -788,7 +623,7 @@ aes_decrypt_final(crypto_ctx_t *ctx, crypto_data_t *data) if (ret != CRYPTO_SUCCESS) { return (ret); } - } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + } else if (aes_ctx->ac_flags & GCM_MODE) { /* * This is where all the plaintext is returned, make sure * the plaintext buffer is big enough @@ -818,10 +653,6 @@ aes_decrypt_final(crypto_ctx_t *ctx, crypto_data_t *data) } - if ((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) { - data->cd_length = 0; - } - (void) aes_free_context(ctx); return (CRYPTO_SUCCESS); @@ -842,21 +673,6 @@ aes_encrypt_atomic(crypto_mechanism_t *mechanism, ASSERT(ciphertext != NULL); - /* - * CTR, CCM, GCM, and GMAC modes do not require that plaintext - * be a multiple of AES block size. - */ - switch (mechanism->cm_type) { - case AES_CTR_MECH_INFO_TYPE: - case AES_CCM_MECH_INFO_TYPE: - case AES_GCM_MECH_INFO_TYPE: - case AES_GMAC_MECH_INFO_TYPE: - break; - default: - if ((plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0) - return (CRYPTO_DATA_LEN_RANGE); - } - if ((ret = aes_check_mech_param(mechanism, NULL)) != CRYPTO_SUCCESS) return (ret); @@ -869,15 +685,11 @@ aes_encrypt_atomic(crypto_mechanism_t *mechanism, case AES_CCM_MECH_INFO_TYPE: length_needed = plaintext->cd_length + aes_ctx.ac_mac_len; break; - case AES_GMAC_MECH_INFO_TYPE: - if (plaintext->cd_length != 0) - return (CRYPTO_ARGUMENTS_BAD); - zfs_fallthrough; case AES_GCM_MECH_INFO_TYPE: length_needed = plaintext->cd_length + aes_ctx.ac_tag_len; break; default: - length_needed = plaintext->cd_length; + __builtin_unreachable(); } /* return size of buffer needed to store output */ @@ -914,21 +726,13 @@ aes_encrypt_atomic(crypto_mechanism_t *mechanism, if (ret != CRYPTO_SUCCESS) goto out; ASSERT(aes_ctx.ac_remainder_len == 0); - } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE || - mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) { + } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE) { ret = gcm_encrypt_final((gcm_ctx_t *)&aes_ctx, ciphertext, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); if (ret != CRYPTO_SUCCESS) goto out; ASSERT(aes_ctx.ac_remainder_len == 0); - } else if (mechanism->cm_type == AES_CTR_MECH_INFO_TYPE) { - if (aes_ctx.ac_remainder_len > 0) { - ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx, - ciphertext, aes_encrypt_block); - if (ret != CRYPTO_SUCCESS) - goto out; - } } else { ASSERT(aes_ctx.ac_remainder_len == 0); } @@ -947,7 +751,7 @@ out: memset(aes_ctx.ac_keysched, 0, aes_ctx.ac_keysched_len); kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); } - if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE)) { + if (aes_ctx.ac_flags & GCM_MODE) { gcm_clear_ctx((gcm_ctx_t *)&aes_ctx); } return (ret); @@ -968,21 +772,6 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, ASSERT(plaintext != NULL); - /* - * CCM, GCM, CTR, and GMAC modes do not require that ciphertext - * be a multiple of AES block size. - */ - switch (mechanism->cm_type) { - case AES_CTR_MECH_INFO_TYPE: - case AES_CCM_MECH_INFO_TYPE: - case AES_GCM_MECH_INFO_TYPE: - case AES_GMAC_MECH_INFO_TYPE: - break; - default: - if ((ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0) - return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); - } - if ((ret = aes_check_mech_param(mechanism, NULL)) != CRYPTO_SUCCESS) return (ret); @@ -998,13 +787,8 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, case AES_GCM_MECH_INFO_TYPE: length_needed = ciphertext->cd_length - aes_ctx.ac_tag_len; break; - case AES_GMAC_MECH_INFO_TYPE: - if (plaintext->cd_length != 0) - return (CRYPTO_ARGUMENTS_BAD); - length_needed = 0; - break; default: - length_needed = ciphertext->cd_length; + __builtin_unreachable(); } /* return size of buffer needed to store output */ @@ -1050,8 +834,7 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, } else { plaintext->cd_length = saved_length; } - } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE || - mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) { + } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE) { ret = gcm_decrypt_final((gcm_ctx_t *)&aes_ctx, plaintext, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); @@ -1063,24 +846,8 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, } else { plaintext->cd_length = saved_length; } - } else if (mechanism->cm_type != AES_CTR_MECH_INFO_TYPE) { - ASSERT(aes_ctx.ac_remainder_len == 0); - if (ciphertext != plaintext) - plaintext->cd_length = - plaintext->cd_offset - saved_offset; - } else { - if (aes_ctx.ac_remainder_len > 0) { - ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx, - plaintext, aes_encrypt_block); - if (ret == CRYPTO_DATA_LEN_RANGE) - ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; - if (ret != CRYPTO_SUCCESS) - goto out; - } - if (ciphertext != plaintext) - plaintext->cd_length = - plaintext->cd_offset - saved_offset; - } + } else + __builtin_unreachable(); } else { plaintext->cd_length = saved_length; } @@ -1096,7 +863,7 @@ out: if (aes_ctx.ac_pt_buf != NULL) { vmem_free(aes_ctx.ac_pt_buf, aes_ctx.ac_data_len); } - } else if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE)) { + } else if (aes_ctx.ac_flags & GCM_MODE) { gcm_clear_ctx((gcm_ctx_t *)&aes_ctx); } @@ -1114,12 +881,8 @@ aes_create_ctx_template(crypto_mechanism_t *mechanism, crypto_key_t *key, size_t size; int rv; - if (mechanism->cm_type != AES_ECB_MECH_INFO_TYPE && - mechanism->cm_type != AES_CBC_MECH_INFO_TYPE && - mechanism->cm_type != AES_CTR_MECH_INFO_TYPE && - mechanism->cm_type != AES_CCM_MECH_INFO_TYPE && - mechanism->cm_type != AES_GCM_MECH_INFO_TYPE && - mechanism->cm_type != AES_GMAC_MECH_INFO_TYPE) + if (mechanism->cm_type != AES_CCM_MECH_INFO_TYPE && + mechanism->cm_type != AES_GCM_MECH_INFO_TYPE) return (CRYPTO_MECHANISM_INVALID); if ((keysched = aes_alloc_keysched(&size, KM_SLEEP)) == NULL) { @@ -1193,22 +956,6 @@ aes_common_init_ctx(aes_ctx_t *aes_ctx, crypto_spi_ctx_template_t *template, aes_ctx->ac_keysched = keysched; switch (mechanism->cm_type) { - case AES_CBC_MECH_INFO_TYPE: - rv = cbc_init_ctx((cbc_ctx_t *)aes_ctx, mechanism->cm_param, - mechanism->cm_param_len, AES_BLOCK_LEN, aes_copy_block64); - break; - case AES_CTR_MECH_INFO_TYPE: { - CK_AES_CTR_PARAMS *pp; - - if (mechanism->cm_param == NULL || - mechanism->cm_param_len != sizeof (CK_AES_CTR_PARAMS)) { - return (CRYPTO_MECHANISM_PARAM_INVALID); - } - pp = (CK_AES_CTR_PARAMS *)(void *)mechanism->cm_param; - rv = ctr_init_ctx((ctr_ctx_t *)aes_ctx, pp->ulCounterBits, - pp->cb, aes_copy_block); - break; - } case AES_CCM_MECH_INFO_TYPE: if (mechanism->cm_param == NULL || mechanism->cm_param_len != sizeof (CK_AES_CCM_PARAMS)) { @@ -1227,17 +974,6 @@ aes_common_init_ctx(aes_ctx_t *aes_ctx, crypto_spi_ctx_template_t *template, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); break; - case AES_GMAC_MECH_INFO_TYPE: - if (mechanism->cm_param == NULL || - mechanism->cm_param_len != sizeof (CK_AES_GMAC_PARAMS)) { - return (CRYPTO_MECHANISM_PARAM_INVALID); - } - rv = gmac_init_ctx((gcm_ctx_t *)aes_ctx, mechanism->cm_param, - AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, - aes_xor_block); - break; - case AES_ECB_MECH_INFO_TYPE: - aes_ctx->ac_flags |= ECB_MODE; } if (rv != CRYPTO_SUCCESS) { @@ -1249,75 +985,3 @@ aes_common_init_ctx(aes_ctx_t *aes_ctx, crypto_spi_ctx_template_t *template, return (rv); } - -static int -process_gmac_mech(crypto_mechanism_t *mech, crypto_data_t *data, - CK_AES_GCM_PARAMS *gcm_params) -{ - /* LINTED: pointer alignment */ - CK_AES_GMAC_PARAMS *params = (CK_AES_GMAC_PARAMS *)mech->cm_param; - - if (mech->cm_type != AES_GMAC_MECH_INFO_TYPE) - return (CRYPTO_MECHANISM_INVALID); - - if (mech->cm_param_len != sizeof (CK_AES_GMAC_PARAMS)) - return (CRYPTO_MECHANISM_PARAM_INVALID); - - if (params->pIv == NULL) - return (CRYPTO_MECHANISM_PARAM_INVALID); - - gcm_params->pIv = params->pIv; - gcm_params->ulIvLen = AES_GMAC_IV_LEN; - gcm_params->ulTagBits = AES_GMAC_TAG_BITS; - - if (data == NULL) - return (CRYPTO_SUCCESS); - - if (data->cd_format != CRYPTO_DATA_RAW) - return (CRYPTO_ARGUMENTS_BAD); - - gcm_params->pAAD = (uchar_t *)data->cd_raw.iov_base; - gcm_params->ulAADLen = data->cd_length; - return (CRYPTO_SUCCESS); -} - -static int -aes_mac_atomic(crypto_mechanism_t *mechanism, - crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, - crypto_spi_ctx_template_t template) -{ - CK_AES_GCM_PARAMS gcm_params; - crypto_mechanism_t gcm_mech; - int rv; - - if ((rv = process_gmac_mech(mechanism, data, &gcm_params)) - != CRYPTO_SUCCESS) - return (rv); - - gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE; - gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); - gcm_mech.cm_param = (char *)&gcm_params; - - return (aes_encrypt_atomic(&gcm_mech, - key, &null_crypto_data, mac, template)); -} - -static int -aes_mac_verify_atomic(crypto_mechanism_t *mechanism, crypto_key_t *key, - crypto_data_t *data, crypto_data_t *mac, crypto_spi_ctx_template_t template) -{ - CK_AES_GCM_PARAMS gcm_params; - crypto_mechanism_t gcm_mech; - int rv; - - if ((rv = process_gmac_mech(mechanism, data, &gcm_params)) - != CRYPTO_SUCCESS) - return (rv); - - gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE; - gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); - gcm_mech.cm_param = (char *)&gcm_params; - - return (aes_decrypt_atomic(&gcm_mech, - key, mac, &null_crypto_data, template)); -} From 4ed91dc26e63cd18817f7bd91d1590dd6514394b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 18 May 2024 22:17:36 +1000 Subject: [PATCH 100/113] icp: remove unusued incremental cipher methods Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- module/icp/core/kcf_mech_tabs.c | 3 +- module/icp/include/sys/crypto/spi.h | 21 -- module/icp/io/aes.c | 493 +--------------------------- 3 files changed, 3 insertions(+), 514 deletions(-) diff --git a/module/icp/core/kcf_mech_tabs.c b/module/icp/core/kcf_mech_tabs.c index 41705e84bc..b6e6937698 100644 --- a/module/icp/core/kcf_mech_tabs.c +++ b/module/icp/core/kcf_mech_tabs.c @@ -222,8 +222,7 @@ kcf_add_mech_provider(short mech_indx, if (fg & CRYPTO_FG_DIGEST || fg & CRYPTO_FG_DIGEST_ATOMIC) class = KCF_DIGEST_CLASS; - else if (fg & CRYPTO_FG_ENCRYPT || fg & CRYPTO_FG_DECRYPT || - fg & CRYPTO_FG_ENCRYPT_ATOMIC || + else if (fg & CRYPTO_FG_ENCRYPT_ATOMIC || fg & CRYPTO_FG_DECRYPT_ATOMIC) class = KCF_CIPHER_CLASS; else if (fg & CRYPTO_FG_MAC || fg & CRYPTO_FG_MAC_ATOMIC) diff --git a/module/icp/include/sys/crypto/spi.h b/module/icp/include/sys/crypto/spi.h index 63dfce7957..9bcb62ac52 100644 --- a/module/icp/include/sys/crypto/spi.h +++ b/module/icp/include/sys/crypto/spi.h @@ -89,27 +89,8 @@ typedef struct crypto_digest_ops { * with the kernel using crypto_register_provider(9F). */ typedef struct crypto_cipher_ops { - int (*encrypt_init)(crypto_ctx_t *, - crypto_mechanism_t *, crypto_key_t *, - crypto_spi_ctx_template_t); - int (*encrypt)(crypto_ctx_t *, - crypto_data_t *, crypto_data_t *); - int (*encrypt_update)(crypto_ctx_t *, - crypto_data_t *, crypto_data_t *); - int (*encrypt_final)(crypto_ctx_t *, - crypto_data_t *); int (*encrypt_atomic)(crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t); - - int (*decrypt_init)(crypto_ctx_t *, - crypto_mechanism_t *, crypto_key_t *, - crypto_spi_ctx_template_t); - int (*decrypt)(crypto_ctx_t *, - crypto_data_t *, crypto_data_t *); - int (*decrypt_update)(crypto_ctx_t *, - crypto_data_t *, crypto_data_t *); - int (*decrypt_final)(crypto_ctx_t *, - crypto_data_t *); int (*decrypt_atomic)(crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t); } __no_const crypto_cipher_ops_t; @@ -172,8 +153,6 @@ typedef struct crypto_ops { typedef uint32_t crypto_func_group_t; -#define CRYPTO_FG_ENCRYPT 0x00000001 /* encrypt_init() */ -#define CRYPTO_FG_DECRYPT 0x00000002 /* decrypt_init() */ #define CRYPTO_FG_DIGEST 0x00000004 /* digest_init() */ #define CRYPTO_FG_MAC 0x00001000 /* mac_init() */ #define CRYPTO_FG_ENCRYPT_ATOMIC 0x00008000 /* encrypt_atomic() */ diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index a4ef171671..a68a878b6a 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -42,47 +42,23 @@ static const crypto_mech_info_t aes_mech_info_tab[] = { /* AES_CCM */ {SUN_CKM_AES_CCM, AES_CCM_MECH_INFO_TYPE, - CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | - CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC}, + CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT_ATOMIC}, /* AES_GCM */ {SUN_CKM_AES_GCM, AES_GCM_MECH_INFO_TYPE, - CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | - CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC}, + CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT_ATOMIC}, }; -static int aes_encrypt_init(crypto_ctx_t *, crypto_mechanism_t *, - crypto_key_t *, crypto_spi_ctx_template_t); -static int aes_decrypt_init(crypto_ctx_t *, crypto_mechanism_t *, - crypto_key_t *, crypto_spi_ctx_template_t); -static int aes_common_init(crypto_ctx_t *, crypto_mechanism_t *, - crypto_key_t *, crypto_spi_ctx_template_t, boolean_t); static int aes_common_init_ctx(aes_ctx_t *, crypto_spi_ctx_template_t *, crypto_mechanism_t *, crypto_key_t *, int, boolean_t); -static int aes_encrypt_final(crypto_ctx_t *, crypto_data_t *); -static int aes_decrypt_final(crypto_ctx_t *, crypto_data_t *); -static int aes_encrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *); -static int aes_encrypt_update(crypto_ctx_t *, crypto_data_t *, - crypto_data_t *); static int aes_encrypt_atomic(crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t); -static int aes_decrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *); -static int aes_decrypt_update(crypto_ctx_t *, crypto_data_t *, - crypto_data_t *); static int aes_decrypt_atomic(crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t); static const crypto_cipher_ops_t aes_cipher_ops = { - .encrypt_init = aes_encrypt_init, - .encrypt = aes_encrypt, - .encrypt_update = aes_encrypt_update, - .encrypt_final = aes_encrypt_final, .encrypt_atomic = aes_encrypt_atomic, - .decrypt_init = aes_decrypt_init, - .decrypt = aes_decrypt, - .decrypt_update = aes_decrypt_update, - .decrypt_final = aes_decrypt_final, .decrypt_atomic = aes_decrypt_atomic }; @@ -190,474 +166,9 @@ init_keysched(crypto_key_t *key, void *newbie) return (CRYPTO_SUCCESS); } -static int -aes_encrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, - crypto_key_t *key, crypto_spi_ctx_template_t template) -{ - return (aes_common_init(ctx, mechanism, key, template, B_TRUE)); -} - -static int -aes_decrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, - crypto_key_t *key, crypto_spi_ctx_template_t template) -{ - return (aes_common_init(ctx, mechanism, key, template, B_FALSE)); -} - - - /* * KCF software provider encrypt entry points. */ -static int -aes_common_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, - crypto_key_t *key, crypto_spi_ctx_template_t template, - boolean_t is_encrypt_init) -{ - aes_ctx_t *aes_ctx; - int rv; - - if ((rv = aes_check_mech_param(mechanism, &aes_ctx)) - != CRYPTO_SUCCESS) - return (rv); - - rv = aes_common_init_ctx(aes_ctx, template, mechanism, key, KM_SLEEP, - is_encrypt_init); - if (rv != CRYPTO_SUCCESS) { - crypto_free_mode_ctx(aes_ctx); - return (rv); - } - - ctx->cc_provider_private = aes_ctx; - - return (CRYPTO_SUCCESS); -} - -static int -aes_encrypt(crypto_ctx_t *ctx, crypto_data_t *plaintext, - crypto_data_t *ciphertext) -{ - int ret = CRYPTO_FAILED; - - aes_ctx_t *aes_ctx; - size_t saved_length, saved_offset, length_needed; - - ASSERT(ctx->cc_provider_private != NULL); - aes_ctx = ctx->cc_provider_private; - - ASSERT(ciphertext != NULL); - - /* - * We need to just return the length needed to store the output. - * We should not destroy the context for the following case. - */ - switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE)) { - case CCM_MODE: - length_needed = plaintext->cd_length + aes_ctx->ac_mac_len; - break; - case GCM_MODE: - length_needed = plaintext->cd_length + aes_ctx->ac_tag_len; - break; - default: - __builtin_unreachable(); - } - - if (ciphertext->cd_length < length_needed) { - ciphertext->cd_length = length_needed; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - saved_length = ciphertext->cd_length; - saved_offset = ciphertext->cd_offset; - - /* - * Do an update on the specified input data. - */ - ret = aes_encrypt_update(ctx, plaintext, ciphertext); - if (ret != CRYPTO_SUCCESS) { - return (ret); - } - - /* - * For CCM mode, aes_ccm_encrypt_final() will take care of any - * left-over unprocessed data, and compute the MAC - */ - if (aes_ctx->ac_flags & CCM_MODE) { - /* - * ccm_encrypt_final() will compute the MAC and append - * it to existing ciphertext. So, need to adjust the left over - * length value accordingly - */ - - /* order of following 2 lines MUST not be reversed */ - ciphertext->cd_offset = ciphertext->cd_length; - ciphertext->cd_length = saved_length - ciphertext->cd_length; - ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, ciphertext, - AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); - if (ret != CRYPTO_SUCCESS) { - return (ret); - } - - if (plaintext != ciphertext) { - ciphertext->cd_length = - ciphertext->cd_offset - saved_offset; - } - ciphertext->cd_offset = saved_offset; - } else if (aes_ctx->ac_flags & GCM_MODE) { - /* - * gcm_encrypt_final() will compute the MAC and append - * it to existing ciphertext. So, need to adjust the left over - * length value accordingly - */ - - /* order of following 2 lines MUST not be reversed */ - ciphertext->cd_offset = ciphertext->cd_length; - ciphertext->cd_length = saved_length - ciphertext->cd_length; - ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, ciphertext, - AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, - aes_xor_block); - if (ret != CRYPTO_SUCCESS) { - return (ret); - } - - if (plaintext != ciphertext) { - ciphertext->cd_length = - ciphertext->cd_offset - saved_offset; - } - ciphertext->cd_offset = saved_offset; - } - - ASSERT(aes_ctx->ac_remainder_len == 0); - (void) aes_free_context(ctx); - - return (ret); -} - - -static int -aes_decrypt(crypto_ctx_t *ctx, crypto_data_t *ciphertext, - crypto_data_t *plaintext) -{ - int ret = CRYPTO_FAILED; - - aes_ctx_t *aes_ctx; - off_t saved_offset; - size_t saved_length, length_needed; - - ASSERT(ctx->cc_provider_private != NULL); - aes_ctx = ctx->cc_provider_private; - - ASSERT(plaintext != NULL); - - /* - * Return length needed to store the output. - * Do not destroy context when plaintext buffer is too small. - * - * CCM: plaintext is MAC len smaller than cipher text - * GCM: plaintext is TAG len smaller than cipher text - */ - switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE)) { - case CCM_MODE: - length_needed = aes_ctx->ac_processed_data_len; - break; - case GCM_MODE: - length_needed = ciphertext->cd_length - aes_ctx->ac_tag_len; - break; - default: - __builtin_unreachable(); - } - - if (plaintext->cd_length < length_needed) { - plaintext->cd_length = length_needed; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - saved_offset = plaintext->cd_offset; - saved_length = plaintext->cd_length; - - /* - * Do an update on the specified input data. - */ - ret = aes_decrypt_update(ctx, ciphertext, plaintext); - if (ret != CRYPTO_SUCCESS) { - goto cleanup; - } - - if (aes_ctx->ac_flags & CCM_MODE) { - ASSERT(aes_ctx->ac_processed_data_len == aes_ctx->ac_data_len); - ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len); - - /* order of following 2 lines MUST not be reversed */ - plaintext->cd_offset = plaintext->cd_length; - plaintext->cd_length = saved_length - plaintext->cd_length; - - ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, plaintext, - AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, - aes_xor_block); - if (ret == CRYPTO_SUCCESS) { - if (plaintext != ciphertext) { - plaintext->cd_length = - plaintext->cd_offset - saved_offset; - } - } else { - plaintext->cd_length = saved_length; - } - - plaintext->cd_offset = saved_offset; - } else if (aes_ctx->ac_flags & GCM_MODE) { - /* order of following 2 lines MUST not be reversed */ - plaintext->cd_offset = plaintext->cd_length; - plaintext->cd_length = saved_length - plaintext->cd_length; - - ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, plaintext, - AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); - if (ret == CRYPTO_SUCCESS) { - if (plaintext != ciphertext) { - plaintext->cd_length = - plaintext->cd_offset - saved_offset; - } - } else { - plaintext->cd_length = saved_length; - } - - plaintext->cd_offset = saved_offset; - } - - ASSERT(aes_ctx->ac_remainder_len == 0); - -cleanup: - (void) aes_free_context(ctx); - - return (ret); -} - - -static int -aes_encrypt_update(crypto_ctx_t *ctx, crypto_data_t *plaintext, - crypto_data_t *ciphertext) -{ - off_t saved_offset; - size_t saved_length, out_len; - int ret = CRYPTO_SUCCESS; - aes_ctx_t *aes_ctx; - - ASSERT(ctx->cc_provider_private != NULL); - aes_ctx = ctx->cc_provider_private; - - ASSERT(ciphertext != NULL); - - /* compute number of bytes that will hold the ciphertext */ - out_len = aes_ctx->ac_remainder_len; - out_len += plaintext->cd_length; - out_len &= ~(AES_BLOCK_LEN - 1); - - /* return length needed to store the output */ - if (ciphertext->cd_length < out_len) { - ciphertext->cd_length = out_len; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - saved_offset = ciphertext->cd_offset; - saved_length = ciphertext->cd_length; - - /* - * Do the AES update on the specified input data. - */ - switch (plaintext->cd_format) { - case CRYPTO_DATA_RAW: - ret = crypto_update_iov(ctx->cc_provider_private, - plaintext, ciphertext, aes_encrypt_contiguous_blocks); - break; - case CRYPTO_DATA_UIO: - ret = crypto_update_uio(ctx->cc_provider_private, - plaintext, ciphertext, aes_encrypt_contiguous_blocks); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - if (ret == CRYPTO_SUCCESS) { - if (plaintext != ciphertext) - ciphertext->cd_length = - ciphertext->cd_offset - saved_offset; - } else { - ciphertext->cd_length = saved_length; - } - ciphertext->cd_offset = saved_offset; - - return (ret); -} - - -static int -aes_decrypt_update(crypto_ctx_t *ctx, crypto_data_t *ciphertext, - crypto_data_t *plaintext) -{ - off_t saved_offset; - size_t saved_length; - int ret = CRYPTO_SUCCESS; - - ASSERT(ctx->cc_provider_private != NULL); - - ASSERT(plaintext != NULL); - - saved_offset = plaintext->cd_offset; - saved_length = plaintext->cd_length; - - /* - * Do the AES update on the specified input data. - */ - switch (ciphertext->cd_format) { - case CRYPTO_DATA_RAW: - ret = crypto_update_iov(ctx->cc_provider_private, - ciphertext, plaintext, aes_decrypt_contiguous_blocks); - break; - case CRYPTO_DATA_UIO: - ret = crypto_update_uio(ctx->cc_provider_private, - ciphertext, plaintext, aes_decrypt_contiguous_blocks); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - if (ret == CRYPTO_SUCCESS) { - if (ciphertext != plaintext) - plaintext->cd_length = - plaintext->cd_offset - saved_offset; - } else { - plaintext->cd_length = saved_length; - } - plaintext->cd_offset = saved_offset; - - - return (ret); -} - -static int -aes_encrypt_final(crypto_ctx_t *ctx, crypto_data_t *data) -{ - aes_ctx_t *aes_ctx; - int ret; - - ASSERT(ctx->cc_provider_private != NULL); - aes_ctx = ctx->cc_provider_private; - - if (data->cd_format != CRYPTO_DATA_RAW && - data->cd_format != CRYPTO_DATA_UIO) { - return (CRYPTO_ARGUMENTS_BAD); - } - - if (aes_ctx->ac_flags & CCM_MODE) { - ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, data, - AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); - if (ret != CRYPTO_SUCCESS) { - return (ret); - } - } else if (aes_ctx->ac_flags & GCM_MODE) { - size_t saved_offset = data->cd_offset; - - ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, data, - AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, - aes_xor_block); - if (ret != CRYPTO_SUCCESS) { - return (ret); - } - data->cd_length = data->cd_offset - saved_offset; - data->cd_offset = saved_offset; - } - - (void) aes_free_context(ctx); - - return (CRYPTO_SUCCESS); -} - -static int -aes_decrypt_final(crypto_ctx_t *ctx, crypto_data_t *data) -{ - aes_ctx_t *aes_ctx; - int ret; - off_t saved_offset; - size_t saved_length; - - ASSERT(ctx->cc_provider_private != NULL); - aes_ctx = ctx->cc_provider_private; - - if (data->cd_format != CRYPTO_DATA_RAW && - data->cd_format != CRYPTO_DATA_UIO) { - return (CRYPTO_ARGUMENTS_BAD); - } - - /* - * There must be no unprocessed ciphertext. - * This happens if the length of the last ciphertext is - * not a multiple of the AES block length. - */ - if (aes_ctx->ac_remainder_len > 0) - return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); - - if (aes_ctx->ac_flags & CCM_MODE) { - /* - * This is where all the plaintext is returned, make sure - * the plaintext buffer is big enough - */ - size_t pt_len = aes_ctx->ac_data_len; - if (data->cd_length < pt_len) { - data->cd_length = pt_len; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - ASSERT(aes_ctx->ac_processed_data_len == pt_len); - ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len); - saved_offset = data->cd_offset; - saved_length = data->cd_length; - ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, data, - AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, - aes_xor_block); - if (ret == CRYPTO_SUCCESS) { - data->cd_length = data->cd_offset - saved_offset; - } else { - data->cd_length = saved_length; - } - - data->cd_offset = saved_offset; - if (ret != CRYPTO_SUCCESS) { - return (ret); - } - } else if (aes_ctx->ac_flags & GCM_MODE) { - /* - * This is where all the plaintext is returned, make sure - * the plaintext buffer is big enough - */ - gcm_ctx_t *ctx = (gcm_ctx_t *)aes_ctx; - size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; - - if (data->cd_length < pt_len) { - data->cd_length = pt_len; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - saved_offset = data->cd_offset; - saved_length = data->cd_length; - ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, data, - AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); - if (ret == CRYPTO_SUCCESS) { - data->cd_length = data->cd_offset - saved_offset; - } else { - data->cd_length = saved_length; - } - - data->cd_offset = saved_offset; - if (ret != CRYPTO_SUCCESS) { - return (ret); - } - } - - - (void) aes_free_context(ctx); - - return (CRYPTO_SUCCESS); -} - static int aes_encrypt_atomic(crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext, From 94f1e56e412909cf76b9acf799f5154a08d50a2f Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 19 May 2024 12:24:35 +1000 Subject: [PATCH 101/113] icp: remove unused KCF_ macros Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- module/icp/include/sys/crypto/impl.h | 37 ---------------------------- 1 file changed, 37 deletions(-) diff --git a/module/icp/include/sys/crypto/impl.h b/module/icp/include/sys/crypto/impl.h index 4d17221ea9..f6b2e519f0 100644 --- a/module/icp/include/sys/crypto/impl.h +++ b/module/icp/include/sys/crypto/impl.h @@ -187,28 +187,6 @@ typedef struct kcf_mech_entry { avl_node_t me_node; } kcf_mech_entry_t; -/* - * If a component has a reference to a kcf_policy_desc_t, - * it REFHOLD()s. A new policy descriptor which is referenced only - * by the policy table has a reference count of one. - */ -#define KCF_POLICY_REFHOLD(desc) { \ - int newval = atomic_add_32_nv(&(desc)->pd_refcnt, 1); \ - ASSERT(newval != 0); \ -} - -/* - * Releases a reference to a policy descriptor. When the last - * reference is released, the descriptor is freed. - */ -#define KCF_POLICY_REFRELE(desc) { \ - membar_producer(); \ - int newval = atomic_add_32_nv(&(desc)->pd_refcnt, -1); \ - ASSERT(newval != -1); \ - if (newval == 0) \ - kcf_policy_free_desc(desc); \ -} - /* * Global tables. The sizes are from the predefined PKCS#11 v2.20 mechanisms, * with a margin of few extra empty entry points @@ -275,29 +253,14 @@ extern const kcf_mech_entry_tab_t kcf_mech_tabs_tab[]; * of type kcf_prov_desc_t. */ -#define KCF_PROV_DIGEST_OPS(pd) ((pd)->pd_ops_vector->co_digest_ops) #define KCF_PROV_CIPHER_OPS(pd) ((pd)->pd_ops_vector->co_cipher_ops) #define KCF_PROV_MAC_OPS(pd) ((pd)->pd_ops_vector->co_mac_ops) #define KCF_PROV_CTX_OPS(pd) ((pd)->pd_ops_vector->co_ctx_ops) -/* - * Wrappers for crypto_digest_ops(9S) entry points. - */ - -#define KCF_PROV_DIGEST_INIT(pd, ctx, mech) ( \ - (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_init) ? \ - KCF_PROV_DIGEST_OPS(pd)->digest_init(ctx, mech) : \ - CRYPTO_NOT_SUPPORTED) - /* * Wrappers for crypto_cipher_ops(9S) entry points. */ -#define KCF_PROV_ENCRYPT_INIT(pd, ctx, mech, key, template) ( \ - (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_init) ? \ - KCF_PROV_CIPHER_OPS(pd)->encrypt_init(ctx, mech, key, template) : \ - CRYPTO_NOT_SUPPORTED) - #define KCF_PROV_ENCRYPT_ATOMIC(pd, mech, key, plaintext, ciphertext, \ template) ( \ (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_atomic) ? \ From 1291c46ea4baf3f8807cf533edbdbd4999f6759e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 19 May 2024 12:58:56 +1000 Subject: [PATCH 102/113] icp: remove digest entry points For whatever reason, we call digest mechanisms directly, not through the KCF digest provider. So we can remove those entry points entirely. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- include/sys/skein.h | 9 - module/icp/core/kcf_mech_tabs.c | 7 +- module/icp/include/sys/crypto/impl.h | 7 +- module/icp/include/sys/crypto/spi.h | 19 -- module/icp/io/aes.c | 1 - module/icp/io/sha2_mod.c | 286 +-------------------------- module/icp/io/skein_mod.c | 143 +------------- 7 files changed, 8 insertions(+), 464 deletions(-) diff --git a/include/sys/skein.h b/include/sys/skein.h index 2f649d6b26..3359d48af7 100644 --- a/include/sys/skein.h +++ b/include/sys/skein.h @@ -152,25 +152,16 @@ typedef struct skein_param { /* Module definitions */ #ifdef SKEIN_MODULE_IMPL -#define CKM_SKEIN_256 "CKM_SKEIN_256" -#define CKM_SKEIN_512 "CKM_SKEIN_512" -#define CKM_SKEIN1024 "CKM_SKEIN1024" #define CKM_SKEIN_256_MAC "CKM_SKEIN_256_MAC" #define CKM_SKEIN_512_MAC "CKM_SKEIN_512_MAC" #define CKM_SKEIN1024_MAC "CKM_SKEIN1024_MAC" typedef enum skein_mech_type { - SKEIN_256_MECH_INFO_TYPE, - SKEIN_512_MECH_INFO_TYPE, - SKEIN1024_MECH_INFO_TYPE, SKEIN_256_MAC_MECH_INFO_TYPE, SKEIN_512_MAC_MECH_INFO_TYPE, SKEIN1024_MAC_MECH_INFO_TYPE } skein_mech_type_t; -#define VALID_SKEIN_DIGEST_MECH(__mech) \ - ((int)(__mech) >= SKEIN_256_MECH_INFO_TYPE && \ - (__mech) <= SKEIN1024_MECH_INFO_TYPE) #define VALID_SKEIN_MAC_MECH(__mech) \ ((int)(__mech) >= SKEIN_256_MAC_MECH_INFO_TYPE && \ (__mech) <= SKEIN1024_MAC_MECH_INFO_TYPE) diff --git a/module/icp/core/kcf_mech_tabs.c b/module/icp/core/kcf_mech_tabs.c index b6e6937698..a1e95847d0 100644 --- a/module/icp/core/kcf_mech_tabs.c +++ b/module/icp/core/kcf_mech_tabs.c @@ -41,7 +41,6 @@ * mech_index is the index for that mechanism in the table. * A mechanism belongs to exactly 1 table. * The tables are: - * . digest_mechs_tab[] for the msg digest mechs. * . cipher_mechs_tab[] for encrypt/decrypt and wrap/unwrap mechs. * . mac_mechs_tab[] for MAC mechs. * . sign_mechs_tab[] for sign & verify mechs. @@ -75,13 +74,11 @@ /* RFE 4687834 Will deal with the extensibility of these tables later */ -static kcf_mech_entry_t kcf_digest_mechs_tab[KCF_MAXDIGEST]; static kcf_mech_entry_t kcf_cipher_mechs_tab[KCF_MAXCIPHER]; static kcf_mech_entry_t kcf_mac_mechs_tab[KCF_MAXMAC]; const kcf_mech_entry_tab_t kcf_mech_tabs_tab[KCF_LAST_OPSCLASS + 1] = { {0, NULL}, /* No class zero */ - {KCF_MAXDIGEST, kcf_digest_mechs_tab}, {KCF_MAXCIPHER, kcf_cipher_mechs_tab}, {KCF_MAXMAC, kcf_mac_mechs_tab}, }; @@ -220,9 +217,7 @@ kcf_add_mech_provider(short mech_indx, crypto_func_group_t fg = mech_info->cm_func_group_mask; kcf_ops_class_t class; - if (fg & CRYPTO_FG_DIGEST || fg & CRYPTO_FG_DIGEST_ATOMIC) - class = KCF_DIGEST_CLASS; - else if (fg & CRYPTO_FG_ENCRYPT_ATOMIC || + if (fg & CRYPTO_FG_ENCRYPT_ATOMIC || fg & CRYPTO_FG_DECRYPT_ATOMIC) class = KCF_CIPHER_CLASS; else if (fg & CRYPTO_FG_MAC || fg & CRYPTO_FG_MAC_ATOMIC) diff --git a/module/icp/include/sys/crypto/impl.h b/module/icp/include/sys/crypto/impl.h index f6b2e519f0..0f5ef58ac0 100644 --- a/module/icp/include/sys/crypto/impl.h +++ b/module/icp/include/sys/crypto/impl.h @@ -55,7 +55,7 @@ extern "C" { * When impl.h is broken up (bug# 4703218), this will be done. For now, * we hardcode these values. */ -#define KCF_OPS_CLASSSIZE 4 +#define KCF_OPS_CLASSSIZE 3 #define KCF_MAXMECHTAB 32 /* @@ -200,12 +200,11 @@ _Static_assert(KCF_MAXCIPHER == KCF_MAXMECHTAB, "KCF_MAXCIPHER != KCF_MAXMECHTAB"); /* See KCF_MAXMECHTAB comment */ typedef enum { - KCF_DIGEST_CLASS = 1, - KCF_CIPHER_CLASS, + KCF_CIPHER_CLASS = 1, KCF_MAC_CLASS, } kcf_ops_class_t; -#define KCF_FIRST_OPSCLASS KCF_DIGEST_CLASS +#define KCF_FIRST_OPSCLASS KCF_CIPHER_CLASS #define KCF_LAST_OPSCLASS KCF_MAC_CLASS _Static_assert( KCF_OPS_CLASSSIZE == (KCF_LAST_OPSCLASS - KCF_FIRST_OPSCLASS + 2), diff --git a/module/icp/include/sys/crypto/spi.h b/module/icp/include/sys/crypto/spi.h index 9bcb62ac52..e9be7e0c54 100644 --- a/module/icp/include/sys/crypto/spi.h +++ b/module/icp/include/sys/crypto/spi.h @@ -66,22 +66,6 @@ typedef struct crypto_ctx { void *cc_framework_private; /* owned by framework */ } crypto_ctx_t; -/* - * The crypto_digest_ops structure contains pointers to digest - * operations for cryptographic providers. It is passed through - * the crypto_ops(9S) structure when providers register with the - * kernel using crypto_register_provider(9F). - */ -typedef struct crypto_digest_ops { - int (*digest_init)(crypto_ctx_t *, crypto_mechanism_t *); - int (*digest)(crypto_ctx_t *, crypto_data_t *, crypto_data_t *); - int (*digest_update)(crypto_ctx_t *, crypto_data_t *); - int (*digest_key)(crypto_ctx_t *, crypto_key_t *); - int (*digest_final)(crypto_ctx_t *, crypto_data_t *); - int (*digest_atomic)(crypto_mechanism_t *, crypto_data_t *, - crypto_data_t *); -} __no_const crypto_digest_ops_t; - /* * The crypto_cipher_ops structure contains pointers to encryption * and decryption operations for cryptographic providers. It is @@ -137,7 +121,6 @@ typedef struct crypto_ctx_ops { * by calling crypto_register_provider(9F). */ typedef struct crypto_ops { - const crypto_digest_ops_t *co_digest_ops; const crypto_cipher_ops_t *co_cipher_ops; const crypto_mac_ops_t *co_mac_ops; const crypto_ctx_ops_t *co_ctx_ops; @@ -153,12 +136,10 @@ typedef struct crypto_ops { typedef uint32_t crypto_func_group_t; -#define CRYPTO_FG_DIGEST 0x00000004 /* digest_init() */ #define CRYPTO_FG_MAC 0x00001000 /* mac_init() */ #define CRYPTO_FG_ENCRYPT_ATOMIC 0x00008000 /* encrypt_atomic() */ #define CRYPTO_FG_DECRYPT_ATOMIC 0x00010000 /* decrypt_atomic() */ #define CRYPTO_FG_MAC_ATOMIC 0x00020000 /* mac_atomic() */ -#define CRYPTO_FG_DIGEST_ATOMIC 0x00040000 /* digest_atomic() */ /* * Maximum length of the pi_provider_description field of the diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index a68a878b6a..8ee2d036c1 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -72,7 +72,6 @@ static const crypto_ctx_ops_t aes_ctx_ops = { }; static const crypto_ops_t aes_crypto_ops = { - NULL, &aes_cipher_ops, NULL, &aes_ctx_ops, diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c index f068951b07..c8e3b4fccd 100644 --- a/module/icp/io/sha2_mod.c +++ b/module/icp/io/sha2_mod.c @@ -61,8 +61,7 @@ */ static const crypto_mech_info_t sha2_mech_info_tab[] = { /* SHA256 */ - {SUN_CKM_SHA256, SHA256_MECH_INFO_TYPE, - CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC}, + {SUN_CKM_SHA256, SHA256_MECH_INFO_TYPE, 0}, /* SHA256-HMAC */ {SUN_CKM_SHA256_HMAC, SHA256_HMAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, @@ -70,8 +69,7 @@ static const crypto_mech_info_t sha2_mech_info_tab[] = { {SUN_CKM_SHA256_HMAC_GENERAL, SHA256_HMAC_GEN_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, /* SHA384 */ - {SUN_CKM_SHA384, SHA384_MECH_INFO_TYPE, - CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC}, + {SUN_CKM_SHA384, SHA384_MECH_INFO_TYPE, 0}, /* SHA384-HMAC */ {SUN_CKM_SHA384_HMAC, SHA384_HMAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, @@ -79,8 +77,7 @@ static const crypto_mech_info_t sha2_mech_info_tab[] = { {SUN_CKM_SHA384_HMAC_GENERAL, SHA384_HMAC_GEN_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, /* SHA512 */ - {SUN_CKM_SHA512, SHA512_MECH_INFO_TYPE, - CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC}, + {SUN_CKM_SHA512, SHA512_MECH_INFO_TYPE, 0}, /* SHA512-HMAC */ {SUN_CKM_SHA512_HMAC, SHA512_HMAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, @@ -89,21 +86,6 @@ static const crypto_mech_info_t sha2_mech_info_tab[] = { CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, }; -static int sha2_digest_init(crypto_ctx_t *, crypto_mechanism_t *); -static int sha2_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *); -static int sha2_digest_update(crypto_ctx_t *, crypto_data_t *); -static int sha2_digest_final(crypto_ctx_t *, crypto_data_t *); -static int sha2_digest_atomic(crypto_mechanism_t *, crypto_data_t *, - crypto_data_t *); - -static const crypto_digest_ops_t sha2_digest_ops = { - .digest_init = sha2_digest_init, - .digest = sha2_digest, - .digest_update = sha2_digest_update, - .digest_final = sha2_digest_final, - .digest_atomic = sha2_digest_atomic -}; - static int sha2_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t); static int sha2_mac_update(crypto_ctx_t *, crypto_data_t *); @@ -132,7 +114,6 @@ static const crypto_ctx_ops_t sha2_ctx_ops = { }; static const crypto_ops_t sha2_crypto_ops = { - &sha2_digest_ops, NULL, &sha2_mac_ops, &sha2_ctx_ops, @@ -184,27 +165,6 @@ sha2_mod_fini(void) return (ret); } -/* - * KCF software provider digest entry points. - */ - -static int -sha2_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism) -{ - - /* - * Allocate and initialize SHA2 context. - */ - ctx->cc_provider_private = kmem_alloc(sizeof (sha2_ctx_t), KM_SLEEP); - if (ctx->cc_provider_private == NULL) - return (CRYPTO_HOST_MEMORY); - - PROV_SHA2_CTX(ctx)->sc_mech_type = mechanism->cm_type; - SHA2Init(mechanism->cm_type, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx); - - return (CRYPTO_SUCCESS); -} - /* * Helper SHA2 digest update function for uio data. */ @@ -360,246 +320,6 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, return (CRYPTO_SUCCESS); } -static int -sha2_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest) -{ - int ret = CRYPTO_SUCCESS; - uint_t sha_digest_len; - - ASSERT(ctx->cc_provider_private != NULL); - - switch (PROV_SHA2_CTX(ctx)->sc_mech_type) { - case SHA256_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - break; - case SHA384_MECH_INFO_TYPE: - sha_digest_len = SHA384_DIGEST_LENGTH; - break; - case SHA512_MECH_INFO_TYPE: - sha_digest_len = SHA512_DIGEST_LENGTH; - break; - default: - return (CRYPTO_MECHANISM_INVALID); - } - - /* - * We need to just return the length needed to store the output. - * We should not destroy the context for the following cases. - */ - if ((digest->cd_length == 0) || - (digest->cd_length < sha_digest_len)) { - digest->cd_length = sha_digest_len; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - /* - * Do the SHA2 update on the specified input data. - */ - switch (data->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - (uint8_t *)data->cd_raw.iov_base + data->cd_offset, - data->cd_length); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - data); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - if (ret != CRYPTO_SUCCESS) { - /* the update failed, free context and bail */ - kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t)); - ctx->cc_provider_private = NULL; - digest->cd_length = 0; - return (ret); - } - - /* - * Do a SHA2 final, must be done separately since the digest - * type can be different than the input data type. - */ - switch (digest->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Final((unsigned char *)digest->cd_raw.iov_base + - digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - digest, sha_digest_len, NULL); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - /* all done, free context and return */ - - if (ret == CRYPTO_SUCCESS) - digest->cd_length = sha_digest_len; - else - digest->cd_length = 0; - - kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t)); - ctx->cc_provider_private = NULL; - return (ret); -} - -static int -sha2_digest_update(crypto_ctx_t *ctx, crypto_data_t *data) -{ - int ret = CRYPTO_SUCCESS; - - ASSERT(ctx->cc_provider_private != NULL); - - /* - * Do the SHA2 update on the specified input data. - */ - switch (data->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - (uint8_t *)data->cd_raw.iov_base + data->cd_offset, - data->cd_length); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - data); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - return (ret); -} - -static int -sha2_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest) -{ - int ret = CRYPTO_SUCCESS; - uint_t sha_digest_len; - - ASSERT(ctx->cc_provider_private != NULL); - - switch (PROV_SHA2_CTX(ctx)->sc_mech_type) { - case SHA256_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - break; - case SHA384_MECH_INFO_TYPE: - sha_digest_len = SHA384_DIGEST_LENGTH; - break; - case SHA512_MECH_INFO_TYPE: - sha_digest_len = SHA512_DIGEST_LENGTH; - break; - default: - return (CRYPTO_MECHANISM_INVALID); - } - - /* - * We need to just return the length needed to store the output. - * We should not destroy the context for the following cases. - */ - if ((digest->cd_length == 0) || - (digest->cd_length < sha_digest_len)) { - digest->cd_length = sha_digest_len; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - /* - * Do a SHA2 final. - */ - switch (digest->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Final((unsigned char *)digest->cd_raw.iov_base + - digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - digest, sha_digest_len, NULL); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - /* all done, free context and return */ - - if (ret == CRYPTO_SUCCESS) - digest->cd_length = sha_digest_len; - else - digest->cd_length = 0; - - kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t)); - ctx->cc_provider_private = NULL; - - return (ret); -} - -static int -sha2_digest_atomic(crypto_mechanism_t *mechanism, crypto_data_t *data, - crypto_data_t *digest) -{ - int ret = CRYPTO_SUCCESS; - SHA2_CTX sha2_ctx; - uint32_t sha_digest_len; - - /* - * Do the SHA inits. - */ - - SHA2Init(mechanism->cm_type, &sha2_ctx); - - switch (data->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Update(&sha2_ctx, (uint8_t *)data-> - cd_raw.iov_base + data->cd_offset, data->cd_length); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_update_uio(&sha2_ctx, data); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - /* - * Do the SHA updates on the specified input data. - */ - - if (ret != CRYPTO_SUCCESS) { - /* the update failed, bail */ - digest->cd_length = 0; - return (ret); - } - - if (mechanism->cm_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE) - sha_digest_len = SHA256_DIGEST_LENGTH; - else - sha_digest_len = SHA512_DIGEST_LENGTH; - - /* - * Do a SHA2 final, must be done separately since the digest - * type can be different than the input data type. - */ - switch (digest->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Final((unsigned char *)digest->cd_raw.iov_base + - digest->cd_offset, &sha2_ctx); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_final_uio(&sha2_ctx, digest, - sha_digest_len, NULL); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - if (ret == CRYPTO_SUCCESS) - digest->cd_length = sha_digest_len; - else - digest->cd_length = 0; - - return (ret); -} - /* * KCF software provider mac entry points. * diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c index 221e1debd4..3e969513be 100644 --- a/module/icp/io/skein_mod.c +++ b/module/icp/io/skein_mod.c @@ -31,34 +31,16 @@ #include static const crypto_mech_info_t skein_mech_info_tab[] = { - {CKM_SKEIN_256, SKEIN_256_MECH_INFO_TYPE, - CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC}, {CKM_SKEIN_256_MAC, SKEIN_256_MAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - {CKM_SKEIN_512, SKEIN_512_MECH_INFO_TYPE, - CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC}, {CKM_SKEIN_512_MAC, SKEIN_512_MAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - {CKM_SKEIN1024, SKEIN1024_MECH_INFO_TYPE, - CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC}, {CKM_SKEIN1024_MAC, SKEIN1024_MAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, }; -static int skein_digest_init(crypto_ctx_t *, crypto_mechanism_t *); -static int skein_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *); static int skein_update(crypto_ctx_t *, crypto_data_t *); static int skein_final(crypto_ctx_t *, crypto_data_t *); -static int skein_digest_atomic(crypto_mechanism_t *, crypto_data_t *, - crypto_data_t *); - -static const crypto_digest_ops_t skein_digest_ops = { - .digest_init = skein_digest_init, - .digest = skein_digest, - .digest_update = skein_update, - .digest_final = skein_final, - .digest_atomic = skein_digest_atomic -}; static int skein_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t); @@ -84,7 +66,6 @@ static const crypto_ctx_ops_t skein_ctx_ops = { }; static const crypto_ops_t skein_crypto_ops = { - &skein_digest_ops, NULL, &skein_mac_ops, &skein_ctx_ops, @@ -115,15 +96,12 @@ typedef struct skein_ctx { do { \ skein_ctx_t *sc = (_skein_ctx); \ switch (sc->sc_mech_type) { \ - case SKEIN_256_MECH_INFO_TYPE: \ case SKEIN_256_MAC_MECH_INFO_TYPE: \ (void) Skein_256_ ## _op(&sc->sc_256, __VA_ARGS__);\ break; \ - case SKEIN_512_MECH_INFO_TYPE: \ case SKEIN_512_MAC_MECH_INFO_TYPE: \ (void) Skein_512_ ## _op(&sc->sc_512, __VA_ARGS__);\ break; \ - case SKEIN1024_MECH_INFO_TYPE: \ case SKEIN1024_MAC_MECH_INFO_TYPE: \ (void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\ break; \ @@ -143,19 +121,7 @@ skein_get_digest_bitlen(const crypto_mechanism_t *mechanism, size_t *result) } *result = param->sp_digest_bitlen; } else { - switch (mechanism->cm_type) { - case SKEIN_256_MECH_INFO_TYPE: - *result = 256; - break; - case SKEIN_512_MECH_INFO_TYPE: - *result = 512; - break; - case SKEIN1024_MECH_INFO_TYPE: - *result = 1024; - break; - default: - return (CRYPTO_MECHANISM_INVALID); - } + return (CRYPTO_MECHANISM_INVALID); } return (CRYPTO_SUCCESS); } @@ -320,73 +286,6 @@ skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest) * KCF software provider digest entry points. */ -/* - * Initializes a skein digest context to the configuration in `mechanism'. - * The mechanism cm_type must be one of SKEIN_*_MECH_INFO_TYPE. The cm_param - * field may contain a skein_param_t structure indicating the length of the - * digest the algorithm should produce. Otherwise the default output lengths - * are applied (32 bytes for Skein-256, 64 bytes for Skein-512 and 128 bytes - * for Skein-1024). - */ -static int -skein_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism) -{ - int error = CRYPTO_SUCCESS; - - if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type)) - return (CRYPTO_MECHANISM_INVALID); - - SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)), KM_SLEEP); - if (SKEIN_CTX(ctx) == NULL) - return (CRYPTO_HOST_MEMORY); - - SKEIN_CTX(ctx)->sc_mech_type = mechanism->cm_type; - error = skein_get_digest_bitlen(mechanism, - &SKEIN_CTX(ctx)->sc_digest_bitlen); - if (error != CRYPTO_SUCCESS) - goto errout; - SKEIN_OP(SKEIN_CTX(ctx), Init, SKEIN_CTX(ctx)->sc_digest_bitlen); - - return (CRYPTO_SUCCESS); -errout: - memset(SKEIN_CTX(ctx), 0, sizeof (*SKEIN_CTX(ctx))); - kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); - SKEIN_CTX_LVALUE(ctx) = NULL; - return (error); -} - -/* - * Executes a skein_update and skein_digest on a pre-initialized crypto - * context in a single step. See the documentation to these functions to - * see what to pass here. - */ -static int -skein_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest) -{ - int error = CRYPTO_SUCCESS; - - ASSERT(SKEIN_CTX(ctx) != NULL); - - if (digest->cd_length < - CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) { - digest->cd_length = - CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); - return (CRYPTO_BUFFER_TOO_SMALL); - } - - error = skein_update(ctx, data); - if (error != CRYPTO_SUCCESS) { - memset(SKEIN_CTX(ctx), 0, sizeof (*SKEIN_CTX(ctx))); - kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); - SKEIN_CTX_LVALUE(ctx) = NULL; - digest->cd_length = 0; - return (error); - } - error = skein_final(ctx, digest); - - return (error); -} - /* * Performs a skein Update with the input message in `data' (successive calls * can push more data). This is used both for digest and MAC operation. @@ -470,46 +369,6 @@ skein_final(crypto_ctx_t *ctx, crypto_data_t *digest) return (error); } -/* - * Performs a full skein digest computation in a single call, configuring the - * algorithm according to `mechanism', reading the input to be digested from - * `data' and writing the output to `digest'. - * Supported input/output formats are raw, uio and mblk. - */ -static int -skein_digest_atomic(crypto_mechanism_t *mechanism, crypto_data_t *data, - crypto_data_t *digest) -{ - int error; - skein_ctx_t skein_ctx; - crypto_ctx_t ctx; - SKEIN_CTX_LVALUE(&ctx) = &skein_ctx; - - /* Init */ - if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type)) - return (CRYPTO_MECHANISM_INVALID); - skein_ctx.sc_mech_type = mechanism->cm_type; - error = skein_get_digest_bitlen(mechanism, &skein_ctx.sc_digest_bitlen); - if (error != CRYPTO_SUCCESS) - goto out; - SKEIN_OP(&skein_ctx, Init, skein_ctx.sc_digest_bitlen); - - if ((error = skein_update(&ctx, data)) != CRYPTO_SUCCESS) - goto out; - if ((error = skein_final_nofree(&ctx, data)) != CRYPTO_SUCCESS) - goto out; - -out: - if (error == CRYPTO_SUCCESS) - digest->cd_length = - CRYPTO_BITS2BYTES(skein_ctx.sc_digest_bitlen); - else - digest->cd_length = 0; - memset(&skein_ctx, 0, sizeof (skein_ctx)); - - return (error); -} - /* * Helper function that builds a Skein MAC context from the provided * mechanism and key. From 10de12e9ed2fee85adc2b9b4efac64f8655062bc Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 19 May 2024 13:18:42 +1000 Subject: [PATCH 103/113] icp: reorganise SHA2 digest mechanisms sha2_mech_type_t serves double-duty, as the list of MAC providers and also the algo type for direct callers to SHA2Init. Until we disentangle that, reorganise it to make the separation more clear. While we're there, remove the digest mechs we don't use. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- include/sys/crypto/common.h | 5 -- include/sys/sha2.h | 22 ++------ module/icp/algs/sha2/sha2_generic.c | 44 ++++----------- module/icp/io/sha2_mod.c | 6 -- tests/zfs-tests/cmd/checksum/sha2_test.c | 70 ++++-------------------- 5 files changed, 26 insertions(+), 121 deletions(-) diff --git a/include/sys/crypto/common.h b/include/sys/crypto/common.h index 7438056500..a73804f916 100644 --- a/include/sys/crypto/common.h +++ b/include/sys/crypto/common.h @@ -79,17 +79,12 @@ typedef uint32_t crypto_keysize_unit_t; /* Mechanisms supported out-of-the-box */ -#define SUN_CKM_SHA256 "CKM_SHA256" #define SUN_CKM_SHA256_HMAC "CKM_SHA256_HMAC" #define SUN_CKM_SHA256_HMAC_GENERAL "CKM_SHA256_HMAC_GENERAL" -#define SUN_CKM_SHA384 "CKM_SHA384" #define SUN_CKM_SHA384_HMAC "CKM_SHA384_HMAC" #define SUN_CKM_SHA384_HMAC_GENERAL "CKM_SHA384_HMAC_GENERAL" -#define SUN_CKM_SHA512 "CKM_SHA512" #define SUN_CKM_SHA512_HMAC "CKM_SHA512_HMAC" #define SUN_CKM_SHA512_HMAC_GENERAL "CKM_SHA512_HMAC_GENERAL" -#define SUN_CKM_SHA512_224 "CKM_SHA512_224" -#define SUN_CKM_SHA512_256 "CKM_SHA512_256" #define SUN_CKM_AES_CCM "CKM_AES_CCM" #define SUN_CKM_AES_GCM "CKM_AES_GCM" diff --git a/include/sys/sha2.h b/include/sys/sha2.h index 81dfbbb8ce..2d38885bd9 100644 --- a/include/sys/sha2.h +++ b/include/sys/sha2.h @@ -86,30 +86,18 @@ typedef struct { /* SHA2 algorithm types */ typedef enum sha2_mech_type { - SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ - SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ - SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ - SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ - SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ -} sha2_mech_type_t; -#define SHA256 0 -#define SHA256_HMAC 1 -#define SHA256_HMAC_GEN 2 -#define SHA384 3 -#define SHA384_HMAC 4 -#define SHA384_HMAC_GEN 5 -#define SHA512 6 -#define SHA512_HMAC 7 -#define SHA512_HMAC_GEN 8 -#define SHA512_224 9 -#define SHA512_256 10 + /* Not true KCF mech types; used by direct callers to SHA2Init */ + SHA256, + SHA512, + SHA512_256, +} sha2_mech_type_t; /* SHA2 Init function */ extern void SHA2Init(int algotype, SHA2_CTX *ctx); diff --git a/module/icp/algs/sha2/sha2_generic.c b/module/icp/algs/sha2/sha2_generic.c index 60d7ad9a1d..ab361b9d59 100644 --- a/module/icp/algs/sha2/sha2_generic.c +++ b/module/icp/algs/sha2/sha2_generic.c @@ -400,13 +400,13 @@ SHA2Init(int algotype, SHA2_CTX *ctx) sha256_ctx *ctx256 = &ctx->sha256; sha512_ctx *ctx512 = &ctx->sha512; - ASSERT3S(algotype, >=, SHA256_MECH_INFO_TYPE); - ASSERT3S(algotype, <=, SHA512_256_MECH_INFO_TYPE); + ASSERT3S(algotype, >=, SHA256_HMAC_MECH_INFO_TYPE); + ASSERT3S(algotype, <=, SHA512_256); memset(ctx, 0, sizeof (*ctx)); ctx->algotype = algotype; switch (ctx->algotype) { - case SHA256_MECH_INFO_TYPE: + case SHA256: case SHA256_HMAC_MECH_INFO_TYPE: case SHA256_HMAC_GEN_MECH_INFO_TYPE: ctx256->state[0] = 0x6a09e667; @@ -420,7 +420,6 @@ SHA2Init(int algotype, SHA2_CTX *ctx) ctx256->count[0] = 0; ctx256->ops = sha256_get_ops(); break; - case SHA384_MECH_INFO_TYPE: case SHA384_HMAC_MECH_INFO_TYPE: case SHA384_HMAC_GEN_MECH_INFO_TYPE: ctx512->state[0] = 0xcbbb9d5dc1059ed8ULL; @@ -435,7 +434,7 @@ SHA2Init(int algotype, SHA2_CTX *ctx) ctx512->count[1] = 0; ctx512->ops = sha512_get_ops(); break; - case SHA512_MECH_INFO_TYPE: + case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: case SHA512_HMAC_GEN_MECH_INFO_TYPE: ctx512->state[0] = 0x6a09e667f3bcc908ULL; @@ -450,20 +449,7 @@ SHA2Init(int algotype, SHA2_CTX *ctx) ctx512->count[1] = 0; ctx512->ops = sha512_get_ops(); break; - case SHA512_224_MECH_INFO_TYPE: - ctx512->state[0] = 0x8c3d37c819544da2ULL; - ctx512->state[1] = 0x73e1996689dcd4d6ULL; - ctx512->state[2] = 0x1dfab7ae32ff9c82ULL; - ctx512->state[3] = 0x679dd514582f9fcfULL; - ctx512->state[4] = 0x0f6d2b697bd44da8ULL; - ctx512->state[5] = 0x77e36f7304c48942ULL; - ctx512->state[6] = 0x3f9d85a86a1d36c8ULL; - ctx512->state[7] = 0x1112e6ad91d692a1ULL; - ctx512->count[0] = 0; - ctx512->count[1] = 0; - ctx512->ops = sha512_get_ops(); - break; - case SHA512_256_MECH_INFO_TYPE: + case SHA512_256: ctx512->state[0] = 0x22312194fc2bf72cULL; ctx512->state[1] = 0x9f555fa3c84c64c2ULL; ctx512->state[2] = 0x2393b86b6f53b151ULL; @@ -490,25 +476,21 @@ SHA2Update(SHA2_CTX *ctx, const void *data, size_t len) ASSERT3P(data, !=, NULL); switch (ctx->algotype) { - case SHA256_MECH_INFO_TYPE: + case SHA256: case SHA256_HMAC_MECH_INFO_TYPE: case SHA256_HMAC_GEN_MECH_INFO_TYPE: sha256_update(&ctx->sha256, data, len); break; - case SHA384_MECH_INFO_TYPE: case SHA384_HMAC_MECH_INFO_TYPE: case SHA384_HMAC_GEN_MECH_INFO_TYPE: sha512_update(&ctx->sha512, data, len); break; - case SHA512_MECH_INFO_TYPE: + case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: case SHA512_HMAC_GEN_MECH_INFO_TYPE: sha512_update(&ctx->sha512, data, len); break; - case SHA512_224_MECH_INFO_TYPE: - sha512_update(&ctx->sha512, data, len); - break; - case SHA512_256_MECH_INFO_TYPE: + case SHA512_256: sha512_update(&ctx->sha512, data, len); break; } @@ -519,25 +501,21 @@ void SHA2Final(void *digest, SHA2_CTX *ctx) { switch (ctx->algotype) { - case SHA256_MECH_INFO_TYPE: + case SHA256: case SHA256_HMAC_MECH_INFO_TYPE: case SHA256_HMAC_GEN_MECH_INFO_TYPE: sha256_final(&ctx->sha256, digest, 256); break; - case SHA384_MECH_INFO_TYPE: case SHA384_HMAC_MECH_INFO_TYPE: case SHA384_HMAC_GEN_MECH_INFO_TYPE: sha512_final(&ctx->sha512, digest, 384); break; - case SHA512_MECH_INFO_TYPE: + case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: case SHA512_HMAC_GEN_MECH_INFO_TYPE: sha512_final(&ctx->sha512, digest, 512); break; - case SHA512_224_MECH_INFO_TYPE: - sha512_final(&ctx->sha512, digest, 224); - break; - case SHA512_256_MECH_INFO_TYPE: + case SHA512_256: sha512_final(&ctx->sha512, digest, 256); break; } diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c index c8e3b4fccd..d80ea1e677 100644 --- a/module/icp/io/sha2_mod.c +++ b/module/icp/io/sha2_mod.c @@ -60,24 +60,18 @@ * Mechanism info structure passed to KCF during registration. */ static const crypto_mech_info_t sha2_mech_info_tab[] = { - /* SHA256 */ - {SUN_CKM_SHA256, SHA256_MECH_INFO_TYPE, 0}, /* SHA256-HMAC */ {SUN_CKM_SHA256_HMAC, SHA256_HMAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, /* SHA256-HMAC GENERAL */ {SUN_CKM_SHA256_HMAC_GENERAL, SHA256_HMAC_GEN_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA384 */ - {SUN_CKM_SHA384, SHA384_MECH_INFO_TYPE, 0}, /* SHA384-HMAC */ {SUN_CKM_SHA384_HMAC, SHA384_HMAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, /* SHA384-HMAC GENERAL */ {SUN_CKM_SHA384_HMAC_GENERAL, SHA384_HMAC_GEN_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA512 */ - {SUN_CKM_SHA512, SHA512_MECH_INFO_TYPE, 0}, /* SHA512-HMAC */ {SUN_CKM_SHA512_HMAC, SHA512_HMAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, diff --git a/tests/zfs-tests/cmd/checksum/sha2_test.c b/tests/zfs-tests/cmd/checksum/sha2_test.c index efcf812d77..d36b670db8 100644 --- a/tests/zfs-tests/cmd/checksum/sha2_test.c +++ b/tests/zfs-tests/cmd/checksum/sha2_test.c @@ -72,31 +72,6 @@ static const uint8_t sha256_test_digests[][32] = { /* no test vector for test_msg2 */ }; -static const uint8_t sha384_test_digests[][48] = { - { - /* for test_msg0 */ - 0xCB, 0x00, 0x75, 0x3F, 0x45, 0xA3, 0x5E, 0x8B, - 0xB5, 0xA0, 0x3D, 0x69, 0x9A, 0xC6, 0x50, 0x07, - 0x27, 0x2C, 0x32, 0xAB, 0x0E, 0xDE, 0xD1, 0x63, - 0x1A, 0x8B, 0x60, 0x5A, 0x43, 0xFF, 0x5B, 0xED, - 0x80, 0x86, 0x07, 0x2B, 0xA1, 0xE7, 0xCC, 0x23, - 0x58, 0xBA, 0xEC, 0xA1, 0x34, 0xC8, 0x25, 0xA7 - }, - { - /* no test vector for test_msg1 */ - 0 - }, - { - /* for test_msg2 */ - 0x09, 0x33, 0x0C, 0x33, 0xF7, 0x11, 0x47, 0xE8, - 0x3D, 0x19, 0x2F, 0xC7, 0x82, 0xCD, 0x1B, 0x47, - 0x53, 0x11, 0x1B, 0x17, 0x3B, 0x3B, 0x05, 0xD2, - 0x2F, 0xA0, 0x80, 0x86, 0xE3, 0xB0, 0xF7, 0x12, - 0xFC, 0xC7, 0xC7, 0x1A, 0x55, 0x7E, 0x2D, 0xB9, - 0x66, 0xC3, 0xE9, 0xFA, 0x91, 0x74, 0x60, 0x39 - } -}; - static const uint8_t sha512_test_digests[][64] = { { /* for test_msg0 */ @@ -126,27 +101,6 @@ static const uint8_t sha512_test_digests[][64] = { } }; -static const uint8_t sha512_224_test_digests[][28] = { - { - /* for test_msg0 */ - 0x46, 0x34, 0x27, 0x0F, 0x70, 0x7B, 0x6A, 0x54, - 0xDA, 0xAE, 0x75, 0x30, 0x46, 0x08, 0x42, 0xE2, - 0x0E, 0x37, 0xED, 0x26, 0x5C, 0xEE, 0xE9, 0xA4, - 0x3E, 0x89, 0x24, 0xAA - }, - { - /* no test vector for test_msg1 */ - 0 - }, - { - /* for test_msg2 */ - 0x23, 0xFE, 0xC5, 0xBB, 0x94, 0xD6, 0x0B, 0x23, - 0x30, 0x81, 0x92, 0x64, 0x0B, 0x0C, 0x45, 0x33, - 0x35, 0xD6, 0x64, 0x73, 0x4F, 0xE4, 0x0E, 0x72, - 0x68, 0x67, 0x4A, 0xF9 - } -}; - static const uint8_t sha512_256_test_digests[][32] = { { /* for test_msg0 */ @@ -191,7 +145,7 @@ main(int argc, char *argv[]) do { \ SHA2_CTX ctx; \ uint8_t digest[diglen / 8]; \ - SHA2Init(SHA ## mode ## _MECH_INFO_TYPE, &ctx); \ + SHA2Init(mode, &ctx); \ SHA2Update(&ctx, _m, strlen(_m)); \ SHA2Final(digest, &ctx); \ (void) printf("SHA%-9sMessage: " #_m \ @@ -215,7 +169,7 @@ main(int argc, char *argv[]) struct timeval start, end; \ memset(block, 0, sizeof (block)); \ (void) gettimeofday(&start, NULL); \ - SHA2Init(SHA ## mode ## _MECH_INFO_TYPE, &ctx); \ + SHA2Init(mode, &ctx); \ for (i = 0; i < 8192; i++) \ SHA2Update(&ctx, block, sizeof (block)); \ SHA2Final(digest, &ctx); \ @@ -231,16 +185,12 @@ main(int argc, char *argv[]) } while (0) (void) printf("Running algorithm correctness tests:\n"); - SHA2_ALGO_TEST(test_msg0, 256, 256, sha256_test_digests[0]); - SHA2_ALGO_TEST(test_msg1, 256, 256, sha256_test_digests[1]); - SHA2_ALGO_TEST(test_msg0, 384, 384, sha384_test_digests[0]); - SHA2_ALGO_TEST(test_msg2, 384, 384, sha384_test_digests[2]); - SHA2_ALGO_TEST(test_msg0, 512, 512, sha512_test_digests[0]); - SHA2_ALGO_TEST(test_msg2, 512, 512, sha512_test_digests[2]); - SHA2_ALGO_TEST(test_msg0, 512_224, 224, sha512_224_test_digests[0]); - SHA2_ALGO_TEST(test_msg2, 512_224, 224, sha512_224_test_digests[2]); - SHA2_ALGO_TEST(test_msg0, 512_256, 256, sha512_256_test_digests[0]); - SHA2_ALGO_TEST(test_msg2, 512_256, 256, sha512_256_test_digests[2]); + SHA2_ALGO_TEST(test_msg0, SHA256, 256, sha256_test_digests[0]); + SHA2_ALGO_TEST(test_msg1, SHA256, 256, sha256_test_digests[1]); + SHA2_ALGO_TEST(test_msg0, SHA512, 512, sha512_test_digests[0]); + SHA2_ALGO_TEST(test_msg2, SHA512, 512, sha512_test_digests[2]); + SHA2_ALGO_TEST(test_msg0, SHA512_256, 256, sha512_256_test_digests[0]); + SHA2_ALGO_TEST(test_msg2, SHA512_256, 256, sha512_256_test_digests[2]); if (failed) return (1); @@ -251,13 +201,13 @@ main(int argc, char *argv[]) for (id = 0; id < sha256->getcnt(); id++) { sha256->setid(id); const char *name = sha256->getname(); - SHA2_PERF_TEST(256, 256, name); + SHA2_PERF_TEST(SHA256, 256, name); } for (id = 0; id < sha512->getcnt(); id++) { sha512->setid(id); const char *name = sha512->getname(); - SHA2_PERF_TEST(512, 512, name); + SHA2_PERF_TEST(SHA512, 512, name); } return (0); From f39241aeb333027a4dfb3f716e80d475042a348d Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 19 May 2024 15:00:44 +1000 Subject: [PATCH 104/113] icp: remove unused SHA2 HMAC mechanisms Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- include/sys/crypto/common.h | 5 - include/sys/sha2.h | 5 - module/icp/algs/sha2/sha2_generic.c | 33 +----- module/icp/io/sha2_mod.c | 158 +++------------------------- 4 files changed, 13 insertions(+), 188 deletions(-) diff --git a/include/sys/crypto/common.h b/include/sys/crypto/common.h index a73804f916..c9ef3b367e 100644 --- a/include/sys/crypto/common.h +++ b/include/sys/crypto/common.h @@ -79,12 +79,7 @@ typedef uint32_t crypto_keysize_unit_t; /* Mechanisms supported out-of-the-box */ -#define SUN_CKM_SHA256_HMAC "CKM_SHA256_HMAC" -#define SUN_CKM_SHA256_HMAC_GENERAL "CKM_SHA256_HMAC_GENERAL" -#define SUN_CKM_SHA384_HMAC "CKM_SHA384_HMAC" -#define SUN_CKM_SHA384_HMAC_GENERAL "CKM_SHA384_HMAC_GENERAL" #define SUN_CKM_SHA512_HMAC "CKM_SHA512_HMAC" -#define SUN_CKM_SHA512_HMAC_GENERAL "CKM_SHA512_HMAC_GENERAL" #define SUN_CKM_AES_CCM "CKM_AES_CCM" #define SUN_CKM_AES_GCM "CKM_AES_GCM" diff --git a/include/sys/sha2.h b/include/sys/sha2.h index 2d38885bd9..b344eb9d5f 100644 --- a/include/sys/sha2.h +++ b/include/sys/sha2.h @@ -86,12 +86,7 @@ typedef struct { /* SHA2 algorithm types */ typedef enum sha2_mech_type { - SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ - SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ - SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ - SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ - SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ /* Not true KCF mech types; used by direct callers to SHA2Init */ SHA256, diff --git a/module/icp/algs/sha2/sha2_generic.c b/module/icp/algs/sha2/sha2_generic.c index ab361b9d59..d53f4b6999 100644 --- a/module/icp/algs/sha2/sha2_generic.c +++ b/module/icp/algs/sha2/sha2_generic.c @@ -400,15 +400,13 @@ SHA2Init(int algotype, SHA2_CTX *ctx) sha256_ctx *ctx256 = &ctx->sha256; sha512_ctx *ctx512 = &ctx->sha512; - ASSERT3S(algotype, >=, SHA256_HMAC_MECH_INFO_TYPE); + ASSERT3S(algotype, >=, SHA512_HMAC_MECH_INFO_TYPE); ASSERT3S(algotype, <=, SHA512_256); memset(ctx, 0, sizeof (*ctx)); ctx->algotype = algotype; switch (ctx->algotype) { case SHA256: - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: ctx256->state[0] = 0x6a09e667; ctx256->state[1] = 0xbb67ae85; ctx256->state[2] = 0x3c6ef372; @@ -420,23 +418,8 @@ SHA2Init(int algotype, SHA2_CTX *ctx) ctx256->count[0] = 0; ctx256->ops = sha256_get_ops(); break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - ctx512->state[0] = 0xcbbb9d5dc1059ed8ULL; - ctx512->state[1] = 0x629a292a367cd507ULL; - ctx512->state[2] = 0x9159015a3070dd17ULL; - ctx512->state[3] = 0x152fecd8f70e5939ULL; - ctx512->state[4] = 0x67332667ffc00b31ULL; - ctx512->state[5] = 0x8eb44a8768581511ULL; - ctx512->state[6] = 0xdb0c2e0d64f98fa7ULL; - ctx512->state[7] = 0x47b5481dbefa4fa4ULL; - ctx512->count[0] = 0; - ctx512->count[1] = 0; - ctx512->ops = sha512_get_ops(); - break; case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: ctx512->state[0] = 0x6a09e667f3bcc908ULL; ctx512->state[1] = 0xbb67ae8584caa73bULL; ctx512->state[2] = 0x3c6ef372fe94f82bULL; @@ -477,17 +460,10 @@ SHA2Update(SHA2_CTX *ctx, const void *data, size_t len) switch (ctx->algotype) { case SHA256: - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: sha256_update(&ctx->sha256, data, len); break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - sha512_update(&ctx->sha512, data, len); - break; case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: sha512_update(&ctx->sha512, data, len); break; case SHA512_256: @@ -502,17 +478,10 @@ SHA2Final(void *digest, SHA2_CTX *ctx) { switch (ctx->algotype) { case SHA256: - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: sha256_final(&ctx->sha256, digest, 256); break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - sha512_final(&ctx->sha512, digest, 384); - break; case SHA512: case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: sha512_final(&ctx->sha512, digest, 512); break; case SHA512_256: diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c index d80ea1e677..e515dabc9d 100644 --- a/module/icp/io/sha2_mod.c +++ b/module/icp/io/sha2_mod.c @@ -60,24 +60,9 @@ * Mechanism info structure passed to KCF during registration. */ static const crypto_mech_info_t sha2_mech_info_tab[] = { - /* SHA256-HMAC */ - {SUN_CKM_SHA256_HMAC, SHA256_HMAC_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA256-HMAC GENERAL */ - {SUN_CKM_SHA256_HMAC_GENERAL, SHA256_HMAC_GEN_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA384-HMAC */ - {SUN_CKM_SHA384_HMAC, SHA384_HMAC_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA384-HMAC GENERAL */ - {SUN_CKM_SHA384_HMAC_GENERAL, SHA384_HMAC_GEN_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, /* SHA512-HMAC */ {SUN_CKM_SHA512_HMAC, SHA512_HMAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA512-HMAC GENERAL */ - {SUN_CKM_SHA512_HMAC_GENERAL, SHA512_HMAC_GEN_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, }; static int sha2_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, @@ -251,10 +236,8 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, * The computed SHA2 digest will fit in the current * iovec. */ - if (((sha2_ctx->algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) && - (digest_len != SHA256_DIGEST_LENGTH)) || - ((sha2_ctx->algotype > SHA256_HMAC_GEN_MECH_INFO_TYPE) && - (digest_len != SHA512_DIGEST_LENGTH))) { + ASSERT3U(sha2_ctx->algotype, ==, SHA512_HMAC_MECH_INFO_TYPE); + if (digest_len != SHA512_DIGEST_LENGTH) { /* * The caller requested a short digest. Digest * into a scratch buffer and return to @@ -349,13 +332,9 @@ sha2_mac_init_ctx(sha2_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes) int i, block_size, blocks_per_int64; /* Determine the block size */ - if (ctx->hc_mech_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { - block_size = SHA256_HMAC_BLOCK_SIZE; - blocks_per_int64 = SHA256_HMAC_BLOCK_SIZE / sizeof (uint64_t); - } else { - block_size = SHA512_HMAC_BLOCK_SIZE; - blocks_per_int64 = SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t); - } + ASSERT3U(ctx->hc_mech_type, ==, SHA512_HMAC_MECH_INFO_TYPE); + block_size = SHA512_HMAC_BLOCK_SIZE; + blocks_per_int64 = SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t); (void) memset(ipad, 0, block_size); (void) memset(opad, 0, block_size); @@ -397,15 +376,7 @@ sha2_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, * mechanism */ switch (mechanism->cm_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: sha_digest_len = SHA512_DIGEST_LENGTH; sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; break; @@ -445,22 +416,6 @@ sha2_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, } } - /* - * Get the mechanism parameters, if applicable. - */ - if (mechanism->cm_type % 3 == 2) { - if (mechanism->cm_param == NULL || - mechanism->cm_param_len != sizeof (ulong_t)) { - ret = CRYPTO_MECHANISM_PARAM_INVALID; - } else { - PROV_SHA2_GET_DIGEST_LEN(mechanism, - PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len); - if (PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len > - sha_digest_len) - ret = CRYPTO_MECHANISM_PARAM_INVALID; - } - } - if (ret != CRYPTO_SUCCESS) { memset(ctx->cc_provider_private, 0, sizeof (sha2_hmac_ctx_t)); kmem_free(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t)); @@ -509,24 +464,9 @@ sha2_mac_final(crypto_ctx_t *ctx, crypto_data_t *mac) /* Set the digest lengths to values appropriate to the mechanism */ switch (PROV_SHA2_HMAC_CTX(ctx)->hc_mech_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA256_DIGEST_LENGTH; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA384_DIGEST_LENGTH; - break; case SHA512_HMAC_MECH_INFO_TYPE: sha_digest_len = digest_len = SHA512_DIGEST_LENGTH; break; - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len; - break; - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA512_DIGEST_LENGTH; - digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len; - break; default: return (CRYPTO_ARGUMENTS_BAD); } @@ -626,15 +566,7 @@ sha2_mac_atomic(crypto_mechanism_t *mechanism, * mechanism */ switch (mechanism->cm_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA256_DIGEST_LENGTH; - sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: sha_digest_len = digest_len = SHA512_DIGEST_LENGTH; sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; break; @@ -665,20 +597,6 @@ sha2_mac_atomic(crypto_mechanism_t *mechanism, } } - /* get the mechanism parameters, if applicable */ - if ((mechanism->cm_type % 3) == 2) { - if (mechanism->cm_param == NULL || - mechanism->cm_param_len != sizeof (ulong_t)) { - ret = CRYPTO_MECHANISM_PARAM_INVALID; - goto bail; - } - PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len); - if (digest_len > sha_digest_len) { - ret = CRYPTO_MECHANISM_PARAM_INVALID; - goto bail; - } - } - /* do a SHA2 update of the inner context using the specified data */ SHA2_MAC_UPDATE(data, sha2_hmac_ctx, ret); if (ret != CRYPTO_SUCCESS) @@ -693,16 +611,9 @@ sha2_mac_atomic(crypto_mechanism_t *mechanism, /* * Do an SHA2 update on the outer context, feeding the inner * digest as data. - * - * HMAC-SHA384 needs special handling as the outer hash needs only 48 - * bytes of the inner hash value. */ - if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE || - mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE) - SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, - SHA384_DIGEST_LENGTH); - else - SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len); + ASSERT3U(mechanism->cm_type, ==, SHA512_HMAC_MECH_INFO_TYPE); + SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len); /* * Do a SHA2 final on the outer context, storing the computed @@ -758,15 +669,7 @@ sha2_mac_verify_atomic(crypto_mechanism_t *mechanism, * mechanism */ switch (mechanism->cm_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA256_DIGEST_LENGTH; - sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: sha_digest_len = digest_len = SHA512_DIGEST_LENGTH; sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; break; @@ -797,20 +700,6 @@ sha2_mac_verify_atomic(crypto_mechanism_t *mechanism, } } - /* get the mechanism parameters, if applicable */ - if (mechanism->cm_type % 3 == 2) { - if (mechanism->cm_param == NULL || - mechanism->cm_param_len != sizeof (ulong_t)) { - ret = CRYPTO_MECHANISM_PARAM_INVALID; - goto bail; - } - PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len); - if (digest_len > sha_digest_len) { - ret = CRYPTO_MECHANISM_PARAM_INVALID; - goto bail; - } - } - if (mac->cd_length != digest_len) { ret = CRYPTO_INVALID_MAC; goto bail; @@ -828,16 +717,9 @@ sha2_mac_verify_atomic(crypto_mechanism_t *mechanism, /* * Do an SHA2 update on the outer context, feeding the inner * digest as data. - * - * HMAC-SHA384 needs special handling as the outer hash needs only 48 - * bytes of the inner hash value. */ - if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE || - mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE) - SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, - SHA384_DIGEST_LENGTH); - else - SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len); + ASSERT3U(mechanism->cm_type, ==, SHA512_HMAC_MECH_INFO_TYPE); + SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len); /* * Do a SHA2 final on the outer context, storing the computed @@ -929,15 +811,7 @@ sha2_create_ctx_template(crypto_mechanism_t *mechanism, crypto_key_t *key, * mechanism */ switch (mechanism->cm_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: sha_digest_len = SHA512_DIGEST_LENGTH; sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; break; @@ -986,17 +860,9 @@ sha2_free_context(crypto_ctx_t *ctx) if (ctx->cc_provider_private == NULL) return (CRYPTO_SUCCESS); - /* - * We have to free either SHA2 or SHA2-HMAC contexts, which - * have different lengths. - * - * Note: Below is dependent on the mechanism ordering. - */ - - if (PROV_SHA2_CTX(ctx)->sc_mech_type % 3 == 0) - ctx_len = sizeof (sha2_ctx_t); - else - ctx_len = sizeof (sha2_hmac_ctx_t); + ASSERT3U(PROV_SHA2_CTX(ctx)->sc_mech_type, ==, + SHA512_HMAC_MECH_INFO_TYPE); + ctx_len = sizeof (sha2_hmac_ctx_t); memset(ctx->cc_provider_private, 0, ctx_len); kmem_free(ctx->cc_provider_private, ctx_len); From ae512620d0c372d83180204f3149d9a5df814931 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 19 May 2024 15:00:00 +1000 Subject: [PATCH 105/113] icp: remove skein module Nothing calls it through the KCF interface, so this is all unused. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- include/sys/crypto/icp.h | 3 - lib/libicp/Makefile.am | 1 - module/Kbuild.in | 1 - module/icp/illumos-crypto.c | 2 - module/icp/io/skein_mod.c | 515 ------------------------------------ 5 files changed, 522 deletions(-) delete mode 100644 module/icp/io/skein_mod.c diff --git a/include/sys/crypto/icp.h b/include/sys/crypto/icp.h index 8c3f19886f..efe283fa92 100644 --- a/include/sys/crypto/icp.h +++ b/include/sys/crypto/icp.h @@ -32,9 +32,6 @@ int aes_mod_fini(void); int sha2_mod_init(void); int sha2_mod_fini(void); -int skein_mod_init(void); -int skein_mod_fini(void); - int icp_init(void); void icp_fini(void); diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index f40512bec9..ce24d13a76 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -31,7 +31,6 @@ nodist_libicp_la_SOURCES = \ module/icp/illumos-crypto.c \ module/icp/io/aes.c \ module/icp/io/sha2_mod.c \ - module/icp/io/skein_mod.c \ module/icp/core/kcf_sched.c \ module/icp/core/kcf_prov_lib.c \ module/icp/core/kcf_callprov.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index 6e2eab2258..9e44364b75 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -122,7 +122,6 @@ ICP_OBJS := \ illumos-crypto.o \ io/aes.o \ io/sha2_mod.o \ - io/skein_mod.o \ spi/kcf_spi.o ICP_OBJS_X86_64 := \ diff --git a/module/icp/illumos-crypto.c b/module/icp/illumos-crypto.c index 13f05c06ed..f5ed3e13fa 100644 --- a/module/icp/illumos-crypto.c +++ b/module/icp/illumos-crypto.c @@ -107,7 +107,6 @@ void icp_fini(void) { - skein_mod_fini(); sha2_mod_fini(); aes_mod_fini(); kcf_sched_destroy(); @@ -134,7 +133,6 @@ icp_init(void) /* initialize algorithms */ aes_mod_init(); sha2_mod_init(); - skein_mod_init(); return (0); } diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c deleted file mode 100644 index 3e969513be..0000000000 --- a/module/icp/io/skein_mod.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2013 Saso Kiselkov. All rights reserved. - */ - -#include -#include -#include -#include -#define SKEIN_MODULE_IMPL -#include - -static const crypto_mech_info_t skein_mech_info_tab[] = { - {CKM_SKEIN_256_MAC, SKEIN_256_MAC_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - {CKM_SKEIN_512_MAC, SKEIN_512_MAC_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - {CKM_SKEIN1024_MAC, SKEIN1024_MAC_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, -}; - -static int skein_update(crypto_ctx_t *, crypto_data_t *); -static int skein_final(crypto_ctx_t *, crypto_data_t *); - -static int skein_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, - crypto_spi_ctx_template_t); -static int skein_mac_atomic(crypto_mechanism_t *, crypto_key_t *, - crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t); - -static const crypto_mac_ops_t skein_mac_ops = { - .mac_init = skein_mac_init, - .mac = NULL, - .mac_update = skein_update, /* using regular digest update is OK here */ - .mac_final = skein_final, /* using regular digest final is OK here */ - .mac_atomic = skein_mac_atomic, - .mac_verify_atomic = NULL -}; - -static int skein_create_ctx_template(crypto_mechanism_t *, crypto_key_t *, - crypto_spi_ctx_template_t *, size_t *); -static int skein_free_context(crypto_ctx_t *); - -static const crypto_ctx_ops_t skein_ctx_ops = { - .create_ctx_template = skein_create_ctx_template, - .free_context = skein_free_context -}; - -static const crypto_ops_t skein_crypto_ops = { - NULL, - &skein_mac_ops, - &skein_ctx_ops, -}; - -static const crypto_provider_info_t skein_prov_info = { - "Skein Software Provider", - &skein_crypto_ops, - sizeof (skein_mech_info_tab) / sizeof (crypto_mech_info_t), - skein_mech_info_tab -}; - -static crypto_kcf_provider_handle_t skein_prov_handle = 0; - -typedef struct skein_ctx { - skein_mech_type_t sc_mech_type; - size_t sc_digest_bitlen; - /*LINTED(E_ANONYMOUS_UNION_DECL)*/ - union { - Skein_256_Ctxt_t sc_256; - Skein_512_Ctxt_t sc_512; - Skein1024_Ctxt_t sc_1024; - }; -} skein_ctx_t; -#define SKEIN_CTX(_ctx_) ((skein_ctx_t *)((_ctx_)->cc_provider_private)) -#define SKEIN_CTX_LVALUE(_ctx_) (_ctx_)->cc_provider_private -#define SKEIN_OP(_skein_ctx, _op, ...) \ - do { \ - skein_ctx_t *sc = (_skein_ctx); \ - switch (sc->sc_mech_type) { \ - case SKEIN_256_MAC_MECH_INFO_TYPE: \ - (void) Skein_256_ ## _op(&sc->sc_256, __VA_ARGS__);\ - break; \ - case SKEIN_512_MAC_MECH_INFO_TYPE: \ - (void) Skein_512_ ## _op(&sc->sc_512, __VA_ARGS__);\ - break; \ - case SKEIN1024_MAC_MECH_INFO_TYPE: \ - (void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\ - break; \ - } \ - } while (0) - -static int -skein_get_digest_bitlen(const crypto_mechanism_t *mechanism, size_t *result) -{ - if (mechanism->cm_param != NULL) { - /*LINTED(E_BAD_PTR_CAST_ALIGN)*/ - skein_param_t *param = (skein_param_t *)mechanism->cm_param; - - if (mechanism->cm_param_len != sizeof (*param) || - param->sp_digest_bitlen == 0) { - return (CRYPTO_MECHANISM_PARAM_INVALID); - } - *result = param->sp_digest_bitlen; - } else { - return (CRYPTO_MECHANISM_INVALID); - } - return (CRYPTO_SUCCESS); -} - -int -skein_mod_init(void) -{ - /* - * Try to register with KCF - failure shouldn't unload us, since we - * still may want to continue providing misc/skein functionality. - */ - (void) crypto_register_provider(&skein_prov_info, &skein_prov_handle); - - return (0); -} - -int -skein_mod_fini(void) -{ - int ret = 0; - - if (skein_prov_handle != 0) { - if ((ret = crypto_unregister_provider(skein_prov_handle)) != - CRYPTO_SUCCESS) { - cmn_err(CE_WARN, - "skein _fini: crypto_unregister_provider() " - "failed (0x%x)", ret); - return (EBUSY); - } - skein_prov_handle = 0; - } - - return (0); -} - -/* - * General Skein hashing helper functions. - */ - -/* - * Performs an Update on a context with uio input data. - */ -static int -skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data) -{ - off_t offset = data->cd_offset; - size_t length = data->cd_length; - uint_t vec_idx = 0; - size_t cur_len; - zfs_uio_t *uio = data->cd_uio; - - /* we support only kernel buffer */ - if (zfs_uio_segflg(uio) != UIO_SYSSPACE) - return (CRYPTO_ARGUMENTS_BAD); - - /* - * Jump to the first iovec containing data to be - * digested. - */ - offset = zfs_uio_index_at_offset(uio, offset, &vec_idx); - if (vec_idx == zfs_uio_iovcnt(uio)) { - /* - * The caller specified an offset that is larger than the - * total size of the buffers it provided. - */ - return (CRYPTO_DATA_LEN_RANGE); - } - - /* - * Now do the digesting on the iovecs. - */ - while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) { - cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset, length); - SKEIN_OP(ctx, Update, (uint8_t *)zfs_uio_iovbase(uio, vec_idx) - + offset, cur_len); - length -= cur_len; - vec_idx++; - offset = 0; - } - - if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) { - /* - * The end of the specified iovec's was reached but - * the length requested could not be processed, i.e. - * The caller requested to digest more data than it provided. - */ - return (CRYPTO_DATA_LEN_RANGE); - } - - return (CRYPTO_SUCCESS); -} - -/* - * Performs a Final on a context and writes to a uio digest output. - */ -static int -skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest) -{ - off_t offset = digest->cd_offset; - uint_t vec_idx = 0; - zfs_uio_t *uio = digest->cd_uio; - - /* we support only kernel buffer */ - if (zfs_uio_segflg(uio) != UIO_SYSSPACE) - return (CRYPTO_ARGUMENTS_BAD); - - /* - * Jump to the first iovec containing ptr to the digest to be returned. - */ - offset = zfs_uio_index_at_offset(uio, offset, &vec_idx); - if (vec_idx == zfs_uio_iovcnt(uio)) { - /* - * The caller specified an offset that is larger than the - * total size of the buffers it provided. - */ - return (CRYPTO_DATA_LEN_RANGE); - } - if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <= - zfs_uio_iovlen(uio, vec_idx)) { - /* The computed digest will fit in the current iovec. */ - SKEIN_OP(ctx, Final, - (uchar_t *)zfs_uio_iovbase(uio, vec_idx) + offset); - } else { - uint8_t *digest_tmp; - off_t scratch_offset = 0; - size_t length = CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen); - size_t cur_len; - - digest_tmp = kmem_alloc(CRYPTO_BITS2BYTES( - ctx->sc_digest_bitlen), KM_SLEEP); - if (digest_tmp == NULL) - return (CRYPTO_HOST_MEMORY); - SKEIN_OP(ctx, Final, digest_tmp); - while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) { - cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset, - length); - memcpy(zfs_uio_iovbase(uio, vec_idx) + offset, - digest_tmp + scratch_offset, cur_len); - - length -= cur_len; - vec_idx++; - scratch_offset += cur_len; - offset = 0; - } - kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen)); - - if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) { - /* - * The end of the specified iovec's was reached but - * the length requested could not be processed, i.e. - * The caller requested to digest more data than it - * provided. - */ - return (CRYPTO_DATA_LEN_RANGE); - } - } - - return (CRYPTO_SUCCESS); -} - -/* - * KCF software provider digest entry points. - */ - -/* - * Performs a skein Update with the input message in `data' (successive calls - * can push more data). This is used both for digest and MAC operation. - * Supported input data formats are raw, uio and mblk. - */ -static int -skein_update(crypto_ctx_t *ctx, crypto_data_t *data) -{ - int error = CRYPTO_SUCCESS; - - ASSERT(SKEIN_CTX(ctx) != NULL); - - switch (data->cd_format) { - case CRYPTO_DATA_RAW: - SKEIN_OP(SKEIN_CTX(ctx), Update, - (uint8_t *)data->cd_raw.iov_base + data->cd_offset, - data->cd_length); - break; - case CRYPTO_DATA_UIO: - error = skein_digest_update_uio(SKEIN_CTX(ctx), data); - break; - default: - error = CRYPTO_ARGUMENTS_BAD; - } - - return (error); -} - -/* - * Performs a skein Final, writing the output to `digest'. This is used both - * for digest and MAC operation. - * Supported output digest formats are raw, uio and mblk. - */ -static int -skein_final_nofree(crypto_ctx_t *ctx, crypto_data_t *digest) -{ - int error = CRYPTO_SUCCESS; - - ASSERT(SKEIN_CTX(ctx) != NULL); - - if (digest->cd_length < - CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) { - digest->cd_length = - CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); - return (CRYPTO_BUFFER_TOO_SMALL); - } - - switch (digest->cd_format) { - case CRYPTO_DATA_RAW: - SKEIN_OP(SKEIN_CTX(ctx), Final, - (uint8_t *)digest->cd_raw.iov_base + digest->cd_offset); - break; - case CRYPTO_DATA_UIO: - error = skein_digest_final_uio(SKEIN_CTX(ctx), digest); - break; - default: - error = CRYPTO_ARGUMENTS_BAD; - } - - if (error == CRYPTO_SUCCESS) - digest->cd_length = - CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); - else - digest->cd_length = 0; - - return (error); -} - -static int -skein_final(crypto_ctx_t *ctx, crypto_data_t *digest) -{ - int error = skein_final_nofree(ctx, digest); - - if (error == CRYPTO_BUFFER_TOO_SMALL) - return (error); - - memset(SKEIN_CTX(ctx), 0, sizeof (*SKEIN_CTX(ctx))); - kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx)))); - SKEIN_CTX_LVALUE(ctx) = NULL; - - return (error); -} - -/* - * Helper function that builds a Skein MAC context from the provided - * mechanism and key. - */ -static int -skein_mac_ctx_build(skein_ctx_t *ctx, crypto_mechanism_t *mechanism, - crypto_key_t *key) -{ - int error; - - if (!VALID_SKEIN_MAC_MECH(mechanism->cm_type)) - return (CRYPTO_MECHANISM_INVALID); - ctx->sc_mech_type = mechanism->cm_type; - error = skein_get_digest_bitlen(mechanism, &ctx->sc_digest_bitlen); - if (error != CRYPTO_SUCCESS) - return (error); - SKEIN_OP(ctx, InitExt, ctx->sc_digest_bitlen, 0, key->ck_data, - CRYPTO_BITS2BYTES(key->ck_length)); - - return (CRYPTO_SUCCESS); -} - -/* - * KCF software provide mac entry points. - */ -/* - * Initializes a skein MAC context. You may pass a ctx_template, in which - * case the template will be reused to make initialization more efficient. - * Otherwise a new context will be constructed. The mechanism cm_type must - * be one of SKEIN_*_MAC_MECH_INFO_TYPE. Same as in skein_digest_init, you - * may pass a skein_param_t in cm_param to configure the length of the - * digest. The key must be in raw format. - */ -static int -skein_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, - crypto_key_t *key, crypto_spi_ctx_template_t ctx_template) -{ - int error; - - SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)), KM_SLEEP); - if (SKEIN_CTX(ctx) == NULL) - return (CRYPTO_HOST_MEMORY); - - if (ctx_template != NULL) { - memcpy(SKEIN_CTX(ctx), ctx_template, - sizeof (*SKEIN_CTX(ctx))); - } else { - error = skein_mac_ctx_build(SKEIN_CTX(ctx), mechanism, key); - if (error != CRYPTO_SUCCESS) - goto errout; - } - - return (CRYPTO_SUCCESS); -errout: - memset(SKEIN_CTX(ctx), 0, sizeof (*SKEIN_CTX(ctx))); - kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); - return (error); -} - -/* - * The MAC update and final calls are reused from the regular digest code. - */ - -/* - * Same as skein_digest_atomic, performs an atomic Skein MAC operation in - * one step. All the same properties apply to the arguments of this - * function as to those of the partial operations above. - */ -static int -skein_mac_atomic(crypto_mechanism_t *mechanism, - crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, - crypto_spi_ctx_template_t ctx_template) -{ - /* faux crypto context just for skein_digest_{update,final} */ - int error; - crypto_ctx_t ctx; - skein_ctx_t skein_ctx; - SKEIN_CTX_LVALUE(&ctx) = &skein_ctx; - - if (ctx_template != NULL) { - memcpy(&skein_ctx, ctx_template, sizeof (skein_ctx)); - } else { - error = skein_mac_ctx_build(&skein_ctx, mechanism, key); - if (error != CRYPTO_SUCCESS) - goto errout; - } - - if ((error = skein_update(&ctx, data)) != CRYPTO_SUCCESS) - goto errout; - if ((error = skein_final_nofree(&ctx, mac)) != CRYPTO_SUCCESS) - goto errout; - - return (CRYPTO_SUCCESS); -errout: - memset(&skein_ctx, 0, sizeof (skein_ctx)); - return (error); -} - -/* - * KCF software provider context management entry points. - */ - -/* - * Constructs a context template for the Skein MAC algorithm. The same - * properties apply to the arguments of this function as to those of - * skein_mac_init. - */ -static int -skein_create_ctx_template(crypto_mechanism_t *mechanism, crypto_key_t *key, - crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size) -{ - int error; - skein_ctx_t *ctx_tmpl; - - ctx_tmpl = kmem_alloc(sizeof (*ctx_tmpl), KM_SLEEP); - if (ctx_tmpl == NULL) - return (CRYPTO_HOST_MEMORY); - error = skein_mac_ctx_build(ctx_tmpl, mechanism, key); - if (error != CRYPTO_SUCCESS) - goto errout; - *ctx_template = ctx_tmpl; - *ctx_template_size = sizeof (*ctx_tmpl); - - return (CRYPTO_SUCCESS); -errout: - memset(ctx_tmpl, 0, sizeof (*ctx_tmpl)); - kmem_free(ctx_tmpl, sizeof (*ctx_tmpl)); - return (error); -} - -/* - * Frees a skein context in a parent crypto context. - */ -static int -skein_free_context(crypto_ctx_t *ctx) -{ - if (SKEIN_CTX(ctx) != NULL) { - memset(SKEIN_CTX(ctx), 0, sizeof (*SKEIN_CTX(ctx))); - kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); - SKEIN_CTX_LVALUE(ctx) = NULL; - } - - return (CRYPTO_SUCCESS); -} From 4e714c0be10f53eea2d87e6af67bf46d67e94db2 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 19 May 2024 21:40:59 +1000 Subject: [PATCH 106/113] icp: remove unused headers Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- module/icp/include/sys/stack.h | 36 ---------------------------------- module/icp/include/sys/trap.h | 36 ---------------------------------- 2 files changed, 72 deletions(-) delete mode 100644 module/icp/include/sys/stack.h delete mode 100644 module/icp/include/sys/trap.h diff --git a/module/icp/include/sys/stack.h b/module/icp/include/sys/stack.h deleted file mode 100644 index 0bace018b5..0000000000 --- a/module/icp/include/sys/stack.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_STACK_H -#define _SYS_STACK_H - -#if defined(__i386) || defined(__amd64) - -#include /* XX64 x86/sys/stack.h */ - -#endif - -#endif /* _SYS_STACK_H */ diff --git a/module/icp/include/sys/trap.h b/module/icp/include/sys/trap.h deleted file mode 100644 index 2f47d43939..0000000000 --- a/module/icp/include/sys/trap.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_TRAP_H -#define _SYS_TRAP_H - -#if defined(__i386) || defined(__amd64) - -#include /* XX64 x86/sys/trap.h */ - -#endif - -#endif /* _SYS_TRAP_H */ From a72751a34265492efc115bb6a773e961bb9be82d Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 19 May 2024 21:49:19 +1000 Subject: [PATCH 107/113] icp: remove redundant FreeBSD check We don't build illumos-crypto for FreeBSD. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16209 --- module/icp/illumos-crypto.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/module/icp/illumos-crypto.c b/module/icp/illumos-crypto.c index f5ed3e13fa..89736a61bc 100644 --- a/module/icp/illumos-crypto.c +++ b/module/icp/illumos-crypto.c @@ -136,8 +136,3 @@ icp_init(void) return (0); } - -#if defined(_KERNEL) && defined(__FreeBSD__) -module_exit(icp_fini); -module_init(icp_init); -#endif From 23a489a41167890cdd227366a5f950170df8cc9b Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Tue, 4 Jun 2024 04:28:43 +0500 Subject: [PATCH 108/113] zdb: detect cachefile automatically otherwise force import If a pool is created with the cache file located in a non-default path /etc/default/zpool.cache, removed, or the cachefile property is set to none, zdb fails to show the pool unless we specify the cache file or use the -e option. This PR automates this process. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Akash B Signed-off-by: Ameer Hamza Closes #16071 --- cmd/zdb/zdb.c | 66 ++++++++++++++++++++++++++++++--- include/libzfs_core.h | 1 + lib/libzfs_core/libzfs_core.abi | 6 +++ lib/libzfs_core/libzfs_core.c | 6 +++ 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 704fcf4422..2a3d58d77e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -89,6 +89,7 @@ #include #include +#include #include @@ -8924,6 +8925,9 @@ main(int argc, char **argv) boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; nvlist_t *cfg = NULL; struct sigaction action; + boolean_t force_import = B_FALSE; + boolean_t config_path_console = B_FALSE; + char pbuf[MAXPATHLEN]; dprintf_setup(&argc, argv); @@ -9094,6 +9098,7 @@ main(int argc, char **argv) } break; case 'U': + config_path_console = B_TRUE; spa_config_path = optarg; if (spa_config_path[0] != '/') { (void) fprintf(stderr, @@ -9153,9 +9158,6 @@ main(int argc, char **argv) */ spa_mode_readable_spacemaps = B_TRUE; - kernel_init(SPA_MODE_READ); - kernel_init_done = B_TRUE; - if (dump_all) verbose = MAX(verbose, 1); @@ -9174,6 +9176,53 @@ main(int argc, char **argv) if (argc < 2 && dump_opt['R']) usage(); + target = argv[0]; + + /* + * Automate cachefile + */ + if (!spa_config_path_env && !config_path_console && target && + libzfs_core_init() == 0) { + char *pname = strdup(target); + const char *value; + nvlist_t *pnvl; + nvlist_t *vnvl; + + if (strpbrk(pname, "/@") != NULL) + *strpbrk(pname, "/@") = '\0'; + + if (pname && lzc_get_props(pname, &pnvl) == 0) { + if (nvlist_lookup_nvlist(pnvl, "cachefile", + &vnvl) == 0) { + value = fnvlist_lookup_string(vnvl, + ZPROP_VALUE); + } else { + value = "-"; + } + strlcpy(pbuf, value, sizeof (pbuf)); + if (pbuf[0] != '\0') { + if (pbuf[0] == '/') { + if (access(pbuf, F_OK) == 0) + spa_config_path = pbuf; + else + force_import = B_TRUE; + } else if ((strcmp(pbuf, "-") == 0 && + access(ZPOOL_CACHE, F_OK) != 0) || + strcmp(pbuf, "none") == 0) { + force_import = B_TRUE; + } + } + nvlist_free(vnvl); + } + + free(pname); + nvlist_free(pnvl); + libzfs_core_fini(); + } + + kernel_init(SPA_MODE_READ); + kernel_init_done = B_TRUE; + if (dump_opt['E']) { if (argc != 1) usage(); @@ -9210,7 +9259,6 @@ main(int argc, char **argv) fatal("internal error: %s", strerror(ENOMEM)); error = 0; - target = argv[0]; if (strpbrk(target, "/@") != NULL) { size_t targetlen; @@ -9256,9 +9304,17 @@ main(int argc, char **argv) target_pool = target; } - if (dump_opt['e']) { + if (dump_opt['e'] || force_import) { importargs_t args = { 0 }; + /* + * If path is not provided, search in /dev + */ + if (searchdirs == NULL) { + searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); + searchdirs[nsearch++] = (char *)ZFS_DEVDIR; + } + args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_TRUE; diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 867c18b9c2..b2fd97372c 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -77,6 +77,7 @@ _LIBZFS_CORE_H int lzc_snaprange_space(const char *, const char *, uint64_t *); _LIBZFS_CORE_H int lzc_hold(nvlist_t *, int, nvlist_t **); _LIBZFS_CORE_H int lzc_release(nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_holds(const char *, nvlist_t **); +_LIBZFS_CORE_H int lzc_get_props(const char *, nvlist_t **); enum lzc_send_flags { LZC_SEND_FLAG_EMBED_DATA = 1 << 0, diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index cf9d6bddc9..c20698580e 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -168,6 +168,7 @@ + @@ -2694,6 +2695,11 @@ + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 01d803e21d..070f8c1be6 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -596,6 +596,12 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp) return (lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, NULL, holdsp)); } +int +lzc_get_props(const char *poolname, nvlist_t **props) +{ + return (lzc_ioctl(ZFS_IOC_POOL_GET_PROPS, poolname, NULL, props)); +} + static unsigned int max_pipe_buffer(int infd) { From b558f0a9d65c3bdb8310504184a82e9802551168 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Fri, 7 Jun 2024 05:01:26 +0500 Subject: [PATCH 109/113] zdb: fix FreeBSD build failure This fixes FreeBSD build failure with clang-18 after 23a489a got merged. Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Reviewed-by: Brian Behlendorf Signed-off-by: Ameer Hamza Closes #16252 --- cmd/zdb/zdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 2a3d58d77e..3a7ef11612 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -9185,7 +9185,7 @@ main(int argc, char **argv) libzfs_core_init() == 0) { char *pname = strdup(target); const char *value; - nvlist_t *pnvl; + nvlist_t *pnvl = NULL; nvlist_t *vnvl; if (strpbrk(pname, "/@") != NULL) From 4de260efe3375fe62b9d80452b9203c89ab0c045 Mon Sep 17 00:00:00 2001 From: Derek Schrock Date: Thu, 6 Jun 2024 20:37:26 -0400 Subject: [PATCH 110/113] contrib/bash_completion.d: squelch FreeBSD seq when first < last With seq x -1 z and x is less than z FreeBSD seq will print the error: $ seq 1 -1 2 seq: needs positive increment Hide this error. Alternatively $COMP_CWORD could be checked for < 2. Reviewed-by: Brian Behlendorf Signed-off-by: Derek Schrock Closes #16234 --- contrib/bash_completion.d/zfs.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/bash_completion.d/zfs.in b/contrib/bash_completion.d/zfs.in index c5cfd8e8ef..dbeb10d899 100644 --- a/contrib/bash_completion.d/zfs.in +++ b/contrib/bash_completion.d/zfs.in @@ -155,7 +155,7 @@ __zfs_list_volumes() __zfs_argument_chosen() { local word property - for word in $(seq $((COMP_CWORD-1)) -1 2) + for word in $(seq $((COMP_CWORD-1)) -1 2 2>/dev/null) do local prev="${COMP_WORDS[$word]}" if [[ ${COMP_WORDS[$word-1]} != -[tos] ]] From 20c8bdd85ef4716d5e59d9f6f61347c0e4566750 Mon Sep 17 00:00:00 2001 From: bnovkov <72801811+bnovkov@users.noreply.github.com> Date: Fri, 7 Jun 2024 03:11:00 +0200 Subject: [PATCH 111/113] FreeBSD: Update use of UMA-related symbols in arc_available_memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent UMA changes repurposed the use of UMA_MD_SMALL_ALLOC in a way that breaks arc_available_memory on -CURRENT. This change ensures that arc_available_memory uses the new symbol while maintaining compatibility with older FreeBSD releases. Reviewed-by: Brian Behlendorf Signed-off-by: Bojan Novković Closes #16230 --- module/os/freebsd/zfs/arc_os.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 92696c0bf1..e271d3bf98 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -89,17 +89,17 @@ arc_available_memory(void) if (n < lowest) { lowest = n; } -#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) +#if !defined(UMA_MD_SMALL_ALLOC) && !defined(UMA_USE_DMAP) /* - * If we're on an i386 platform, it's possible that we'll exhaust the - * kernel heap space before we ever run out of available physical - * memory. Most checks of the size of the heap_area compare against - * tune.t_minarmem, which is the minimum available real memory that we - * can have in the system. However, this is generally fixed at 25 pages - * which is so low that it's useless. In this comparison, we seek to - * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the calculation, if less than 1/4th is - * free) + * If we're on a platform without a direct map, it's possible that we'll + * exhaust the kernel heap space before we ever run out of available + * physical memory. Most checks of the size of the heap_area compare + * against tune.t_minarmem, which is the minimum available real memory + * that we can have in the system. However, this is generally fixed at + * 25 pages which is so low that it's useless. In this comparison, we + * seek to calculate the total heap-size, and reclaim if more than + * 3/4ths of the heap is allocated. (Or, in the calculation, if less + * than 1/4th is free) */ n = uma_avail() - (long)(uma_limit() / 4); if (n < lowest) { From 121a2d335414fe294c948795ee9406bab966588f Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 14 Jun 2024 02:49:50 +0200 Subject: [PATCH 112/113] FreeBSD: unregister mountroot eventhandler on unload Otherwise if zfs is unloaded and reroot is being used it trips over a stale pointer. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Sponsored by: Rubicon Communications, LLC ("Netgate") Signed-off-by: Mateusz Guzik Closes #16242 --- module/os/freebsd/zfs/kmod_core.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c index 2bced9ab64..9bd0c9ed81 100644 --- a/module/os/freebsd/zfs/kmod_core.c +++ b/module/os/freebsd/zfs/kmod_core.c @@ -111,6 +111,7 @@ static int zfs__fini(void); static void zfs_shutdown(void *, int); static eventhandler_tag zfs_shutdown_event_tag; +static eventhandler_tag zfs_mountroot_event_tag; #define ZFS_MIN_KSTACK_PAGES 4 @@ -305,16 +306,25 @@ zfs_modevent(module_t mod, int type, void *unused __unused) switch (type) { case MOD_LOAD: err = zfs__init(); - if (err == 0) + if (err == 0) { zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( shutdown_post_sync, zfs_shutdown, NULL, SHUTDOWN_PRI_FIRST); + zfs_mountroot_event_tag = EVENTHANDLER_REGISTER( + mountroot, spa_boot_init, NULL, + SI_ORDER_ANY); + } return (err); case MOD_UNLOAD: err = zfs__fini(); - if (err == 0 && zfs_shutdown_event_tag != NULL) - EVENTHANDLER_DEREGISTER(shutdown_post_sync, - zfs_shutdown_event_tag); + if (err == 0) { + if (zfs_shutdown_event_tag != NULL) + EVENTHANDLER_DEREGISTER(shutdown_post_sync, + zfs_shutdown_event_tag); + if (zfs_mountroot_event_tag != NULL) + EVENTHANDLER_DEREGISTER(mountroot, + zfs_mountroot_event_tag); + } return (err); case MOD_SHUTDOWN: return (0); @@ -330,9 +340,6 @@ static moduledata_t zfs_mod = { 0 }; -#ifdef _KERNEL -EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); -#endif FEATURE(zfs, "OpenZFS support"); From c98295eed2687cee704ef5f8f3218d3d44a6a1d8 Mon Sep 17 00:00:00 2001 From: Martin Wagner Date: Fri, 14 Jun 2024 03:08:49 +0200 Subject: [PATCH 113/113] disable automatic dependency tracking for dkms builds Previously the dkms build left some unwanted files in `/usr/lib/modules` which could cause package managers to not properly clean up old kernels. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Martin Wagner Closes #16221 Closes #16241 --- scripts/dkms.mkconf | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/dkms.mkconf b/scripts/dkms.mkconf index 0bd3834204..046ce9edce 100755 --- a/scripts/dkms.mkconf +++ b/scripts/dkms.mkconf @@ -26,6 +26,7 @@ PACKAGE_VERSION="${pkgver}" PACKAGE_CONFIG="${pkgcfg}" NO_WEAK_MODULES="yes" PRE_BUILD="configure + --disable-dependency-tracking --prefix=/usr --with-config=kernel --with-linux=\$(