From 575872cc37a744928223dae2c24964d06f670e2d Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 9 Apr 2024 19:23:19 -0400 Subject: [PATCH] L2ARC: Relax locking during write Previous code held ARC state sublist lock throughout all L2ARC write process, which included number of allocations and even ZIO issues. Being blocked in any of those places the code could also block ARC eviction, that could cause OOM activation or even dead- lock if system is low on memory or one is too fragmented. Fix it by dropping the lock as soon as we see a block eligible for L2ARC writing and pick it up later using earlier inserted marker. While there, also reduce scope of hash lock, moving ZIO allocation and other operations not requiring header access out of it. All operations requiring header access move under hash lock, since L2_WRITING flag does not prevent header eviction only transition to arc_l2c_only state with L1 header. To be able to manipulate sublist lock and marker as needed add few more multilist functions and modify one. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16040 --- include/sys/multilist.h | 5 +- module/zfs/arc.c | 189 +++++++++++++++++++++------------------- module/zfs/dbuf.c | 2 +- module/zfs/dmu_objset.c | 10 +-- module/zfs/metaslab.c | 8 +- module/zfs/multilist.c | 26 +++++- 6 files changed, 136 insertions(+), 104 deletions(-) diff --git a/include/sys/multilist.h b/include/sys/multilist.h index 26f37c37ab..e7de86f237 100644 --- a/include/sys/multilist.h +++ b/include/sys/multilist.h @@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *); unsigned int multilist_get_num_sublists(multilist_t *); unsigned int multilist_get_random_index(multilist_t *); -multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +void multilist_sublist_lock(multilist_sublist_t *); +multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int); multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *); void multilist_sublist_unlock(multilist_sublist_t *); void multilist_sublist_insert_head(multilist_sublist_t *, void *); void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *); +void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *); void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); void multilist_sublist_remove(multilist_sublist_t *, void *); int multilist_sublist_is_empty(multilist_sublist_t *); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 4db6c06148..1953640139 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -3872,7 +3872,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, ASSERT3P(marker, !=, NULL); - mls = multilist_sublist_lock(ml, idx); + mls = multilist_sublist_lock_idx(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { @@ -3984,6 +3984,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, return (bytes_evicted); } +static arc_buf_hdr_t * +arc_state_alloc_marker(void) +{ + arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_evict_state_impl(). + */ + marker->b_spa = 0; + + return (marker); +} + +static void +arc_state_free_marker(arc_buf_hdr_t *marker) +{ + kmem_cache_free(hdr_full_cache, marker); +} + /* * Allocate an array of buffer headers used as placeholders during arc state * eviction. @@ -3994,16 +4014,8 @@ arc_state_alloc_markers(int count) arc_buf_hdr_t **markers; markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); - for (int i = 0; i < count; i++) { - markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); - - /* - * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_evict_state_impl(). - */ - markers[i]->b_spa = 0; - - } + for (int i = 0; i < count; i++) + markers[i] = arc_state_alloc_marker(); return (markers); } @@ -4011,7 +4023,7 @@ static void arc_state_free_markers(arc_buf_hdr_t **markers, int count) { for (int i = 0; i < count; i++) - kmem_cache_free(hdr_full_cache, markers[i]); + arc_state_free_marker(markers[i]); kmem_free(markers, sizeof (*markers) * count); } @@ -4055,7 +4067,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; - mls = multilist_sublist_lock(ml, i); + mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } @@ -4120,7 +4132,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, } for (int i = 0; i < num_sublists; i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); } @@ -8628,7 +8640,7 @@ l2arc_sublist_lock(int list_num) * sublists being selected. */ idx = multilist_get_random_index(ml); - return (multilist_sublist_lock(ml, idx)); + return (multilist_sublist_lock_idx(ml, idx)); } /* @@ -9040,9 +9052,9 @@ l2arc_blk_fetch_done(zio_t *zio) static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; + arc_buf_hdr_t *hdr, *head, *marker; + uint64_t write_asize, write_psize, headroom; + boolean_t full, from_head = !arc_warm; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); @@ -9051,10 +9063,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_lsize = write_asize = write_psize = 0; + write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); + marker = arc_state_alloc_marker(); /* * Copy buffers for L2ARC writing. @@ -9069,40 +9082,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) continue; } - multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; - - VERIFY3P(mls, !=, NULL); - - /* - * L2ARC fast warmup. - * - * Until the ARC is warm and starts to evict, read from the - * head of the ARC lists rather than the tail. - */ - if (arc_warm == B_FALSE) - hdr = multilist_sublist_head(mls); - else - hdr = multilist_sublist_tail(mls); - headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; - for (; hdr; hdr = hdr_prev) { + /* + * Until the ARC is warm and starts to evict, read from the + * head of the ARC lists rather than the tail. + */ + multilist_sublist_t *mls = l2arc_sublist_lock(pass); + ASSERT3P(mls, !=, NULL); + if (from_head) + hdr = multilist_sublist_head(mls); + else + hdr = multilist_sublist_tail(mls); + + while (hdr != NULL) { kmutex_t *hash_lock; abd_t *to_write = NULL; - if (arc_warm == B_FALSE) - hdr_prev = multilist_sublist_next(mls, hdr); - else - hdr_prev = multilist_sublist_prev(mls, hdr); - hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { - /* - * Skip this buffer rather than waiting. - */ +skip: + /* Skip this buffer rather than waiting. */ + if (from_head) + hdr = multilist_sublist_next(mls, hdr); + else + hdr = multilist_sublist_prev(mls, hdr); continue; } @@ -9117,11 +9124,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); - continue; + goto skip; } ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || @@ -9143,12 +9149,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. + * We should not sleep with sublist lock held or it + * may block ARC eviction. Insert a marker to save + * the position and drop the lock. */ - arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); + if (from_head) { + multilist_sublist_insert_after(mls, hdr, + marker); + } else { + multilist_sublist_insert_before(mls, hdr, + marker); + } + multilist_sublist_unlock(mls); /* * If this header has b_rabd, we can use this since it @@ -9179,32 +9191,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, - ARC_FLAG_L2_WRITING); + ARC_FLAG_L2CACHE); mutex_exit(hash_lock); - continue; + goto next; } l2arc_free_abd_on_write(to_write, asize, type); } + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_hits = 0; + hdr->b_l2hdr.b_arcs_state = + hdr->b_l1hdr.b_state->arcs_state; + mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ - mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); - mutex_exit(&dev->l2ad_mtx); + } + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | + ARC_FLAG_L2_WRITING); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + l2arc_hdr_arcstats_increment(hdr); + + boolean_t commit = l2arc_log_blk_insert(dev, hdr); + mutex_exit(hash_lock); + + if (pio == NULL) { cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; - /* - * Create a list to save allocated abd buffers - * for l2arc_log_blk_commit(). - */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); @@ -9212,54 +9237,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ZIO_FLAG_CANFAIL); } - hdr->b_l2hdr.b_dev = dev; - hdr->b_l2hdr.b_hits = 0; - - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - hdr->b_l2hdr.b_arcs_state = - hdr->b_l1hdr.b_state->arcs_state; - arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); - - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, hdr); - mutex_exit(&dev->l2ad_mtx); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - wzio = zio_write_phys(pio, dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, asize, to_write, + dev->l2ad_hand, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); - write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + zio_nowait(wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; - l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); - mutex_exit(hash_lock); - - /* - * Append buf info to current log and commit if full. - * arcstat_l2_{size,asize} kstats are updated - * internally. - */ - if (l2arc_log_blk_insert(dev, hdr)) { - /* - * l2ad_hand will be adjusted in - * l2arc_log_blk_commit(). - */ + if (commit) { + /* l2ad_hand will be adjusted inside. */ write_asize += l2arc_log_blk_commit(dev, pio, cb); } - zio_nowait(wzio); +next: + multilist_sublist_lock(mls); + if (from_head) + hdr = multilist_sublist_next(mls, marker); + else + hdr = multilist_sublist_prev(mls, marker); + multilist_sublist_remove(mls, marker); } multilist_sublist_unlock(mls); @@ -9268,9 +9273,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } + arc_state_free_marker(marker); + /* No buffers selected for writing? */ if (pio == NULL) { - ASSERT0(write_lsize); + ASSERT0(write_psize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); @@ -10598,7 +10605,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); - L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); + L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 5d1887175a..72aaf7f198 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -754,7 +754,7 @@ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); - multilist_sublist_t *mls = multilist_sublist_lock( + multilist_sublist_t *mls = multilist_sublist_lock_idx( &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index d134d4958f..f8bd7422a5 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1633,7 +1633,7 @@ sync_dnodes_task(void *arg) sync_dnodes_arg_t *sda = arg; multilist_sublist_t *ms = - multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); + multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx); dmu_objset_sync_dnodes(ms, sda->sda_tx); @@ -1987,8 +1987,8 @@ userquota_updates_task(void *arg) dnode_t *dn; userquota_cache_t cache = { { 0 } }; - multilist_sublist_t *list = - multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_t *list = multilist_sublist_lock_idx( + &os->os_synced_dnodes, uua->uua_sublist_idx); ASSERT(multilist_sublist_head(list) == NULL || dmu_objset_userused_enabled(os)); @@ -2070,8 +2070,8 @@ dnode_rele_task(void *arg) userquota_updates_arg_t *uua = arg; objset_t *os = uua->uua_os; - multilist_sublist_t *list = - multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_t *list = multilist_sublist_lock_idx( + &os->os_synced_dnodes, uua->uua_sublist_idx); dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 5809a832bc..dbfc00362f 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -641,7 +641,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; for (int i = 0; i < multilist_get_num_sublists(ml); i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL) { @@ -658,7 +658,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) i--; break; } - mls = multilist_sublist_lock(ml, i); + mls = multilist_sublist_lock_idx(ml, i); metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); if (txg > @@ -2190,12 +2190,12 @@ metaslab_potentially_evict(metaslab_class_t *mc) unsigned int idx = multilist_get_random_index( &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = - multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx); + multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { - VERIFY3P(mls, ==, multilist_sublist_lock( + VERIFY3P(mls, ==, multilist_sublist_lock_idx( &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c index b1cdf1c5c5..3d3ef86e68 100644 --- a/module/zfs/multilist.c +++ b/module/zfs/multilist.c @@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml) return (random_in_range(ml->ml_num_sublists)); } +void +multilist_sublist_lock(multilist_sublist_t *mls) +{ + mutex_enter(&mls->mls_lock); +} + /* Lock and return the sublist specified at the given index */ multilist_sublist_t * -multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx) { multilist_sublist_t *mls; @@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) multilist_sublist_t * multilist_sublist_lock_obj(multilist_t *ml, void *obj) { - return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj))); + return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj))); } void @@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) list_insert_tail(&mls->mls_list, obj); } +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_after(&mls->mls_list, prev, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_before(&mls->mls_list, next, obj); +} + /* * Move the object one element forward in the list. *