ZIL: Fix config lock deadlock.

When we have some LWBs closed and their ZIOs ready to be issued, we
can not afford sleeping on config lock if somebody else try to lock
it as writer, or it will cause a deadlock.

To solve it, move spa_config_enter() from zil_lwb_write_issue() to
zil_lwb_write_close() under zl_issuer_lock to enforce lock ordering
with other threads.  Now if we can't immediately lock config, issue
all previously closed LWBs so that they could drop their config
locks after completion, and only then allow sleeping on our lock.

Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15078
Closes #15080
This commit is contained in:
Alexander Motin 2023-07-24 16:41:11 -04:00 committed by GitHub
parent fb344f5aeb
commit 2cb992a99c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 27 additions and 7 deletions

View File

@ -151,6 +151,7 @@ static kmem_cache_t *zil_lwb_cache;
static kmem_cache_t *zil_zcw_cache; static kmem_cache_t *zil_zcw_cache;
static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb);
static itx_t *zil_itx_clone(itx_t *oitx); static itx_t *zil_itx_clone(itx_t *oitx);
static int static int
@ -1768,7 +1769,7 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
* Has to be called under zl_issuer_lock to chain more lwbs. * Has to be called under zl_issuer_lock to chain more lwbs.
*/ */
static lwb_t * static lwb_t *
zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
{ {
lwb_t *nlwb = NULL; lwb_t *nlwb = NULL;
zil_chain_t *zilc; zil_chain_t *zilc;
@ -1870,6 +1871,27 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
dmu_tx_commit(tx); dmu_tx_commit(tx);
/*
* We need to acquire the config lock for the lwb to issue it later.
* However, if we already have a queue of closed parent lwbs already
* holding the config lock (but not yet issued), we can't block here
* waiting on the lock or we will deadlock. In that case we must
* first issue to parent IOs before waiting on the lock.
*/
if (ilwbs && !list_is_empty(ilwbs)) {
if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) {
lwb_t *tlwb;
while ((tlwb = list_remove_head(ilwbs)) != NULL)
zil_lwb_write_issue(zilog, tlwb);
spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
}
} else {
spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
}
if (ilwbs)
list_insert_tail(ilwbs, lwb);
/* /*
* If there was an allocation failure then nlwb will be null which * If there was an allocation failure then nlwb will be null which
* forces a txg_wait_synced(). * forces a txg_wait_synced().
@ -1933,7 +1955,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
BP_GET_LSIZE(&lwb->lwb_blk)); BP_GET_LSIZE(&lwb->lwb_blk));
} }
spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER));
zil_lwb_add_block(lwb, &lwb->lwb_blk); zil_lwb_add_block(lwb, &lwb->lwb_blk);
lwb->lwb_issued_timestamp = gethrtime(); lwb->lwb_issued_timestamp = gethrtime();
zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_root_zio);
@ -2037,8 +2059,7 @@ cont:
lwb_sp < zil_max_waste_space(zilog) && lwb_sp < zil_max_waste_space(zilog) &&
(dlen % max_log_data == 0 || (dlen % max_log_data == 0 ||
lwb_sp < reclen + dlen % max_log_data))) { lwb_sp < reclen + dlen % max_log_data))) {
list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
lwb = zil_lwb_write_close(zilog, lwb);
if (lwb == NULL) if (lwb == NULL)
return (NULL); return (NULL);
zil_lwb_write_open(zilog, lwb); zil_lwb_write_open(zilog, lwb);
@ -2937,8 +2958,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
zfs_commit_timeout_pct / 100; zfs_commit_timeout_pct / 100;
if (sleep < zil_min_commit_timeout || if (sleep < zil_min_commit_timeout ||
lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) {
list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
lwb = zil_lwb_write_close(zilog, lwb);
zilog->zl_cur_used = 0; zilog->zl_cur_used = 0;
if (lwb == NULL) { if (lwb == NULL) {
while ((lwb = list_remove_head(ilwbs)) while ((lwb = list_remove_head(ilwbs))
@ -3096,7 +3116,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* since we've reached the commit waiter's timeout and it still * since we've reached the commit waiter's timeout and it still
* hasn't been issued. * hasn't been issued.
*/ */
lwb_t *nlwb = zil_lwb_write_close(zilog, lwb); lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);