zil_fail: don't take the namespace lock

Early on I thought it would be necessary to stop the world making
progress under us, and taking the namespace lock was my initial idea of
how to do that. The right way is a bit more nuanced than that, but as it
turns out, we don't even need it.

To fail the ZIL is effectively to stop it in its tracks and hold onto
all itxs stored within until they operations they represent are
committed to the pool by some other means (ie the regular txg sync).

It doesn't matter if the pool makes progress while we're doing this. If
the pool does progress, then zil_clean() will be called to process any
itxs now completed. That will be to take the itxg_lock, process and
destroy the itxs, and release the lock, leaving the itxg empty.

If zil_fail() is running at the same time, then either zil_clean() will
have cleaned up the itxg and zil_fail() will find it empty, or
zil_fail() will get there first and empty it onto the fail itxg.

(cherry picked from commit 83ce694898f5a89bd382dda0ba09bb8a04ac5666)
This commit is contained in:
Rob Norris 2023-07-27 12:00:33 +10:00 committed by Geoff Amey
parent 3cb140863f
commit 43f45f8df0
1 changed files with 29 additions and 21 deletions

View File

@ -1059,14 +1059,6 @@ zil_fail(zilog_t *zilog)
list_create(&waiters, sizeof (zil_commit_waiter_t), list_create(&waiters, sizeof (zil_commit_waiter_t),
offsetof(zil_commit_waiter_t, zcw_node)); offsetof(zil_commit_waiter_t, zcw_node));
/*
* We have to take the namespace lock to prevent the txg being moved
* forward while we're processing the itxgs.
*/
mutex_enter(&spa_namespace_lock);
uint64_t last_synced_txg = spa_last_synced_txg(zilog->zl_spa);
/* /*
* A ZIL failure occurs when an LWB write or flush fails, or an LWB * A ZIL failure occurs when an LWB write or flush fails, or an LWB
* can't be issued. This usually occurs when the pool suspends during * can't be issued. This usually occurs when the pool suspends during
@ -1101,17 +1093,31 @@ zil_fail(zilog_t *zilog)
* that. * that.
*/ */
uint64_t highest_txg = last_synced_txg;
/* /*
* Prepare the fail itxg. This is not a real itxg, just a convenient * Prepare the fail itxg. This is not a real itxg, just a convenient
* holder for the itxs that couldn't be written and associated metadata * holder for the itxs that couldn't be written and associated metadata
* until their transaction is committed and we can fire their * until their transaction is committed and we can fire their
* callbacks. * callbacks.
*
* Note that once we take itxg_lock, zil_clean() is blocked until we're
* done.
*/ */
itxg_t *fail_itxg = &zilog->zl_fail_itxg; itxg_t *fail_itxg = &zilog->zl_fail_itxg;
mutex_enter(&fail_itxg->itxg_lock); mutex_enter(&fail_itxg->itxg_lock);
/*
* Starting txg for failure. Any itx seen on or before this txg is
* already committed to the pool.
*/
uint64_t last_synced_txg = spa_last_synced_txg(zilog->zl_spa);
/*
* The highest txg we've seen across all itxs. We'll bump this as we
* scan them, and the ZIL can't resume until the pool has passed this
* txg, thus fully committing all itxs.
*/
uint64_t highest_txg = last_synced_txg;
ASSERT3U(fail_itxg->itxg_txg, ==, 0); ASSERT3U(fail_itxg->itxg_txg, ==, 0);
ASSERT3P(fail_itxg->itxg_itxs, ==, NULL); ASSERT3P(fail_itxg->itxg_itxs, ==, NULL);
@ -1302,8 +1308,6 @@ zil_fail(zilog_t *zilog)
mutex_exit(&zcw->zcw_lock); mutex_exit(&zcw->zcw_lock);
} }
mutex_exit(&spa_namespace_lock);
} }
/* /*
@ -2457,15 +2461,21 @@ zil_itxg_clean_failed(zilog_t *zilog, uint64_t synced_txg)
{ {
itxg_t *fail_itxg = &zilog->zl_fail_itxg; itxg_t *fail_itxg = &zilog->zl_fail_itxg;
if (fail_itxg->itxg_txg == 0 || fail_itxg->itxg_txg > synced_txg) /*
* If zil_fail() is currently running, we will pause here until its
* done. If its not (ie most of the time), we're the only one checking,
* once per txg sync, so we should always get the lock without fuss.
*/
mutex_enter(&fail_itxg->itxg_lock);
if (fail_itxg->itxg_txg == 0 || fail_itxg->itxg_txg > synced_txg) {
mutex_exit(&fail_itxg->itxg_lock);
return; return;
}
ASSERT3U(fail_itxg->itxg_txg, ==, synced_txg); ASSERT3U(fail_itxg->itxg_txg, <=, synced_txg);
uint64_t next_txg = UINT64_MAX; uint64_t next_txg = UINT64_MAX;
mutex_enter(&fail_itxg->itxg_lock);
itx_t *itx, *next; itx_t *itx, *next;
list_t *l = &fail_itxg->itxg_itxs->i_sync_list; list_t *l = &fail_itxg->itxg_itxs->i_sync_list;
@ -2509,15 +2519,13 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
ASSERT3U(synced_txg, <, ZILTEST_TXG); ASSERT3U(synced_txg, <, ZILTEST_TXG);
mutex_enter(&itxg->itxg_lock);
/* /*
* Clean up the failed itxg if it has anything for this txg. This will * First clean up the failed itxg if it has anything for this txg or
* be fast and lock-free if it doesn't, so its ok to do this directly * any before it.
* on this thread.
*/ */
zil_itxg_clean_failed(zilog, synced_txg); zil_itxg_clean_failed(zilog, synced_txg);
mutex_enter(&itxg->itxg_lock);
if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
mutex_exit(&itxg->itxg_lock); mutex_exit(&itxg->itxg_lock);
return; return;