diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ea0a26826c..f1f6013de2 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1059,14 +1059,6 @@ zil_fail(zilog_t *zilog) list_create(&waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); - /* - * We have to take the namespace lock to prevent the txg being moved - * forward while we're processing the itxgs. - */ - mutex_enter(&spa_namespace_lock); - - uint64_t last_synced_txg = spa_last_synced_txg(zilog->zl_spa); - /* * A ZIL failure occurs when an LWB write or flush fails, or an LWB * can't be issued. This usually occurs when the pool suspends during @@ -1101,17 +1093,31 @@ zil_fail(zilog_t *zilog) * that. */ - uint64_t highest_txg = last_synced_txg; - /* * Prepare the fail itxg. This is not a real itxg, just a convenient * holder for the itxs that couldn't be written and associated metadata * until their transaction is committed and we can fire their * callbacks. + * + * Note that once we take itxg_lock, zil_clean() is blocked until we're + * done. */ itxg_t *fail_itxg = &zilog->zl_fail_itxg; mutex_enter(&fail_itxg->itxg_lock); + /* + * Starting txg for failure. Any itx seen on or before this txg is + * already committed to the pool. + */ + uint64_t last_synced_txg = spa_last_synced_txg(zilog->zl_spa); + + /* + * The highest txg we've seen across all itxs. We'll bump this as we + * scan them, and the ZIL can't resume until the pool has passed this + * txg, thus fully committing all itxs. + */ + uint64_t highest_txg = last_synced_txg; + ASSERT3U(fail_itxg->itxg_txg, ==, 0); ASSERT3P(fail_itxg->itxg_itxs, ==, NULL); @@ -1302,8 +1308,6 @@ zil_fail(zilog_t *zilog) mutex_exit(&zcw->zcw_lock); } - - mutex_exit(&spa_namespace_lock); } /* @@ -2457,15 +2461,21 @@ zil_itxg_clean_failed(zilog_t *zilog, uint64_t synced_txg) { itxg_t *fail_itxg = &zilog->zl_fail_itxg; - if (fail_itxg->itxg_txg == 0 || fail_itxg->itxg_txg > synced_txg) + /* + * If zil_fail() is currently running, we will pause here until its + * done. If its not (ie most of the time), we're the only one checking, + * once per txg sync, so we should always get the lock without fuss. + */ + mutex_enter(&fail_itxg->itxg_lock); + if (fail_itxg->itxg_txg == 0 || fail_itxg->itxg_txg > synced_txg) { + mutex_exit(&fail_itxg->itxg_lock); return; + } - ASSERT3U(fail_itxg->itxg_txg, ==, synced_txg); + ASSERT3U(fail_itxg->itxg_txg, <=, synced_txg); uint64_t next_txg = UINT64_MAX; - mutex_enter(&fail_itxg->itxg_lock); - itx_t *itx, *next; list_t *l = &fail_itxg->itxg_itxs->i_sync_list; @@ -2509,15 +2519,13 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) ASSERT3U(synced_txg, <, ZILTEST_TXG); - mutex_enter(&itxg->itxg_lock); - /* - * Clean up the failed itxg if it has anything for this txg. This will - * be fast and lock-free if it doesn't, so its ok to do this directly - * on this thread. + * First clean up the failed itxg if it has anything for this txg or + * any before it. */ zil_itxg_clean_failed(zilog, synced_txg); + mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return;