diff --git a/include/sys/zil.h b/include/sys/zil.h index cefbccb32f..b5d65ab95c 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -495,8 +495,8 @@ extern void zil_itx_destroy(itx_t *itx); extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); -extern void zil_commit(zilog_t *zilog, uint64_t oid); -extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); +extern int zil_commit(zilog_t *zilog, uint64_t oid); +extern int zil_commit_impl(zilog_t *zilog, uint64_t oid); extern void zil_remove_async(zilog_t *zilog, uint64_t oid); extern int zil_reset(const char *osname, void *txarg); diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index d2f4018653..6d43035fe9 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -146,6 +146,7 @@ typedef struct itxg { kmutex_t itxg_lock; /* lock for this structure */ uint64_t itxg_txg; /* txg for this chain */ itxs_t *itxg_itxs; /* sync and async itxs */ + boolean_t itxg_failed; /* ZIL failed, don't touch */ } itxg_t; /* for async nodes we build up an AVL tree of lists of async itxs per file */ @@ -198,6 +199,8 @@ struct zilog { uint64_t zl_parse_blk_count; /* number of blocks parsed */ uint64_t zl_parse_lr_count; /* number of log records parsed */ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ + itxg_t zl_fail_itxg; /* holding space for failed itxs */ + uint64_t zl_unfail_txg; /* txg to unfail ZIL at */ list_t zl_itx_commit_list; /* itx list to be committed */ uint64_t zl_cur_used; /* current commit log size used */ list_t zl_lwb_list; /* in-flight log write list */ diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index b8fbff0ff0..28df811bc9 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -280,18 +280,29 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) dp = dmu_objset_pool(zfsvfs->z_os); /* - * If the system is shutting down, then skip any - * filesystems which may exist on a suspended pool. + * If the system is shutting down, then skip any filesystems + * which may exist on a suspended pool. We don't do this if + * failmode=continue becase zil_commit might have a better + * error for us. */ - if (spa_suspended(dp->dp_spa)) { + if (spa_suspended(dp->dp_spa) && + spa_get_failmode(dp->dp_spa) != ZIO_FAILURE_MODE_CONTINUE) { ZFS_EXIT(zfsvfs); return (0); } + /* + * If there's a ZIL, try to flush it. If the pool is in some + * unflushable state, this will get us an approprate error + * return. + */ + int err = 0; if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, 0); + err = zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); + + return (err); } else { /* * Sync all ZFS filesystems. This is what happens when you diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 0592e6a303..25923417cf 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1648,7 +1648,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); - if (dmu_tx_assign(tx, DMU_TX_ASSIGN_WAIT) != 0) { + if (dmu_tx_assign(tx, + DMU_TX_ASSIGN_WAIT | DMU_TX_ASSIGN_CONTINUE) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 77740e59fb..f00021a298 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -61,6 +61,7 @@ static ulong_t zfs_fsync_sync_cnt = 4; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) { + int err = 0; zfsvfs_t *zfsvfs = ZTOZSB(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_DISABLED) @@ -71,13 +72,13 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - zil_commit(zfsvfs->z_log, zp->z_id); + err = zil_commit(zfsvfs->z_log, zp->z_id); tsd_set(zfs_fsyncer_key, NULL); ZFS_EXIT(zfsvfs); - return (0); + return (err); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index efb8c8761f..bf8a6b29bb 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -583,6 +583,8 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, return (lwb); } +static boolean_t zil_failed(zilog_t *zilog); + static void zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { @@ -593,9 +595,16 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); - ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); - ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || - lwb->lwb_state == LWB_STATE_FLUSH_DONE); + + /* + * If we can't write, then we could be in zil_close() destroying + * unissued lwbs; in that case, don't assert that they're completed. + */ + if (!(zil_failed(zilog) || spa_exiting_any(zilog->zl_spa))) { + ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); + ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || + lwb->lwb_state == LWB_STATE_FLUSH_DONE); + } /* * Clear the zilog's field to indicate this lwb is no longer @@ -681,7 +690,12 @@ zil_create(zilog_t *zilog) /* * Wait for any previous destroy to complete. */ - txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + error = txg_wait_synced_flags(zilog->zl_dmu_pool, zilog->zl_destroy_txg, + TXG_WAIT_F_NOSUSPEND); + if (error != 0) { + ASSERT3S(error, ==, EAGAIN); + return (NULL); + } ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); @@ -695,9 +709,10 @@ zil_create(zilog_t *zilog) */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); - error = dmu_tx_assign(tx, DMU_TX_ASSIGN_WAIT); + error = dmu_tx_assign(tx, DMU_TX_ASSIGN_WAIT | + DMU_TX_ASSIGN_NOSUSPEND); if (error != 0) { - ASSERT(dmu_objset_exiting(zilog->zl_os)); + ASSERT3S(error, ==, EAGAIN); dmu_tx_abort(tx); return (NULL); } @@ -730,7 +745,27 @@ zil_create(zilog_t *zilog) */ if (tx != NULL) { dmu_tx_commit(tx); - txg_wait_synced(zilog->zl_dmu_pool, txg); + error = txg_wait_synced_flags(zilog->zl_dmu_pool, txg, + TXG_WAIT_F_NOSUSPEND); + if (error != 0) { + /* + * Pool suspended at the last moment, so we have to + * unwind everything we've just done. + */ + ASSERT3S(error, ==, EAGAIN); + + metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); + + mutex_enter(&zilog->zl_lock); + list_remove(&zilog->zl_lwb_list, lwb); + mutex_exit(&zilog->zl_lock); + + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); + zil_free_lwb(zilog, lwb); + + return (NULL); + } } ASSERT(error != 0 || bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); @@ -758,7 +793,9 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) int error; /* - * Wait for any previous destroy to complete. + * Wait for any previous destroy to complete. Its ok to block here, + * as this is always arrived at via administrative operations, never + * appication IO. Its also not obvious how we'd recover. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); @@ -991,6 +1028,282 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) return ((error == ECKSUM || error == ENOENT) ? 0 : error); } +/* + * True if the ZIL has failed. + */ +static boolean_t +zil_failed(zilog_t *zilog) +{ + return ((zilog->zl_unfail_txg > 0) ? B_TRUE : B_FALSE); +} + +static void zil_commit_waiter_skip(zil_commit_waiter_t *zcw); +static itxs_t *zil_alloc_itxs(void); + +/* + * Fail the ZIL. This will collect up all failed itxs, and note that the ZIL + * cannot be unfailed until all their txgs are synced. After this, all + * zil_commit() calls will be serviced by the regular txg sync, and no new itxs + * will be assigned or committed. + */ +static void +zil_fail(zilog_t *zilog) +{ + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT(MUTEX_HELD(&zilog->zl_lock)); + ASSERT(!zil_failed(zilog)); + + itx_t *itx; + list_t *l; + + list_t waiters; + list_create(&waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + + /* + * We have to take the namespace lock to prevent the txg being moved + * forward while we're processing the itxgs. + */ + mutex_enter(&spa_namespace_lock); + + uint64_t last_synced_txg = spa_last_synced_txg(zilog->zl_spa); + + /* + * A ZIL failure occurs when an LWB write or flush fails, or an LWB + * can't be issued. This usually occurs when the pool suspends during + * zil_commit(). + * + * The LWBs and the ZIL proper have outstanding itxs attached to them, + * with callbacks on them to be called when the data is safely on disk. + * If the pool never returns it doesn't matter; that data is never + * going to disk anyway. If the pool does return, and the txg covering + * those itxs is committed, then we need to fire those callbacks to + * make sure the rest of the system knows that its data is properly on + * disk. + * + * Outstandng itxs exist in in the live LWB list. The tail LWB is open, + * any others have issued IO. Once we set failure, zil_lwb_write_done() + * and zil_lwb_flush_vdevs_done() will quietly ignore any responses + * from live IO, so these LWBs will not progress any further. There may + * also be itxs on the commit list, that have been moved out of + * zl_itxg[] but not yet assigned to an itx. + * + * We collect up all those outstanding itxs, and put them onto + * fail_itxg. As we take each itx, we check its txg, and record the + * highest one we see. In the future, when that highest txg is + * committed, we know that all outstanding work is completed and the + * ZIL can be unfailed. See zil_sync() for details. + * + * In addition, each LWB has waiters attached, which threads are + * currently waiting on for zil_commit() to complete. We gather those + * so we can fire them via zil_commit_waiter_skip() after the unfail + * txg has been set. The first thing they'll do when they wake is to + * check the fail state, so we need to be done with our work before + * that. + */ + + uint64_t highest_txg = last_synced_txg; + + /* + * Prepare the fail itxg. This is not a real itxg, just a convenient + * holder for the itxs that couldn't be written and associated metadata + * until their transaction is committed and we can fire their + * callbacks. + */ + itxg_t *fail_itxg = &zilog->zl_fail_itxg; + mutex_enter(&fail_itxg->itxg_lock); + + ASSERT3U(fail_itxg->itxg_txg, ==, 0); + ASSERT3P(fail_itxg->itxg_itxs, ==, NULL); + + fail_itxg->itxg_itxs = zil_alloc_itxs(); + + /* + * Unlike the live itxgs, fail_itxg can carry itxs from multiple txgs, + * as zil_commit will collect all pending changes for a given foid, + * regardless of which txg its on. To handle this, we set its itxg_txg + * to the lowest txg of all the itxs it holds, as a cheap reference for + * what's left to process. + */ + fail_itxg->itxg_txg = UINT64_MAX; + + /* Process failed LWBs, oldest first */ + for (lwb_t *lwb = list_head(&zilog->zl_lwb_list); lwb; + lwb = list_next(&zilog->zl_lwb_list, lwb)) { + /* + * All transactions for unissued or completed LWBs should be + * aborted by the time we get here. Conversely, transactions + * on in-flight LWBs will still be live, to be aborted in + * zil_lwb_flush_vdevs_done(). + */ + IMPLY(lwb->lwb_state == LWB_STATE_CLOSED, lwb->lwb_tx == NULL); + IMPLY(lwb->lwb_state == LWB_STATE_OPENED, lwb->lwb_tx == NULL); + IMPLY(lwb->lwb_state == LWB_STATE_ISSUED, lwb->lwb_tx != NULL); + IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE, + lwb->lwb_tx != NULL); + IMPLY(lwb->lwb_state == LWB_STATE_FLUSH_DONE, + lwb->lwb_tx == NULL); + + l = &lwb->lwb_itxs; + while ((itx = list_head(l)) != NULL) { + list_remove(l, itx); + if (itx->itx_lr.lrc_txtype == TX_COMMIT) { + /* + * zil_process_commit_list() already put the + * zcw for commit itxs on the waiter list, so + * we don't need to do anything special here. + */ + zil_itx_destroy(itx); + } else if (itx->itx_lr.lrc_txg <= last_synced_txg) { + /* + * txg covering this itx is alread on the main + * pool, so we can safely complete it now. + */ + zil_itx_destroy(itx); + } else { + /* + * Put the itx on the fail itxg sync list, + * and bump the unfail txg. + */ + list_insert_tail( + &fail_itxg->itxg_itxs->i_sync_list, itx); + fail_itxg->itxg_txg = MIN(fail_itxg->itxg_txg, + itx->itx_lr.lrc_txg); + + highest_txg = + MAX(highest_txg, itx->itx_lr.lrc_txg); + } + } + + /* Take any waiters */ + list_move_tail(&waiters, &lwb->lwb_waiters); + + ASSERT(list_is_empty(&lwb->lwb_itxs)); + ASSERT(list_is_empty(&lwb->lwb_waiters)); + } + + /* + * Failures in zil_process_commit_list() will leave stuff on the + * commit list; bring those over too. + */ + l = &zilog->zl_itx_commit_list; + while ((itx = list_head(l)) != NULL) { + list_remove(l, itx); + if (itx->itx_lr.lrc_txtype == TX_COMMIT) { + /* Take commit waiters */ + list_insert_tail(&waiters, itx->itx_private); + itx->itx_private = NULL; + zil_itx_destroy(itx); + } else if (itx->itx_lr.lrc_txg <= last_synced_txg) { + /* + * txg covering this itx is alread on the main pool, so + * we can safely complete it now. + */ + zil_itx_destroy(itx); + } else { + + /* + * Put the itx on the fail itxg sync list, and bump the + * unfail txg. + */ + list_insert_tail( + &fail_itxg->itxg_itxs->i_sync_list, itx); + fail_itxg->itxg_txg = MIN(fail_itxg->itxg_txg, + itx->itx_lr.lrc_txg); + + highest_txg = MAX(highest_txg, itx->itx_lr.lrc_txg); + } + } + ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); + + /* + * If this fails, then we didn't take any itxs at all. If that's true, + * how did we end up here? + */ + ASSERT3U(fail_itxg->itxg_txg, <, UINT64_MAX); + + /* Prepare the live itxgs for failure */ + for (int i = 0; i < TXG_SIZE; i++) { + itxg_t *itxg = &zilog->zl_itxg[i]; + + mutex_enter(&itxg->itxg_lock); + ASSERT(!itxg->itxg_failed); + + /* + * Flag itxgs as failed. Most itxg users (eg zil_itx_assign()) + * take itxg_lock but not zl_lock, to avoid contention. They + * need a cheap way to test for failure; this is it. + */ + itxg->itxg_failed = B_TRUE; + + if (itxg->itxg_txg == 0) { + /* Previously cleaned itxg, nothing to do. */ + ASSERT3U(itxg->itxg_itxs, ==, NULL); + mutex_exit(&itxg->itxg_lock); + continue; + } + + /* Ensure unfail covers this itxg */ + highest_txg = MAX(highest_txg, itxg->itxg_txg); + + /* + * Scan the sync list for commit itxs, so we can signal their + * waiters. This shouldn't happen, but see zil_itxg_clean() for + * how they can legitimately end up here. + */ + l = &itxg->itxg_itxs->i_sync_list; + while ((itx = list_head(l)) != NULL) { + if (itx->itx_lr.lrc_txtype == TX_COMMIT) { + list_remove(l, itx); + list_insert_tail(&waiters, itx->itx_private); + itx->itx_private = NULL; + zil_itx_destroy(itx); + } + } + + mutex_exit(&itxg->itxg_lock); + } + + /* We've seen every outstanding itx, and have our unfail point */ + zilog->zl_unfail_txg = highest_txg + 1; + + mutex_exit(&fail_itxg->itxg_lock); + + char ds_name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(dmu_objset_ds(zilog->zl_os), ds_name); + cmn_err(CE_WARN, + "ZIL failed on pool '%s' dataset '%s': " + "last_txg=%llu; unfail_txg=%llu", + spa_name(zilog->zl_spa), ds_name, + last_synced_txg, zilog->zl_unfail_txg); + + /* Now inform all the waiters */ + zil_commit_waiter_t *zcw; + while ((zcw = list_head(&waiters)) != NULL) { + mutex_enter(&zcw->zcw_lock); + + ASSERT(list_link_active(&zcw->zcw_node)); + list_remove(&waiters, zcw); + + zcw->zcw_lwb = NULL; + + /* + * Set a reasonable error for zil_commit_impl() to find when + * zil_commit_waiter() returns. + */ + zcw->zcw_zio_error = EIO; + + ASSERT3B(zcw->zcw_done, ==, B_FALSE); + zcw->zcw_done = B_TRUE; + + cv_broadcast(&zcw->zcw_cv); + + mutex_exit(&zcw->zcw_lock); + } + + mutex_exit(&spa_namespace_lock); +} + /* * When an itx is "skipped", this function is used to properly mark the * waiter as "done, and signal any thread(s) waiting on it. An itx can @@ -1146,6 +1459,19 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + /* + * If the IO failed, then either the ZIL is already failed, or we need + * to call zil_fail() to fail it. To call zil_fail() we have to hold + * both zl_issuer_lock and zl_lock, and zl_issuer_lock must be taken + * before zl_lock, so we have to take it now. + * + * If zl_issuer_lock is already taken its because new IO is in the + * process of being issued. That's fine, as once we've called + * zil_fail() those IOs will be come through here and hit the + * zil_failed() test below anyway. + */ + if (zio->io_error != 0) + mutex_enter(&zilog->zl_issuer_lock); mutex_enter(&zilog->zl_lock); /* @@ -1167,6 +1493,50 @@ zil_lwb_flush_vdevs_done(zio_t *zio) ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); lwb->lwb_state = LWB_STATE_FLUSH_DONE; + if (zil_failed(zilog)) { + /* + * The ZIL failed some time ago, so itxs have already been + * emptied and waiters informed, so there's nothing else + * to do here. + */ + mutex_exit(&zilog->zl_lock); + if (MUTEX_HELD(&zilog->zl_issuer_lock)) + mutex_exit(&zilog->zl_issuer_lock); + + /* + * This is the transaction for the next block; we have to + * commit it here otherwise txg_quiesce will block if the pool + * returns. + */ + dmu_tx_commit(tx); + + return; + } + + if (zio->io_error != 0) { + /* + * We expect any ZIO errors from child ZIOs to have been + * propagated "up" to this specific LWB's root ZIO, in + * order for this error handling to work correctly. This + * includes ZIO errors from either this LWB's write or + * flush, as well as any errors from other dependent LWBs + * (e.g. a root LWB ZIO that might be a child of this LWB). + */ + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT(MUTEX_HELD(&zilog->zl_lock)); + + zil_fail(zilog); + + mutex_exit(&zilog->zl_lock); + mutex_exit(&zilog->zl_issuer_lock); + + dmu_tx_commit(tx); + + return; + } + + ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); + if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number @@ -1190,16 +1560,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; - /* - * We expect any ZIO errors from child ZIOs to have been - * propagated "up" to this specific LWB's root ZIO, in - * order for this error handling to work correctly. This - * includes ZIO errors from either this LWB's write or - * flush, as well as any errors from other dependent LWBs - * (e.g. a root LWB ZIO that might be a child of this LWB). - */ - - zcw->zcw_zio_error = zio->io_error; ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; @@ -1278,7 +1638,7 @@ zil_lwb_write_done(zio_t *zio) * we expect any error seen here, to have been propagated to * that function). */ - if (zio->io_error != 0) { + if (zil_failed(zilog) || zio->io_error != 0) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; @@ -1524,9 +1884,10 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) * should not be subject to the dirty data based delays. We * use DMU_TX_ASSIGN_NOTHROTTLE to bypass the delay mechanism. */ - if (dmu_tx_assign(tx, - DMU_TX_ASSIGN_WAIT | DMU_TX_ASSIGN_NOTHROTTLE) != 0) { - ASSERT(dmu_objset_exiting(zilog->zl_os)); + error = dmu_tx_assign(tx, DMU_TX_ASSIGN_WAIT | + DMU_TX_ASSIGN_NOTHROTTLE | DMU_TX_ASSIGN_NOSUSPEND); + if (error != 0) { + ASSERT3S(error, ==, EAGAIN); dmu_tx_abort(tx); return (NULL); } @@ -1562,7 +1923,9 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); - error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); + if (error == 0) + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, + &slog); if (slog) { ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused); @@ -1743,6 +2106,11 @@ cont: */ if (lrc->lrc_txtype == TX_WRITE) { if (txg > spa_freeze_txg(zilog->zl_spa)) + /* + * We're allowed to block here as freeze is only used by + * ztest, which should always be a stable and controlled + * environment. + */ txg_wait_synced(zilog->zl_dmu_pool, txg); if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zil_itx_copied_count); @@ -1791,9 +2159,33 @@ cont: bzero((char *)dbuf + lrwb->lr_length, dpad); if (error == EIO) { - txg_wait_synced(zilog->zl_dmu_pool, txg); - return (lwb); + /* + * The zl_get_data callback couldn't give us + * the data. This is almost certainly because + * the pool suspended in dmu_sync_late_arrival() + * but we attempt to force the txg out anyway + * just to be sure. If it really is suspended, + * it'll return immediately. + */ + error = txg_wait_synced_flags( + zilog->zl_dmu_pool, txg, + TXG_WAIT_F_NOSUSPEND); + if (error == 0) + /* Surprising, but we'll take it. */ + return (lwb); + + /* + * The pool has suspended, and we're being + * asked to commit an itx whose associated data + * has no block pointer. This can't be + * resolved, so the itx can't be committed to + * this lwb. All we can do is signal this fact + * to the caller and let it clean up. + */ + ASSERT3S(error, ==, EAGAIN); + return (NULL); } + if (error != 0) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); @@ -1976,6 +2368,17 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) list_destroy(&clean_list); } +static itxs_t * +zil_alloc_itxs(void) +{ + itxs_t *itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); + list_create(&itxs->i_sync_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + avl_create(&itxs->i_async_tree, zil_aitx_compare, + sizeof (itx_async_node_t), offsetof(itx_async_node_t, ia_node)); + return itxs; +} + void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { @@ -1996,6 +2399,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); + itxs = itxg->itxg_itxs; if (itxg->itxg_txg != txg) { if (itxs != NULL) { @@ -2009,14 +2413,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) clean = itxg->itxg_itxs; } itxg->itxg_txg = txg; - itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), - KM_SLEEP); - - list_create(&itxs->i_sync_list, sizeof (itx_t), - offsetof(itx_t, itx_node)); - avl_create(&itxs->i_async_tree, zil_aitx_compare, - sizeof (itx_async_node_t), - offsetof(itx_async_node_t, ia_node)); + itxs = itxg->itxg_itxs = zil_alloc_itxs(); } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); @@ -2055,6 +2452,48 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) zil_itxg_clean(clean); } +static void +zil_itxg_clean_failed(zilog_t *zilog, uint64_t synced_txg) +{ + itxg_t *fail_itxg = &zilog->zl_fail_itxg; + + if (fail_itxg->itxg_txg == 0 || fail_itxg->itxg_txg > synced_txg) + return; + + ASSERT3U(fail_itxg->itxg_txg, ==, synced_txg); + + uint64_t next_txg = UINT64_MAX; + + mutex_enter(&fail_itxg->itxg_lock); + + itx_t *itx, *next; + list_t *l = &fail_itxg->itxg_itxs->i_sync_list; + + next = list_head(l); + while (next != NULL) { + itx = next; + next = list_next(l, itx); + ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); + if (itx->itx_lr.lrc_txg <= synced_txg) { + list_remove(l, itx); + zil_itx_destroy(itx); + } else + next_txg = MIN(next_txg, itx->itx_lr.lrc_txg); + } + + if (next_txg < UINT64_MAX) + fail_itxg->itxg_txg = next_txg; + else { + ASSERT(list_is_empty(&fail_itxg->itxg_itxs->i_sync_list)); + avl_destroy(&fail_itxg->itxg_itxs->i_async_tree); + kmem_free(fail_itxg->itxg_itxs, sizeof (itxs_t)); + fail_itxg->itxg_itxs = NULL; + fail_itxg->itxg_txg = 0; + } + + mutex_exit(&fail_itxg->itxg_lock); +} + /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we @@ -2071,6 +2510,14 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) ASSERT3U(synced_txg, <, ZILTEST_TXG); mutex_enter(&itxg->itxg_lock); + + /* + * Clean up the failed itxg if it has anything for this txg. This will + * be fast and lock-free if it doesn't, so its ok to do this directly + * on this thread. + */ + zil_itxg_clean_failed(zilog, synced_txg); + if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return; @@ -2125,6 +2572,15 @@ zil_get_commit_list(zilog_t *zilog) mutex_exit(&itxg->itxg_lock); continue; } + if (itxg->itxg_failed) { + /* + * If we failed, do nothing, as we want the itxs to + * remain here. zil_fail() is under zl_issuer_lock, + * which we hold, so there's no way for us to race them. + */ + mutex_exit(&itxg->itxg_lock); + return; + } /* * If we're adding itx records to the zl_itx_commit_list, @@ -2245,12 +2701,12 @@ zil_prune_commit_list(zilog_t *zilog) IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); } -static void +static int zil_commit_writer_stall(zilog_t *zilog) { /* * When zio_alloc_zil() fails to allocate the next lwb block on - * disk, we must call txg_wait_synced() to ensure all of the + * disk, we must call txg_wait_synced_flags() to ensure all of the * lwbs in the zilog's zl_lwb_list are synced and then freed (in * zil_sync()), such that any subsequent ZIL writer (i.e. a call * to zil_process_commit_list()) will have to call zil_create(), @@ -2266,11 +2722,17 @@ zil_commit_writer_stall(zilog_t *zilog) * We must hold the zilog's zl_issuer_lock while we do this, to * ensure no new threads enter zil_process_commit_list() until * all lwb's in the zl_lwb_list have been synced and freed - * (which is achieved via the txg_wait_synced() call). + * (which is achieved via the txg_wait_synced_flags() call). */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - txg_wait_synced(zilog->zl_dmu_pool, 0); + int error = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_F_NOSUSPEND); + if (error != 0) { + ASSERT3S(error, ==, EAGAIN); + return (EAGAIN); + } ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); + return (0); } /* @@ -2304,6 +2766,17 @@ zil_process_commit_list(zilog_t *zilog) lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { lwb = zil_create(zilog); + if (lwb == NULL) { + /* + * Couldn't allocate the first block or write the + * header. Immediately fail the ZIL. The itxs are on the + * commit list, which means not much to do. + */ + mutex_enter(&zilog->zl_lock); + zil_fail(zilog); + mutex_exit(&zilog->zl_lock); + return; + } } else { ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); @@ -2397,33 +2870,93 @@ zil_process_commit_list(zilog_t *zilog) if (lwb == NULL) { /* - * This indicates zio_alloc_zil() failed to allocate the - * "next" lwb on-disk. When this happens, we must stall - * the ZIL write pipeline; see the comment within + * This indicates either zio_alloc_zil() failed to allocate the + * "next" lwb on-disk, or the pool suspended while trying to + * assign that block to a tx (or, in rare cases, the pool + * suspended while trying to commit a TX_WRITE). + * + * If the lwb was issued, we can try to recover by stalling the + * ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. + * + * If the lwb was not issued, stalling is not an option. Almost + * certainly it will fail anyway, but if the pool was restored + * and it succeeded, we'd be left with an unissued lwb carrying + * itxs for data that is already on the main pool, and no one + * has responsibility for calling the waiters. + * + * Regardless, if the stall fails or would be unsafe, we have no + * choice but to fail the ZIL entirely. */ - zil_commit_writer_stall(zilog); - /* - * Additionally, we have to signal and mark the "nolwb" - * waiters as "done" here, since without an lwb, we - * can't do this via zil_lwb_flush_vdevs_done() like - * normal. - */ - zil_commit_waiter_t *zcw; - while ((zcw = list_head(&nolwb_waiters)) != NULL) { - zil_commit_waiter_skip(zcw); - list_remove(&nolwb_waiters, zcw); - } + lwb = list_tail(&zilog->zl_lwb_list); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); - /* - * And finally, we have to destroy the itx's that - * couldn't be committed to an lwb; this will also call - * the itx's callback if one exists for the itx. - */ - while ((itx = list_head(&nolwb_itxs)) != NULL) { - list_remove(&nolwb_itxs, itx); - zil_itx_destroy(itx); + if (lwb->lwb_state != LWB_STATE_OPENED && + zil_commit_writer_stall(zilog) == 0) { + /* + * Additionally, we have to signal and mark the "nolwb" + * waiters as "done" here, since without an lwb, we + * can't do this via zil_lwb_flush_vdevs_done() like + * normal. + */ + zil_commit_waiter_t *zcw; + while ((zcw = list_head(&nolwb_waiters)) != NULL) { + list_remove(&nolwb_waiters, zcw); + zil_commit_waiter_skip(zcw); + } + + /* + * And finally, we have to destroy the itx's that + * couldn't be committed to an lwb; this will also call + * the itx's callback if one exists for the itx. + */ + while ((itx = list_head(&nolwb_itxs)) != NULL) { + list_remove(&nolwb_itxs, itx); + zil_itx_destroy(itx); + } + } else { + ASSERT(spa_suspended(spa)); + ASSERT(!zil_failed(zilog)); + + /* + * The pool suspended, so all we can do is fail the ZIL + * and keep the itxs alive in case it ever comes back. + */ + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + mutex_enter(&zilog->zl_lock); + + if (lwb->lwb_state == LWB_STATE_OPENED) { + /* + * The LWB was never issued, so will never get + * cleaned up in zil_lwb_flush_vdevs_done(). Do + * that now. + */ + ASSERT3P(lwb->lwb_buf, !=, NULL); + ASSERT3P(lwb->lwb_tx, !=, NULL); + ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT3P(lwb->lwb_write_zio, ==, NULL); + + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + dmu_tx_t *tx = lwb->lwb_tx; + lwb->lwb_buf = NULL; + lwb->lwb_tx = NULL; + dmu_tx_commit(tx); + } + + /* + * Attach the nolwb itxs and waiters to the lead lwb. + * It doesn't matter that they're not going to be + * issued; we just need them in a place that zil_fail() + * can get at them. + */ + list_move_tail(&lwb->lwb_itxs, &nolwb_itxs); + list_move_tail(&lwb->lwb_waiters, &nolwb_waiters); + + zil_fail(zilog); + + mutex_exit(&zilog->zl_lock); } } else { ASSERT(list_is_empty(&nolwb_waiters)); @@ -2532,6 +3065,7 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) out: mutex_exit(&zilog->zl_issuer_lock); + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); } static void @@ -2635,10 +3169,15 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) if (nlwb == NULL) { /* * When zil_lwb_write_issue() returns NULL, this - * indicates zio_alloc_zil() failed to allocate the - * "next" lwb on-disk. When this occurs, the ZIL write - * pipeline must be stalled; see the comment within the - * zil_commit_writer_stall() function for more details. + * indicates either zio_alloc_zil() failed to allocate + * the "next" lwb on-disk, or the pool suspended while + * trying to assign that block to a tx. + * + * If the lwb was issued, we can try to recover by + * stalling the ZIL write pipeline. If it was not, or + * the stall also fails, we must fail the ZIL instead. + * See the comments in zil_process_commit_list() and + * zil_commit_writer_stall() for more details. * * We must drop the commit waiter's lock prior to * calling zil_commit_writer_stall() or else we can wind @@ -2655,14 +3194,50 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * because it's blocked trying to acquire the waiter's * lock, which occurs prior to calling dmu_tx_commit() */ + + lwb = list_tail(&zilog->zl_lwb_list); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); + mutex_exit(&zcw->zcw_lock); - zil_commit_writer_stall(zilog); + + if (lwb->lwb_state == LWB_STATE_OPENED || + zil_commit_writer_stall(zilog) != 0) { + ASSERT(spa_suspended(zilog->zl_spa)); + ASSERT(!zil_failed(zilog)); + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + mutex_enter(&zilog->zl_lock); + + if (lwb->lwb_state == LWB_STATE_OPENED) { + /* + * The LWB was never issued, so will never get + * cleaned up in zil_lwb_flush_vdevs_done(). Do + * that now. + */ + ASSERT3P(lwb->lwb_buf, !=, NULL); + ASSERT3P(lwb->lwb_tx, !=, NULL); + ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT3P(lwb->lwb_write_zio, ==, NULL); + + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + dmu_tx_t *tx = lwb->lwb_tx; + lwb->lwb_buf = NULL; + lwb->lwb_tx = NULL; + dmu_tx_commit(tx); + } + + zil_fail(zilog); + + mutex_exit(&zilog->zl_lock); + } + mutex_enter(&zcw->zcw_lock); } out: mutex_exit(&zilog->zl_issuer_lock); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); } /* @@ -2817,15 +3392,17 @@ zil_free_commit_waiter(zil_commit_waiter_t *zcw) * then later committed to an lwb (or skipped) when * zil_process_commit_list() is called. */ -static void +static int zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + int error = 0; - if (dmu_tx_assign(tx, DMU_TX_ASSIGN_WAIT) != 0) { - ASSERT(dmu_objset_exiting(zilog->zl_os)); + error = dmu_tx_assign(tx, DMU_TX_ASSIGN_WAIT | DMU_TX_ASSIGN_NOSUSPEND); + if (error != 0) { + ASSERT3S(error, ==, EAGAIN); dmu_tx_abort(tx); - return; + return (EAGAIN); } itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); @@ -2835,6 +3412,30 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) zil_itx_assign(zilog, itx, tx); dmu_tx_commit(tx); + + return (0); +} + +/* + * This is a helper for when zil_commit() has to fall back to a full txg sync. + * Callers of zil_commit() are expecting it to block or fail according to the + * failure mode, so this is one of the few places where we honor that. It will + * return EIO on suspend and failmode=continue so we don't have to repeat that + * conversion multiple times at the callers. + */ +static int +zil_commit_fallback_sync(zilog_t *zilog) +{ + int err = 0; + const txg_wait_flag_t flags = + spa_get_failmode(zilog->zl_spa) == ZIO_FAILURE_MODE_CONTINUE ? + TXG_WAIT_F_NOSUSPEND : TXG_WAIT_F_NONE; + + err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, flags); + if (flags & TXG_WAIT_F_NOSUSPEND && err == EAGAIN) + err = SET_ERROR(EIO); + + return (err); } /* @@ -2952,7 +3553,7 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) * but the order in which they complete will be the same order in * which they were created. */ -void +int zil_commit(zilog_t *zilog, uint64_t foid) { /* @@ -2971,7 +3572,7 @@ zil_commit(zilog_t *zilog, uint64_t foid) ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); if (zilog->zl_sync == ZFS_SYNC_DISABLED) - return; + return (0); if (!spa_writeable(zilog->zl_spa)) { /* @@ -2985,33 +3586,33 @@ zil_commit(zilog_t *zilog, uint64_t foid) ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); for (int i = 0; i < TXG_SIZE; i++) ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); - return; + return (SET_ERROR(EROFS)); } /* * If the objset is being forced to exit, there's nothing more to do. */ if (dmu_objset_exiting(zilog->zl_os)) - return; + return (0); /* - * If the ZIL is suspended, we don't want to dirty it by calling - * zil_commit_itx_assign() below, nor can we write out - * lwbs like would be done in zil_commit_write(). Thus, we - * simply rely on txg_wait_synced() to maintain the necessary - * semantics, and avoid calling those functions altogether. + * If the ZIL has failed or is suspended, we don't want to dirty it by + * calling zil_commit_itx_assign() below, nor can we write out lwbs + * like would be done in zil_commit_write(). Thus, we simply rely on + * txg_wait_synced() to maintain the necessary semantics, and avoid + * calling those functions altogether. */ - if (zilog->zl_suspend > 0) { - txg_wait_synced(zilog->zl_dmu_pool, 0); - return; - } + if (zil_failed(zilog) || zilog->zl_suspend > 0) + return (zil_commit_fallback_sync(zilog)); - zil_commit_impl(zilog, foid); + return (zil_commit_impl(zilog, foid)); } -void +int zil_commit_impl(zilog_t *zilog, uint64_t foid) { + ASSERT0(zil_failed(zilog) || zilog->zl_suspend > 0); + ZIL_STAT_BUMP(zil_commit_count); /* @@ -3041,26 +3642,40 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) * zil_commit_waiter(). */ zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); - zil_commit_itx_assign(zilog, zcw); + + if (zil_commit_itx_assign(zilog, zcw) != 0) { + /* Pool suspended while committing TX_COMMIT. */ + zil_free_commit_waiter(zcw); + return (SET_ERROR(EIO)); + } zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); - if (zcw->zcw_zio_error != 0) { - /* - * If there was an error writing out the ZIL blocks that - * this thread is waiting on, then we fallback to - * relying on spa_sync() to write out the data this - * thread is waiting on. Obviously this has performance - * implications, but the expectation is for this to be - * an exceptional case, and shouldn't occur often. - */ - DTRACE_PROBE2(zil__commit__io__error, - zilog_t *, zilog, zil_commit_waiter_t *, zcw); - txg_wait_synced(zilog->zl_dmu_pool, 0); + ASSERT(zcw->zcw_done); + + if (zcw->zcw_zio_error == 0) { + zil_free_commit_waiter(zcw); + return (0); } + + ASSERT(zil_failed(zilog)); + + /* + * Ideally, we would just return to zil_commit() here and do the + * call to zil_commit_fallback_sync() from one place, but this dtrace + * probe has the live zcw as an argument, so we have to call it before + * we free it. + */ + DTRACE_PROBE2(zil__commit__io__error, + zilog_t *, zilog, zil_commit_waiter_t *, zcw); + + int err = zil_commit_fallback_sync(zilog); + zil_free_commit_waiter(zcw); + + return (err); } /* @@ -3092,6 +3707,46 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) *replayed_seq = 0; } + if (zil_failed(zilog)) { + if (zilog->zl_unfail_txg <= txg) { + /* + * Everything in the failed ZIL is now on the main + * pool, so we can return it to service. + */ + + /* + * The live txgs should be either empty or ahead of the + * unfail txg. The fail txg should be empty. + */ + for (int i = 0; i < TXG_SIZE; i++) { + itxg_t *itxg = &zilog->zl_itxg[i]; + mutex_enter(&itxg->itxg_lock); + + ASSERT(zilog->zl_itxg[i].itxg_txg == 0 || + zilog->zl_itxg[i].itxg_txg >= + zilog->zl_unfail_txg); + + itxg->itxg_failed = B_FALSE; + mutex_exit(&itxg->itxg_lock); + } + + zilog->zl_unfail_txg = 0; + + char ds_name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(dmu_objset_ds(zilog->zl_os), ds_name); + cmn_err(CE_WARN, + "ZIL resumed on pool '%s' dataset '%s': txg=%llu", + spa_name(zilog->zl_spa), ds_name, txg); + } else { + /* Can't unfail yet; re-dirty for next txg. */ + dsl_pool_t *dp = zilog->zl_dmu_pool; + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + dmu_tx_t *next_tx = dmu_tx_create_assigned(dp, txg + 1); + dsl_dataset_dirty(ds, next_tx); + dmu_tx_commit(next_tx); + } + } + if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; @@ -3194,6 +3849,19 @@ zil_init(void) void zil_fini(void) { + /* + * XXX If the pool crashed and there is still outstanding IO, there + * will still be allocated lwbs on zil_lwb_cache. On Linux this will + * trip a BUG() in the underlyng __kmem_cache_shutdown. There's + * really nothing we can do here unless we decouple lwbs from their + * zios, either by walking all lwb zios and clearing their + * io_private members under lock, and then checking for that in the + * done callbacks, or by making their io_private members an index + * into some global lwb lookup. These are both substantial + * undertakings, so for now the guidance is simply to not unload the + * module after pool failure and forced export -- robn, 2023-05-08 + */ + kmem_cache_destroy(zil_zcw_cache); kmem_cache_destroy(zil_lwb_cache); @@ -3241,6 +3909,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, MUTEX_DEFAULT, NULL); } + mutex_init(&zilog->zl_fail_itxg.itxg_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); @@ -3282,6 +3951,9 @@ zil_free(zilog_t *zilog) mutex_destroy(&zilog->zl_itxg[i].itxg_lock); } + /* XXX clean out the fail itxg -- robn, 2023-04-27 */ + mutex_destroy(&zilog->zl_fail_itxg.itxg_lock); + mutex_destroy(&zilog->zl_issuer_lock); mutex_destroy(&zilog->zl_lock); @@ -3351,29 +4023,51 @@ zil_close(zilog_t *zilog) zilog->zl_get_data = NULL; /* - * We should have only one lwb left on the list; remove it now. + * XXX if we're exiting, we may have itxgs holding itxs that need to be + * removed to allow their callbacks to fire -- robn, 2023-05-08 + */ + + /* + * Clean up outstanding lwbs. Under normal circumstances there should + * be no more than one, but if the ZIL has failed or the pool is + * exporting, there might be additional ones in flight, so we need to + * take care. */ mutex_enter(&zilog->zl_lock); lwb = list_head(&zilog->zl_lwb_list); - if (lwb != NULL) { - ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); + if (lwb == NULL) { + mutex_exit(&zilog->zl_lock); + return; + } + + if (lwb != list_tail(&zilog->zl_lwb_list)) { + /* More than one; assert something is wrong. */ + ASSERT(zil_failed(zilog) || spa_exiting_any(zilog->zl_spa)); + } + + do { + list_remove(&zilog->zl_lwb_list, lwb); + + if (lwb->lwb_state != LWB_STATE_OPENED && + lwb->lwb_state != LWB_STATE_CLOSED) { + /* + * If its been issued, we can't clean it up, as its zios + * may yet come back to us. So we just skip it here and + * let zil_lwb_write_done() deal with it. + */ + continue; + } if (lwb->lwb_fastwrite) metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - list_remove(&zilog->zl_lwb_list, lwb); - if (lwb->lwb_buf != NULL) { + if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - } else { - /* - * Pool is being force exported, while this lwb was - * between zil_lwb_flush_vdevs_done and zil_sync. - */ - ASSERT(spa_exiting(zilog->zl_spa)); - } + + zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); - } + } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL); + mutex_exit(&zilog->zl_lock); } @@ -3503,6 +4197,9 @@ zil_suspend(const char *osname, void **cookiep) * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we * use txg_wait_synced() to ensure the data from the zilog has * migrated to the main pool before calling zil_destroy(). + * + * Its ok to block here, as this is always arrived at via administrative + * operations, never application IO. */ txg_wait_synced(zilog->zl_dmu_pool, 0);