zil: allow the ZIL to fail and restart independently of the pool

zil_commit() has always returned void, and thus, cannot fail. Everything
inside it assumed that if anything ever went wrong, it could fall back
on txg_wait_synced() until the txg covering the operations being flushed
from the ZIL has fully committed. This meant that if the pool failed and
failmode=continue was set, syncing operations like fsync() would still
block.

Unblocking zil_commit() means largely the same approach. The difficulty
is that the ZIL carries the record of uncommitted VFS operations (vs the
changed data), and attached to those, callbacks and cvs that will
release userspace callers once the data is on disk. So if we can't write
the ZIL, we also can't release those records until the data is on disk.

This wasn't a problem before, because the zil_commit() would block. If
we change zil_commit() to return error, we still need to track those
entries until the data they represent hits the disk. We also need to
accept new records; just because the ZIL fails may not necessarily mean
the pool itself is unavailable.

This commit reorganises the ZIL to allow zil_commit() to return failure.
If ZIL writes or flushes fail, the ZIL is moved into a "failed" state,
and no further writes are done; all zil_commit() calls are serviced by
the regular txg mechanism. Outstanding records (itx_ts) are held until
the main pool writes their associated txg out. The records are then
released. Once all records are cleared, the ZIL is reset and reopened.

Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
(cherry picked from commit af821006f6602261e690fe6635689cabdeefcadf)
This commit is contained in:
Rob Norris 2023-05-10 15:44:40 +10:00 committed by Geoff Amey
parent cdaf041d39
commit 2724bcb3d6
6 changed files with 833 additions and 120 deletions

View File

@ -495,8 +495,8 @@ extern void zil_itx_destroy(itx_t *itx);
extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid);
extern void zil_commit(zilog_t *zilog, uint64_t oid);
extern void zil_commit_impl(zilog_t *zilog, uint64_t oid);
extern int zil_commit(zilog_t *zilog, uint64_t oid);
extern int zil_commit_impl(zilog_t *zilog, uint64_t oid);
extern void zil_remove_async(zilog_t *zilog, uint64_t oid);
extern int zil_reset(const char *osname, void *txarg);

View File

@ -146,6 +146,7 @@ typedef struct itxg {
kmutex_t itxg_lock; /* lock for this structure */
uint64_t itxg_txg; /* txg for this chain */
itxs_t *itxg_itxs; /* sync and async itxs */
boolean_t itxg_failed; /* ZIL failed, don't touch */
} itxg_t;
/* for async nodes we build up an AVL tree of lists of async itxs per file */
@ -198,6 +199,8 @@ struct zilog {
uint64_t zl_parse_blk_count; /* number of blocks parsed */
uint64_t zl_parse_lr_count; /* number of log records parsed */
itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
itxg_t zl_fail_itxg; /* holding space for failed itxs */
uint64_t zl_unfail_txg; /* txg to unfail ZIL at */
list_t zl_itx_commit_list; /* itx list to be committed */
uint64_t zl_cur_used; /* current commit log size used */
list_t zl_lwb_list; /* in-flight log write list */

View File

@ -280,18 +280,29 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
dp = dmu_objset_pool(zfsvfs->z_os);
/*
* If the system is shutting down, then skip any
* filesystems which may exist on a suspended pool.
* If the system is shutting down, then skip any filesystems
* which may exist on a suspended pool. We don't do this if
* failmode=continue becase zil_commit might have a better
* error for us.
*/
if (spa_suspended(dp->dp_spa)) {
if (spa_suspended(dp->dp_spa) &&
spa_get_failmode(dp->dp_spa) != ZIO_FAILURE_MODE_CONTINUE) {
ZFS_EXIT(zfsvfs);
return (0);
}
/*
* If there's a ZIL, try to flush it. If the pool is in some
* unflushable state, this will get us an approprate error
* return.
*/
int err = 0;
if (zfsvfs->z_log != NULL)
zil_commit(zfsvfs->z_log, 0);
err = zil_commit(zfsvfs->z_log, 0);
ZFS_EXIT(zfsvfs);
return (err);
} else {
/*
* Sync all ZFS filesystems. This is what happens when you

View File

@ -1648,7 +1648,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
tx = dmu_tx_create(os);
dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
if (dmu_tx_assign(tx, DMU_TX_ASSIGN_WAIT) != 0) {
if (dmu_tx_assign(tx,
DMU_TX_ASSIGN_WAIT | DMU_TX_ASSIGN_CONTINUE) != 0) {
dmu_tx_abort(tx);
/* Make zl_get_data do txg_waited_synced() */
return (SET_ERROR(EIO));

View File

@ -61,6 +61,7 @@ static ulong_t zfs_fsync_sync_cnt = 4;
int
zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
{
int err = 0;
zfsvfs_t *zfsvfs = ZTOZSB(zp);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_DISABLED)
@ -71,13 +72,13 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
zil_commit(zfsvfs->z_log, zp->z_id);
err = zil_commit(zfsvfs->z_log, zp->z_id);
tsd_set(zfs_fsyncer_key, NULL);
ZFS_EXIT(zfsvfs);
return (0);
return (err);
}

File diff suppressed because it is too large Load Diff