zil: allow the ZIL to fail and restart independently of the pool
zil_commit() has always returned void, and thus, cannot fail. Everything inside it assumed that if anything ever went wrong, it could fall back on txg_wait_synced() until the txg covering the operations being flushed from the ZIL has fully committed. This meant that if the pool failed and failmode=continue was set, syncing operations like fsync() would still block. Unblocking zil_commit() means largely the same approach. The difficulty is that the ZIL carries the record of uncommitted VFS operations (vs the changed data), and attached to those, callbacks and cvs that will release userspace callers once the data is on disk. So if we can't write the ZIL, we also can't release those records until the data is on disk. This wasn't a problem before, because the zil_commit() would block. If we change zil_commit() to return error, we still need to track those entries until the data they represent hits the disk. We also need to accept new records; just because the ZIL fails may not necessarily mean the pool itself is unavailable. This commit reorganises the ZIL to allow zil_commit() to return failure. If ZIL writes or flushes fail, the ZIL is moved into a "failed" state, and no further writes are done; all zil_commit() calls are serviced by the regular txg mechanism. Outstanding records (itx_ts) are held until the main pool writes their associated txg out. The records are then released. Once all records are cleared, the ZIL is reset and reopened. Signed-off-by: Rob Norris <rob.norris@klarasystems.com> (cherry picked from commit af821006f6602261e690fe6635689cabdeefcadf)
This commit is contained in:
parent
cdaf041d39
commit
2724bcb3d6
|
@ -495,8 +495,8 @@ extern void zil_itx_destroy(itx_t *itx);
|
|||
extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
|
||||
|
||||
extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid);
|
||||
extern void zil_commit(zilog_t *zilog, uint64_t oid);
|
||||
extern void zil_commit_impl(zilog_t *zilog, uint64_t oid);
|
||||
extern int zil_commit(zilog_t *zilog, uint64_t oid);
|
||||
extern int zil_commit_impl(zilog_t *zilog, uint64_t oid);
|
||||
extern void zil_remove_async(zilog_t *zilog, uint64_t oid);
|
||||
|
||||
extern int zil_reset(const char *osname, void *txarg);
|
||||
|
|
|
@ -146,6 +146,7 @@ typedef struct itxg {
|
|||
kmutex_t itxg_lock; /* lock for this structure */
|
||||
uint64_t itxg_txg; /* txg for this chain */
|
||||
itxs_t *itxg_itxs; /* sync and async itxs */
|
||||
boolean_t itxg_failed; /* ZIL failed, don't touch */
|
||||
} itxg_t;
|
||||
|
||||
/* for async nodes we build up an AVL tree of lists of async itxs per file */
|
||||
|
@ -198,6 +199,8 @@ struct zilog {
|
|||
uint64_t zl_parse_blk_count; /* number of blocks parsed */
|
||||
uint64_t zl_parse_lr_count; /* number of log records parsed */
|
||||
itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
|
||||
itxg_t zl_fail_itxg; /* holding space for failed itxs */
|
||||
uint64_t zl_unfail_txg; /* txg to unfail ZIL at */
|
||||
list_t zl_itx_commit_list; /* itx list to be committed */
|
||||
uint64_t zl_cur_used; /* current commit log size used */
|
||||
list_t zl_lwb_list; /* in-flight log write list */
|
||||
|
|
|
@ -280,18 +280,29 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
|
|||
dp = dmu_objset_pool(zfsvfs->z_os);
|
||||
|
||||
/*
|
||||
* If the system is shutting down, then skip any
|
||||
* filesystems which may exist on a suspended pool.
|
||||
* If the system is shutting down, then skip any filesystems
|
||||
* which may exist on a suspended pool. We don't do this if
|
||||
* failmode=continue becase zil_commit might have a better
|
||||
* error for us.
|
||||
*/
|
||||
if (spa_suspended(dp->dp_spa)) {
|
||||
if (spa_suspended(dp->dp_spa) &&
|
||||
spa_get_failmode(dp->dp_spa) != ZIO_FAILURE_MODE_CONTINUE) {
|
||||
ZFS_EXIT(zfsvfs);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* If there's a ZIL, try to flush it. If the pool is in some
|
||||
* unflushable state, this will get us an approprate error
|
||||
* return.
|
||||
*/
|
||||
int err = 0;
|
||||
if (zfsvfs->z_log != NULL)
|
||||
zil_commit(zfsvfs->z_log, 0);
|
||||
err = zil_commit(zfsvfs->z_log, 0);
|
||||
|
||||
ZFS_EXIT(zfsvfs);
|
||||
|
||||
return (err);
|
||||
} else {
|
||||
/*
|
||||
* Sync all ZFS filesystems. This is what happens when you
|
||||
|
|
|
@ -1648,7 +1648,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
|
|||
|
||||
tx = dmu_tx_create(os);
|
||||
dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
|
||||
if (dmu_tx_assign(tx, DMU_TX_ASSIGN_WAIT) != 0) {
|
||||
if (dmu_tx_assign(tx,
|
||||
DMU_TX_ASSIGN_WAIT | DMU_TX_ASSIGN_CONTINUE) != 0) {
|
||||
dmu_tx_abort(tx);
|
||||
/* Make zl_get_data do txg_waited_synced() */
|
||||
return (SET_ERROR(EIO));
|
||||
|
|
|
@ -61,6 +61,7 @@ static ulong_t zfs_fsync_sync_cnt = 4;
|
|||
int
|
||||
zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
|
||||
{
|
||||
int err = 0;
|
||||
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
||||
|
||||
if (zfsvfs->z_os->os_sync == ZFS_SYNC_DISABLED)
|
||||
|
@ -71,13 +72,13 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
|
|||
|
||||
(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
|
||||
|
||||
zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
err = zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
|
||||
tsd_set(zfs_fsyncer_key, NULL);
|
||||
|
||||
ZFS_EXIT(zfsvfs);
|
||||
|
||||
return (0);
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
||||
|
|
869
module/zfs/zil.c
869
module/zfs/zil.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue