zil: allow the ZIL to fail and restart independently of the pool

zil_commit() has always returned void, and thus, cannot fail. Everything inside it assumed that if anything ever went wrong, it could fall back on txg_wait_synced() until the txg covering the operations being flushed from the ZIL has fully committed. This meant that if the pool failed and failmode=continue was set, syncing operations like fsync() would still block. Unblocking zil_commit() means largely the same approach. The difficulty is that the ZIL carries the record of uncommitted VFS operations (vs the changed data), and attached to those, callbacks and cvs that will release userspace callers once the data is on disk. So if we can't write the ZIL, we also can't release those records until the data is on disk. This wasn't a problem before, because the zil_commit() would block. If we change zil_commit() to return error, we still need to track those entries until the data they represent hits the disk. We also need to accept new records; just because the ZIL fails may not necessarily mean the pool itself is unavailable. This commit reorganises the ZIL to allow zil_commit() to return failure. If ZIL writes or flushes fail, the ZIL is moved into a "failed" state, and no further writes are done; all zil_commit() calls are serviced by the regular txg mechanism. Outstanding records (itx_ts) are held until the main pool writes their associated txg out. The records are then released. Once all records are cleared, the ZIL is reset and reopened. Signed-off-by: Rob Norris <rob.norris@klarasystems.com> (cherry picked from commit af821006f6602261e690fe6635689cabdeefcadf)
2023-05-10 15:44:40 +10:00 · 2023-05-10 15:44:40 +10:00 · 2724bcb3d6
parent cdaf041d39
commit 2724bcb3d6
6 changed files with 833 additions and 120 deletions
--- a/include/sys/zil.h
+++ b/include/sys/zil.h
@ -495,8 +495,8 @@ extern void	zil_itx_destroy(itx_t *itx);
 extern void	zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);

 extern void	zil_async_to_sync(zilog_t *zilog, uint64_t oid);
-extern void	zil_commit(zilog_t *zilog, uint64_t oid);
-extern void	zil_commit_impl(zilog_t *zilog, uint64_t oid);
+extern int	zil_commit(zilog_t *zilog, uint64_t oid);
+extern int	zil_commit_impl(zilog_t *zilog, uint64_t oid);
 extern void	zil_remove_async(zilog_t *zilog, uint64_t oid);

 extern int	zil_reset(const char *osname, void *txarg);
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@ -146,6 +146,7 @@ typedef struct itxg {
 	kmutex_t	itxg_lock;	/* lock for this structure */
 	uint64_t	itxg_txg;	/* txg for this chain */
 	itxs_t		*itxg_itxs;	/* sync and async itxs */
+	boolean_t	itxg_failed;	/* ZIL failed, don't touch */
 } itxg_t;

 /* for async nodes we build up an AVL tree of lists of async itxs per file */
@ -198,6 +199,8 @@ struct zilog {
 	uint64_t	zl_parse_blk_count; /* number of blocks parsed */
 	uint64_t	zl_parse_lr_count; /* number of log records parsed */
 	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
+	itxg_t		zl_fail_itxg;	/* holding space for failed itxs */
+	uint64_t	zl_unfail_txg;	/* txg to unfail ZIL at */
 	list_t		zl_itx_commit_list; /* itx list to be committed */
 	uint64_t	zl_cur_used;	/* current commit log size used */
 	list_t		zl_lwb_list;	/* in-flight log write list */
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@ -280,18 +280,29 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
 		dp = dmu_objset_pool(zfsvfs->z_os);

 		/*
-		 * If the system is shutting down, then skip any
-		 * filesystems which may exist on a suspended pool.
+		 * If the system is shutting down, then skip any filesystems
+		 * which may exist on a suspended pool. We don't do this if
+		 * failmode=continue becase zil_commit might have a better
+		 * error for us.
 		 */
-		if (spa_suspended(dp->dp_spa)) {
+		if (spa_suspended(dp->dp_spa) &&
+		    spa_get_failmode(dp->dp_spa) != ZIO_FAILURE_MODE_CONTINUE) {
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}

+		/*
+		 * If there's a ZIL, try to flush it. If the pool is in some
+		 * unflushable state, this will get us an approprate error
+		 * return.
+		 */
+		int err = 0;
 		if (zfsvfs->z_log != NULL)
-			zil_commit(zfsvfs->z_log, 0);
+			err = zil_commit(zfsvfs->z_log, 0);

 		ZFS_EXIT(zfsvfs);
+
+		return (err);
 	} else {
 		/*
 		 * Sync all ZFS filesystems.  This is what happens when you
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@ -1648,7 +1648,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,

 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
-	if (dmu_tx_assign(tx, DMU_TX_ASSIGN_WAIT) != 0) {
+	if (dmu_tx_assign(tx,
+	    DMU_TX_ASSIGN_WAIT | DMU_TX_ASSIGN_CONTINUE) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@ -61,6 +61,7 @@ static ulong_t zfs_fsync_sync_cnt = 4;
 int
 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
 {
+	int err = 0;
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);

 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_DISABLED)
@ -71,13 +72,13 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)

 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);

-	zil_commit(zfsvfs->z_log, zp->z_id);
+	err = zil_commit(zfsvfs->z_log, zp->z_id);

 	tsd_set(zfs_fsyncer_key, NULL);

 	ZFS_EXIT(zfsvfs);

-	return (0);
+	return (err);
 }


--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c