Fix deadlock in 'zfs rollback'

Currently, the 'zfs rollback' code can end up deadlocked due to the way the kernel handles unreferenced inodes on a suspended fs. Essentially, the zfs_resume_fs() code path may cause zfs to spawn new threads as it reinstantiates the suspended fs's zil. When a new thread is spawned, the kernel may attempt to free memory for that thread by freeing some unreferenced inodes. If it happens to select inodes that are a a part of the suspended fs a deadlock will occur because freeing inodes requires holding the fs's z_teardown_inactive_lock which is still held from the suspend. This patch corrects this issue by adding an additional reference to all inodes that are still present when a suspend is initiated. This prevents them from being freed by the kernel for any reason. Reviewed-by: Alek Pinchuk <apinchuk@datto.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tom Caputi <tcaputi@datto.com> Closes #9203
2019-08-27 12:55:51 -04:00 · 2019-08-27 12:55:51 -04:00 · e7a2fa70c3
parent 142f84dd19
commit e7a2fa70c3
3 changed files with 17 additions and 1 deletions
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@ -200,6 +200,7 @@ typedef struct znode {
 	boolean_t	z_is_mapped;	/* are we mmap'ed */
 	boolean_t	z_is_ctldir;	/* are we .zfs entry */
 	boolean_t	z_is_stale;	/* are we stale due to rollback? */
 	boolean_t	z_suspended;	/* extra ref from a suspend? */
 	uint_t		z_blksz;	/* block size in bytes */
 	uint_t		z_seq;		/* modification sequence number */
 	uint64_t	z_mapcnt;	/* number of pages mapped to file */
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@ -1737,7 +1737,12 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 	 * will fail with EIO since we have z_teardown_lock for writer (only
 	 * relevant for forced unmount).
 	 *
-	 * Release all holds on dbufs.
+	 * Release all holds on dbufs. We also grab an extra reference to all
 	 * the remaining inodes so that the kernel does not attempt to free
 	 * any inodes of a suspended fs. This can cause deadlocks since the
 	 * zfs_resume_fs() process may involve starting threads, which might
 	 * attempt to free unreferenced inodes to free up memory for the new
 	 * thread.
 	 */
 	if (!unmounting) {
 		mutex_enter(&zfsvfs->z_znodes_lock);
@ -1745,6 +1750,9 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
 			if (zp->z_sa_hdl)
 				zfs_znode_dmu_fini(zp);
 			if (igrab(ZTOI(zp)) != NULL)
 				zp->z_suspended = B_TRUE;
 		}
 		mutex_exit(&zfsvfs->z_znodes_lock);
 	}
@ -2202,6 +2210,12 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 			remove_inode_hash(ZTOI(zp));
 			zp->z_is_stale = B_TRUE;
 		}
 		/* see comment in zfs_suspend_fs() */
 		if (zp->z_suspended) {
 			zfs_iput_async(ZTOI(zp));
 			zp->z_suspended = B_FALSE;
 		}
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@ -545,6 +545,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_is_mapped = B_FALSE;
 	zp->z_is_ctldir = B_FALSE;
 	zp->z_is_stale = B_FALSE;
 	zp->z_suspended = B_FALSE;
 	zp->z_sa_hdl = NULL;
 	zp->z_mapcnt = 0;
 	zp->z_id = db->db_object;