Fix deadlock in 'zfs rollback'

Currently, the 'zfs rollback' code can end up deadlocked due to
the way the kernel handles unreferenced inodes on a suspended fs.
Essentially, the zfs_resume_fs() code path may cause zfs to spawn
new threads as it reinstantiates the suspended fs's zil. When a
new thread is spawned, the kernel may attempt to free memory for
that thread by freeing some unreferenced inodes. If it happens to
select inodes that are a a part of the suspended fs a deadlock
will occur because freeing inodes requires holding the fs's
z_teardown_inactive_lock which is still held from the suspend.

This patch corrects this issue by adding an additional reference
to all inodes that are still present when a suspend is initiated.
This prevents them from being freed by the kernel for any reason.

Reviewed-by: Alek Pinchuk <apinchuk@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #9203
This commit is contained in:
Tom Caputi 2019-08-27 12:55:51 -04:00 committed by Brian Behlendorf
parent 142f84dd19
commit e7a2fa70c3
3 changed files with 17 additions and 1 deletions

View File

@ -200,6 +200,7 @@ typedef struct znode {
boolean_t z_is_mapped; /* are we mmap'ed */ boolean_t z_is_mapped; /* are we mmap'ed */
boolean_t z_is_ctldir; /* are we .zfs entry */ boolean_t z_is_ctldir; /* are we .zfs entry */
boolean_t z_is_stale; /* are we stale due to rollback? */ boolean_t z_is_stale; /* are we stale due to rollback? */
boolean_t z_suspended; /* extra ref from a suspend? */
uint_t z_blksz; /* block size in bytes */ uint_t z_blksz; /* block size in bytes */
uint_t z_seq; /* modification sequence number */ uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */ uint64_t z_mapcnt; /* number of pages mapped to file */

View File

@ -1737,7 +1737,12 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
* will fail with EIO since we have z_teardown_lock for writer (only * will fail with EIO since we have z_teardown_lock for writer (only
* relevant for forced unmount). * relevant for forced unmount).
* *
* Release all holds on dbufs. * Release all holds on dbufs. We also grab an extra reference to all
* the remaining inodes so that the kernel does not attempt to free
* any inodes of a suspended fs. This can cause deadlocks since the
* zfs_resume_fs() process may involve starting threads, which might
* attempt to free unreferenced inodes to free up memory for the new
* thread.
*/ */
if (!unmounting) { if (!unmounting) {
mutex_enter(&zfsvfs->z_znodes_lock); mutex_enter(&zfsvfs->z_znodes_lock);
@ -1745,6 +1750,9 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
zp = list_next(&zfsvfs->z_all_znodes, zp)) { zp = list_next(&zfsvfs->z_all_znodes, zp)) {
if (zp->z_sa_hdl) if (zp->z_sa_hdl)
zfs_znode_dmu_fini(zp); zfs_znode_dmu_fini(zp);
if (igrab(ZTOI(zp)) != NULL)
zp->z_suspended = B_TRUE;
} }
mutex_exit(&zfsvfs->z_znodes_lock); mutex_exit(&zfsvfs->z_znodes_lock);
} }
@ -2202,6 +2210,12 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
remove_inode_hash(ZTOI(zp)); remove_inode_hash(ZTOI(zp));
zp->z_is_stale = B_TRUE; zp->z_is_stale = B_TRUE;
} }
/* see comment in zfs_suspend_fs() */
if (zp->z_suspended) {
zfs_iput_async(ZTOI(zp));
zp->z_suspended = B_FALSE;
}
} }
mutex_exit(&zfsvfs->z_znodes_lock); mutex_exit(&zfsvfs->z_znodes_lock);

View File

@ -545,6 +545,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_is_mapped = B_FALSE; zp->z_is_mapped = B_FALSE;
zp->z_is_ctldir = B_FALSE; zp->z_is_ctldir = B_FALSE;
zp->z_is_stale = B_FALSE; zp->z_is_stale = B_FALSE;
zp->z_suspended = B_FALSE;
zp->z_sa_hdl = NULL; zp->z_sa_hdl = NULL;
zp->z_mapcnt = 0; zp->z_mapcnt = 0;
zp->z_id = db->db_object; zp->z_id = db->db_object;