From 2921ad6cba54522a33fc95f90237c6e5ead80de7 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <paulcd2000@gmail.com>
Date: Wed, 27 Jan 2021 21:29:58 -0800
Subject: [PATCH] Fix zrele race in zrele_async that can cause hang

There is a race condition in zfs_zrele_async when we are checking if
we would be the one to evict an inode. This can lead to a txg sync
deadlock.

Instead of calling into iput directly, we attempt to perform the atomic
decrement ourselves, unless that would set the i_count value to zero.
In that case, we dispatch a call to iput to run later, to prevent a
deadlock from occurring.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #11527
Closes #11530
---
 module/os/linux/zfs/zfs_vnops_os.c | 34 +++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 15b7f019c4..0587dd7cc7 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -87,15 +87,18 @@
  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  *      can return EIO from the calling function.
  *
- *  (2)	zrele() should always be the last thing except for zil_commit()
- *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
- *	First, if it's the last reference, the vnode/znode
- *	can be freed, so the zp may point to freed memory.  Second, the last
- *	reference will call zfs_zinactive(), which may induce a lot of work --
- *	pushing cached pages (which acquires range locks) and syncing out
- *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
- *	which could deadlock the system if you were already holding one.
- *	If you must call zrele() within a tx then use zfs_zrele_async().
+ *  (2) zrele() should always be the last thing except for zil_commit() (if
+ *	necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the
+ *	last reference, the vnode/znode can be freed, so the zp may point to
+ *	freed memory.  Second, the last reference will call zfs_zinactive(),
+ *	which may induce a lot of work -- pushing cached pages (which acquires
+ *	range locks) and syncing out cached atime changes.  Third,
+ *	zfs_zinactive() may require a new tx, which could deadlock the system
+ *	if you were already holding one. This deadlock occurs because the tx
+ *	currently being operated on prevents a txg from syncing, which
+ *	prevents the new tx from progressing, resulting in a deadlock.  If you
+ *	must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
+ *	is a synonym for zrele().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
@@ -398,11 +401,18 @@ zfs_zrele_async(znode_t *zp)
 	ASSERT(atomic_read(&ip->i_count) > 0);
 	ASSERT(os != NULL);
 
-	if (atomic_read(&ip->i_count) == 1)
+	/*
+	 * If decrementing the count would put us at 0, we can't do it inline
+	 * here, because that would be synchronous. Instead, dispatch an iput
+	 * to run later.
+	 *
+	 * For more information on the dangers of a synchronous iput, see the
+	 * header comment of this file.
+	 */
+	if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 		VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
 		    (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
-	else
-		zrele(zp);
+	}
 }