From 0e4c830bc19766e860e760e10e0d59250f12cced Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Mon, 12 Sep 2022 12:55:37 -0400
Subject: [PATCH] Cleanup: Use OpenSolaris functions to call scheduler

In our codebase, `cond_resched() and `schedule()` are Linux kernel
functions that have replaced the OpenSolaris `kpreempt()` functions in
the codebase to such an extent that `kpreempt()` in zfs_context.h was
broken. Nobody noticed because we did not actually use it. The header
had defined `kpreempt()` as `yield()`, which works on OpenSolaris and
Illumos where `sched_yield()` is a wrapper for `yield()`, but that does
not work on any other platform.

The FreeBSD platform specific code implemented shims for these, but the
shim for `schedule()` forced us to wait, which is different than merely
rescheduling to another thread as the original Linux code does, while
the shim for `cond_resched()` had the same definition as its kernel
kpreempt() shim.

After studying this, I have concluded that we should reintroduce the
kpreempt() function in platform independent code with the following
definitions:

	- In the Linux kernel:
		kpreempt(unused)	-> cond_resched()

	- In the FreeBSD kernel:
		kpreempt(unused)	-> kern_yield(PRI_USER)

	- In userspace:
		kpreempt(unused)	-> sched_yield()

In userspace, nothing changes from this cleanup. In the kernels, the
function `fm_fini()` will now call `kern_yield(PRI_USER)` on FreeBSD and
`cond_resched()` on Linux.  This is instead of `pause("schedule", 1)` on
FreeBSD and `schedule()` on Linux. This makes our behavior consistent
across platforms.

Note that Linux's SPL continues to use `cond_resched()` and
`schedule()`.  However, those functions have been removed from both the
FreeBSD code and userspace code.

This should have the benefit of making it slightly easier to port the
code to new platforms by making how things should be mapped less
confusing.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Neal Gompa <ngompa@datto.com>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13845
---
 include/os/freebsd/spl/sys/disp.h           | 2 ++
 include/os/freebsd/spl/sys/timer.h          | 2 --
 include/os/freebsd/zfs/sys/zfs_context_os.h | 2 --
 include/os/linux/spl/sys/disp.h             | 4 +++-
 include/sys/zfs_context.h                   | 5 +++--
 module/zfs/arc.c                            | 4 ++--
 module/zfs/dnode.c                          | 6 +++---
 module/zfs/fm.c                             | 2 +-
 module/zfs/spa_log_spacemap.c               | 2 +-
 9 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/os/freebsd/spl/sys/disp.h b/include/os/freebsd/spl/sys/disp.h
index 2be1b76e43..d46a7d2c01 100644
--- a/include/os/freebsd/spl/sys/disp.h
+++ b/include/os/freebsd/spl/sys/disp.h
@@ -31,6 +31,8 @@
 
 #include <sys/proc.h>
 
+#define	KPREEMPT_SYNC		(-1)
+
 #define	kpreempt(x)	kern_yield(PRI_USER)
 
 #endif	/* _OPENSOLARIS_SYS_DISP_H_ */
diff --git a/include/os/freebsd/spl/sys/timer.h b/include/os/freebsd/spl/sys/timer.h
index d4694bb7c0..7ff77e9b1b 100644
--- a/include/os/freebsd/spl/sys/timer.h
+++ b/include/os/freebsd/spl/sys/timer.h
@@ -33,6 +33,4 @@
 #define	usleep_range(wakeup, wakeupepsilon)				   \
 	pause_sbt("usleep_range", ustosbt(wakeup), \
 	ustosbt(wakeupepsilon - wakeup), 0)
-
-#define	schedule() pause("schedule", 1)
 #endif
diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h
index 8671995013..1ce7233041 100644
--- a/include/os/freebsd/zfs/sys/zfs_context_os.h
+++ b/include/os/freebsd/zfs/sys/zfs_context_os.h
@@ -45,8 +45,6 @@
 #define	HAVE_LARGE_STACKS	1
 #endif
 
-#define	cond_resched()		kern_yield(PRI_USER)
-
 #define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
 	    ((void) sizeof (dc), taskq_create(a, b, maxclsyspri, d, e, f))
 
diff --git a/include/os/linux/spl/sys/disp.h b/include/os/linux/spl/sys/disp.h
index e106d3c543..c8be6ffbf1 100644
--- a/include/os/linux/spl/sys/disp.h
+++ b/include/os/linux/spl/sys/disp.h
@@ -26,7 +26,9 @@
 
 #include <linux/preempt.h>
 
-#define	kpreempt(unused)	schedule()
+#define	KPREEMPT_SYNC		(-1)
+
+#define	kpreempt(unused)	cond_resched()
 #define	kpreempt_disable()	preempt_disable()
 #define	kpreempt_enable()	preempt_enable()
 
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index aa4f787896..83ed97fbec 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -219,7 +219,6 @@ typedef pthread_t	kthread_t;
 #define	TS_JOINABLE	0x00000004
 
 #define	curthread	((void *)(uintptr_t)pthread_self())
-#define	kpreempt(x)	yield()
 #define	getcomm()	"unknown"
 
 #define	thread_create_named(name, stk, stksize, func, arg, len, \
@@ -248,9 +247,11 @@ extern kthread_t *zk_thread_create(void (*func)(void *), void *arg,
 #define	issig(why)	(FALSE)
 #define	ISSIG(thr, why)	(FALSE)
 
+#define	KPREEMPT_SYNC		(-1)
+
+#define	kpreempt(x)		sched_yield()
 #define	kpreempt_disable()	((void)0)
 #define	kpreempt_enable()	((void)0)
-#define	cond_resched()		sched_yield()
 
 /*
  * Mutexes
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 980dc60d0c..b9969bff53 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -4165,7 +4165,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	 * this CPU are able to make progress, make a voluntary preemption
 	 * call here.
 	 */
-	cond_resched();
+	kpreempt(KPREEMPT_SYNC);
 
 	return (bytes_evicted);
 }
@@ -10335,7 +10335,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
 		    !dev->l2ad_first)
 			goto out;
 
-		cond_resched();
+		kpreempt(KPREEMPT_SYNC);
 		for (;;) {
 			mutex_enter(&l2arc_rebuild_thr_lock);
 			if (dev->l2ad_rebuild_cancel) {
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 67fe1e2c9a..ef27dfd40a 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1142,7 +1142,7 @@ dnode_free_interior_slots(dnode_t *dn)
 
 	while (!dnode_slots_tryenter(children, idx, slots)) {
 		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
-		cond_resched();
+		kpreempt(KPREEMPT_SYNC);
 	}
 
 	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
@@ -1423,7 +1423,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 			dnode_slots_rele(dnc, idx, slots);
 			while (!dnode_slots_tryenter(dnc, idx, slots)) {
 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
-				cond_resched();
+				kpreempt(KPREEMPT_SYNC);
 			}
 
 			/*
@@ -1478,7 +1478,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 		dnode_slots_rele(dnc, idx, slots);
 		while (!dnode_slots_tryenter(dnc, idx, slots)) {
 			DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
-			cond_resched();
+			kpreempt(KPREEMPT_SYNC);
 		}
 
 		if (!dnode_check_slots_free(dnc, idx, slots)) {
diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index e7a7ad5832..bc13b5517c 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@@ -1354,7 +1354,7 @@ fm_fini(void)
 	zevent_flags |= ZEVENT_SHUTDOWN;
 	while (zevent_waiters > 0) {
 		mutex_exit(&zevent_lock);
-		schedule();
+		kpreempt(KPREEMPT_SYNC);
 		mutex_enter(&zevent_lock);
 	}
 	mutex_exit(&zevent_lock);
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index 19e334916b..4ecce8214f 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1176,7 +1176,7 @@ spa_ld_log_sm_data(spa_t *spa)
 		}
 
 		/* Load TXG log spacemap into ms_unflushed_allocs/frees. */
-		cond_resched();
+		kpreempt(KPREEMPT_SYNC);
 		ASSERT0(sls->sls_nblocks);
 		sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
 		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;