OpenZFS 9284 - arc_reclaim_thread has 2 jobs

Following the fix for 9018 (Replace kmem_cache_reap_now() with kmem_cache_reap_soon), the arc_reclaim_thread() no longer blocks while reaping. However, the code is still confusing and error-prone, because this thread has two responsibilities. We should instead separate this into two threads each with their own responsibility: 1. keep `arc_size` under `arc_c`, by calling `arc_adjust()`, which improves `arc_is_overflowing()` 2. keep enough free memory in the system, by calling `arc_kmem_reap_now()` plus `arc_shrink()`, which improves `arc_available_memory()`. Furthermore, we can use the zthr infrastructure to separate the "should we do something" from "do it" parts of the logic, and normalize the start up / shut down of the threads. Authored by: Brad Lewis <brad.lewis@delphix.com> Reviewed by: Matt Ahrens <mahrens@delphix.com> Reviewed by: Serapheim Dimitropoulos <serapheim@delphix.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Dan McDonald <danmcd@joyent.com> Reviewed by: Tim Kordas <tim.kordas@joyent.com> Reviewed by: Tim Chase <tim@chase2k.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Ported-by: Brad Lewis <brad.lewis@delphix.com> Signed-off-by: Brad Lewis <brad.lewis@delphix.com> OpenZFS-issue: https://www.illumos.org/issues/9284 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/de753e34f9 Closes #8165
2017-03-15 16:41:52 -07:00 · 2017-03-15 16:41:52 -07:00 · 3ec34e5527
parent 00f198de6b
commit 3ec34e5527
7 changed files with 292 additions and 179 deletions
--- a/include/spl/sys/kmem.h
+++ b/include/spl/sys/kmem.h
@ -163,6 +163,7 @@ extern unsigned int spl_kmem_alloc_max;
 #define	kmem_alloc(sz, fl)	spl_kmem_alloc((sz), (fl), __func__, __LINE__)
 #define	kmem_zalloc(sz, fl)	spl_kmem_zalloc((sz), (fl), __func__, __LINE__)
 #define	kmem_free(ptr, sz)	spl_kmem_free((ptr), (sz))
 #define	kmem_cache_reap_active	spl_kmem_cache_reap_active
 extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
 extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
@ -181,5 +182,6 @@ extern void spl_kmem_free_track(const void *buf, size_t size);
 extern int spl_kmem_init(void);
 extern void spl_kmem_fini(void);
 extern int spl_kmem_cache_reap_active(void);
 #endif	/* _SPL_KMEM_H */
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@ -773,6 +773,7 @@ typedef int fstrans_cookie_t;
 extern fstrans_cookie_t spl_fstrans_mark(void);
 extern void spl_fstrans_unmark(fstrans_cookie_t);
 extern int __spl_pf_fstrans_check(void);
 extern int kmem_cache_reap_active(void);
 #define	____cacheline_aligned
--- a/include/sys/zthr.h
+++ b/include/sys/zthr.h
@ -29,6 +29,7 @@ struct zthr {
 	kmutex_t	zthr_lock;
 	kcondvar_t	zthr_cv;
 	boolean_t	zthr_cancel;
 	hrtime_t	zthr_wait_time;
 	zthr_checkfunc_t	*zthr_checkfunc;
 	zthr_func_t	*zthr_func;
@ -38,6 +39,9 @@ struct zthr {
 extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc,
    zthr_func_t *func, void *arg);
 extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc,
    zthr_func_t *func, void *arg, hrtime_t nano_wait);
 extern void zthr_exit(zthr_t *t, int rc);
 extern void zthr_destroy(zthr_t *t);
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@ -1276,6 +1276,12 @@ __spl_pf_fstrans_check(void)
 	return (0);
 }
 int
 kmem_cache_reap_active(void)
 {
 	return (0);
 }
 void *zvol_tag = "zvol_tag";
 void
--- a/module/spl/spl-kmem-cache.c
+++ b/module/spl/spl-kmem-cache.c
@ -1732,6 +1732,18 @@ out:
 }
 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
 /*
 * This is stubbed out for code consistency with other platforms.  There
 * is existing logic to prevent concurrent reaping so while this is ugly
 * it should do no harm.
 */
 int
 spl_kmem_cache_reap_active()
 {
 	return (0);
 }
 EXPORT_SYMBOL(spl_kmem_cache_reap_active);
 /*
 * Reap all free slabs from all registered caches.
 */
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@ -20,10 +20,10 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
- * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
 */
 /*
@ -299,7 +299,7 @@
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
-#include <sys/dmu_tx.h>
+#include <sys/zthr.h>
 #include <zfs_fletcher.h>
 #include <sys/arc_impl.h>
 #include <sys/trace_arc.h>
@ -311,10 +311,22 @@
 boolean_t arc_watch = B_FALSE;
 #endif
-static kmutex_t		arc_reclaim_lock;
+/*
-static kcondvar_t	arc_reclaim_thread_cv;
+ * This thread's job is to keep enough free memory in the system, by
-static boolean_t	arc_reclaim_thread_exit;
+ * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
-static kcondvar_t	arc_reclaim_waiters_cv;
+ * arc_available_memory().
 */
 static zthr_t		*arc_reap_zthr;
 /*
 * This thread's job is to keep arc_size under arc_c, by calling
 * arc_adjust(), which improves arc_is_overflowing().
 */
 static zthr_t		*arc_adjust_zthr;
 static kmutex_t		arc_adjust_lock;
 static kcondvar_t	arc_adjust_waiters_cv;
 static boolean_t	arc_adjust_needed = B_FALSE;
 /*
 * The number of headers to evict in arc_evict_state_impl() before
@ -326,20 +338,25 @@ static kcondvar_t	arc_reclaim_waiters_cv;
 int zfs_arc_evict_batch_limit = 10;
 /* number of seconds before growing cache again */
-static int		arc_grow_retry = 5;
+static int arc_grow_retry = 5;
 /*
 * Minimum time between calls to arc_kmem_reap_soon().
 */
 int arc_kmem_cache_reap_retry_ms = 1000;
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
-int		zfs_arc_overflow_shift = 8;
+int zfs_arc_overflow_shift = 8;
 /* shift of arc_c for calculating both min and max arc_p */
-static int		arc_p_min_shift = 4;
+int arc_p_min_shift = 4;
 /* log2(fraction of arc to reclaim) */
-static int		arc_shrink_shift = 7;
+static int arc_shrink_shift = 7;
 /* percent of pagecache to reclaim arc to */
 #ifdef _KERNEL
-static uint_t		zfs_arc_pc_percent = 0;
+static uint_t zfs_arc_pc_percent = 0;
 #endif
 /*
@ -366,7 +383,10 @@ static int		arc_min_prescient_prefetch_ms;
 */
 int arc_lotsfree_percent = 10;
-static int arc_dead;
+/*
 * hdr_recl() uses this to determine if the arc is up and running.
 */
 static boolean_t arc_initialized;
 /*
 * The arc has filled available memory and has now warmed up.
@ -906,6 +926,7 @@ aggsum_t astat_bonus_size;
 aggsum_t astat_hdr_size;
 aggsum_t astat_l2_hdr_size;
 static hrtime_t arc_growtime;
 static list_t arc_prune_list;
 static kmutex_t arc_prune_mtx;
 static taskq_t *arc_prune_taskq;
@ -1380,8 +1401,8 @@ hdr_recl(void *unused)
 	 * umem calls the reclaim func when we destroy the buf cache,
 	 * which is after we do arc_fini().
 	 */
-	if (!arc_dead)
+	if (arc_initialized)
-		cv_signal(&arc_reclaim_thread_cv);
+		zthr_wakeup(arc_reap_zthr);
 }
 static void
@ -4097,13 +4118,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 			 * function should proceed in this case).
 			 *
 			 * If threads are left sleeping, due to not
-			 * using cv_broadcast, they will be woken up
+			 * using cv_broadcast here, they will be woken
-			 * just before arc_reclaim_thread() sleeps.
+			 * up via cv_broadcast in arc_adjust_cb() just
 			 * before arc_adjust_zthr sleeps.
 			 */
-			mutex_enter(&arc_reclaim_lock);
+			mutex_enter(&arc_adjust_lock);
 			if (!arc_is_overflowing())
-				cv_signal(&arc_reclaim_waiters_cv);
+				cv_signal(&arc_adjust_waiters_cv);
-			mutex_exit(&arc_reclaim_lock);
+			mutex_exit(&arc_adjust_lock);
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
@ -4763,8 +4785,8 @@ arc_flush(spa_t *spa, boolean_t retry)
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 }
-void
+static void
-arc_shrink(int64_t to_free)
+arc_reduce_target_size(int64_t to_free)
 {
 	uint64_t asize = aggsum_value(&arc_size);
 	uint64_t c = arc_c;
@ -4782,10 +4804,14 @@ arc_shrink(int64_t to_free)
 		arc_c = arc_c_min;
 	}
-	if (asize > arc_c)
+	if (asize > arc_c) {
-		(void) arc_adjust();
+		/* See comment in arc_adjust_cb_check() on why lock+flag */
 		mutex_enter(&arc_adjust_lock);
 		arc_adjust_needed = B_TRUE;
 		mutex_exit(&arc_adjust_lock);
 		zthr_wakeup(arc_adjust_zthr);
 	}
 }
 /*
 * Return maximum amount of memory that we could possibly use.  Reduced
 * to half of all memory in user space which is primarily used for testing.
@ -4989,7 +5015,7 @@ arc_reclaim_needed(void)
 }
 static void
-arc_kmem_reap_now(void)
+arc_kmem_reap_soon(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
@ -5044,135 +5070,169 @@ arc_kmem_reap_now(void)
 	}
 }
 /* ARGSUSED */
 static boolean_t
 arc_adjust_cb_check(void *arg, zthr_t *zthr)
 {
 	/*
 	 * This is necessary in order to keep the kstat information
 	 * up to date for tools that display kstat data such as the
 	 * mdb ::arc dcmd and the Linux crash utility.  These tools
 	 * typically do not call kstat's update function, but simply
 	 * dump out stats from the most recent update.  Without
 	 * this call, these commands may show stale stats for the
 	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
 	 * with this change, the data might be up to 1 second
 	 * out of date(the arc_adjust_zthr has a maximum sleep
 	 * time of 1 second); but that should suffice.  The
 	 * arc_state_t structures can be queried directly if more
 	 * accurate information is needed.
 	 */
 	if (arc_ksp != NULL)
 		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 	/*
 	 * We have to rely on arc_get_data_impl() to tell us when to adjust,
 	 * rather than checking if we are overflowing here, so that we are
 	 * sure to not leave arc_get_data_impl() waiting on
 	 * arc_adjust_waiters_cv.  If we have become "not overflowing" since
 	 * arc_get_data_impl() checked, we need to wake it up.  We could
 	 * broadcast the CV here, but arc_get_data_impl() may have not yet
 	 * gone to sleep.  We would need to use a mutex to ensure that this
 	 * function doesn't broadcast until arc_get_data_impl() has gone to
 	 * sleep (e.g. the arc_adjust_lock).  However, the lock ordering of
 	 * such a lock would necessarily be incorrect with respect to the
 	 * zthr_lock, which is held before this function is called, and is
 	 * held by arc_get_data_impl() when it calls zthr_wakeup().
 	 */
 	return (arc_adjust_needed);
 }
 /*
- * Threads can block in arc_get_data_impl() waiting for this thread to evict
+ * Keep arc_size under arc_c by running arc_adjust which evicts data
- * enough data and signal them to proceed. When this happens, the threads in
+ * from the ARC.
 * arc_get_data_impl() are sleeping while holding the hash lock for their
 * particular arc header. Thus, we must be careful to never sleep on a
 * hash lock in this thread. This is to prevent the following deadlock:
 *
 *  - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
 *    waiting for the reclaim thread to signal it.
 *
 *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
 *    fails, and goes to sleep forever.
 *
 * This possible deadlock is avoided by always acquiring a hash lock
 * using mutex_tryenter() from arc_reclaim_thread().
 */
 /* ARGSUSED */
-static void
+static int
-arc_reclaim_thread(void *unused)
+arc_adjust_cb(void *arg, zthr_t *zthr)
 {
-	fstrans_cookie_t	cookie = spl_fstrans_mark();
+	uint64_t evicted = 0;
-	hrtime_t		growtime = 0;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
 	callb_cpr_t		cpr;
-	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
+	/* Evict from cache */
-
+	evicted = arc_adjust();
 	mutex_enter(&arc_reclaim_lock);
 	while (!arc_reclaim_thread_exit) {
 		uint64_t evicted = 0;
 		uint64_t need_free = arc_need_free;
 		arc_tuning_update();
 	/*
 	 * If evicted is zero, we couldn't evict anything
 	 * via arc_adjust(). This could be due to hash lock
 	 * collisions, but more likely due to the majority of
 	 * arc buffers being unevictable. Therefore, even if
 	 * arc_size is above arc_c, another pass is unlikely to
 	 * be helpful and could potentially cause us to enter an
 	 * infinite loop.  Additionally, zthr_iscancelled() is
 	 * checked here so that if the arc is shutting down, the
 	 * broadcast will wake any remaining arc adjust waiters.
 	 */
 	mutex_enter(&arc_adjust_lock);
 	arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
 	    evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
 	if (!arc_adjust_needed) {
 		/*
-		 * This is necessary in order for the mdb ::arc dcmd to
+		 * We're either no longer overflowing, or we
-		 * show up to date information. Since the ::arc command
+		 * can't evict anything more, so we should wake
-		 * does not call the kstat's update function, without
+		 * arc_get_data_impl() sooner.
 		 * this call, the command may show stale stats for the
 		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
 		 * with this change, the data might be up to 1 second
 		 * out of date; but that should suffice. The arc_state_t
 		 * structures can be queried directly if more accurate
 		 * information is needed.
 		 */
-#ifndef __linux__
+		cv_broadcast(&arc_adjust_waiters_cv);
-		if (arc_ksp != NULL)
+		arc_need_free = 0;
-			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+	}
-#endif
+	mutex_exit(&arc_adjust_lock);
-		mutex_exit(&arc_reclaim_lock);
+	spl_fstrans_unmark(cookie);
 	return (0);
 }
 /* ARGSUSED */
 static boolean_t
 arc_reap_cb_check(void *arg, zthr_t *zthr)
 {
 	int64_t free_memory = arc_available_memory();
 	/*
 	 * If a kmem reap is already active, don't schedule more.  We must
 	 * check for this because kmem_cache_reap_soon() won't actually
 	 * block on the cache being reaped (this is to prevent callers from
 	 * becoming implicitly blocked by a system-wide kmem reap -- which,
 	 * on a system with many, many full magazines, can take minutes).
 	 */
 	if (!kmem_cache_reap_active() && free_memory < 0) {
 		arc_no_grow = B_TRUE;
 		arc_warm = B_TRUE;
 		/*
-		 * We call arc_adjust() before (possibly) calling
+		 * Wait at least zfs_grow_retry (default 5) seconds
-		 * arc_kmem_reap_now(), so that we can wake up
+		 * before considering growing.
 		 * arc_get_data_buf() sooner.
 		 */
-		evicted = arc_adjust();
+		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
-
+		return (B_TRUE);
-		int64_t free_memory = arc_available_memory();
+	} else if (free_memory < arc_c >> arc_no_grow_shift) {
-		if (free_memory < 0) {
+		arc_no_grow = B_TRUE;
-
+	} else if (gethrtime() >= arc_growtime) {
-			arc_no_grow = B_TRUE;
+		arc_no_grow = B_FALSE;
 			arc_warm = B_TRUE;
 			/*
 			 * Wait at least zfs_grow_retry (default 5) seconds
 			 * before considering growing.
 			 */
 			growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 			arc_kmem_reap_now();
 			/*
 			 * If we are still low on memory, shrink the ARC
 			 * so that we have arc_shrink_min free space.
 			 */
 			free_memory = arc_available_memory();
 			int64_t to_free =
 			    (arc_c >> arc_shrink_shift) - free_memory;
 			if (to_free > 0) {
 #ifdef _KERNEL
 				to_free = MAX(to_free, need_free);
 #endif
 				arc_shrink(to_free);
 			}
 		} else if (free_memory < arc_c >> arc_no_grow_shift) {
 			arc_no_grow = B_TRUE;
 		} else if (gethrtime() >= growtime) {
 			arc_no_grow = B_FALSE;
 		}
 		mutex_enter(&arc_reclaim_lock);
 		/*
 		 * If evicted is zero, we couldn't evict anything via
 		 * arc_adjust(). This could be due to hash lock
 		 * collisions, but more likely due to the majority of
 		 * arc buffers being unevictable. Therefore, even if
 		 * arc_size is above arc_c, another pass is unlikely to
 		 * be helpful and could potentially cause us to enter an
 		 * infinite loop.
 		 */
 		if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
 			/*
 			 * We're either no longer overflowing, or we
 			 * can't evict anything more, so we should wake
 			 * up any threads before we go to sleep and remove
 			 * the bytes we were working on from arc_need_free
 			 * since nothing more will be done here.
 			 */
 			cv_broadcast(&arc_reclaim_waiters_cv);
 			ARCSTAT_INCR(arcstat_need_free, -need_free);
 			/*
 			 * Block until signaled, or after one second (we
 			 * might need to perform arc_kmem_reap_now()
 			 * even if we aren't being signalled)
 			 */
 			CALLB_CPR_SAFE_BEGIN(&cpr);
 			(void) cv_timedwait_sig_hires(&arc_reclaim_thread_cv,
 			    &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
 			CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
 		}
 	}
-	arc_reclaim_thread_exit = B_FALSE;
+	return (B_FALSE);
-	cv_broadcast(&arc_reclaim_thread_cv);
+}
-	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_lock */
+
 /*
 * Keep enough free memory in the system by reaping the ARC's kmem
 * caches.  To cause more slabs to be reapable, we may reduce the
 * target size of the cache (arc_c), causing the arc_adjust_cb()
 * to free more buffers.
 */
 /* ARGSUSED */
 static int
 arc_reap_cb(void *arg, zthr_t *zthr)
 {
 	int64_t free_memory;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	/*
 	 * Kick off asynchronous kmem_reap()'s of all our caches.
 	 */
 	arc_kmem_reap_soon();
 	/*
 	 * Wait at least arc_kmem_cache_reap_retry_ms between
 	 * arc_kmem_reap_soon() calls. Without this check it is possible to
 	 * end up in a situation where we spend lots of time reaping
 	 * caches, while we're near arc_c_min.  Waiting here also gives the
 	 * subsequent free memory check a chance of finding that the
 	 * asynchronous reap has already freed enough memory, and we don't
 	 * need to call arc_reduce_target_size().
 	 */
 	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
 	/*
 	 * Reduce the target size as needed to maintain the amount of free
 	 * memory in the system at a fraction of the arc_size (1/128th by
 	 * default).  If oversubscribed (free_memory < 0) then reduce the
 	 * target arc_size by the deficit amount plus the fractional
 	 * amount.  If free memory is positive but less then the fractional
 	 * amount, reduce by what is needed to hit the fractional amount.
 	 */
 	free_memory = arc_available_memory();
 	int64_t to_free =
 	    (arc_c >> arc_shrink_shift) - free_memory;
 	if (to_free > 0) {
 #ifdef _KERNEL
 		to_free = MAX(to_free, arc_need_free);
 #endif
 		arc_reduce_target_size(to_free);
 	}
 	spl_fstrans_unmark(cookie);
-	thread_exit();
+
 	return (0);
 }
 #ifdef _KERNEL
@ -5276,21 +5336,21 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
 		return (SHRINK_STOP);
 	/* Reclaim in progress */
-	if (mutex_tryenter(&arc_reclaim_lock) == 0) {
+	if (mutex_tryenter(&arc_adjust_lock) == 0) {
 		ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan));
 		return (0);
 	}
-	mutex_exit(&arc_reclaim_lock);
+	mutex_exit(&arc_adjust_lock);
 	/*
 	 * Evict the requested number of pages by shrinking arc_c the
 	 * requested amount.
 	 */
 	if (pages > 0) {
-		arc_shrink(ptob(sc->nr_to_scan));
+		arc_reduce_target_size(ptob(sc->nr_to_scan));
 		if (current_is_kswapd())
-			arc_kmem_reap_now();
+			arc_kmem_reap_soon();
 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
 		pages = MAX((int64_t)pages -
 		    (int64_t)btop(arc_evictable_memory()), 0);
@ -5300,7 +5360,7 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
 		/*
 		 * We've shrunk what we can, wake up threads.
 		 */
-		cv_broadcast(&arc_reclaim_waiters_cv);
+		cv_broadcast(&arc_adjust_waiters_cv);
 	} else
 		pages = SHRINK_STOP;
@ -5315,7 +5375,7 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
 		ARCSTAT_BUMP(arcstat_memory_indirect_count);
 	} else {
 		arc_no_grow = B_TRUE;
-		arc_kmem_reap_now();
+		arc_kmem_reap_soon();
 		ARCSTAT_BUMP(arcstat_memory_direct_count);
 	}
@ -5369,8 +5429,11 @@ arc_adapt(int bytes, arc_state_t *state)
 	}
 	ASSERT((int64_t)arc_p >= 0);
 	/*
 	 * Wake reap thread if we do not have any available memory
 	 */
 	if (arc_reclaim_needed()) {
-		cv_signal(&arc_reclaim_thread_cv);
+		zthr_wakeup(arc_reap_zthr);
 		return;
 	}
@ -5478,7 +5541,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 	 * overflowing; thus we don't use a while loop here.
 	 */
 	if (arc_is_overflowing()) {
-		mutex_enter(&arc_reclaim_lock);
+		mutex_enter(&arc_adjust_lock);
 		/*
 		 * Now that we've acquired the lock, we may no longer be
@ -5492,11 +5555,12 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 		 * shouldn't cause any harm.
 		 */
 		if (arc_is_overflowing()) {
-			cv_signal(&arc_reclaim_thread_cv);
+			arc_adjust_needed = B_TRUE;
-			cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
+			zthr_wakeup(arc_adjust_zthr);
 			(void) cv_wait(&arc_adjust_waiters_cv,
 			    &arc_adjust_lock);
 		}
-
+		mutex_exit(&arc_adjust_lock);
 		mutex_exit(&arc_reclaim_lock);
 	}
 	VERIFY3U(hdr->b_type, ==, type);
@ -7687,10 +7751,8 @@ void
 arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
-
+	mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
 	arc_min_prefetch_ms = 1000;
 	arc_min_prescient_prefetch_ms = 6000;
@ -7750,6 +7812,13 @@ arc_init(void)
 		arc_c = arc_c_min;
 	arc_state_init();
 	/*
 	 * The arc must be "uninitialized", so that hdr_recl() (which is
 	 * registered by buf_init()) will not access arc_reap_zthr before
 	 * it is created.
 	 */
 	ASSERT(!arc_initialized);
 	buf_init();
 	list_create(&arc_prune_list, sizeof (arc_prune_t),
@ -7759,8 +7828,6 @@ arc_init(void)
 	arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri,
 	    max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 	arc_reclaim_thread_exit = B_FALSE;
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@ -7770,10 +7837,12 @@ arc_init(void)
 		kstat_install(arc_ksp);
 	}
-	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+	arc_adjust_zthr = zthr_create(arc_adjust_cb_check,
-	    TS_RUN, defclsyspri);
+	    arc_adjust_cb, NULL);
 	arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
 	    arc_reap_cb, NULL, SEC2NSEC(1));
-	arc_dead = B_FALSE;
+	arc_initialized = B_TRUE;
 	arc_warm = B_FALSE;
 	/*
@ -7805,22 +7874,10 @@ arc_fini(void)
 	spl_unregister_shrinker(&arc_shrinker);
 #endif /* _KERNEL */
 	mutex_enter(&arc_reclaim_lock);
 	arc_reclaim_thread_exit = B_TRUE;
 	/*
 	 * The reclaim thread will set arc_reclaim_thread_exit back to
 	 * B_FALSE when it is finished exiting; we're waiting for that.
 	 */
 	while (arc_reclaim_thread_exit) {
 		cv_signal(&arc_reclaim_thread_cv);
 		cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
 	}
 	mutex_exit(&arc_reclaim_lock);
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
-	arc_dead = B_TRUE;
+	arc_initialized = B_FALSE;
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
@ -7841,9 +7898,14 @@ arc_fini(void)
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);
-	mutex_destroy(&arc_reclaim_lock);
+	(void) zthr_cancel(arc_adjust_zthr);
-	cv_destroy(&arc_reclaim_thread_cv);
+	zthr_destroy(arc_adjust_zthr);
-	cv_destroy(&arc_reclaim_waiters_cv);
+
 	(void) zthr_cancel(arc_reap_zthr);
 	zthr_destroy(arc_reap_zthr);
 	mutex_destroy(&arc_adjust_lock);
 	cv_destroy(&arc_adjust_waiters_cv);
 	/*
 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
--- a/module/zfs/zthr.c
+++ b/module/zfs/zthr.c
@ -47,6 +47,10 @@
 * 3] When the zthr is done, it changes the indicator to stopped, allowing
 *    a new cycle to start.
 *
 * Besides being awakened by other threads, a zthr can be configured
 * during creation to wakeup on its own after a specified interval
 * [see zthr_create_timer()].
 *
 * == ZTHR creation
 *
 * Every zthr needs three inputs to start running:
@ -74,6 +78,9 @@
 *
 * To start a zthr:
 *     zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
 * or
 *     zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func,
 *         args, max_sleep);
 *
 * After that you should be able to wakeup, cancel, and resume the
 * zthr from another thread using zthr_pointer.
@ -189,7 +196,13 @@ zthr_procedure(void *arg)
 			mutex_enter(&t->zthr_lock);
 		} else {
 			/* go to sleep */
-			cv_wait_sig(&t->zthr_cv, &t->zthr_lock);
+			if (t->zthr_wait_time == 0) {
 				cv_wait_sig(&t->zthr_cv, &t->zthr_lock);
 			} else {
 				(void) cv_timedwait_sig_hires(&t->zthr_cv,
 				    &t->zthr_lock, t->zthr_wait_time,
 				    MSEC2NSEC(1), 0);
 			}
 		}
 	}
 	mutex_exit(&t->zthr_lock);
@ -199,6 +212,18 @@ zthr_procedure(void *arg)
 zthr_t *
 zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg)
 {
 	return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0));
 }
 /*
 * Create a zthr with specified maximum sleep time.  If the time
 * in sleeping state exceeds max_sleep, a wakeup(do the check and
 * start working if required) will be triggered.
 */
 zthr_t *
 zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func,
    void *arg, hrtime_t max_sleep)
 {
 	zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 	mutex_init(&t->zthr_lock, NULL, MUTEX_DEFAULT, NULL);
@ -208,6 +233,7 @@ zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg)
 	t->zthr_checkfunc = checkfunc;
 	t->zthr_func = func;
 	t->zthr_arg = arg;
 	t->zthr_wait_time = max_sleep;
 	t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
 	    0, &p0, TS_RUN, minclsyspri);