diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index e106499191..d4f16692ff 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -58,6 +58,9 @@
  *     the transaction group number is less than the current, open txg.
  *     If you add a new test, please do this if applicable.
  *
+ * (7) Threads are created with a reduced stack size, for sanity checking.
+ *     Therefore, it's important not to allocate huge buffers on the stack.
+ *
  * When run with no arguments, ztest runs for about five minutes and
  * produces no output if successful.  To get a little bit of information,
  * specify -V.  To get more information, specify -VV, and so on.
@@ -141,7 +144,6 @@ typedef struct ztest_args {
 	objset_t	*za_os;
 	zilog_t		*za_zilog;
 	kthread_t	*za_thread;
-	kt_did_t	za_threadid;
 	uint64_t	za_instance;
 	uint64_t	za_random;
 	uint64_t	za_diroff;
@@ -157,6 +159,7 @@ typedef struct ztest_args {
 	ztest_block_tag_t za_wbt;
 	dmu_object_info_t za_doi;
 	dmu_buf_t	*za_dbuf;
+	boolean_t	za_exited;
 } ztest_args_t;
 
 typedef void ztest_func_t(ztest_args_t *);
@@ -253,6 +256,8 @@ typedef struct ztest_shared {
 	kmutex_t		zs_sync_lock[ZTEST_SYNC_LOCKS];
 	uint64_t		zs_seq[ZTEST_SYNC_LOCKS];
 	ztest_cb_list_t		zs_cb_list;
+	kmutex_t		zs_thr_lock;
+	kcondvar_t		zs_thr_cv;
 } ztest_shared_t;
 
 static char ztest_dev_template[] = "%s/%s.%llua";
@@ -264,6 +269,7 @@ static int ztest_dump_core = 1;
 
 static uint64_t metaslab_sz;
 static boolean_t ztest_exiting;
+static boolean_t resume_thr_exited;
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
@@ -2571,7 +2577,7 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 	uint64_t off, txg, txg_how;
 	kmutex_t *lp;
 	char osname[MAXNAMELEN];
-	char iobuf[SPA_MAXBLOCKSIZE];
+	char *iobuf;
 	blkptr_t blk;
 	uint64_t blkoff;
 	zbookmark_t zb;
@@ -2741,6 +2747,8 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 	ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
 	ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
 
+	iobuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
 	/*
 	 * Read the block that dmu_sync() returned to make sure its contents
 	 * match what we wrote.  We do this while still txg_suspend()ed
@@ -2759,10 +2767,10 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 	bcopy(&iobuf[blkoff], rbt, btsize);
 
 	if (rbt->bt_objset == 0)		/* concurrent free */
-		return;
+		goto out;
 
 	if (wbt->bt_objset == 0)		/* all-zero overwrite */
-		return;
+		goto out;
 
 	ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
 	ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
@@ -2778,6 +2786,8 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 		ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq);
 	else
 		ASSERT3U(rbt->bt_seq, >, wbt->bt_seq);
+out:
+	umem_free(iobuf, SPA_MAXBLOCKSIZE);
 }
 
 /*
@@ -3819,6 +3829,8 @@ ztest_resume_thread(void *arg)
 		ztest_resume(spa);
 	}
 
+	resume_thr_exited = B_TRUE;
+
 	thread_exit();
 	return (NULL);
 }
@@ -3884,6 +3896,13 @@ ztest_thread(void *arg)
 			break;
 	}
 
+	mutex_enter(&zs->zs_thr_lock);
+	za->za_exited = B_TRUE;
+	mutex_exit(&zs->zs_thr_lock);
+
+	/* Announce that the thread has finished */
+	cv_broadcast(&zs->zs_thr_cv);
+
 	thread_exit();
 	return (NULL);
 }
@@ -3900,13 +3919,14 @@ ztest_run(char *pool)
 	spa_t *spa;
 	char name[100];
 	kthread_t *resume_thread;
-	kt_did_t resume_id;
 
 	ztest_exiting = B_FALSE;
 
 	mutex_init(&zs->zs_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zs->zs_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zs->zs_cb_list.zcl_callbacks_lock,NULL,MUTEX_DEFAULT,NULL);
+	mutex_init(&zs->zs_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&zs->zs_thr_cv, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zs->zs_cb_list.zcl_callbacks, sizeof (ztest_cb_data_t),
 	    offsetof(ztest_cb_data_t, zcd_node));
@@ -3979,9 +3999,9 @@ ztest_run(char *pool)
 	/*
 	 * Create a thread to periodically resume suspended I/O.
 	 */
+	resume_thr_exited = B_FALSE;
 	VERIFY3P((resume_thread = thread_create(NULL, 0, ztest_resume_thread,
-	         spa, THR_BOUND, NULL, 0, 0)), !=, NULL);
-	resume_id = resume_thread->t_tid;
+	    spa, TS_RUN, NULL, 0, 0)), !=, NULL);
 
 	/*
 	 * Verify that we can safely inquire about about any object,
@@ -4057,13 +4077,18 @@ ztest_run(char *pool)
 			za[d].za_zilog = zil_open(za[d].za_os, NULL);
 		}
 
+		za[t].za_exited = B_FALSE;
+
 		VERIFY3P((za[t].za_thread = thread_create(NULL, 0, ztest_thread,
-		         &za[t], THR_BOUND, NULL, 0, 0)), !=, NULL);
-		za[t].za_threadid = za[t].za_thread->t_tid;
+		    &za[t], TS_RUN, NULL, 0, 0)), !=, NULL);
 	}
 
 	while (--t >= 0) {
-		VERIFY(thread_join(za[t].za_threadid, NULL, NULL) == 0);
+		mutex_enter(&zs->zs_thr_lock);
+		while (!za[t].za_exited)
+			cv_wait(&zs->zs_thr_cv, &zs->zs_thr_lock);
+		mutex_exit(&zs->zs_thr_lock);
+
 		if (t < zopt_datasets) {
 			zil_close(za[t].za_zilog);
 			dmu_objset_close(za[t].za_os);
@@ -4102,7 +4127,11 @@ ztest_run(char *pool)
 
 	/* Kill the resume thread */
 	ztest_exiting = B_TRUE;
-	VERIFY(thread_join(resume_id, NULL, NULL) == 0);
+
+	/* Wait for the resume thread to exit */
+	while (!resume_thr_exited)
+		(void) poll(NULL, 0, 200);
+
 	ztest_resume(spa);
 
 	/*
@@ -4118,6 +4147,8 @@ ztest_run(char *pool)
 
 	list_destroy(&zs->zs_cb_list.zcl_callbacks);
 
+	cv_destroy(&zs->zs_thr_cv);
+	mutex_destroy(&zs->zs_thr_lock);
 	mutex_destroy(&zs->zs_cb_list.zcl_callbacks_lock);
 	rw_destroy(&zs->zs_name_lock);
 	mutex_destroy(&zs->zs_vdev_lock);
diff --git a/lib/libzpool/include/sys/zfs_context.h b/lib/libzpool/include/sys/zfs_context.h
index 37931169ca..8742fea6cc 100644
--- a/lib/libzpool/include/sys/zfs_context.h
+++ b/lib/libzpool/include/sys/zfs_context.h
@@ -151,34 +151,34 @@ extern void vpanic(const char *, __va_list);
 /*
  * Threads
  */
-#define THR_BOUND		0x00000001
 #define TS_RUN			0x00000002
 
-typedef void (*thread_func_t)(void *);
+#define STACK_SIZE 8192 /* Linux x86 and amd64 */
+
+typedef void (*thread_func_t)(void);
+typedef void (*thread_func_arg_t)(void *);
 typedef pthread_t kt_did_t;
 
 typedef struct kthread {
-        list_node_t	t_node;
 	kt_did_t	t_tid;
-	pthread_attr_t	t_attr;
+	thread_func_t	t_func;
+	void *		t_arg;
 } kthread_t;
 
+/* XXX tsd_create()/tsd_destroy() missing */
 #define	tsd_get(key)		pthread_getspecific(key)
 #define	tsd_set(key, val)	pthread_setspecific(key, val)
 #define	curthread		zk_thread_current()
 #define thread_exit		zk_thread_exit
 #define thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
-	zk_thread_create(stk, stksize, (thread_func_t)func, arg,	\
-			 len, NULL, state, pri)
-#define thread_join(tid, dtid, status)					\
-	zk_thread_join(tid, dtid, status)
+	zk_thread_create(stk, stksize, (thread_func_t) func, arg, len,  \
+	    NULL, state, pri)
 
 extern kthread_t *zk_thread_current(void);
 extern void zk_thread_exit(void);
 extern kthread_t *zk_thread_create(caddr_t stk, size_t  stksize,
 	thread_func_t func, void *arg, size_t len,
 	void *pp, int state, pri_t pri);
-extern int zk_thread_join(kt_did_t tid, kthread_t *dtid, void **status);
 
 #define	issig(why)	(FALSE)
 #define	ISSIG(thr, why)	(FALSE)
@@ -310,7 +310,7 @@ extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern void	taskq_destroy(taskq_t *);
 extern void	taskq_wait(taskq_t *);
-extern int	taskq_member(taskq_t *, void *);
+extern int	taskq_member(taskq_t *, kthread_t *);
 extern void	system_taskq_init(void);
 extern void	system_taskq_fini(void);
 
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 97e1bc9d38..f62f6180ed 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -57,155 +57,141 @@ struct utsname utsname = {
  * =========================================================================
  */
 
-/* NOTE: Tracking each tid on a list and using it for curthread lookups
- *       is slow at best but it provides an easy way to provide a kthread
- *       style API on top of pthreads.  For now we just want ztest to work
- *       to validate correctness.  Performance is not much of an issue
- *       since that is what the in-kernel version is for.  That said
- *       reworking this to track the kthread_t structure as thread
- *       specific data would be probably the best way to speed this up.
- */
-
 pthread_cond_t kthread_cond = PTHREAD_COND_INITIALIZER;
 pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER;
-list_t kthread_list;
-
-static int
-thread_count(void)
-{
-	kthread_t *kt;
-	int count = 0;
-
-	for (kt = list_head(&kthread_list); kt != NULL;
-	     kt = list_next(&kthread_list, kt))
-		count++;
-
-	return count;
-}
+pthread_key_t kthread_key;
+int kthread_nr = 0;
 
 static void
 thread_init(void)
 {
 	kthread_t *kt;
 
-	/* Initialize list for tracking kthreads */
-	list_create(&kthread_list, sizeof (kthread_t),
-		    offsetof(kthread_t, t_node));
+	VERIFY3S(pthread_key_create(&kthread_key, NULL), ==, 0);
 
 	/* Create entry for primary kthread */
 	kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL);
-	list_link_init(&kt->t_node);
-	VERIFY3U(kt->t_tid = pthread_self(), !=, 0);
-        VERIFY3S(pthread_attr_init(&kt->t_attr), ==, 0);
-	VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
-	list_insert_head(&kthread_list, kt);
-	VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
+	kt->t_tid = pthread_self();
+	kt->t_func = NULL;
+
+	VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0);
+
+	/* Only the main thread should be running at the moment */
+	ASSERT3S(kthread_nr, ==, 0);
+	kthread_nr = 1;
 }
 
 static void
 thread_fini(void)
 {
-	kthread_t *kt;
-	struct timespec ts = { 0 };
-	int count;
+	kthread_t *kt = curthread;
+
+	ASSERT(pthread_equal(kt->t_tid, pthread_self()));
+	ASSERT3P(kt->t_func, ==, NULL);
+
+	umem_free(kt, sizeof(kthread_t));
 
 	/* Wait for all threads to exit via thread_exit() */
 	VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
-	while ((count = thread_count()) > 1) {
-		clock_gettime(CLOCK_REALTIME, &ts);
-		ts.tv_sec += 1;
-		pthread_cond_timedwait(&kthread_cond, &kthread_lock, &ts);
-	}
 
-	ASSERT3S(thread_count(), ==, 1);
-	kt = list_head(&kthread_list);
-	list_remove(&kthread_list, kt);
+	kthread_nr--; /* Main thread is exiting */
+
+	while (kthread_nr > 0)
+		VERIFY3S(pthread_cond_wait(&kthread_cond, &kthread_lock), ==,
+		    0);
+
+	ASSERT3S(kthread_nr, ==, 0);
 	VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
 
-	VERIFY(pthread_attr_destroy(&kt->t_attr) == 0);
-	umem_free(kt, sizeof(kthread_t));
-
-	/* Cleanup list for tracking kthreads */
-	list_destroy(&kthread_list);
+	VERIFY3S(pthread_key_delete(kthread_key), ==, 0);
 }
 
 kthread_t *
 zk_thread_current(void)
 {
-	kt_did_t tid = pthread_self();
-	kthread_t *kt;
-	int count = 1;
+	kthread_t *kt = pthread_getspecific(kthread_key);
 
-	/*
-	 * Because a newly created thread may call zk_thread_current()
-	 * before the thread parent has had time to add the thread's tid
-	 * to our lookup list.  We will loop as long as there are tid
-	 * which have not yet been set which must be one of ours.
-	 * Yes it's a hack, at some point we can just use native pthreads.
-	 */
-	while (count > 0) {
-		count = 0;
-		VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
-		for (kt = list_head(&kthread_list); kt != NULL;
-		     kt = list_next(&kthread_list, kt)) {
-
-			if (kt->t_tid == tid) {
-				VERIFY3S(pthread_mutex_unlock(
-				         &kthread_lock), ==, 0);
-				return kt;
-			}
-
-			if (kt->t_tid == (kt_did_t)-1)
-				count++;
-		}
-		VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
-	}
-
-	/* Unreachable */
-	ASSERT(0);
-	return NULL;
-}
-
-kthread_t *
-zk_thread_create(caddr_t stk, size_t  stksize, thread_func_t func, void *arg,
-	      size_t len, void *pp, int state, pri_t pri)
-{
-	kthread_t *kt;
-
-	kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL);
-	kt->t_tid = (kt_did_t)-1;
-	list_link_init(&kt->t_node);
-	VERIFY(pthread_attr_init(&kt->t_attr) == 0);
-
-	VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
-	list_insert_head(&kthread_list, kt);
-	VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
-
-	VERIFY3U(pthread_create(&kt->t_tid, &kt->t_attr,
-			      (void *(*)(void *))func, arg), ==, 0);
+	ASSERT3P(kt, !=, NULL);
 
 	return kt;
 }
 
-int
-zk_thread_join(kt_did_t tid, kthread_t *dtid, void **status)
+void *
+zk_thread_helper(void *arg)
 {
-	return pthread_join(tid, status);
+	kthread_t *kt = (kthread_t *) arg;
+
+	VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0);
+
+	VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
+	kthread_nr++;
+	VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
+
+	kt->t_tid = pthread_self();
+	((thread_func_arg_t) kt->t_func)(kt->t_arg);
+
+	/* Unreachable, thread must exit with thread_exit() */
+	abort();
+
+	return NULL;
+}
+
+kthread_t *
+zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg,
+	      size_t len, void *pp, int state, pri_t pri)
+{
+	kthread_t *kt;
+	pthread_t tid;
+	pthread_attr_t attr;
+	size_t stack;
+
+	/*
+	 * Due to a race when getting/setting the thread ID, currently only
+	 * detached threads are supported.
+	 */
+	ASSERT3S(state & ~TS_RUN, ==, 0);
+
+	kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL);
+	kt->t_func = func;
+	kt->t_arg = arg;
+
+	/*
+	 * The Solaris kernel stack size in x86/x64 is 8K, so we reduce the
+	 * default stack size in userspace, for sanity checking.
+	 *
+	 * PTHREAD_STACK_MIN is the stack required for a NULL procedure in
+	 * userspace.
+	 *
+	 * XXX: Stack size for other architectures is not being taken into
+	 * account.
+	 */
+	stack = PTHREAD_STACK_MIN + MAX(stksize, STACK_SIZE);
+
+	VERIFY3S(pthread_attr_init(&attr), ==, 0);
+	VERIFY3S(pthread_attr_setstacksize(&attr, stack), ==, 0);
+	VERIFY3S(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED),
+	    ==, 0);
+
+	VERIFY3S(pthread_create(&tid, &attr, &zk_thread_helper, kt), ==, 0);
+
+	VERIFY3S(pthread_attr_destroy(&attr), ==, 0);
+
+	return kt;
 }
 
 void
 zk_thread_exit(void)
 {
-	kthread_t *kt;
+	kthread_t *kt = curthread;
 
-	VERIFY3P(kt = curthread, !=, NULL);
-	VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
-	list_remove(&kthread_list, kt);
-	VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
+	ASSERT(pthread_equal(kt->t_tid, pthread_self()));
 
-	VERIFY(pthread_attr_destroy(&kt->t_attr) == 0);
 	umem_free(kt, sizeof(kthread_t));
 
+	pthread_mutex_lock(&kthread_lock);
+	kthread_nr--;
+	pthread_mutex_unlock(&kthread_lock);
+
 	pthread_cond_broadcast(&kthread_cond);
 	pthread_exit(NULL);
 }
diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c
index 42e2dd3f43..1efdf1d6fa 100644
--- a/lib/libzpool/taskq.c
+++ b/lib/libzpool/taskq.c
@@ -43,7 +43,6 @@ struct taskq {
 	kcondvar_t	tq_dispatch_cv;
 	kcondvar_t	tq_wait_cv;
 	kthread_t	**tq_threadlist;
-	kt_did_t	*tq_idlist;
 	int		tq_flags;
 	int		tq_active;
 	int		tq_nthreads;
@@ -135,7 +134,7 @@ taskq_wait(taskq_t *tq)
 	mutex_exit(&tq->tq_lock);
 }
 
-static void *
+static void
 taskq_thread(void *arg)
 {
 	taskq_t *tq = arg;
@@ -165,7 +164,6 @@ taskq_thread(void *arg)
 	cv_broadcast(&tq->tq_wait_cv);
 	mutex_exit(&tq->tq_lock);
 	thread_exit();
-	return (NULL);
 }
 
 /*ARGSUSED*/
@@ -200,10 +198,8 @@ taskq_create(const char *name, int nthreads, pri_t pri,
 	tq->tq_maxalloc = maxalloc;
 	tq->tq_task.task_next = &tq->tq_task;
 	tq->tq_task.task_prev = &tq->tq_task;
-	VERIFY3P((tq->tq_threadlist = kmem_alloc(tq->tq_nthreads *
-	         sizeof(kthread_t *), KM_SLEEP)), !=, NULL);
-	VERIFY3P((tq->tq_idlist = kmem_alloc(tq->tq_nthreads *
-	         sizeof(kt_did_t), KM_SLEEP)), !=, NULL);
+	tq->tq_threadlist = kmem_alloc(tq->tq_nthreads * sizeof(kthread_t *),
+	    KM_SLEEP);
 
 	if (flags & TASKQ_PREPOPULATE) {
 		mutex_enter(&tq->tq_lock);
@@ -214,8 +210,7 @@ taskq_create(const char *name, int nthreads, pri_t pri,
 
 	for (t = 0; t < tq->tq_nthreads; t++) {
 		VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0,
-		       taskq_thread, tq, THR_BOUND, NULL, 0, 0)) != NULL);
-		tq->tq_idlist[t] = tq->tq_threadlist[t]->t_tid;
+		    taskq_thread, tq, TS_RUN, NULL, 0, 0)) != NULL);
 	}
 
 	return (tq);
@@ -224,7 +219,6 @@ taskq_create(const char *name, int nthreads, pri_t pri,
 void
 taskq_destroy(taskq_t *tq)
 {
-	int t;
 	int nthreads = tq->tq_nthreads;
 
 	taskq_wait(tq);
@@ -245,11 +239,7 @@ taskq_destroy(taskq_t *tq)
 
 	mutex_exit(&tq->tq_lock);
 
-	for (t = 0; t < nthreads; t++)
-		VERIFY3S(thread_join(tq->tq_idlist[t], NULL, NULL), ==, 0);
-
 	kmem_free(tq->tq_threadlist, nthreads * sizeof(kthread_t *));
-	kmem_free(tq->tq_idlist, nthreads * sizeof(kt_did_t));
 
 	rw_destroy(&tq->tq_threadlock);
 	mutex_destroy(&tq->tq_lock);
@@ -260,7 +250,7 @@ taskq_destroy(taskq_t *tq)
 }
 
 int
-taskq_member(taskq_t *tq, void *t)
+taskq_member(taskq_t *tq, kthread_t *t)
 {
 	int i;
 
@@ -268,7 +258,7 @@ taskq_member(taskq_t *tq, void *t)
 		return (1);
 
 	for (i = 0; i < tq->tq_nthreads; i++)
-		if (tq->tq_threadlist[i] == (kthread_t *)t)
+		if (tq->tq_threadlist[i] == t)
 			return (1);
 
 	return (0);
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index fb95361f87..e0bc524a37 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -446,8 +446,6 @@ txg_sync_thread(dsl_pool_t *dp)
 		rw_exit(&tx->tx_suspend);
 		cv_broadcast(&tx->tx_sync_done_cv);
 	}
-
-	thread_exit();
 }
 
 static void
@@ -492,8 +490,6 @@ txg_quiesce_thread(dsl_pool_t *dp)
 		cv_broadcast(&tx->tx_sync_more_cv);
 		cv_broadcast(&tx->tx_quiesce_done_cv);
 	}
-
-	thread_exit();
 }
 
 /*