From 9a3d5378c6fc42b9cb47aa0ce5b2507cbfd0579f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 9 Mar 2010 12:25:28 -0800 Subject: [PATCH 1/2] Clean up emulation of kernel threads in userspace. Updated to use pthread thread specific data rather than keeping a global list. This also fixes at least one easily reproducible crash in ztest --- cmd/ztest/ztest.c | 53 +++++-- lib/libzpool/include/sys/zfs_context.h | 20 +-- lib/libzpool/kernel.c | 200 ++++++++++++------------- lib/libzpool/taskq.c | 22 +-- module/zfs/txg.c | 4 - 5 files changed, 151 insertions(+), 148 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 0f69b2d00b..b2d3ea7422 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -58,6 +58,9 @@ * the transaction group number is less than the current, open txg. * If you add a new test, please do this if applicable. * + * (7) Threads are created with a reduced stack size, for sanity checking. + * Therefore, it's important not to allocate huge buffers on the stack. + * * When run with no arguments, ztest runs for about five minutes and * produces no output if successful. To get a little bit of information, * specify -V. To get more information, specify -VV, and so on. @@ -141,7 +144,6 @@ typedef struct ztest_args { objset_t *za_os; zilog_t *za_zilog; kthread_t *za_thread; - kt_did_t za_threadid; uint64_t za_instance; uint64_t za_random; uint64_t za_diroff; @@ -157,6 +159,7 @@ typedef struct ztest_args { ztest_block_tag_t za_wbt; dmu_object_info_t za_doi; dmu_buf_t *za_dbuf; + boolean_t za_exited; } ztest_args_t; typedef void ztest_func_t(ztest_args_t *); @@ -253,6 +256,8 @@ typedef struct ztest_shared { kmutex_t zs_sync_lock[ZTEST_SYNC_LOCKS]; uint64_t zs_seq[ZTEST_SYNC_LOCKS]; ztest_cb_list_t zs_cb_list; + kmutex_t zs_thr_lock; + kcondvar_t zs_thr_cv; } ztest_shared_t; static char ztest_dev_template[] = "%s/%s.%llua"; @@ -264,6 +269,7 @@ static int ztest_dump_core = 1; static uint64_t metaslab_sz; static boolean_t ztest_exiting; +static boolean_t resume_thr_exited; extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; @@ -2558,7 +2564,7 @@ ztest_dmu_write_parallel(ztest_args_t *za) uint64_t off, txg, txg_how; kmutex_t *lp; char osname[MAXNAMELEN]; - char iobuf[SPA_MAXBLOCKSIZE]; + char *iobuf; blkptr_t blk = { 0 }; uint64_t blkoff; zbookmark_t zb; @@ -2727,6 +2733,8 @@ ztest_dmu_write_parallel(ztest_args_t *za) ASSERT3U(BP_GET_LEVEL(&blk), ==, 0); ASSERT3U(BP_GET_LSIZE(&blk), ==, bs); + iobuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + /* * Read the block that dmu_sync() returned to make sure its contents * match what we wrote. We do this while still txg_suspend()ed @@ -2745,10 +2753,10 @@ ztest_dmu_write_parallel(ztest_args_t *za) bcopy(&iobuf[blkoff], rbt, btsize); if (rbt->bt_objset == 0) /* concurrent free */ - return; + goto out; if (wbt->bt_objset == 0) /* all-zero overwrite */ - return; + goto out; ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset); ASSERT3U(rbt->bt_object, ==, wbt->bt_object); @@ -2764,6 +2772,8 @@ ztest_dmu_write_parallel(ztest_args_t *za) ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq); else ASSERT3U(rbt->bt_seq, >, wbt->bt_seq); +out: + umem_free(iobuf, SPA_MAXBLOCKSIZE); } /* @@ -3805,6 +3815,8 @@ ztest_resume_thread(void *arg) ztest_resume(spa); } + resume_thr_exited = B_TRUE; + thread_exit(); return (NULL); } @@ -3870,6 +3882,13 @@ ztest_thread(void *arg) break; } + mutex_enter(&zs->zs_thr_lock); + za->za_exited = B_TRUE; + mutex_exit(&zs->zs_thr_lock); + + /* Announce that the thread has finished */ + cv_broadcast(&zs->zs_thr_cv); + thread_exit(); return (NULL); } @@ -3886,13 +3905,14 @@ ztest_run(char *pool) spa_t *spa; char name[100]; kthread_t *resume_thread; - kt_did_t resume_id; ztest_exiting = B_FALSE; mutex_init(&zs->zs_vdev_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zs->zs_name_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zs->zs_cb_list.zcl_callbacks_lock,NULL,MUTEX_DEFAULT,NULL); + mutex_init(&zs->zs_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zs->zs_thr_cv, NULL, CV_DEFAULT, NULL); list_create(&zs->zs_cb_list.zcl_callbacks, sizeof (ztest_cb_data_t), offsetof(ztest_cb_data_t, zcd_node)); @@ -3965,9 +3985,9 @@ ztest_run(char *pool) /* * Create a thread to periodically resume suspended I/O. */ + resume_thr_exited = B_FALSE; VERIFY3P((resume_thread = thread_create(NULL, 0, ztest_resume_thread, - spa, THR_BOUND, NULL, 0, 0)), !=, NULL); - resume_id = resume_thread->t_tid; + spa, TS_RUN, NULL, 0, 0)), !=, NULL); /* * Verify that we can safely inquire about about any object, @@ -4043,13 +4063,18 @@ ztest_run(char *pool) za[d].za_zilog = zil_open(za[d].za_os, NULL); } + za[t].za_exited = B_FALSE; + VERIFY3P((za[t].za_thread = thread_create(NULL, 0, ztest_thread, - &za[t], THR_BOUND, NULL, 0, 0)), !=, NULL); - za[t].za_threadid = za[t].za_thread->t_tid; + &za[t], TS_RUN, NULL, 0, 0)), !=, NULL); } while (--t >= 0) { - VERIFY(thread_join(za[t].za_threadid, NULL, NULL) == 0); + mutex_enter(&zs->zs_thr_lock); + while (!za[t].za_exited) + cv_wait(&zs->zs_thr_cv, &zs->zs_thr_lock); + mutex_exit(&zs->zs_thr_lock); + if (t < zopt_datasets) { zil_close(za[t].za_zilog); dmu_objset_close(za[t].za_os); @@ -4088,7 +4113,11 @@ ztest_run(char *pool) /* Kill the resume thread */ ztest_exiting = B_TRUE; - VERIFY(thread_join(resume_id, NULL, NULL) == 0); + + /* Wait for the resume thread to exit */ + while (!resume_thr_exited) + (void) poll(NULL, 0, 200); + ztest_resume(spa); /* @@ -4104,6 +4133,8 @@ ztest_run(char *pool) list_destroy(&zs->zs_cb_list.zcl_callbacks); + cv_destroy(&zs->zs_thr_cv); + mutex_destroy(&zs->zs_thr_lock); mutex_destroy(&zs->zs_cb_list.zcl_callbacks_lock); rw_destroy(&zs->zs_name_lock); mutex_destroy(&zs->zs_vdev_lock); diff --git a/lib/libzpool/include/sys/zfs_context.h b/lib/libzpool/include/sys/zfs_context.h index cad7553cc8..522d860ab6 100644 --- a/lib/libzpool/include/sys/zfs_context.h +++ b/lib/libzpool/include/sys/zfs_context.h @@ -192,34 +192,34 @@ _NOTE(CONSTCOND) } while (0) /* * Threads */ -#define THR_BOUND 0x00000001 #define TS_RUN 0x00000002 -typedef void (*thread_func_t)(void *); +#define STACK_SIZE 8192 /* Linux x86 and amd64 */ + +typedef void (*thread_func_t)(void); +typedef void (*thread_func_arg_t)(void *); typedef pthread_t kt_did_t; typedef struct kthread { - list_node_t t_node; kt_did_t t_tid; - pthread_attr_t t_attr; + thread_func_t t_func; + void * t_arg; } kthread_t; +/* XXX tsd_create()/tsd_destroy() missing */ #define tsd_get(key) pthread_getspecific(key) #define tsd_set(key, val) pthread_setspecific(key, val) #define curthread zk_thread_current() #define thread_exit zk_thread_exit #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ - zk_thread_create(stk, stksize, (thread_func_t)func, arg, \ - len, NULL, state, pri) -#define thread_join(tid, dtid, status) \ - zk_thread_join(tid, dtid, status) + zk_thread_create(stk, stksize, (thread_func_t) func, arg, len, \ + NULL, state, pri) extern kthread_t *zk_thread_current(void); extern void zk_thread_exit(void); extern kthread_t *zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, size_t len, void *pp, int state, pri_t pri); -extern int zk_thread_join(kt_did_t tid, kthread_t *dtid, void **status); #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) @@ -351,7 +351,7 @@ extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); -extern int taskq_member(taskq_t *, void *); +extern int taskq_member(taskq_t *, kthread_t *); extern void system_taskq_init(void); extern void system_taskq_fini(void); diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 1218f20deb..2e003a1f0b 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -58,155 +58,141 @@ struct utsname utsname = { * ========================================================================= */ -/* NOTE: Tracking each tid on a list and using it for curthread lookups - * is slow at best but it provides an easy way to provide a kthread - * style API on top of pthreads. For now we just want ztest to work - * to validate correctness. Performance is not much of an issue - * since that is what the in-kernel version is for. That said - * reworking this to track the kthread_t structure as thread - * specific data would be probably the best way to speed this up. - */ - pthread_cond_t kthread_cond = PTHREAD_COND_INITIALIZER; pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER; -list_t kthread_list; - -static int -thread_count(void) -{ - kthread_t *kt; - int count = 0; - - for (kt = list_head(&kthread_list); kt != NULL; - kt = list_next(&kthread_list, kt)) - count++; - - return count; -} +pthread_key_t kthread_key; +int kthread_nr = 0; static void thread_init(void) { kthread_t *kt; - /* Initialize list for tracking kthreads */ - list_create(&kthread_list, sizeof (kthread_t), - offsetof(kthread_t, t_node)); + VERIFY3S(pthread_key_create(&kthread_key, NULL), ==, 0); /* Create entry for primary kthread */ kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); - list_link_init(&kt->t_node); - VERIFY3U(kt->t_tid = pthread_self(), !=, 0); - VERIFY3S(pthread_attr_init(&kt->t_attr), ==, 0); - VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); - list_insert_head(&kthread_list, kt); - VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); + kt->t_tid = pthread_self(); + kt->t_func = NULL; + + VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0); + + /* Only the main thread should be running at the moment */ + ASSERT3S(kthread_nr, ==, 0); + kthread_nr = 1; } static void thread_fini(void) { - kthread_t *kt; - struct timespec ts = { 0 }; - int count; + kthread_t *kt = curthread; + + ASSERT(pthread_equal(kt->t_tid, pthread_self())); + ASSERT3P(kt->t_func, ==, NULL); + + umem_free(kt, sizeof(kthread_t)); /* Wait for all threads to exit via thread_exit() */ VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); - while ((count = thread_count()) > 1) { - clock_gettime(CLOCK_REALTIME, &ts); - ts.tv_sec += 1; - pthread_cond_timedwait(&kthread_cond, &kthread_lock, &ts); - } - ASSERT3S(thread_count(), ==, 1); - kt = list_head(&kthread_list); - list_remove(&kthread_list, kt); + kthread_nr--; /* Main thread is exiting */ + + while (kthread_nr > 0) + VERIFY3S(pthread_cond_wait(&kthread_cond, &kthread_lock), ==, + 0); + + ASSERT3S(kthread_nr, ==, 0); VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); - VERIFY(pthread_attr_destroy(&kt->t_attr) == 0); - umem_free(kt, sizeof(kthread_t)); - - /* Cleanup list for tracking kthreads */ - list_destroy(&kthread_list); + VERIFY3S(pthread_key_delete(kthread_key), ==, 0); } kthread_t * zk_thread_current(void) { - kt_did_t tid = pthread_self(); - kthread_t *kt; - int count = 1; + kthread_t *kt = pthread_getspecific(kthread_key); - /* - * Because a newly created thread may call zk_thread_current() - * before the thread parent has had time to add the thread's tid - * to our lookup list. We will loop as long as there are tid - * which have not yet been set which must be one of ours. - * Yes it's a hack, at some point we can just use native pthreads. - */ - while (count > 0) { - count = 0; - VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); - for (kt = list_head(&kthread_list); kt != NULL; - kt = list_next(&kthread_list, kt)) { - - if (kt->t_tid == tid) { - VERIFY3S(pthread_mutex_unlock( - &kthread_lock), ==, 0); - return kt; - } - - if (kt->t_tid == (kt_did_t)-1) - count++; - } - VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); - } - - /* Unreachable */ - ASSERT(0); - return NULL; -} - -kthread_t * -zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, - size_t len, void *pp, int state, pri_t pri) -{ - kthread_t *kt; - - kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); - kt->t_tid = (kt_did_t)-1; - list_link_init(&kt->t_node); - VERIFY(pthread_attr_init(&kt->t_attr) == 0); - - VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); - list_insert_head(&kthread_list, kt); - VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); - - VERIFY3U(pthread_create(&kt->t_tid, &kt->t_attr, - (void *(*)(void *))func, arg), ==, 0); + ASSERT3P(kt, !=, NULL); return kt; } -int -zk_thread_join(kt_did_t tid, kthread_t *dtid, void **status) +void * +zk_thread_helper(void *arg) { - return pthread_join(tid, status); + kthread_t *kt = (kthread_t *) arg; + + VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0); + + VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); + kthread_nr++; + VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); + + kt->t_tid = pthread_self(); + ((thread_func_arg_t) kt->t_func)(kt->t_arg); + + /* Unreachable, thread must exit with thread_exit() */ + abort(); + + return NULL; +} + +kthread_t * +zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, + size_t len, void *pp, int state, pri_t pri) +{ + kthread_t *kt; + pthread_t tid; + pthread_attr_t attr; + size_t stack; + + /* + * Due to a race when getting/setting the thread ID, currently only + * detached threads are supported. + */ + ASSERT3S(state & ~TS_RUN, ==, 0); + + kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); + kt->t_func = func; + kt->t_arg = arg; + + /* + * The Solaris kernel stack size in x86/x64 is 8K, so we reduce the + * default stack size in userspace, for sanity checking. + * + * PTHREAD_STACK_MIN is the stack required for a NULL procedure in + * userspace. + * + * XXX: Stack size for other architectures is not being taken into + * account. + */ + stack = PTHREAD_STACK_MIN + MAX(stksize, STACK_SIZE); + + VERIFY3S(pthread_attr_init(&attr), ==, 0); + VERIFY3S(pthread_attr_setstacksize(&attr, stack), ==, 0); + VERIFY3S(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED), + ==, 0); + + VERIFY3S(pthread_create(&tid, &attr, &zk_thread_helper, kt), ==, 0); + + VERIFY3S(pthread_attr_destroy(&attr), ==, 0); + + return kt; } void zk_thread_exit(void) { - kthread_t *kt; + kthread_t *kt = curthread; - VERIFY3P(kt = curthread, !=, NULL); - VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); - list_remove(&kthread_list, kt); - VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); + ASSERT(pthread_equal(kt->t_tid, pthread_self())); - VERIFY(pthread_attr_destroy(&kt->t_attr) == 0); umem_free(kt, sizeof(kthread_t)); + pthread_mutex_lock(&kthread_lock); + kthread_nr--; + pthread_mutex_unlock(&kthread_lock); + pthread_cond_broadcast(&kthread_cond); pthread_exit(NULL); } diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 42e2dd3f43..1efdf1d6fa 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -43,7 +43,6 @@ struct taskq { kcondvar_t tq_dispatch_cv; kcondvar_t tq_wait_cv; kthread_t **tq_threadlist; - kt_did_t *tq_idlist; int tq_flags; int tq_active; int tq_nthreads; @@ -135,7 +134,7 @@ taskq_wait(taskq_t *tq) mutex_exit(&tq->tq_lock); } -static void * +static void taskq_thread(void *arg) { taskq_t *tq = arg; @@ -165,7 +164,6 @@ taskq_thread(void *arg) cv_broadcast(&tq->tq_wait_cv); mutex_exit(&tq->tq_lock); thread_exit(); - return (NULL); } /*ARGSUSED*/ @@ -200,10 +198,8 @@ taskq_create(const char *name, int nthreads, pri_t pri, tq->tq_maxalloc = maxalloc; tq->tq_task.task_next = &tq->tq_task; tq->tq_task.task_prev = &tq->tq_task; - VERIFY3P((tq->tq_threadlist = kmem_alloc(tq->tq_nthreads * - sizeof(kthread_t *), KM_SLEEP)), !=, NULL); - VERIFY3P((tq->tq_idlist = kmem_alloc(tq->tq_nthreads * - sizeof(kt_did_t), KM_SLEEP)), !=, NULL); + tq->tq_threadlist = kmem_alloc(tq->tq_nthreads * sizeof(kthread_t *), + KM_SLEEP); if (flags & TASKQ_PREPOPULATE) { mutex_enter(&tq->tq_lock); @@ -214,8 +210,7 @@ taskq_create(const char *name, int nthreads, pri_t pri, for (t = 0; t < tq->tq_nthreads; t++) { VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0, - taskq_thread, tq, THR_BOUND, NULL, 0, 0)) != NULL); - tq->tq_idlist[t] = tq->tq_threadlist[t]->t_tid; + taskq_thread, tq, TS_RUN, NULL, 0, 0)) != NULL); } return (tq); @@ -224,7 +219,6 @@ taskq_create(const char *name, int nthreads, pri_t pri, void taskq_destroy(taskq_t *tq) { - int t; int nthreads = tq->tq_nthreads; taskq_wait(tq); @@ -245,11 +239,7 @@ taskq_destroy(taskq_t *tq) mutex_exit(&tq->tq_lock); - for (t = 0; t < nthreads; t++) - VERIFY3S(thread_join(tq->tq_idlist[t], NULL, NULL), ==, 0); - kmem_free(tq->tq_threadlist, nthreads * sizeof(kthread_t *)); - kmem_free(tq->tq_idlist, nthreads * sizeof(kt_did_t)); rw_destroy(&tq->tq_threadlock); mutex_destroy(&tq->tq_lock); @@ -260,7 +250,7 @@ taskq_destroy(taskq_t *tq) } int -taskq_member(taskq_t *tq, void *t) +taskq_member(taskq_t *tq, kthread_t *t) { int i; @@ -268,7 +258,7 @@ taskq_member(taskq_t *tq, void *t) return (1); for (i = 0; i < tq->tq_nthreads; i++) - if (tq->tq_threadlist[i] == (kthread_t *)t) + if (tq->tq_threadlist[i] == t) return (1); return (0); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 15745b3e98..b5fcc8c4a8 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -446,8 +446,6 @@ txg_sync_thread(dsl_pool_t *dp) rw_exit(&tx->tx_suspend); cv_broadcast(&tx->tx_sync_done_cv); } - - thread_exit(); } static void @@ -492,8 +490,6 @@ txg_quiesce_thread(dsl_pool_t *dp) cv_broadcast(&tx->tx_sync_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); } - - thread_exit(); } /* From 2b8502427595ecb5f703b5555e7f1c8f1b1bde8b Mon Sep 17 00:00:00 2001 From: "Ricardo M. Correia" Date: Tue, 9 Mar 2010 13:21:20 -0800 Subject: [PATCH 2/2] Use CPU percentages for number of commit cb threads. This doesn't change number of threads in the kernel, but it reduces number of threads in ztest (important due to 32-bit address limitations). --- module/zfs/txg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/txg.c b/module/zfs/txg.c index b5fcc8c4a8..dc2ca3da6b 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -348,8 +348,8 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) * Commit callback taskq hasn't been created yet. */ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", - max_ncpus, minclsyspri, max_ncpus, max_ncpus * 4, - TASKQ_PREPOPULATE); + 100, minclsyspri, max_ncpus, max_ncpus * 4, + TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); } tcb = kmem_alloc(sizeof (tx_cb_t), KM_SLEEP);