From 6fe1e37a5e9399eb24947e7b37ca7f3e276d8466 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 28 Jun 2010 23:45:08 -0700 Subject: [PATCH 1/8] Additional pthread related fixes for ztest There are 3 fixes in thie commit. First, update ztest_run() to store the thread id and not the address of the kthread_t. This will be freed on thread exit and is not safe to use. This is pretty close to how things were done in the original ztest code before I got there. Second, for extra paranoia update thread_exit() to return a special TS_MAGIC value via pthread_exit(). This value is then verified in pthread_join() to ensure the thread exited cleanly. This can be done cleanly because the kthread doesn't provide a return code mechanism we need to worry about. Third, replace the ztest deadman thread with a signal handler. We cannot use the previous approach because the correct behavior for pthreads is to wait for all threads to exit before terminating the process. Since the deadman thread won't call exit by design we end up hanging in kernel_exit(). To avoid this we just setup a SIGALRM signal handle and register a deadman alarm. IMHO this is simpler and cleaner anyway. --- cmd/ztest/ztest.c | 38 +++++++++++--------------- lib/libzpool/include/sys/zfs_context.h | 4 +-- lib/libzpool/kernel.c | 7 +++-- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 69086a34a2..5706055216 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -4904,21 +4904,12 @@ ztest_resume_thread(void *arg) return (NULL); } -static void * -ztest_deadman_thread(void *arg) +#define GRACE 300 + +static void +ztest_deadman_alarm(int sig) { - ztest_shared_t *zs = arg; - int grace = 300; - hrtime_t delta; - - delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; - - (void) poll(NULL, 0, (int)(1000 * delta)); - - fatal(0, "failed to complete within %d seconds of deadline", grace); - thread_exit(); - - return (NULL); + fatal(0, "failed to complete within %d seconds of deadline", GRACE); } static void @@ -5106,7 +5097,7 @@ ztest_dataset_close(ztest_shared_t *zs, int d) static void ztest_run(ztest_shared_t *zs) { - kthread_t **tid; + kt_did_t *tid; spa_t *spa; kthread_t *resume_thread; uint64_t object; @@ -5158,10 +5149,10 @@ ztest_run(ztest_shared_t *zs) spa, TS_RUN, NULL, 0, 0)), !=, NULL); /* - * Create a deadman thread to abort() if we hang. + * Set a deadman alarm to abort() if we hang. */ - VERIFY3P(thread_create(NULL, 0, ztest_deadman_thread, zs, - TS_RUN, NULL, 0, 0), !=, NULL); + signal(SIGALRM, ztest_deadman_alarm); + alarm((zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + GRACE); /* * Verify that we can safely inquire about about any object, @@ -5187,7 +5178,7 @@ ztest_run(ztest_shared_t *zs) } zs->zs_enospc_count = 0; - tid = umem_zalloc(zopt_threads * sizeof (kthread_t *), UMEM_NOFAIL); + tid = umem_zalloc(zopt_threads * sizeof (kt_did_t), UMEM_NOFAIL); if (zopt_verbose >= 4) (void) printf("starting main threads...\n"); @@ -5196,11 +5187,14 @@ ztest_run(ztest_shared_t *zs) * Kick off all the tests that run in parallel. */ for (int t = 0; t < zopt_threads; t++) { + kthread_t *thread; + if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0) return; - VERIFY3P(tid[t] = thread_create(NULL, 0, ztest_thread, + VERIFY3P(thread = thread_create(NULL, 0, ztest_thread, (void *)(uintptr_t)t, TS_RUN, NULL, 0, 0), !=, NULL); + tid[t] = thread->t_tid; } /* @@ -5208,7 +5202,7 @@ ztest_run(ztest_shared_t *zs) * so we don't close datasets while threads are still using them. */ for (int t = zopt_threads - 1; t >= 0; t--) { - thread_join(tid[t]->t_tid); + thread_join(tid[t]); if (t < zopt_datasets) ztest_dataset_close(zs, t); } @@ -5218,7 +5212,7 @@ ztest_run(ztest_shared_t *zs) zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); - umem_free(tid, zopt_threads * sizeof (kthread_t *)); + umem_free(tid, zopt_threads * sizeof (kt_did_t)); /* Kill the resume thread */ ztest_exiting = B_TRUE; diff --git a/lib/libzpool/include/sys/zfs_context.h b/lib/libzpool/include/sys/zfs_context.h index e62c75a188..b0112e6ec4 100644 --- a/lib/libzpool/include/sys/zfs_context.h +++ b/lib/libzpool/include/sys/zfs_context.h @@ -197,7 +197,8 @@ _NOTE(CONSTCOND) } while (0) /* * Threads */ -#define TS_RUN 0x00000002 +#define TS_MAGIC 0x72f158ab4261e538ull +#define TS_RUN 0x00000002 #ifdef _linux_ #define STACK_SIZE 8192 /* Linux x86 and amd64 */ #else @@ -221,7 +222,6 @@ typedef struct kthread { void * t_arg; } kthread_t; -/* XXX tsd_create()/tsd_destroy() missing */ #define tsd_get(key) pthread_getspecific(key) #define tsd_set(key, val) pthread_setspecific(key, val) #define curthread zk_thread_current() diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index a992bec9ab..2c9d32be82 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -192,13 +192,16 @@ zk_thread_exit(void) pthread_mutex_unlock(&kthread_lock); pthread_cond_broadcast(&kthread_cond); - pthread_exit(NULL); + pthread_exit((void *)TS_MAGIC); } void zk_thread_join(kt_did_t tid) { - pthread_join((pthread_t)tid, NULL); + void *ret; + + pthread_join((pthread_t)tid, &ret); + VERIFY3P(ret, ==, (void *)TS_MAGIC); } /* From 7334572416255825f45a71447d78de710597dc54 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 29 Jun 2010 10:11:32 -0700 Subject: [PATCH 2/8] Reduce stack usage by dsl_dataset_destroy() Move dsl_dataset_t local variable from the stack to the heap. This reduces the stack usage of this function from 2048 bytes to 176 bytes for x84_64 arches. --- .topdeps | 1 + .topmsg | 8 ++++++++ module/zfs/dsl_dataset.c | 22 +++++++++++++--------- 3 files changed, 22 insertions(+), 9 deletions(-) create mode 100644 .topdeps create mode 100644 .topmsg diff --git a/.topdeps b/.topdeps new file mode 100644 index 0000000000..1f7391f92b --- /dev/null +++ b/.topdeps @@ -0,0 +1 @@ +master diff --git a/.topmsg b/.topmsg new file mode 100644 index 0000000000..14624fabbc --- /dev/null +++ b/.topmsg @@ -0,0 +1,8 @@ +From: Brian Behlendorf +Subject: [PATCH] fix stack dsl_dataset_destroy + +Move dsl_dataset_t local variable from the stack to the heap. +This reduces the stack usage of this function from 2048 bytes +to 176 bytes for x84_64 arches. + +Signed-off-by: Brian Behlendorf diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index ddd83576c6..0f7bc0f81a 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -1033,7 +1033,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) dsl_dir_t *dd; uint64_t obj; struct dsl_ds_destroyarg dsda = { 0 }; - dsl_dataset_t dummy_ds = { 0 }; + dsl_dataset_t *dummy_ds; dsda.ds = ds; @@ -1053,8 +1053,9 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) } dd = ds->ds_dir; - dummy_ds.ds_dir = dd; - dummy_ds.ds_object = ds->ds_object; + dummy_ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); + dummy_ds->ds_dir = dd; + dummy_ds->ds_object = ds->ds_object; /* * Check for errors and mark this ds as inconsistent, in @@ -1063,11 +1064,11 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, dsl_dataset_destroy_begin_sync, ds, NULL, 0); if (err) - goto out; + goto out_free; err = dmu_objset_from_ds(ds, &os); if (err) - goto out; + goto out_free; /* * remove the objects in open context, so that we won't @@ -1104,14 +1105,14 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) } if (err != ESRCH) - goto out; + goto out_free; rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); rw_exit(&dd->dd_pool->dp_config_rwlock); if (err) - goto out; + goto out_free; /* * Blow away the dsl_dir + head dataset. @@ -1127,7 +1128,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) err = dsl_dataset_origin_rm_prep(&dsda, tag); if (err) { dsl_dir_close(dd, FTAG); - goto out; + goto out_free; } } @@ -1135,7 +1136,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) dsl_sync_task_create(dstg, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, &dsda, tag, 0); dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); + dsl_dir_destroy_sync, dummy_ds, FTAG, 0); err = dsl_sync_task_group_wait(dstg); dsl_sync_task_group_destroy(dstg); @@ -1158,6 +1159,9 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) /* if it is successful, dsl_dir_destroy_sync will close the dd */ if (err) dsl_dir_close(dd, FTAG); + +out_free: + kmem_free(dummy_ds, sizeof (dsl_dataset_t)); out: dsl_dataset_disown(ds, tag); return (err); From eabeea3b7c298c286b151962aa047237f551abcd Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 29 Jun 2010 10:12:00 -0700 Subject: [PATCH 3/8] New TopGit dependency: fix-stack-dsl_dataset_destroy --- .topdeps | 1 + 1 file changed, 1 insertion(+) diff --git a/.topdeps b/.topdeps index 3ed5fa8b2f..4d6117617f 100644 --- a/.topdeps +++ b/.topdeps @@ -19,3 +19,4 @@ fix-stack-dsl_dir_open_spa fix-stack-dsl_deleg_get fix-stack-noinline fix-stack-dmu_objset_snapshot +fix-stack-dsl_dataset_destroy From edbb93c11d6470619d3c3b8f0dee41a092687364 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 29 Jun 2010 10:14:37 -0700 Subject: [PATCH 4/8] Reduce stack usage by vn_open() We should not put a 4k maxpathlen buffer on the stack, instead locate it to the heap. Even in user space we run ztest with 8K --- .topdeps | 1 + .topmsg | 8 ++++++++ lib/libzpool/kernel.c | 23 +++++++++++++++++------ 3 files changed, 26 insertions(+), 6 deletions(-) create mode 100644 .topdeps create mode 100644 .topmsg diff --git a/.topdeps b/.topdeps new file mode 100644 index 0000000000..1f7391f92b --- /dev/null +++ b/.topdeps @@ -0,0 +1 @@ +master diff --git a/.topmsg b/.topmsg new file mode 100644 index 0000000000..9ae8bcda7d --- /dev/null +++ b/.topmsg @@ -0,0 +1,8 @@ +From: Brian Behlendorf +Subject: [PATCH] fix stack vn_open + +We should not put a 4k maxpathlen buffer on the stack, instead +locate it to the heap. Even in user space we run ztest with 8K +stacks to verify correctness + +Signed-off-by: Brian Behlendorf diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 5284c12532..89c8975846 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -327,9 +327,11 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) int fd; vnode_t *vp; int old_umask; - char realpath[MAXPATHLEN]; + char *realpath; struct stat64 st; + realpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + /* * If we're accessing a real disk from userland, we need to use * the character interface to avoid caching. This is particularly @@ -343,11 +345,16 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) if (strncmp(path, "/dev/", 5) == 0) { char *dsk; fd = open64(path, O_RDONLY); - if (fd == -1) - return (errno); + if (fd == -1) { + err = errno; + free(realpath); + return (err); + } if (fstat64(fd, &st) == -1) { + err = errno; close(fd); - return (errno); + free(realpath); + return (err); } close(fd); (void) sprintf(realpath, "%s", path); @@ -357,8 +364,11 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) dsk + 1); } else { (void) sprintf(realpath, "%s", path); - if (!(flags & FCREAT) && stat64(realpath, &st) == -1) - return (errno); + if (!(flags & FCREAT) && stat64(realpath, &st) == -1) { + err = errno; + free(realpath); + return (err); + } } if (flags & FCREAT) @@ -369,6 +379,7 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. */ fd = open64(realpath, flags - FREAD, mode); + free(realpath); if (flags & FCREAT) (void) umask(old_umask); From 53324fbd1f0c5127efed0e6229946b660bfb71ee Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 29 Jun 2010 10:15:02 -0700 Subject: [PATCH 5/8] New TopGit dependency: fix-stack-vn_open --- .topdeps | 1 + 1 file changed, 1 insertion(+) diff --git a/.topdeps b/.topdeps index 4d6117617f..9cd7933b53 100644 --- a/.topdeps +++ b/.topdeps @@ -20,3 +20,4 @@ fix-stack-dsl_deleg_get fix-stack-noinline fix-stack-dmu_objset_snapshot fix-stack-dsl_dataset_destroy +fix-stack-vn_open From 8e2de85a6df8f8105ce96f2ac8f1c8b4f732d807 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 29 Jun 2010 10:21:21 -0700 Subject: [PATCH 6/8] Reduce stack usage of traverse_impl() Stack use reduced from 560 bytes to 128 bytes. --- .topdeps | 1 + .topmsg | 6 ++++ module/zfs/dmu_traverse.c | 65 +++++++++++++++++++++++---------------- 3 files changed, 45 insertions(+), 27 deletions(-) create mode 100644 .topdeps create mode 100644 .topmsg diff --git a/.topdeps b/.topdeps new file mode 100644 index 0000000000..1f7391f92b --- /dev/null +++ b/.topdeps @@ -0,0 +1 @@ +master diff --git a/.topmsg b/.topmsg new file mode 100644 index 0000000000..f7bfb40f9c --- /dev/null +++ b/.topmsg @@ -0,0 +1,6 @@ +From: Brian Behlendorf +Subject: [PATCH] fix stack traverse_impl + +Stack use reduced from 560 bytes to 128 bytes. + +Signed-off-by: Brian Behlendorf diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 429c76ae11..85b5f59cd6 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -356,43 +356,54 @@ static int traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - struct traverse_data td; - struct prefetch_data pd = { 0 }; - zbookmark_t czb; + struct traverse_data *td; + struct prefetch_data *pd; + zbookmark_t *czb; int err; - td.td_spa = spa; - td.td_objset = objset; - td.td_rootbp = rootbp; - td.td_min_txg = txg_start; - td.td_func = func; - td.td_arg = arg; - td.td_pfd = &pd; - td.td_flags = flags; + td = kmem_alloc(sizeof(struct traverse_data), KM_SLEEP); + pd = kmem_alloc(sizeof(struct prefetch_data), KM_SLEEP); + czb = kmem_alloc(sizeof(zbookmark_t), KM_SLEEP); - pd.pd_blks_max = 100; - pd.pd_flags = flags; - mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); - cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); + td->td_spa = spa; + td->td_objset = objset; + td->td_rootbp = rootbp; + td->td_min_txg = txg_start; + td->td_func = func; + td->td_arg = arg; + td->td_pfd = pd; + td->td_flags = flags; + + pd->pd_blks_max = 100; + pd->pd_blks_fetched = 0; + pd->pd_flags = flags; + pd->pd_cancel = B_FALSE; + pd->pd_exited = B_FALSE; + mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); if (!(flags & TRAVERSE_PREFETCH) || 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, - &td, TQ_NOQUEUE)) - pd.pd_exited = B_TRUE; + td, TQ_NOQUEUE)) + pd->pd_exited = B_TRUE; - SET_BOOKMARK(&czb, objset, + SET_BOOKMARK(czb, objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); + err = traverse_visitbp(td, NULL, NULL, rootbp, czb); - mutex_enter(&pd.pd_mtx); - pd.pd_cancel = B_TRUE; - cv_broadcast(&pd.pd_cv); - while (!pd.pd_exited) - cv_wait(&pd.pd_cv, &pd.pd_mtx); - mutex_exit(&pd.pd_mtx); + mutex_enter(&pd->pd_mtx); + pd->pd_cancel = B_TRUE; + cv_broadcast(&pd->pd_cv); + while (!pd->pd_exited) + cv_wait(&pd->pd_cv, &pd->pd_mtx); + mutex_exit(&pd->pd_mtx); - mutex_destroy(&pd.pd_mtx); - cv_destroy(&pd.pd_cv); + mutex_destroy(&pd->pd_mtx); + cv_destroy(&pd->pd_cv); + + kmem_free(czb, sizeof(zbookmark_t)); + kmem_free(pd, sizeof(struct prefetch_data)); + kmem_free(td, sizeof(struct traverse_data)); return (err); } From 607b45509098ee157ce7be83cd5c1d744f98aaef Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 29 Jun 2010 10:21:50 -0700 Subject: [PATCH 7/8] New TopGit dependency: fix-stack-traverse_impl --- .topdeps | 1 + 1 file changed, 1 insertion(+) diff --git a/.topdeps b/.topdeps index 9cd7933b53..c6bffb3336 100644 --- a/.topdeps +++ b/.topdeps @@ -21,3 +21,4 @@ fix-stack-noinline fix-stack-dmu_objset_snapshot fix-stack-dsl_dataset_destroy fix-stack-vn_open +fix-stack-traverse_impl From 9cdd80e387d5ee10a367de12a816937812818218 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 29 Jun 2010 10:24:14 -0700 Subject: [PATCH 8/8] Revert traverse_impl() changes These changes are now taken care of by the fix-stack-traverse_impl topic branch which not only solves the uninit problem but also moves these locals off the stack and on to the heap. --- module/zfs/dmu_traverse.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 3350c08bc1..429c76ae11 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -357,7 +357,7 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { struct traverse_data td; - struct prefetch_data pd; + struct prefetch_data pd = { 0 }; zbookmark_t czb; int err; @@ -371,10 +371,7 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, td.td_flags = flags; pd.pd_blks_max = 100; - pd.pd_blks_fetched = 0; pd.pd_flags = flags; - pd.pd_cancel = B_FALSE; - pd.pd_exited = B_FALSE; mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);