diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index f30c9427e9..e73be52f3e 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -125,7 +125,8 @@ COMMON_H = \ $(top_srcdir)/include/sys/zio.h \ $(top_srcdir)/include/sys/zio_impl.h \ $(top_srcdir)/include/sys/zio_priority.h \ - $(top_srcdir)/include/sys/zrlock.h + $(top_srcdir)/include/sys/zrlock.h \ + $(top_srcdir)/include/sys/zthr.h KERNEL_H = \ $(top_srcdir)/include/sys/zfs_ioctl.h \ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 1741eb9e5c..f49138d0f6 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -45,6 +45,7 @@ #include #include #include +#include #include #ifdef __cplusplus @@ -268,7 +269,7 @@ struct spa { spa_condensing_indirect_phys_t spa_condensing_indirect_phys; spa_condensing_indirect_t *spa_condensing_indirect; - kthread_t *spa_condense_thread; /* thread doing condense. */ + zthr_t *spa_condense_zthr; /* zthr doing condense. */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h index 80ee09456a..fb6c8c0cec 100644 --- a/include/sys/vdev_removal.h +++ b/include/sys/vdev_removal.h @@ -76,7 +76,7 @@ extern int spa_remove_init(spa_t *); extern void spa_restart_removal(spa_t *); extern int spa_condense_init(spa_t *); extern void spa_condense_fini(spa_t *); -extern void spa_condense_indirect_restart(spa_t *); +extern void spa_start_indirect_condensing_thread(spa_t *); extern void spa_vdev_condense_suspend(spa_t *); extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t); diff --git a/include/sys/zthr.h b/include/sys/zthr.h new file mode 100644 index 0000000000..6bfb6b6c0d --- /dev/null +++ b/include/sys/zthr.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZTHR_H +#define _SYS_ZTHR_H + +typedef struct zthr zthr_t; +typedef int (zthr_func_t)(void *, zthr_t *); +typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *); + +struct zthr { + kthread_t *zthr_thread; + kmutex_t zthr_lock; + kcondvar_t zthr_cv; + boolean_t zthr_cancel; + + zthr_checkfunc_t *zthr_checkfunc; + zthr_func_t *zthr_func; + void *zthr_arg; + int zthr_rc; +}; + +extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc, + zthr_func_t *func, void *arg); +extern void zthr_exit(zthr_t *t, int rc); +extern void zthr_destroy(zthr_t *t); + +extern void zthr_wakeup(zthr_t *t); +extern int zthr_cancel(zthr_t *t); +extern void zthr_resume(zthr_t *t); + +extern boolean_t zthr_iscancelled(zthr_t *t); +extern boolean_t zthr_isrunning(zthr_t *t); + +#endif /* _SYS_ZTHR_H */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index d4fc201fa7..ec80ccf543 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -147,7 +147,8 @@ KERNEL_C = \ zio_crypt.c \ zio_inject.c \ zle.c \ - zrlock.c + zrlock.c \ + zthr.c LUA_C = \ lapi.c \ diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index fefb296545..fe50107731 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -133,6 +133,7 @@ $(MODULE)-objs += zpl_inode.o $(MODULE)-objs += zpl_super.o $(MODULE)-objs += zpl_xattr.o $(MODULE)-objs += zrlock.o +$(MODULE)-objs += zthr.o $(MODULE)-objs += zvol.o $(MODULE)-objs += dsl_destroy.o $(MODULE)-objs += dsl_userhold.o diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9a5346b42a..0aa9dcc4b4 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1400,6 +1400,12 @@ spa_unload(spa_t *spa) spa->spa_vdev_removal = NULL; } + if (spa->spa_condense_zthr != NULL) { + ASSERT(!zthr_isrunning(spa->spa_condense_zthr)); + zthr_destroy(spa->spa_condense_zthr); + spa->spa_condense_zthr = NULL; + } + spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); @@ -2180,6 +2186,16 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) return (SET_ERROR(err)); } +static void +spa_spawn_aux_threads(spa_t *spa) +{ + ASSERT(spa_writeable(spa)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_start_indirect_condensing_thread(spa); +} + /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off @@ -3244,18 +3260,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, int need_update = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); - /* - * We must check this before we start the sync thread, because - * we only want to start a condense thread for condense - * operations that were in progress when the pool was - * imported. Once we start syncing, spa_sync() could - * initiate a condense (and start a thread for it). In - * that case it would be wrong to start a second - * condense thread. - */ - boolean_t condense_in_progress = - (spa->spa_condensing_indirect != NULL); - ASSERT(state != SPA_LOAD_TRYIMPORT); /* @@ -3336,15 +3340,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); - /* - * Note: unlike condensing, we don't need an analogous - * "removal_in_progress" dance because no other thread - * can start a removal while we hold the spa_namespace_lock. - */ spa_restart_removal(spa); - if (condense_in_progress) - spa_condense_indirect_restart(spa); + spa_spawn_aux_threads(spa); } return (0); @@ -4353,6 +4351,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (dp->dp_root_dir->dd_crypto_obj != 0) VERIFY0(spa_keystore_remove_mapping(spa, root_dsobj, FTAG)); + spa_spawn_aux_threads(spa); + spa_write_cachefile(spa, B_FALSE, B_TRUE); /* @@ -6059,12 +6059,15 @@ spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; - while (spa->spa_async_thread != NULL || - spa->spa_condense_thread != NULL) + while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); spa_vdev_remove_suspend(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL && zthr_isrunning(condense_thread)) + VERIFY0(zthr_cancel(condense_thread)); } void @@ -6075,6 +6078,10 @@ spa_async_resume(spa_t *spa) spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); spa_restart_removal(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL && !zthr_isrunning(condense_thread)) + zthr_resume(condense_thread); } static boolean_t diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index b30ddaf266..7f172e2971 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -31,6 +31,8 @@ #include #include #include +#include +#include /* * An indirect vdev corresponds to a vdev that has been removed. Since @@ -569,7 +571,7 @@ spa_condense_indirect_commit_entry(spa_t *spa, static void spa_condense_indirect_generate_new_mapping(vdev_t *vd, - uint32_t *obsolete_counts, uint64_t start_index) + uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) { spa_t *spa = vd->vdev_spa; uint64_t mapi = start_index; @@ -584,7 +586,15 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd, (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); - while (mapi < old_num_entries && !spa_shutting_down(spa)) { + while (mapi < old_num_entries) { + + if (zthr_iscancelled(zthr)) { + zfs_dbgmsg("pausing condense of vdev %llu " + "at index %llu", (u_longlong_t)vd->vdev_id, + (u_longlong_t)mapi); + break; + } + vdev_indirect_mapping_entry_phys_t *entry = &old_mapping->vim_entries[mapi]; uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); @@ -605,18 +615,30 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd, mapi++; } - if (spa_shutting_down(spa)) { - zfs_dbgmsg("pausing condense of vdev %llu at index %llu", - (u_longlong_t)vd->vdev_id, - (u_longlong_t)mapi); - } } -static void -spa_condense_indirect_thread(void *arg) +/* ARGSUSED */ +static boolean_t +spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) { - vdev_t *vd = arg; - spa_t *spa = vd->vdev_spa; + spa_t *spa = arg; + + return (spa->spa_condensing_indirect != NULL); +} + +/* ARGSUSED */ +static int +spa_condense_indirect_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_t *vd; + + ASSERT3P(spa->spa_condensing_indirect, !=, NULL); + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); + ASSERT3P(vd, !=, NULL); + spa_config_exit(spa, SCL_VDEV, FTAG); + spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; @@ -690,25 +712,24 @@ spa_condense_indirect_thread(void *arg) } } - spa_condense_indirect_generate_new_mapping(vd, counts, start_index); + spa_condense_indirect_generate_new_mapping(vd, counts, + start_index, zthr); vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); /* - * We may have bailed early from generate_new_mapping(), if - * the spa is shutting down. In this case, do not complete - * the condense. + * If the zthr has received a cancellation signal while running + * in generate_new_mapping() or at any point after that, then bail + * early. We don't want to complete the condense if the spa is + * shutting down. */ - if (!spa_shutting_down(spa)) { - VERIFY0(dsl_sync_task(spa_name(spa), NULL, - spa_condense_indirect_complete_sync, sci, 0, - ZFS_SPACE_CHECK_NONE)); - } + if (zthr_iscancelled(zthr)) + return (0); - mutex_enter(&spa->spa_async_lock); - spa->spa_condense_thread = NULL; - cv_broadcast(&spa->spa_async_cv); - mutex_exit(&spa->spa_async_lock); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE)); + + return (0); } /* @@ -761,9 +782,7 @@ spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) (u_longlong_t)scip->scip_prev_obsolete_sm_object, (u_longlong_t)scip->scip_next_mapping_object); - ASSERT3P(spa->spa_condense_thread, ==, NULL); - spa->spa_condense_thread = thread_create(NULL, 0, - spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri); + zthr_wakeup(spa->spa_condense_zthr); } /* @@ -840,24 +859,12 @@ spa_condense_fini(spa_t *spa) } } -/* - * Restart the condense - called when the pool is opened. - */ void -spa_condense_indirect_restart(spa_t *spa) +spa_start_indirect_condensing_thread(spa_t *spa) { - vdev_t *vd; - ASSERT(spa->spa_condensing_indirect != NULL); - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - vd = vdev_lookup_top(spa, - spa->spa_condensing_indirect_phys.scip_vdev); - ASSERT(vd != NULL); - spa_config_exit(spa, SCL_VDEV, FTAG); - - ASSERT3P(spa->spa_condense_thread, ==, NULL); - spa->spa_condense_thread = thread_create(NULL, 0, - spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, - minclsyspri); + ASSERT3P(spa->spa_condense_zthr, ==, NULL); + spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check, + spa_condense_indirect_thread, spa); } /* @@ -1612,7 +1619,7 @@ vdev_ops_t vdev_indirect_ops = { #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(rs_alloc); EXPORT_SYMBOL(spa_condense_fini); -EXPORT_SYMBOL(spa_condense_indirect_restart); +EXPORT_SYMBOL(spa_start_indirect_condensing_thread); EXPORT_SYMBOL(spa_condense_indirect_start_sync); EXPORT_SYMBOL(spa_condense_init); EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c new file mode 100644 index 0000000000..9beb7e128f --- /dev/null +++ b/module/zfs/zthr.c @@ -0,0 +1,319 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +/* + * ZTHR Infrastructure + * =================== + * + * ZTHR threads are used for isolated operations that span multiple txgs + * within a SPA. They generally exist from SPA creation/loading and until + * the SPA is exported/destroyed. The ideal requirements for an operation + * to be modeled with a zthr are the following: + * + * 1] The operation needs to run over multiple txgs. + * 2] There is be a single point of reference in memory or on disk that + * indicates whether the operation should run/is running or is + * stopped. + * + * If the operation satisfies the above then the following rules guarantee + * a certain level of correctness: + * + * 1] Any thread EXCEPT the zthr changes the work indicator from stopped + * to running but not the opposite. + * 2] Only the zthr can change the work indicator from running to stopped + * (e.g. when it is done) but not the opposite. + * + * This way a normal zthr cycle should go like this: + * + * 1] An external thread changes the work indicator from stopped to + * running and wakes up the zthr. + * 2] The zthr wakes up, checks the indicator and starts working. + * 3] When the zthr is done, it changes the indicator to stopped, allowing + * a new cycle to start. + * + * == ZTHR creation + * + * Every zthr needs three inputs to start running: + * + * 1] A user-defined checker function (checkfunc) that decides whether + * the zthr should start working or go to sleep. The function should + * return TRUE when the zthr needs to work or FALSE to let it sleep, + * and should adhere to the following signature: + * boolean_t checkfunc_name(void *args, zthr_t *t); + * + * 2] A user-defined ZTHR function (func) which the zthr executes when + * it is not sleeping. The function should adhere to the following + * signature type: + * int func_name(void *args, zthr_t *t); + * + * 3] A void args pointer that will be passed to checkfunc and func + * implicitly by the infrastructure. + * + * The reason why the above API needs two different functions, + * instead of one that both checks and does the work, has to do with + * the zthr's internal lock (zthr_lock) and the allowed cancellation + * windows. We want to hold the zthr_lock while running checkfunc + * but not while running func. This way the zthr can be cancelled + * while doing work and not while checking for work. + * + * To start a zthr: + * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args); + * + * After that you should be able to wakeup, cancel, and resume the + * zthr from another thread using zthr_pointer. + * + * NOTE: ZTHR threads could potentially wake up spuriously and the + * user should take this into account when writing a checkfunc. + * [see ZTHR state transitions] + * + * == ZTHR cancellation + * + * ZTHR threads must be cancelled when their SPA is being exported + * or when they need to be paused so they don't interfere with other + * operations. + * + * To cancel a zthr: + * zthr_cancel(zthr_pointer); + * + * To resume it: + * zthr_resume(zthr_pointer); + * + * A zthr will implicitly check if it has received a cancellation + * signal every time func returns and everytime it wakes up [see ZTHR + * state transitions below]. + * + * At times, waiting for the zthr's func to finish its job may take + * time. This may be very time-consuming for some operations that + * need to cancel the SPA's zthrs (e.g spa_export). For this scenario + * the user can explicitly make their ZTHR function aware of incoming + * cancellation signals using zthr_iscancelled(). A common pattern for + * that looks like this: + * + * int + * func_name(void *args, zthr_t *t) + * { + * ... ... + * while (!work_done && !zthr_iscancelled(t)) { + * ... ... + * } + * return (0); + * } + * + * == ZTHR exit + * + * For the rare cases where the zthr wants to stop running voluntarily + * while running its ZTHR function (func), we provide zthr_exit(). + * When a zthr has voluntarily stopped running, it can be resumed with + * zthr_resume(), just like it would if it was cancelled by some other + * thread. + * + * == ZTHR cleanup + * + * Cancelling a zthr doesn't clean up its metadata (internal locks, + * function pointers to func and checkfunc, etc..). This is because + * we want to keep them around in case we want to resume the execution + * of the zthr later. Similarly for zthrs that exit themselves. + * + * To completely cleanup a zthr, cancel it first to ensure that it + * is not running and then use zthr_destroy(). + * + * == ZTHR state transitions + * + * zthr creation + * + + * | + * | woke up + * | +--------------+ sleep + * | | ^ + * | | | + * | | | FALSE + * | | | + * v v FALSE + + * cancelled? +---------> checkfunc? + * + ^ + + * | | | + * | | | TRUE + * | | | + * | | func returned v + * | +---------------+ func + * | + * | TRUE + * | + * v + * zthr stopped running + * + */ + +#include +#include + +void +zthr_exit(zthr_t *t, int rc) +{ + ASSERT3P(t->zthr_thread, ==, curthread); + mutex_enter(&t->zthr_lock); + t->zthr_thread = NULL; + t->zthr_rc = rc; + cv_broadcast(&t->zthr_cv); + mutex_exit(&t->zthr_lock); + thread_exit(); +} + +static void +zthr_procedure(void *arg) +{ + zthr_t *t = arg; + int rc = 0; + + mutex_enter(&t->zthr_lock); + while (!t->zthr_cancel) { + if (t->zthr_checkfunc(t->zthr_arg, t)) { + mutex_exit(&t->zthr_lock); + rc = t->zthr_func(t->zthr_arg, t); + mutex_enter(&t->zthr_lock); + } else { + /* go to sleep */ + cv_wait(&t->zthr_cv, &t->zthr_lock); + } + } + mutex_exit(&t->zthr_lock); + + zthr_exit(t, rc); +} + +zthr_t * +zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) +{ + zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP); + mutex_init(&t->zthr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); + + mutex_enter(&t->zthr_lock); + t->zthr_checkfunc = checkfunc; + t->zthr_func = func; + t->zthr_arg = arg; + + t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, + 0, &p0, TS_RUN, minclsyspri); + mutex_exit(&t->zthr_lock); + + return (t); +} + +void +zthr_destroy(zthr_t *t) +{ + VERIFY3P(t->zthr_thread, ==, NULL); + mutex_destroy(&t->zthr_lock); + cv_destroy(&t->zthr_cv); + kmem_free(t, sizeof (*t)); +} + +/* + * Note: If the zthr is not sleeping and misses the wakeup + * (e.g it is running its ZTHR function), it will check if + * there is work to do before going to sleep using its checker + * function [see ZTHR state transition in ZTHR block comment]. + * Thus, missing the wakeup still yields the expected behavior. + */ +void +zthr_wakeup(zthr_t *t) +{ + ASSERT3P(t->zthr_thread, !=, NULL); + + mutex_enter(&t->zthr_lock); + cv_broadcast(&t->zthr_cv); + mutex_exit(&t->zthr_lock); +} + +/* + * Note: If the zthr is not running (e.g. has been cancelled + * already), this is a no-op. + */ +int +zthr_cancel(zthr_t *t) +{ + int rc = 0; + + mutex_enter(&t->zthr_lock); + + /* broadcast in case the zthr is sleeping */ + cv_broadcast(&t->zthr_cv); + + t->zthr_cancel = B_TRUE; + while (t->zthr_thread != NULL) + cv_wait(&t->zthr_cv, &t->zthr_lock); + t->zthr_cancel = B_FALSE; + rc = t->zthr_rc; + mutex_exit(&t->zthr_lock); + + return (rc); +} + +void +zthr_resume(zthr_t *t) +{ + ASSERT3P(t->zthr_thread, ==, NULL); + + mutex_enter(&t->zthr_lock); + + ASSERT3P(&t->zthr_checkfunc, !=, NULL); + ASSERT3P(&t->zthr_func, !=, NULL); + ASSERT(!t->zthr_cancel); + + t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, + 0, &p0, TS_RUN, minclsyspri); + + mutex_exit(&t->zthr_lock); +} + +/* + * This function is intended to be used by the zthr itself + * to check if another thread has signal it to stop running. + * + * returns TRUE if we are in the middle of trying to cancel + * this thread. + * + * returns FALSE otherwise. + */ +boolean_t +zthr_iscancelled(zthr_t *t) +{ + boolean_t cancelled; + + ASSERT3P(t->zthr_thread, ==, curthread); + + mutex_enter(&t->zthr_lock); + cancelled = t->zthr_cancel; + mutex_exit(&t->zthr_lock); + + return (cancelled); +} + +boolean_t +zthr_isrunning(zthr_t *t) +{ + boolean_t running; + + mutex_enter(&t->zthr_lock); + running = (t->zthr_thread != NULL); + mutex_exit(&t->zthr_lock); + + return (running); +}