diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 2c0d6ecf5a..5aa9426e19 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -52,7 +52,6 @@ #include #include #undef ZFS_MAXNAMELEN -#undef verify #include const char cmdname[] = "zdb"; diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c index d8654594d1..4127339ed1 100644 --- a/cmd/zinject/translate.c +++ b/cmd/zinject/translate.c @@ -25,8 +25,6 @@ #include -#undef verify /* both libzfs.h and zfs_context.h want to define this */ - #include #include diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 19742cf8f1..d95dcf0b11 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -138,7 +138,7 @@ typedef struct ztest_args { spa_t *za_spa; objset_t *za_os; zilog_t *za_zilog; - thread_t za_thread; + pthread_t za_thread; uint64_t za_instance; uint64_t za_random; uint64_t za_diroff; @@ -164,6 +164,7 @@ typedef void ztest_func_t(ztest_args_t *); ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; +ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; ztest_func_t ztest_traverse; @@ -198,6 +199,7 @@ ztest_info_t ztest_info[] = { { ztest_dmu_read_write, 1, &zopt_always }, { ztest_dmu_write_parallel, 30, &zopt_always }, { ztest_dmu_object_alloc_free, 1, &zopt_always }, + { ztest_dmu_commit_callbacks, 10, &zopt_always }, { ztest_zap, 30, &zopt_always }, { ztest_zap_parallel, 100, &zopt_always }, { ztest_dsl_prop_get_set, 1, &zopt_sometimes }, @@ -217,22 +219,33 @@ ztest_info_t ztest_info[] = { #define ZTEST_SYNC_LOCKS 16 +/* + * The following struct is used to hold a list of uncalled commit callbacks. + * + * The callbacks are ordered by txg number. + */ +typedef struct ztest_cb_list { + pthread_mutex_t zcl_callbacks_lock; + list_t zcl_callbacks; +} ztest_cb_list_t; + /* * Stuff we need to share writably between parent and child. */ typedef struct ztest_shared { - mutex_t zs_vdev_lock; - rwlock_t zs_name_lock; - uint64_t zs_vdev_primaries; - uint64_t zs_vdev_aux; - uint64_t zs_enospc_count; - hrtime_t zs_start_time; - hrtime_t zs_stop_time; - uint64_t zs_alloc; - uint64_t zs_space; - ztest_info_t zs_info[ZTEST_FUNCS]; - mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS]; - uint64_t zs_seq[ZTEST_SYNC_LOCKS]; + pthread_mutex_t zs_vdev_lock; + pthread_rwlock_t zs_name_lock; + uint64_t zs_vdev_primaries; + uint64_t zs_vdev_aux; + uint64_t zs_enospc_count; + hrtime_t zs_start_time; + hrtime_t zs_stop_time; + uint64_t zs_alloc; + uint64_t zs_space; + ztest_info_t zs_info[ZTEST_FUNCS]; + pthread_mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS]; + uint64_t zs_seq[ZTEST_SYNC_LOCKS]; + ztest_cb_list_t zs_cb_list; } ztest_shared_t; static char ztest_dev_template[] = "%s/%s.%llua"; @@ -804,7 +817,7 @@ ztest_spa_create_destroy(ztest_args_t *za) * Attempt to create an existing pool. It shouldn't matter * what's in the nvroot; we should fail with EEXIST. */ - (void) rw_rdlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL); nvlist_free(nvroot); @@ -820,7 +833,7 @@ ztest_spa_create_destroy(ztest_args_t *za) fatal(0, "spa_destroy() = %d", error); spa_close(spa, FTAG); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); } static vdev_t * @@ -851,7 +864,7 @@ ztest_vdev_add_remove(ztest_args_t *za) nvlist_t *nvroot; int error; - (void) mutex_lock(&ztest_shared->zs_vdev_lock); + (void) pthread_mutex_lock(&ztest_shared->zs_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -869,7 +882,7 @@ ztest_vdev_add_remove(ztest_args_t *za) error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); if (error == ENOSPC) ztest_record_enospc("spa_vdev_add"); @@ -898,7 +911,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za) aux = ZPOOL_CONFIG_L2CACHE; } - (void) mutex_lock(&ztest_shared->zs_vdev_lock); + (void) pthread_mutex_lock(&ztest_shared->zs_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -954,7 +967,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za) fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); } - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); } /* @@ -980,7 +993,7 @@ ztest_vdev_attach_detach(ztest_args_t *za) int oldvd_is_log; int error, expected_error; - (void) mutex_lock(&ztest_shared->zs_vdev_lock); + (void) pthread_mutex_lock(&ztest_shared->zs_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -1040,7 +1053,7 @@ ztest_vdev_attach_detach(ztest_args_t *za) if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP) fatal(0, "detach (%s) returned %d", oldpath, error); - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); return; } @@ -1133,7 +1146,7 @@ ztest_vdev_attach_detach(ztest_args_t *za) (longlong_t)newsize, replacing, error, expected_error); } - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); } /* @@ -1149,7 +1162,7 @@ ztest_vdev_LUN_growth(ztest_args_t *za) size_t fsize; int fd; - (void) mutex_lock(&ztest_shared->zs_vdev_lock); + (void) pthread_mutex_lock(&ztest_shared->zs_vdev_lock); /* * Pick a random leaf vdev. @@ -1180,7 +1193,7 @@ ztest_vdev_LUN_growth(ztest_args_t *za) (void) close(fd); } - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); } /* ARGSUSED */ @@ -1279,7 +1292,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za) uint64_t seq; uint64_t objects; - (void) rw_rdlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool, (u_longlong_t)za->za_instance); @@ -1322,7 +1335,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za) if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_create"); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); return; } fatal(0, "dmu_objset_create(%s) = %d", name, error); @@ -1404,7 +1417,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za) if (error) fatal(0, "dmu_objset_destroy(%s) = %d", name, error); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); } /* @@ -1418,7 +1431,7 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za) char snapname[100]; char osname[MAXNAMELEN]; - (void) rw_rdlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); dmu_objset_name(os, osname); (void) snprintf(snapname, 100, "%s@%llu", osname, (u_longlong_t)za->za_instance); @@ -1431,7 +1444,7 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za) ztest_record_enospc("dmu_take_snapshot"); else if (error != 0 && error != EEXIST) fatal(0, "dmu_take_snapshot() = %d", error); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); } /* @@ -1929,7 +1942,7 @@ ztest_dmu_write_parallel(ztest_args_t *za) int bs = ZTEST_DIROBJ_BLOCKSIZE; int do_free = 0; uint64_t off, txg, txg_how; - mutex_t *lp; + pthread_mutex_t *lp; char osname[MAXNAMELEN]; char iobuf[SPA_MAXBLOCKSIZE]; blkptr_t blk; @@ -1980,7 +1993,7 @@ ztest_dmu_write_parallel(ztest_args_t *za) txg = dmu_tx_get_txg(tx); lp = &ztest_shared->zs_sync_lock[b]; - (void) mutex_lock(lp); + (void) pthread_mutex_lock(lp); wbt->bt_objset = dmu_objset_id(os); wbt->bt_object = ZTEST_DIROBJ; @@ -2033,7 +2046,7 @@ ztest_dmu_write_parallel(ztest_args_t *za) dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx); } - (void) mutex_unlock(lp); + (void) pthread_mutex_unlock(lp); if (ztest_random(1000) == 0) (void) poll(NULL, 0, 1); /* open dn_notxholds window */ @@ -2052,13 +2065,13 @@ ztest_dmu_write_parallel(ztest_args_t *za) /* * dmu_sync() the block we just wrote. */ - (void) mutex_lock(lp); + (void) pthread_mutex_lock(lp); blkoff = P2ALIGN_TYPED(off, bs, uint64_t); error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db); za->za_dbuf = db; if (error) { - (void) mutex_unlock(lp); + (void) pthread_mutex_unlock(lp); return; } blkoff = off - blkoff; @@ -2066,7 +2079,7 @@ ztest_dmu_write_parallel(ztest_args_t *za) dmu_buf_rele(db, FTAG); za->za_dbuf = NULL; - (void) mutex_unlock(lp); + (void) pthread_mutex_unlock(lp); if (error) return; @@ -2435,6 +2448,205 @@ ztest_zap_parallel(ztest_args_t *za) dmu_tx_commit(tx); } +/* + * Commit callback data. + */ +typedef struct ztest_cb_data { + list_node_t zcd_node; + ztest_cb_list_t *zcd_zcl; + uint64_t zcd_txg; + int zcd_expected_err; + boolean_t zcd_added; + boolean_t zcd_called; + spa_t *zcd_spa; +} ztest_cb_data_t; + +static void +ztest_commit_callback(void *arg, int error) +{ + ztest_cb_data_t *data = arg; + ztest_cb_list_t *zcl; + uint64_t synced_txg; + + VERIFY(data != NULL); + VERIFY3S(data->zcd_expected_err, ==, error); + VERIFY(!data->zcd_called); + + synced_txg = spa_last_synced_txg(data->zcd_spa); + if (data->zcd_txg > synced_txg) + fatal(0, "commit callback of txg %llu prematurely called, last" + " synced txg = %llu\n", data->zcd_txg, synced_txg); + + zcl = data->zcd_zcl; + data->zcd_called = B_TRUE; + + if (error == ECANCELED) { + ASSERT3U(data->zcd_txg, ==, 0); + ASSERT(!data->zcd_added); + + /* + * The private callback data should be destroyed here, but + * since we are going to check the zcd_called field after + * dmu_tx_abort(), we will destroy it there. + */ + return; + } + + if (!data->zcd_added) + goto out; + + ASSERT3U(data->zcd_txg, !=, 0); + + /* Remove our callback from the list */ + (void) mutex_lock(&zcl->zcl_callbacks_lock); + list_remove(&zcl->zcl_callbacks, data); + (void) mutex_unlock(&zcl->zcl_callbacks_lock); + +out: + umem_free(data, sizeof (ztest_cb_data_t)); +} + +static ztest_cb_data_t * +ztest_create_cb_data(objset_t *os, ztest_cb_list_t *zcl, uint64_t txg) +{ + ztest_cb_data_t *cb_data; + + cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); + + cb_data->zcd_zcl = zcl; + cb_data->zcd_txg = txg; + cb_data->zcd_spa = os->os->os_spa; + + return (cb_data); +} + +/* + * If a number of txgs equal to this threshold have been created after a commit + * callback has been registered but not called, then we assume there is an + * implementation bug. + */ +#define ZTEST_COMMIT_CALLBACK_THRESH 3 + +/* + * Commit callback test. + */ +void +ztest_dmu_commit_callbacks(ztest_args_t *za) +{ + objset_t *os = za->za_os; + dmu_tx_t *tx; + ztest_cb_list_t *zcl = &ztest_shared->zs_cb_list; + ztest_cb_data_t *cb_data[3], *tmp_cb; + uint64_t old_txg, txg; + int i, error; + + tx = dmu_tx_create(os); + + cb_data[0] = ztest_create_cb_data(os, zcl, 0); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); + + dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t)); + + /* Every once in a while, abort the transaction on purpose */ + if (ztest_random(100) == 0) + error = -1; + + if (!error) + error = dmu_tx_assign(tx, TXG_NOWAIT); + + txg = error ? 0 : dmu_tx_get_txg(tx); + + cb_data[1] = ztest_create_cb_data(os, zcl, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); + + if (error) { + /* + * It's not a strict requirement to call the registered + * callbacks from inside dmu_tx_abort(), but that's what + * happens in the current implementation so we will check for + * that. + */ + for (i = 0; i < 2; i++) { + cb_data[i]->zcd_expected_err = ECANCELED; + VERIFY(!cb_data[i]->zcd_called); + } + + dmu_tx_abort(tx); + + for (i = 0; i < 2; i++) { + VERIFY(cb_data[i]->zcd_called); + umem_free(cb_data[i], sizeof (ztest_cb_data_t)); + } + + return; + } + + cb_data[0]->zcd_txg = txg; + cb_data[2] = ztest_create_cb_data(os, zcl, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); + + /* + * Read existing data to make sure there isn't a future leak. + */ + VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), + &old_txg)); + + if (old_txg > txg) + fatal(0, "future leak: got %llx, open txg is %llx", old_txg, + txg); + + dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), &txg, tx); + + (void) mutex_lock(&zcl->zcl_callbacks_lock); + + /* + * Since commit callbacks don't have any ordering requirement and since + * it is theoretically possible for a commit callback to be called + * after an arbitrary amount of time has elapsed since its txg has been + * synced, it is difficult to reliably determine whether a commit + * callback hasn't been called due to high load or due to a flawed + * implementation. + * + * In practice, we will assume that if after a certain number of txgs a + * commit callback hasn't been called, then most likely there's an + * implementation bug.. + */ + tmp_cb = list_head(&zcl->zcl_callbacks); + if (tmp_cb != NULL) + VERIFY3U(tmp_cb->zcd_txg, >, + txg - ZTEST_COMMIT_CALLBACK_THRESH); + + /* + * Let's find the place to insert our callbacks. + * + * Even though the list is ordered by txg, it is possible for the + * insertion point to not be the end because our txg may already be + * quiescing at this point and other callbacks in the open txg may + * have sneaked in. + */ + tmp_cb = list_tail(&zcl->zcl_callbacks); + while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) + tmp_cb = list_prev(&zcl->zcl_callbacks, tmp_cb); + + /* Add the 3 callbacks to the list */ + for (i = 0; i < 3; i++) { + if (tmp_cb == NULL) + list_insert_head(&zcl->zcl_callbacks, cb_data[i]); + else + list_insert_after(&zcl->zcl_callbacks, tmp_cb, + cb_data[i]); + + cb_data[i]->zcd_added = B_TRUE; + VERIFY(!cb_data[i]->zcd_called); + + tmp_cb = cb_data[i]; + } + + (void) mutex_unlock(&zcl->zcl_callbacks_lock); + + dmu_tx_commit(tx); +} + void ztest_dsl_prop_get_set(ztest_args_t *za) { @@ -2446,7 +2658,7 @@ ztest_dsl_prop_get_set(ztest_args_t *za) char osname[MAXNAMELEN]; int error; - (void) rw_rdlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); dmu_objset_name(os, osname); @@ -2485,7 +2697,7 @@ ztest_dsl_prop_get_set(ztest_args_t *za) } } - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); } /* @@ -2649,7 +2861,7 @@ ztest_spa_rename(ztest_args_t *za) int error; spa_t *spa; - (void) rw_wrlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_wrlock(&ztest_shared->zs_name_lock); oldname = za->za_pool; newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); @@ -2701,7 +2913,7 @@ ztest_spa_rename(ztest_args_t *za) umem_free(newname, strlen(newname) + 1); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); } @@ -2929,15 +3141,18 @@ ztest_spa_import_export(char *oldname, char *newname) nvlist_free(config); } -static void -ztest_resume(spa_t *spa) +static void * +ztest_resume(void *arg) { + spa_t *spa = arg; + if (spa_suspended(spa)) { spa_vdev_state_enter(spa); vdev_clear(spa, NULL); (void) spa_vdev_state_exit(spa, NULL, 0); zio_resume(spa); } + return (NULL); } static void * @@ -3027,15 +3242,19 @@ ztest_run(char *pool) ztest_args_t *za; spa_t *spa; char name[100]; - thread_t resume_tid; + pthread_t resume_tid; ztest_exiting = B_FALSE; - (void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL); - (void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL); + (void) pthread_mutex_init(&zs->zs_vdev_lock, NULL); + (void) pthread_rwlock_init(&zs->zs_name_lock, NULL); + (void) pthread_mutex_init(&zs->zs_cb_list.zcl_callbacks_lock, NULL); + + list_create(&zs->zs_cb_list.zcl_callbacks, sizeof (ztest_cb_data_t), + offsetof(ztest_cb_data_t, zcd_node)); for (t = 0; t < ZTEST_SYNC_LOCKS; t++) - (void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL); + (void) pthread_mutex_init(&zs->zs_sync_lock[t], NULL); /* * Destroy one disk before we even start. @@ -3102,8 +3321,7 @@ ztest_run(char *pool) /* * Create a thread to periodically resume suspended I/O. */ - VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, - &resume_tid) == 0); + VERIFY(pthread_create(&resume_tid, NULL, ztest_resume_thread, spa)==0); /* * Verify that we can safely inquire about about any object, @@ -3152,7 +3370,7 @@ ztest_run(char *pool) if (t < zopt_datasets) { int test_future = FALSE; - (void) rw_rdlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, ztest_create_cb, NULL); @@ -3160,7 +3378,7 @@ ztest_run(char *pool) test_future = TRUE; } else if (error == ENOSPC) { zs->zs_enospc_count++; - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); break; } else if (error != 0) { fatal(0, "dmu_objset_create(%s) = %d", @@ -3171,7 +3389,7 @@ ztest_run(char *pool) if (error) fatal(0, "dmu_objset_open('%s') = %d", name, error); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); if (test_future) ztest_dmu_check_future_leak(&za[t]); zil_replay(za[d].za_os, za[d].za_os, @@ -3179,12 +3397,12 @@ ztest_run(char *pool) za[d].za_zilog = zil_open(za[d].za_os, NULL); } - VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND, - &za[t].za_thread) == 0); + VERIFY(pthread_create(&za[t].za_thread, NULL, ztest_thread, + &za[t]) == 0); } while (--t >= 0) { - VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0); + VERIFY(pthread_join(za[t].za_thread, NULL) == 0); if (t < zopt_datasets) { zil_close(za[t].za_zilog); dmu_objset_close(za[t].za_os); @@ -3203,14 +3421,14 @@ ztest_run(char *pool) * If we had out-of-space errors, destroy a random objset. */ if (zs->zs_enospc_count != 0) { - (void) rw_rdlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); d = (int)ztest_random(zopt_datasets); (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); if (zopt_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); (void) dmu_objset_find(name, ztest_destroy_cb, &za[d], DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); } txg_wait_synced(spa_get_dsl(spa), 0); @@ -3219,7 +3437,7 @@ ztest_run(char *pool) /* Kill the resume thread */ ztest_exiting = B_TRUE; - VERIFY(thr_join(resume_tid, NULL, NULL) == 0); + VERIFY(pthread_join(resume_tid, NULL) == 0); ztest_resume(spa); /* @@ -3232,6 +3450,12 @@ ztest_run(char *pool) spa_close(spa, FTAG); kernel_fini(); + + list_destroy(&zs->zs_cb_list.zcl_callbacks); + + (void) pthread_mutex_destroy(&zs->zs_cb_list.zcl_callbacks_lock); + (void) pthread_rwlock_destroy(&zs->zs_name_lock); + (void) pthread_mutex_destroy(&zs->zs_vdev_lock); } void diff --git a/lib/libuutil/uu_misc.c b/lib/libuutil/uu_misc.c index 04d63da403..60f50832d9 100644 --- a/lib/libuutil/uu_misc.c +++ b/lib/libuutil/uu_misc.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #if !defined(TEXT_DOMAIN) @@ -70,11 +69,12 @@ static va_list uu_panic_args; static pthread_t uu_panic_thread; static uint32_t _uu_main_error; +static __thread int _uu_main_thread = 0; void uu_set_error(uint_t code) { - if (thr_main() != 0) { + if (_uu_main_thread) { _uu_main_error = code; return; } @@ -103,7 +103,7 @@ uu_set_error(uint_t code) uint32_t uu_error(void) { - if (thr_main() != 0) + if (_uu_main_thread) return (_uu_main_error); if (uu_error_key_setup < 0) /* can't happen? */ @@ -249,5 +249,6 @@ uu_init(void) __attribute__((constructor)); static void uu_init(void) { + _uu_main_thread = 1; (void) pthread_atfork(uu_lockup, uu_release, uu_release_child); } diff --git a/lib/libzpool/include/sys/zfs_context.h b/lib/libzpool/include/sys/zfs_context.h index 8883bcd6cc..6541ba0853 100644 --- a/lib/libzpool/include/sys/zfs_context.h +++ b/lib/libzpool/include/sys/zfs_context.h @@ -50,8 +50,7 @@ extern "C" { #include #include #include -#include -#include +#include #include #include #include @@ -150,15 +149,18 @@ extern void vpanic(const char *, __va_list); /* * Threads */ -#define curthread ((void *)(uintptr_t)thr_self()) +#define curthread ((void *)(uintptr_t)pthread_self()) +#define tsd_get(key) pthread_getspecific(key) +#define tsd_set(key, val) pthread_setspecific(key, val) typedef struct kthread kthread_t; +typedef void (*thread_func_t)(void *); #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ - zk_thread_create(func, arg) -#define thread_exit() thr_exit(NULL) + zk_thread_create((thread_func_t)func, arg) +#define thread_exit() pthread_exit(NULL) -extern kthread_t *zk_thread_create(void (*func)(), void *arg); +extern kthread_t *zk_thread_create(thread_func_t func, void *arg); #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) @@ -166,28 +168,18 @@ extern kthread_t *zk_thread_create(void (*func)(), void *arg); /* * Mutexes */ +#define MTX_MAGIC 0x9522f51362a6e326ull typedef struct kmutex { void *m_owner; - boolean_t initialized; - mutex_t m_lock; + uint64_t m_magic; + pthread_mutex_t m_lock; } kmutex_t; -#define MUTEX_DEFAULT USYNC_THREAD -#undef MUTEX_HELD -#define MUTEX_HELD(m) _mutex_held(&(m)->m_lock) +#define MUTEX_DEFAULT 0 +#define MUTEX_HELD(m) ((m)->m_owner == curthread) -/* - * Argh -- we have to get cheesy here because the kernel and userland - * have different signatures for the same routine. - */ -extern int _mutex_init(mutex_t *mp, int type, void *arg); -extern int _mutex_destroy(mutex_t *mp); - -#define mutex_init(mp, b, c, d) zmutex_init((kmutex_t *)(mp)) -#define mutex_destroy(mp) zmutex_destroy((kmutex_t *)(mp)) - -extern void zmutex_init(kmutex_t *mp); -extern void zmutex_destroy(kmutex_t *mp); +extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); +extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); @@ -196,23 +188,24 @@ extern void *mutex_owner(kmutex_t *mp); /* * RW locks */ +#define RW_MAGIC 0x4d31fb123648e78aull typedef struct krwlock { - void *rw_owner; - boolean_t initialized; - rwlock_t rw_lock; + void *rw_owner; + void *rw_wr_owner; + uint64_t rw_magic; + pthread_rwlock_t rw_lock; + uint_t rw_readers; } krwlock_t; typedef int krw_t; #define RW_READER 0 #define RW_WRITER 1 -#define RW_DEFAULT USYNC_THREAD +#define RW_DEFAULT 0 -#undef RW_READ_HELD -#define RW_READ_HELD(x) _rw_read_held(&(x)->rw_lock) - -#undef RW_WRITE_HELD -#define RW_WRITE_HELD(x) _rw_write_held(&(x)->rw_lock) +#define RW_READ_HELD(x) ((x)->rw_readers > 0) +#define RW_WRITE_HELD(x) ((x)->rw_wr_owner == curthread) +#define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); extern void rw_destroy(krwlock_t *rwlp); @@ -230,9 +223,13 @@ extern gid_t *crgetgroups(cred_t *cr); /* * Condition variables */ -typedef cond_t kcondvar_t; +#define CV_MAGIC 0xd31ea9a83b1b30c4ull +typedef struct kcondvar { + uint64_t cv_magic; + pthread_cond_t cv; +} kcondvar_t; -#define CV_DEFAULT USYNC_THREAD +#define CV_DEFAULT 0 extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); extern void cv_destroy(kcondvar_t *cv); @@ -409,7 +406,8 @@ extern void delay(clock_t ticks); #define minclsyspri 60 #define maxclsyspri 99 -#define CPU_SEQID (thr_self() & (max_ncpus - 1)) +/* XXX: not portable */ +#define CPU_SEQID (pthread_self() & (max_ncpus - 1)) #define kcred NULL #define CRED() NULL diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 5d74796904..65faab8451 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -35,6 +35,7 @@ #include #include #include +#include #include /* @@ -56,13 +57,17 @@ struct utsname utsname = { */ /*ARGSUSED*/ kthread_t * -zk_thread_create(void (*func)(), void *arg) +zk_thread_create(thread_func_t func, void *arg) { - thread_t tid; + pthread_t tid; - VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED, - &tid) == 0); + pthread_attr_t attr; + VERIFY(pthread_attr_init(&attr) == 0); + VERIFY(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) == 0); + VERIFY(pthread_create(&tid, &attr, (void *(*)(void *))func, arg) == 0); + + /* XXX: not portable */ return ((void *)(uintptr_t)tid); } @@ -95,30 +100,37 @@ kstat_delete(kstat_t *ksp) * ========================================================================= */ void -zmutex_init(kmutex_t *mp) +mutex_init(kmutex_t *mp, char *name, int type, void *cookie) { + ASSERT(type == MUTEX_DEFAULT); + ASSERT(cookie == NULL); + +#ifdef IM_FEELING_LUCKY + ASSERT(mp->m_magic != MTX_MAGIC); +#endif + mp->m_owner = NULL; - mp->initialized = B_TRUE; - (void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL); + mp->m_magic = MTX_MAGIC; + VERIFY3S(pthread_mutex_init(&mp->m_lock, NULL), ==, 0); } void -zmutex_destroy(kmutex_t *mp) +mutex_destroy(kmutex_t *mp) { - ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_magic == MTX_MAGIC); ASSERT(mp->m_owner == NULL); - (void) _mutex_destroy(&(mp)->m_lock); + VERIFY3S(pthread_mutex_destroy(&(mp)->m_lock), ==, 0); mp->m_owner = (void *)-1UL; - mp->initialized = B_FALSE; + mp->m_magic = 0; } void mutex_enter(kmutex_t *mp) { - ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_magic == MTX_MAGIC); ASSERT(mp->m_owner != (void *)-1UL); ASSERT(mp->m_owner != curthread); - VERIFY(mutex_lock(&mp->m_lock) == 0); + VERIFY3S(pthread_mutex_lock(&mp->m_lock), ==, 0); ASSERT(mp->m_owner == NULL); mp->m_owner = curthread; } @@ -126,9 +138,9 @@ mutex_enter(kmutex_t *mp) int mutex_tryenter(kmutex_t *mp) { - ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_magic == MTX_MAGIC); ASSERT(mp->m_owner != (void *)-1UL); - if (0 == mutex_trylock(&mp->m_lock)) { + if (0 == pthread_mutex_trylock(&mp->m_lock)) { ASSERT(mp->m_owner == NULL); mp->m_owner = curthread; return (1); @@ -140,16 +152,16 @@ mutex_tryenter(kmutex_t *mp) void mutex_exit(kmutex_t *mp) { - ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_magic == MTX_MAGIC); ASSERT(mutex_owner(mp) == curthread); mp->m_owner = NULL; - VERIFY(mutex_unlock(&mp->m_lock) == 0); + VERIFY3S(pthread_mutex_unlock(&mp->m_lock), ==, 0); } void * mutex_owner(kmutex_t *mp) { - ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_magic == MTX_MAGIC); return (mp->m_owner); } @@ -162,31 +174,48 @@ mutex_owner(kmutex_t *mp) void rw_init(krwlock_t *rwlp, char *name, int type, void *arg) { - rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL); + ASSERT(type == RW_DEFAULT); + ASSERT(arg == NULL); + +#ifdef IM_FEELING_LUCKY + ASSERT(rwlp->rw_magic != RW_MAGIC); +#endif + + VERIFY3S(pthread_rwlock_init(&rwlp->rw_lock, NULL), ==, 0); rwlp->rw_owner = NULL; - rwlp->initialized = B_TRUE; + rwlp->rw_wr_owner = NULL; + rwlp->rw_readers = 0; + rwlp->rw_magic = RW_MAGIC; } void rw_destroy(krwlock_t *rwlp) { - rwlock_destroy(&rwlp->rw_lock); - rwlp->rw_owner = (void *)-1UL; - rwlp->initialized = B_FALSE; + ASSERT(rwlp->rw_magic == RW_MAGIC); + + VERIFY3S(pthread_rwlock_destroy(&rwlp->rw_lock), ==, 0); + rwlp->rw_magic = 0; } void rw_enter(krwlock_t *rwlp, krw_t rw) { - ASSERT(!RW_LOCK_HELD(rwlp)); - ASSERT(rwlp->initialized == B_TRUE); - ASSERT(rwlp->rw_owner != (void *)-1UL); + ASSERT(rwlp->rw_magic == RW_MAGIC); ASSERT(rwlp->rw_owner != curthread); + ASSERT(rwlp->rw_wr_owner != curthread); - if (rw == RW_READER) - VERIFY(rw_rdlock(&rwlp->rw_lock) == 0); - else - VERIFY(rw_wrlock(&rwlp->rw_lock) == 0); + if (rw == RW_READER) { + VERIFY3S(pthread_rwlock_rdlock(&rwlp->rw_lock), ==, 0); + ASSERT(rwlp->rw_wr_owner == NULL); + + atomic_inc_uint(&rwlp->rw_readers); + } else { + VERIFY3S(pthread_rwlock_wrlock(&rwlp->rw_lock), ==, 0); + ASSERT(rwlp->rw_wr_owner == NULL); + ASSERT3U(rwlp->rw_readers, ==, 0); + + rwlp->rw_wr_owner = curthread; + } rwlp->rw_owner = curthread; } @@ -194,11 +223,16 @@ rw_enter(krwlock_t *rwlp, krw_t rw) void rw_exit(krwlock_t *rwlp) { - ASSERT(rwlp->initialized == B_TRUE); - ASSERT(rwlp->rw_owner != (void *)-1UL); + ASSERT(rwlp->rw_magic == RW_MAGIC); + ASSERT(RW_LOCK_HELD(rwlp)); + + if (RW_READ_HELD(rwlp)) + atomic_dec_uint(&rwlp->rw_readers); + else + rwlp->rw_wr_owner = NULL; rwlp->rw_owner = NULL; - VERIFY(rw_unlock(&rwlp->rw_lock) == 0); + VERIFY3S(pthread_rwlock_unlock(&rwlp->rw_lock), ==, 0); } int @@ -206,19 +240,29 @@ rw_tryenter(krwlock_t *rwlp, krw_t rw) { int rv; - ASSERT(rwlp->initialized == B_TRUE); - ASSERT(rwlp->rw_owner != (void *)-1UL); + ASSERT(rwlp->rw_magic == RW_MAGIC); if (rw == RW_READER) - rv = rw_tryrdlock(&rwlp->rw_lock); + rv = pthread_rwlock_tryrdlock(&rwlp->rw_lock); else - rv = rw_trywrlock(&rwlp->rw_lock); + rv = pthread_rwlock_trywrlock(&rwlp->rw_lock); if (rv == 0) { + ASSERT(rwlp->rw_wr_owner == NULL); + + if (rw == RW_READER) + atomic_inc_uint(&rwlp->rw_readers); + else { + ASSERT3U(rwlp->rw_readers, ==, 0); + rwlp->rw_wr_owner = curthread; + } + rwlp->rw_owner = curthread; return (1); } + VERIFY3S(rv, ==, EBUSY); + return (0); } @@ -226,8 +270,7 @@ rw_tryenter(krwlock_t *rwlp, krw_t rw) int rw_tryupgrade(krwlock_t *rwlp) { - ASSERT(rwlp->initialized == B_TRUE); - ASSERT(rwlp->rw_owner != (void *)-1UL); + ASSERT(rwlp->rw_magic == RW_MAGIC); return (0); } @@ -241,22 +284,34 @@ rw_tryupgrade(krwlock_t *rwlp) void cv_init(kcondvar_t *cv, char *name, int type, void *arg) { - VERIFY(cond_init(cv, type, NULL) == 0); + ASSERT(type == CV_DEFAULT); + +#ifdef IM_FEELING_LUCKY + ASSERT(cv->cv_magic != CV_MAGIC); +#endif + + cv->cv_magic = CV_MAGIC; + + VERIFY3S(pthread_cond_init(&cv->cv, NULL), ==, 0); } void cv_destroy(kcondvar_t *cv) { - VERIFY(cond_destroy(cv) == 0); + ASSERT(cv->cv_magic == CV_MAGIC); + VERIFY3S(pthread_cond_destroy(&cv->cv), ==, 0); + cv->cv_magic = 0; } void cv_wait(kcondvar_t *cv, kmutex_t *mp) { + ASSERT(cv->cv_magic == CV_MAGIC); ASSERT(mutex_owner(mp) == curthread); mp->m_owner = NULL; - int ret = cond_wait(cv, &mp->m_lock); - VERIFY(ret == 0 || ret == EINTR); + int ret = pthread_cond_wait(&cv->cv, &mp->m_lock); + if (ret != 0) + VERIFY3S(ret, ==, EINTR); mp->m_owner = curthread; } @@ -264,29 +319,38 @@ clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) { int error; + struct timeval tv; timestruc_t ts; clock_t delta; + ASSERT(cv->cv_magic == CV_MAGIC); + top: delta = abstime - lbolt; if (delta <= 0) return (-1); - ts.tv_sec = delta / hz; - ts.tv_nsec = (delta % hz) * (NANOSEC / hz); + VERIFY(gettimeofday(&tv, NULL) == 0); + + ts.tv_sec = tv.tv_sec + delta / hz; + ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz); + if (ts.tv_nsec >= NANOSEC) { + ts.tv_sec++; + ts.tv_nsec -= NANOSEC; + } ASSERT(mutex_owner(mp) == curthread); mp->m_owner = NULL; - error = cond_reltimedwait(cv, &mp->m_lock, &ts); + error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts); mp->m_owner = curthread; - if (error == ETIME) + if (error == ETIMEDOUT) return (-1); if (error == EINTR) goto top; - ASSERT(error == 0); + VERIFY3S(error, ==, 0); return (1); } @@ -294,13 +358,15 @@ top: void cv_signal(kcondvar_t *cv) { - VERIFY(cond_signal(cv) == 0); + ASSERT(cv->cv_magic == CV_MAGIC); + VERIFY3S(pthread_cond_signal(&cv->cv), ==, 0); } void cv_broadcast(kcondvar_t *cv) { - VERIFY(cond_broadcast(cv) == 0); + ASSERT(cv->cv_magic == CV_MAGIC); + VERIFY3S(pthread_cond_broadcast(&cv->cv), ==, 0); } /* @@ -554,7 +620,7 @@ __dprintf(const char *file, const char *func, int line, const char *fmt, ...) if (dprintf_find_string("pid")) (void) printf("%d ", getpid()); if (dprintf_find_string("tid")) - (void) printf("%u ", thr_self()); + (void) printf("%u ", (uint_t) pthread_self()); if (dprintf_find_string("cpu")) (void) printf("%u ", getcpuid()); if (dprintf_find_string("time")) diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 93acdcf8e4..d28f6024bb 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -42,7 +42,7 @@ struct taskq { krwlock_t tq_threadlock; kcondvar_t tq_dispatch_cv; kcondvar_t tq_wait_cv; - thread_t *tq_threadlist; + pthread_t *tq_threadlist; int tq_flags; int tq_active; int tq_nthreads; @@ -185,7 +185,7 @@ taskq_create(const char *name, int nthreads, pri_t pri, tq->tq_maxalloc = maxalloc; tq->tq_task.task_next = &tq->tq_task; tq->tq_task.task_prev = &tq->tq_task; - tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP); + tq->tq_threadlist = kmem_alloc(nthreads * sizeof (pthread_t), KM_SLEEP); if (flags & TASKQ_PREPOPULATE) { mutex_enter(&tq->tq_lock); @@ -195,8 +195,8 @@ taskq_create(const char *name, int nthreads, pri_t pri, } for (t = 0; t < nthreads; t++) - (void) thr_create(0, 0, taskq_thread, - tq, THR_BOUND, &tq->tq_threadlist[t]); + VERIFY(pthread_create(&tq->tq_threadlist[t], + NULL, taskq_thread, tq) == 0); return (tq); } @@ -226,9 +226,9 @@ taskq_destroy(taskq_t *tq) mutex_exit(&tq->tq_lock); for (t = 0; t < nthreads; t++) - (void) thr_join(tq->tq_threadlist[t], NULL, NULL); + VERIFY(pthread_join(tq->tq_threadlist[t], NULL) == 0); - kmem_free(tq->tq_threadlist, nthreads * sizeof (thread_t)); + kmem_free(tq->tq_threadlist, nthreads * sizeof (pthread_t)); rw_destroy(&tq->tq_threadlock); mutex_destroy(&tq->tq_lock); @@ -247,7 +247,7 @@ taskq_member(taskq_t *tq, void *t) return (1); for (i = 0; i < tq->tq_nthreads; i++) - if (tq->tq_threadlist[i] == (thread_t)(uintptr_t)t) + if (tq->tq_threadlist[i] == (pthread_t)(uintptr_t)t) return (1); return (0); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 1cb3e8c7b5..a9783a9717 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -48,6 +48,8 @@ dmu_tx_create_dd(dsl_dir_t *dd) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); + list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); #ifdef ZFS_DEBUG refcount_create(&tx->tx_space_written); refcount_create(&tx->tx_space_freed); @@ -1020,8 +1022,13 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + if (!list_is_empty(&tx->tx_callbacks)) + txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); + if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", @@ -1050,6 +1057,14 @@ dmu_tx_abort(dmu_tx_t *tx) if (dn != NULL) dnode_rele(dn, tx); } + + /* + * Call any registered callbacks with an error code. + */ + if (!list_is_empty(&tx->tx_callbacks)) + dmu_tx_callback(&tx->tx_callbacks, ECANCELED); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG refcount_destroy_many(&tx->tx_space_written, @@ -1067,6 +1082,34 @@ dmu_tx_get_txg(dmu_tx_t *tx) return (tx->tx_txg); } +void +dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) +{ + dmu_tx_callback_t *dcb; + + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); + + dcb->dcb_func = func; + dcb->dcb_data = data; + + list_insert_tail(&tx->tx_callbacks, dcb); +} + +/* + * Call all the commit callbacks on a list, with a given error code. + */ +void +dmu_tx_callback(list_t *cb_list, int error) +{ + dmu_tx_callback_t *dcb; + + while (dcb = list_head(cb_list)) { + list_remove(cb_list, dcb); + dcb->dcb_func(dcb->dcb_data, error); + kmem_free(dcb, sizeof (dmu_tx_callback_t)); + } +} + #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h index 10a88e13f5..95c74bf12d 100644 --- a/module/zfs/include/sys/dmu.h +++ b/module/zfs/include/sys/dmu.h @@ -431,6 +431,26 @@ int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); +/* + * To register a commit callback, dmu_tx_callback_register() must be called. + * + * dcb_data is a pointer to caller private data that is passed on as a + * callback parameter. The caller is responsible for properly allocating and + * freeing it. + * + * When registering a callback, the transaction must be already created, but + * it cannot be committed or aborted. It can be assigned to a txg or not. + * + * The callback will be called after the transaction has been safely written + * to stable storage and will also be called if the dmu_tx is aborted. + * If there is any error which prevents the transaction from being committed to + * disk, the callback will be called with a value of error != 0. + */ +typedef void dmu_tx_callback_func_t(void *dcb_data, int error); + +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); + /* * Free up the data blocks for a defined range of a file. If size is * zero, the range from offset to end-of-file is freed. diff --git a/module/zfs/include/sys/dmu_tx.h b/module/zfs/include/sys/dmu_tx.h index 2fc1fee172..7fcab936f0 100644 --- a/module/zfs/include/sys/dmu_tx.h +++ b/module/zfs/include/sys/dmu_tx.h @@ -26,8 +26,6 @@ #ifndef _SYS_DMU_TX_H #define _SYS_DMU_TX_H - - #include #include #include @@ -59,6 +57,7 @@ struct dmu_tx { txg_handle_t tx_txgh; void *tx_tempreserve_cookie; struct dmu_tx_hold *tx_needassign_txh; + list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */ uint8_t tx_anyobj; int tx_err; #ifdef ZFS_DEBUG @@ -98,6 +97,11 @@ typedef struct dmu_tx_hold { #endif } dmu_tx_hold_t; +typedef struct dmu_tx_callback { + list_node_t dcb_node; /* linked to tx_callbacks list */ + dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ + void *dcb_data; /* caller private data */ +} dmu_tx_callback_t; /* * These routines are defined in dmu.h, and are called by the user. @@ -109,6 +113,10 @@ void dmu_tx_abort(dmu_tx_t *tx); uint64_t dmu_tx_get_txg(dmu_tx_t *tx); void dmu_tx_wait(dmu_tx_t *tx); +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); +void dmu_tx_callback(list_t *cb_list, int error); + /* * These routines are defined in dmu_spa.h, and are called by the SPA. */ diff --git a/module/zfs/include/sys/txg.h b/module/zfs/include/sys/txg.h index ccc2cc5c65..e679898dbc 100644 --- a/module/zfs/include/sys/txg.h +++ b/module/zfs/include/sys/txg.h @@ -26,8 +26,6 @@ #ifndef _SYS_TXG_H #define _SYS_TXG_H - - #include #include @@ -71,6 +69,7 @@ extern void txg_sync_stop(struct dsl_pool *dp); extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); extern void txg_rele_to_quiesce(txg_handle_t *txghp); extern void txg_rele_to_sync(txg_handle_t *txghp); +extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); extern void txg_suspend(struct dsl_pool *dp); extern void txg_resume(struct dsl_pool *dp); diff --git a/module/zfs/include/sys/txg_impl.h b/module/zfs/include/sys/txg_impl.h index 7413c662b3..bc7d7c7e51 100644 --- a/module/zfs/include/sys/txg_impl.h +++ b/module/zfs/include/sys/txg_impl.h @@ -33,10 +33,16 @@ extern "C" { #endif +typedef struct tx_cb { + tx_cpu_t *tcb_tc; + uint64_t tcb_txg; +} tx_cb_t; + struct tx_cpu { kmutex_t tc_lock; kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; + list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ char tc_pad[16]; }; @@ -64,6 +70,8 @@ typedef struct tx_state { kthread_t *tx_sync_thread; kthread_t *tx_quiesce_thread; + + taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ } tx_state_t; #ifdef __cplusplus diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 53ff7a0fcf..1b490688bb 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,9 @@ txg_init(dsl_pool_t *dp, uint64_t txg) for (i = 0; i < TXG_SIZE; i++) { cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); + list_create(&tx->tx_cpu[c].tc_callbacks[i], + sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); } } @@ -96,10 +100,15 @@ txg_fini(dsl_pool_t *dp) int i; mutex_destroy(&tx->tx_cpu[c].tc_lock); - for (i = 0; i < TXG_SIZE; i++) + for (i = 0; i < TXG_SIZE; i++) { cv_destroy(&tx->tx_cpu[c].tc_cv[i]); + list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); + } } + if (tx->tx_commit_cb_taskq != NULL) + taskq_destroy(tx->tx_commit_cb_taskq); + vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); bzero(tx, sizeof (tx_state_t)); @@ -229,25 +238,55 @@ txg_rele_to_quiesce(txg_handle_t *th) } void -txg_rele_to_sync(txg_handle_t *th) +txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) { tx_cpu_t *tc = th->th_cpu; int g = th->th_txg & TXG_MASK; + mutex_enter(&tc->tc_lock); + list_move_tail(&tc->tc_callbacks[g], tx_callbacks); + mutex_exit(&tc->tc_lock); +} + +static void +txg_exit(tx_cpu_t *tc, uint64_t txg) +{ + int g = txg & TXG_MASK; + mutex_enter(&tc->tc_lock); ASSERT(tc->tc_count[g] != 0); if (--tc->tc_count[g] == 0) cv_broadcast(&tc->tc_cv[g]); mutex_exit(&tc->tc_lock); +} + +void +txg_rele_to_sync(txg_handle_t *th) +{ + txg_exit(th->th_cpu, th->th_txg); th->th_cpu = NULL; /* defensive */ } +static void +txg_wait_exit(tx_state_t *tx, uint64_t txg) +{ + int g = txg & TXG_MASK; + int c; + + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + mutex_enter(&tc->tc_lock); + while (tc->tc_count[g] != 0) + cv_wait(&tc->tc_cv[g], &tc->tc_lock); + mutex_exit(&tc->tc_lock); + } +} + static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; - int g = txg & TXG_MASK; int c; /* @@ -269,12 +308,60 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) /* * Quiesce the transaction group by waiting for everyone to txg_exit(). */ + txg_wait_exit(tx, txg); +} + +static void +txg_callback(tx_cb_t *tcb) +{ + tx_cpu_t *tc = tcb->tcb_tc; + int g = tcb->tcb_txg & TXG_MASK; + + dmu_tx_callback(&tc->tc_callbacks[g], 0); + + txg_exit(tc, tcb->tcb_txg); + + kmem_free(tcb, sizeof (tx_cb_t)); +} + +/* + * Dispatch the commit callbacks registered on this txg to worker threads. + */ +static void +txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) +{ + int c; + tx_state_t *tx = &dp->dp_tx; + tx_cb_t *tcb; + for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; - mutex_enter(&tc->tc_lock); - while (tc->tc_count[g] != 0) - cv_wait(&tc->tc_cv[g], &tc->tc_lock); - mutex_exit(&tc->tc_lock); + /* No need to lock tx_cpu_t at this point */ + + int g = txg & TXG_MASK; + + if (list_is_empty(&tc->tc_callbacks[g])) + continue; + + if (tx->tx_commit_cb_taskq == NULL) { + /* + * Commit callback taskq hasn't been created yet. + */ + tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", + max_ncpus, minclsyspri, max_ncpus, max_ncpus * 4, + TASKQ_PREPOPULATE); + } + + tcb = kmem_alloc(sizeof (tx_cb_t), KM_SLEEP); + tcb->tcb_txg = txg; + tcb->tcb_tc = tc; + + /* There shouldn't be any holders on this txg at this point */ + ASSERT3U(tc->tc_count[g], ==, 0); + tc->tc_count[g]++; + + (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) + txg_callback, tcb, TQ_SLEEP); } } @@ -345,6 +432,13 @@ txg_sync_thread(dsl_pool_t *dp) spa_sync(dp->dp_spa, txg); delta = lbolt - start; + /* + * Dispatch commit callbacks to worker threads and wait for + * them to finish. + */ + txg_dispatch_callbacks(dp, txg); + txg_wait_exit(tx, txg); + mutex_enter(&tx->tx_sync_lock); rw_enter(&tx->tx_suspend, RW_WRITER); tx->tx_synced_txg = txg;