dmu_objset_from_ds must be called with dp_config_rwlock held

The normal lock order is that the dp_config_rwlock must be held before
the ds_opening_lock.  For example, dmu_objset_hold() does this.
However, dmu_objset_open_impl() is called with the ds_opening_lock held,
and if the dp_config_rwlock is not already held, it will attempt to
acquire it.  This may lead to deadlock, since the lock order is
reversed.

Looking at all the callers of dmu_objset_open_impl() (which is
principally the callers of dmu_objset_from_ds()), almost all callers
already have the dp_config_rwlock.  However, there are a few places in
the send and receive code paths that do not.  For example:
dsl_crypto_populate_key_nvlist, send_cb, dmu_recv_stream,
receive_write_byref, redact_traverse_thread.

This commit resolves the problem by requiring all callers ot
dmu_objset_from_ds() to hold the dp_config_rwlock.  In most cases, the
code has been restructured such that we call dmu_objset_from_ds()
earlier on in the send and receive processes, when we already have the
dp_config_rwlock, and save the objset_t until we need it in the middle
of the send or receive (similar to what we already do with the
dsl_dataset_t).  Thus we do not need to acquire the dp_config_rwlock in
many new places.

I also cleaned up code in dmu_redact_snap() and send_traverse_thread().

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Paul Zuchowski <pzuchowski@datto.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #9662
Closes #10115
This commit is contained in:
Matthew Ahrens 2020-03-12 10:55:02 -07:00 committed by GitHub
parent fa130e010c
commit 0fdd6106bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 101 additions and 115 deletions

View File

@ -189,7 +189,7 @@ void key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag);
int spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag,
dsl_crypto_key_t **dck_out);
int dsl_crypto_populate_key_nvlist(struct dsl_dataset *ds,
int dsl_crypto_populate_key_nvlist(struct objset *os,
uint64_t from_ivset_guid, nvlist_t **nvl_out);
int dsl_crypto_recv_raw_key_check(struct dsl_dataset *ds,
nvlist_t *nvl, dmu_tx_t *tx);

View File

@ -1804,7 +1804,10 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
objset_t *os;
VERIFY3P(ds->ds_owner, ==, zfsvfs);
VERIFY(dsl_dataset_long_held(ds));
dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
dsl_pool_config_enter(dp, FTAG);
VERIFY0(dmu_objset_from_ds(ds, &os));
dsl_pool_config_exit(dp, FTAG);
err = zfsvfs_init(zfsvfs, os);
if (err != 0)
@ -1895,7 +1898,10 @@ zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
objset_t *os;
VERIFY3P(ds->ds_owner, ==, zfsvfs);
VERIFY(dsl_dataset_long_held(ds));
dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
dsl_pool_config_enter(dp, FTAG);
VERIFY0(dmu_objset_from_ds(ds, &os));
dsl_pool_config_exit(dp, FTAG);
zfsvfs->z_os = os;
/* release the VOPs */

View File

@ -414,6 +414,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
ASSERT(!BP_IS_REDACTED(bp));
/*
* We need the pool config lock to get properties.
*/
ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool));
/*
* The $ORIGIN dataset (if it exists) doesn't have an associated
* objset, so there's no reason to open it. The $ORIGIN dataset
@ -503,20 +508,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* checksum/compression/copies.
*/
if (ds != NULL) {
boolean_t needlock = B_FALSE;
os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);
/*
* Note: it's valid to open the objset if the dataset is
* long-held, in which case the pool_config lock will not
* be held.
*/
if (!dsl_pool_config_held(dmu_objset_pool(os))) {
needlock = B_TRUE;
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
}
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
@ -579,8 +572,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
smallblk_changed_cb, os);
}
}
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (err != 0) {
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
kmem_free(os, sizeof (objset_t));
@ -650,11 +641,11 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
int err = 0;
/*
* We shouldn't be doing anything with dsl_dataset_t's unless the
* pool_config lock is held, or the dataset is long-held.
* We need the pool_config lock to manipulate the dsl_dataset_t.
* Even if the dataset is long-held, we need the pool_config lock
* to open the objset, as it needs to get properties.
*/
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
dsl_dataset_long_held(ds));
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
mutex_enter(&ds->ds_opening_lock);
if (ds->ds_objset == NULL) {

View File

@ -123,7 +123,7 @@ struct receive_writer_arg {
typedef struct guid_map_entry {
uint64_t guid;
boolean_t raw;
dsl_dataset_t *gme_ds;
objset_t *gme_os;
avl_node_t avlnode;
} guid_map_entry_t;
@ -914,6 +914,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
rrw_exit(&newds->ds_bp_rwlock, FTAG);
drba->drba_cookie->drc_ds = newds;
drba->drba_cookie->drc_os = os;
spa_history_log_internal_ds(newds, "receive", tx, " ");
}
@ -1095,6 +1096,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
rrw_exit(&ds->ds_bp_rwlock, FTAG);
drba->drba_cookie->drc_ds = ds;
VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os));
drba->drba_cookie->drc_should_save = B_TRUE;
spa_history_log_internal_ds(ds, "resume receive", tx, " ");
@ -1227,11 +1229,12 @@ free_guid_map_onexit(void *arg)
ds_hold_flags_t dsflags = DS_HOLD_FLAG_DECRYPT;
if (gmep->raw) {
gmep->gme_ds->ds_objset->os_raw_receive = B_FALSE;
gmep->gme_os->os_raw_receive = B_FALSE;
dsflags &= ~DS_HOLD_FLAG_DECRYPT;
}
dsl_dataset_disown(gmep->gme_ds, dsflags, gmep);
dsl_dataset_disown(gmep->gme_os->os_dsl_dataset,
dsflags, gmep);
kmem_free(gmep, sizeof (guid_map_entry_t));
}
avl_destroy(ca);
@ -1797,8 +1800,7 @@ receive_write_byref(struct receive_writer_arg *rwa,
&where)) == NULL) {
return (SET_ERROR(EINVAL));
}
if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
return (SET_ERROR(EINVAL));
ref_os = gmep->gme_os;
} else {
ref_os = rwa->os;
}
@ -2661,10 +2663,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, int cleanup_fd,
DMU_SUBSTREAM);
ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
/*
* Open the objset we are modifying.
*/
VERIFY0(dmu_objset_from_ds(drc->drc_ds, &drc->drc_os));
ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
ASSERT0(drc->drc_os->os_encrypted &&
(drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA));
@ -3013,6 +3011,12 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
origin_head, tx);
/*
* The objset was evicted by dsl_dataset_clone_swap_sync_impl,
* so drc_os is no longer valid.
*/
drc->drc_os = NULL;
dsl_dataset_snapshot_sync_impl(origin_head,
drc->drc_tosnap, tx);
@ -3127,13 +3131,6 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj,
err = dsl_dataset_own_obj(dp, snapobj, dsflags, gmep, &snapds);
if (err == 0) {
/*
* If this is a deduplicated raw send stream, we need
* to make sure that we can still read raw blocks from
* earlier datasets in the stream, so we set the
* os_raw_receive flag now.
*/
if (raw) {
err = dmu_objset_from_ds(snapds, &os);
if (err != 0) {
dsl_dataset_disown(snapds, dsflags, FTAG);
@ -3141,12 +3138,18 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj,
kmem_free(gmep, sizeof (*gmep));
return (err);
}
/*
* If this is a deduplicated raw send stream, we need
* to make sure that we can still read raw blocks from
* earlier datasets in the stream, so we set the
* os_raw_receive flag now.
*/
if (raw)
os->os_raw_receive = B_TRUE;
}
gmep->raw = raw;
gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
gmep->gme_ds = snapds;
gmep->gme_os = os;
avl_add(guid_map, gmep);
} else {
kmem_free(gmep, sizeof (*gmep));

View File

@ -80,6 +80,7 @@ struct redact_record {
struct redact_thread_arg {
bqueue_t q;
objset_t *os; /* Objset to traverse */
dsl_dataset_t *ds; /* Dataset to traverse */
struct redact_record *current_record;
int error_code;
@ -355,11 +356,9 @@ redact_traverse_thread(void *arg)
struct redact_thread_arg *rt_arg = arg;
int err;
struct redact_record *data;
objset_t *os;
VERIFY0(dmu_objset_from_ds(rt_arg->ds, &os));
#ifdef _KERNEL
if (os->os_phys->os_type == DMU_OST_ZFS)
rt_arg->deleted_objs = zfs_get_deleteq(os);
if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS)
rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os);
else
rt_arg->deleted_objs = objlist_create();
#else
@ -1004,9 +1003,8 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
int err = 0;
dsl_pool_t *dp = NULL;
dsl_dataset_t *ds = NULL;
objset_t *os;
int numsnaps = 0;
dsl_dataset_t **redactsnaparr = NULL;
objset_t *os;
struct redact_thread_arg *args = NULL;
redaction_list_t *new_rl = NULL;
@ -1026,39 +1024,45 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
err = EALREADY;
goto out;
}
nvpair_t *pair;
if (fnvlist_num_pairs(redactnvl) > 0 && err == 0) {
redactsnaparr = kmem_zalloc(fnvlist_num_pairs(redactnvl) *
sizeof (dsl_dataset_t *), KM_SLEEP);
}
for (pair = nvlist_next_nvpair(redactnvl, NULL); err == 0 &&
pair != NULL; pair = nvlist_next_nvpair(redactnvl, pair)) {
numsnaps = fnvlist_num_pairs(redactnvl);
if (numsnaps > 0)
args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
nvpair_t *pair = NULL;
for (int i = 0; i < numsnaps; i++) {
pair = nvlist_next_nvpair(redactnvl, pair);
const char *name = nvpair_name(pair);
struct redact_thread_arg *rta = &args[i];
err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT,
FTAG, redactsnaparr + numsnaps);
FTAG, &rta->ds);
if (err != 0)
break;
dsl_dataset_long_hold(redactsnaparr[numsnaps], FTAG);
if (!dsl_dataset_is_before(redactsnaparr[numsnaps], ds, 0)) {
/*
* We want to do the long hold before we can get any other
* errors, because the cleanup code will release the long
* hold if rta->ds is filled in.
*/
dsl_dataset_long_hold(rta->ds, FTAG);
err = dmu_objset_from_ds(rta->ds, &rta->os);
if (err != 0)
break;
if (!dsl_dataset_is_before(rta->ds, ds, 0)) {
err = EINVAL;
numsnaps++;
break;
}
if (dsl_dataset_feature_is_active(redactsnaparr[numsnaps],
if (dsl_dataset_feature_is_active(rta->ds,
SPA_FEATURE_REDACTED_DATASETS)) {
err = EALREADY;
numsnaps++;
break;
}
numsnaps++;
}
VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL);
if (err != 0)
goto out;
ASSERT3U(fnvlist_num_pairs(redactnvl), ==, numsnaps);
boolean_t resuming = B_FALSE;
char newredactbook[ZFS_MAX_DATASET_NAME_LEN];
zfs_bookmark_phys_t bookmark;
@ -1091,15 +1095,14 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
goto out;
}
for (int i = 0; i < numsnaps; i++) {
struct redact_thread_arg *rta = &args[i];
if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps,
new_rl->rl_phys->rlp_num_snaps,
dsl_dataset_phys(redactsnaparr[i])->ds_guid)) {
dsl_dataset_phys(rta->ds)->ds_guid)) {
err = ESRCH;
goto out;
}
}
if (numsnaps > 0)
args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX &&
new_rl->rl_phys->rlp_last_object == UINT64_MAX) {
err = EEXIST;
@ -1112,10 +1115,11 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
if (numsnaps > 0) {
guids = kmem_zalloc(numsnaps * sizeof (uint64_t),
KM_SLEEP);
args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
}
for (int i = 0; i < numsnaps; i++)
guids[i] = dsl_dataset_phys(redactsnaparr[i])->ds_guid;
for (int i = 0; i < numsnaps; i++) {
struct redact_thread_arg *rta = &args[i];
guids[i] = dsl_dataset_phys(rta->ds)->ds_guid;
}
dsl_pool_rele(dp, FTAG);
dp = NULL;
@ -1128,18 +1132,18 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
}
for (int i = 0; i < numsnaps; i++) {
args[i].ds = redactsnaparr[i];
(void) bqueue_init(&args[i].q, zfs_redact_queue_ff,
struct redact_thread_arg *rta = &args[i];
(void) bqueue_init(&rta->q, zfs_redact_queue_ff,
zfs_redact_queue_length,
offsetof(struct redact_record, ln));
if (resuming) {
args[i].resume.zb_blkid =
rta->resume.zb_blkid =
new_rl->rl_phys->rlp_last_blkid;
args[i].resume.zb_object =
rta->resume.zb_object =
new_rl->rl_phys->rlp_last_object;
}
args[i].txg = dsl_dataset_phys(ds)->ds_creation_txg;
(void) thread_create(NULL, 0, redact_traverse_thread, &args[i],
rta->txg = dsl_dataset_phys(ds)->ds_creation_txg;
(void) thread_create(NULL, 0, redact_traverse_thread, rta,
0, curproc, TS_RUN, minclsyspri);
}
struct redact_merge_thread_arg rmta = { { {0} } };
@ -1152,23 +1156,25 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
TS_RUN, minclsyspri);
err = perform_redaction(os, new_rl, &rmta);
out:
if (args != NULL) {
kmem_free(args, numsnaps * sizeof (*args));
}
if (new_rl != NULL) {
dsl_redaction_list_long_rele(new_rl, FTAG);
dsl_redaction_list_rele(new_rl, FTAG);
}
for (int i = 0; i < numsnaps; i++) {
dsl_dataset_long_rele(redactsnaparr[i], FTAG);
dsl_dataset_rele_flags(redactsnaparr[i], DS_HOLD_FLAG_DECRYPT,
FTAG);
struct redact_thread_arg *rta = &args[i];
/*
* rta->ds may be NULL if we got an error while filling
* it in.
*/
if (rta->ds != NULL) {
dsl_dataset_long_rele(rta->ds, FTAG);
dsl_dataset_rele_flags(rta->ds,
DS_HOLD_FLAG_DECRYPT, FTAG);
}
}
if (redactsnaparr != NULL) {
kmem_free(redactsnaparr, fnvlist_num_pairs(redactnvl) *
sizeof (dsl_dataset_t *));
}
if (args != NULL)
kmem_free(args, numsnaps * sizeof (*args));
if (dp != NULL)
dsl_pool_rele(dp, FTAG);
if (ds != NULL) {

View File

@ -115,8 +115,7 @@ overflow_multiply(uint64_t a, uint64_t b, uint64_t *c)
struct send_thread_arg {
bqueue_t q;
dsl_dataset_t *ds; /* Dataset to traverse */
redaction_list_t *redaction_list;
objset_t *os; /* Objset to traverse */
uint64_t fromtxg; /* Traverse from this txg */
int flags; /* flags to pass to traverse_dataset */
int error_code;
@ -1092,19 +1091,16 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
zb->zb_object >= sta->resume.zb_object);
ASSERT3P(sta->ds, !=, NULL);
/*
* All bps of an encrypted os should have the encryption bit set.
* If this is not true it indicates tampering and we report an error.
*/
objset_t *os;
VERIFY0(dmu_objset_from_ds(sta->ds, &os));
if (os->os_encrypted &&
if (sta->os->os_encrypted &&
!BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
spa_log_error(spa, zb);
zfs_panic_recover("unencrypted block in encrypted "
"object set %llu", sta->ds->ds_object);
"object set %llu", dmu_objset_id(sta->os));
return (SET_ERROR(EIO));
}
@ -1211,10 +1207,7 @@ redact_list_cb(redact_block_phys_t *rb, void *arg)
/*
* This function kicks off the traverse_dataset. It also handles setting the
* error code of the thread in case something goes wrong, and pushes the End of
* Stream record when the traverse_dataset call has finished. If there is no
* dataset to traverse, then we traverse the redaction list provided and enqueue
* records for that. If neither is provided, the thread immediately pushes an
* End of Stream marker.
* Stream record when the traverse_dataset call has finished.
*/
static void
send_traverse_thread(void *arg)
@ -1224,20 +1217,9 @@ send_traverse_thread(void *arg)
struct send_range *data;
fstrans_cookie_t cookie = spl_fstrans_mark();
if (st_arg->ds != NULL) {
ASSERT3P(st_arg->redaction_list, ==, NULL);
err = traverse_dataset_resume(st_arg->ds,
err = traverse_dataset_resume(st_arg->os->os_dsl_dataset,
st_arg->fromtxg, &st_arg->resume,
st_arg->flags, send_cb, st_arg);
} else if (st_arg->redaction_list != NULL) {
struct redact_list_cb_arg rlcba = {0};
rlcba.cancel = &st_arg->cancel;
rlcba.num_blocks_visited = st_arg->num_blocks_visited;
rlcba.q = &st_arg->q;
rlcba.mark_redact = B_FALSE;
err = dsl_redaction_list_traverse(st_arg->redaction_list,
&st_arg->resume, redact_list_cb, &rlcba);
}
if (err != EINTR)
st_arg->error_code = err;
@ -2029,7 +2011,7 @@ create_begin_record(struct dmu_send_params *dspp, objset_t *os,
}
static void
setup_to_thread(struct send_thread_arg *to_arg, dsl_dataset_t *to_ds,
setup_to_thread(struct send_thread_arg *to_arg, objset_t *to_os,
dmu_sendstatus_t *dssp, uint64_t fromtxg, boolean_t rawok)
{
VERIFY0(bqueue_init(&to_arg->q, zfs_send_no_prefetch_queue_ff,
@ -2037,12 +2019,11 @@ setup_to_thread(struct send_thread_arg *to_arg, dsl_dataset_t *to_ds,
offsetof(struct send_range, ln)));
to_arg->error_code = 0;
to_arg->cancel = B_FALSE;
to_arg->ds = to_ds;
to_arg->os = to_os;
to_arg->fromtxg = fromtxg;
to_arg->flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA;
if (rawok)
to_arg->flags |= TRAVERSE_NO_DECRYPT;
to_arg->redaction_list = NULL;
to_arg->num_blocks_visited = &dssp->dss_blocks;
(void) thread_create(NULL, 0, send_traverse_thread, to_arg, 0,
curproc, TS_RUN, minclsyspri);
@ -2489,7 +2470,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
nvlist_t *keynvl = NULL;
ASSERT(os->os_encrypted);
err = dsl_crypto_populate_key_nvlist(to_ds, ivset_guid,
err = dsl_crypto_populate_key_nvlist(os, ivset_guid,
&keynvl);
if (err != 0) {
fnvlist_free(nvl);
@ -2513,7 +2494,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
goto out;
}
setup_to_thread(to_arg, to_ds, dssp, fromtxg, dspp->rawok);
setup_to_thread(to_arg, os, dssp, fromtxg, dspp->rawok);
setup_from_thread(from_arg, from_rl, dssp);
setup_redact_list_thread(rlt_arg, dspp, redact_rl, dssp);
setup_merge_thread(smt_arg, dspp, from_arg, to_arg, rlt_arg, os);

View File

@ -2381,11 +2381,11 @@ dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj,
}
int
dsl_crypto_populate_key_nvlist(dsl_dataset_t *ds, uint64_t from_ivset_guid,
dsl_crypto_populate_key_nvlist(objset_t *os, uint64_t from_ivset_guid,
nvlist_t **nvl_out)
{
int ret;
objset_t *os;
dsl_dataset_t *ds = os->os_dsl_dataset;
dnode_t *mdn;
uint64_t rddobj;
nvlist_t *nvl = NULL;
@ -2403,7 +2403,6 @@ dsl_crypto_populate_key_nvlist(dsl_dataset_t *ds, uint64_t from_ivset_guid,
ASSERT(dckobj != 0);
VERIFY0(dmu_objset_from_ds(ds, &os));
mdn = DMU_META_DNODE(os);
nvl = fnvlist_alloc();