Fix dnode allocation race
When performing concurrent object allocations using the new multi-threaded allocator and large dnodes it's possible to allocate overlapping large dnodes. This case should have been handled by detecting an error returned by dnode_hold_impl(). But that logic only checked the returned dnp was not-NULL, and the dnp variable was not reset to NULL when retrying. Resolve this issue by properly checking the return value of dnode_hold_impl(). Additionally, it was possible that dnode_hold_impl() would misreport a dnode as free when it was in fact in use. This could occurs for two reasons: * The per-slot zrl_lock must be held over the entire critical section which includes the alloc/free until the new dnode is assigned to children_dnodes. Additionally, all of the zrl_lock's in the range must be held to protect moving dnodes. * The dn->dn_ot_type cannot be solely relied upon to check the type. When allocating a new dnode its type will be DMU_OT_NONE after dnode_create(). Only latter when dnode_allocate() is called will it transition to the new type. This means there's a window when allocating where it can mistaken for a free dnode. Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov> Reviewed-by: Ned Bass <bass6@llnl.gov> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Olaf Faaland <faaland1@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #6414 Closes #6439
This commit is contained in:
parent
d19a6d5c80
commit
9631681b75
|
@ -61,6 +61,7 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
|
|||
boolean_t restarted = B_FALSE;
|
||||
uint64_t *cpuobj = NULL;
|
||||
int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
|
||||
int error;
|
||||
|
||||
kpreempt_disable();
|
||||
cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
|
||||
|
@ -129,7 +130,6 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
|
|||
uint64_t offset;
|
||||
uint64_t blkfill;
|
||||
int minlvl;
|
||||
int error;
|
||||
if (os->os_rescan_dnodes) {
|
||||
offset = 0;
|
||||
os->os_rescan_dnodes = B_FALSE;
|
||||
|
@ -163,9 +163,9 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
|
|||
* dmu_tx_assign(), but there is currently no mechanism
|
||||
* to do so.
|
||||
*/
|
||||
(void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
|
||||
error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
|
||||
dn_slots, FTAG, &dn);
|
||||
if (dn != NULL) {
|
||||
if (error == 0) {
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
|
||||
/*
|
||||
* Another thread could have allocated it; check
|
||||
|
|
|
@ -1242,11 +1242,13 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
|
|||
void
|
||||
dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
|
||||
{
|
||||
dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
|
||||
tx->tx_objset, object, THT_SPILL, 0, 0);
|
||||
dmu_tx_hold_t *txh;
|
||||
|
||||
(void) refcount_add_many(&txh->txh_space_towrite,
|
||||
SPA_OLD_MAXBLOCKSIZE, FTAG);
|
||||
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
|
||||
THT_SPILL, 0, 0);
|
||||
if (txh != NULL)
|
||||
(void) refcount_add_many(&txh->txh_space_towrite,
|
||||
SPA_OLD_MAXBLOCKSIZE, FTAG);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -391,7 +391,7 @@ dnode_setdblksz(dnode_t *dn, int size)
|
|||
}
|
||||
|
||||
static dnode_t *
|
||||
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
|
||||
dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
|
||||
uint64_t object, dnode_handle_t *dnh)
|
||||
{
|
||||
dnode_t *dn;
|
||||
|
@ -424,11 +424,15 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
|
|||
dn->dn_compress = dnp->dn_compress;
|
||||
dn->dn_bonustype = dnp->dn_bonustype;
|
||||
dn->dn_bonuslen = dnp->dn_bonuslen;
|
||||
dn->dn_num_slots = dnp->dn_extra_slots + 1;
|
||||
dn->dn_maxblkid = dnp->dn_maxblkid;
|
||||
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
|
||||
dn->dn_id_flags = 0;
|
||||
|
||||
if (slots && dn->dn_type == DMU_OT_NONE)
|
||||
dn->dn_num_slots = slots;
|
||||
else
|
||||
dn->dn_num_slots = dnp->dn_extra_slots + 1;
|
||||
|
||||
dmu_zfetch_init(&dn->dn_zfetch, dn);
|
||||
|
||||
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
|
||||
|
@ -1011,7 +1015,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
|
|||
{
|
||||
dnode_t *dn;
|
||||
|
||||
dn = dnode_create(os, dnp, NULL, object, dnh);
|
||||
dn = dnode_create(os, dnp, 0, NULL, object, dnh);
|
||||
zrl_init(&dnh->dnh_zrlock);
|
||||
DNODE_VERIFY(dn);
|
||||
}
|
||||
|
@ -1062,36 +1066,26 @@ dnode_buf_evict_async(void *dbu)
|
|||
*
|
||||
* The dnode_phys_t buffer may not be in sync with the in-core dnode
|
||||
* structure, so we try to check the dnode structure first and fall back
|
||||
* to the dnode_phys_t buffer it doesn't exist.
|
||||
* to the dnode_phys_t buffer it doesn't exist. When an in-code dnode
|
||||
* exists we can always trust dn->dn_num_slots to be accurate, even for
|
||||
* a held dnode which has not yet been fully allocated.
|
||||
*/
|
||||
static boolean_t
|
||||
dnode_is_consumed(dmu_buf_impl_t *db, int idx)
|
||||
dnode_is_consumed(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
|
||||
{
|
||||
dnode_handle_t *dnh;
|
||||
dmu_object_type_t ot;
|
||||
dnode_children_t *children_dnodes;
|
||||
dnode_phys_t *dn_block;
|
||||
int skip;
|
||||
int i;
|
||||
|
||||
children_dnodes = dmu_buf_get_user(&db->db);
|
||||
dn_block = (dnode_phys_t *)db->db.db_data;
|
||||
int skip, i;
|
||||
|
||||
for (i = 0; i < idx; i += skip) {
|
||||
dnh = &children_dnodes->dnc_children[i];
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
|
||||
zrl_add(&dnh->dnh_zrlock);
|
||||
if (dnh->dnh_dnode != NULL) {
|
||||
ot = dnh->dnh_dnode->dn_type;
|
||||
skip = dnh->dnh_dnode->dn_num_slots;
|
||||
} else {
|
||||
ot = dn_block[i].dn_type;
|
||||
skip = dn_block[i].dn_extra_slots + 1;
|
||||
if (dn_block[i].dn_type != DMU_OT_NONE)
|
||||
skip = dn_block[i].dn_extra_slots + 1;
|
||||
else
|
||||
skip = 1;
|
||||
}
|
||||
zrl_remove(&dnh->dnh_zrlock);
|
||||
|
||||
if (ot == DMU_OT_NONE)
|
||||
skip = 1;
|
||||
}
|
||||
|
||||
return (i > idx);
|
||||
|
@ -1107,27 +1101,19 @@ dnode_is_consumed(dmu_buf_impl_t *db, int idx)
|
|||
* to the dnode_phys_t buffer it doesn't exist.
|
||||
*/
|
||||
static boolean_t
|
||||
dnode_is_allocated(dmu_buf_impl_t *db, int idx)
|
||||
dnode_is_allocated(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
|
||||
{
|
||||
dnode_handle_t *dnh;
|
||||
dmu_object_type_t ot;
|
||||
dnode_children_t *children_dnodes;
|
||||
dnode_phys_t *dn_block;
|
||||
|
||||
if (dnode_is_consumed(db, idx))
|
||||
if (dnode_is_consumed(children, dn_block, idx))
|
||||
return (B_FALSE);
|
||||
|
||||
children_dnodes = dmu_buf_get_user(&db->db);
|
||||
dn_block = (dnode_phys_t *)db->db.db_data;
|
||||
|
||||
dnh = &children_dnodes->dnc_children[idx];
|
||||
|
||||
zrl_add(&dnh->dnh_zrlock);
|
||||
dnh = &children->dnc_children[idx];
|
||||
if (dnh->dnh_dnode != NULL)
|
||||
ot = dnh->dnh_dnode->dn_type;
|
||||
else
|
||||
ot = dn_block[idx].dn_type;
|
||||
zrl_remove(&dnh->dnh_zrlock);
|
||||
|
||||
return (ot != DMU_OT_NONE);
|
||||
}
|
||||
|
@ -1142,32 +1128,27 @@ dnode_is_allocated(dmu_buf_impl_t *db, int idx)
|
|||
* to the dnode_phys_t buffer it doesn't exist.
|
||||
*/
|
||||
static boolean_t
|
||||
dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
|
||||
dnode_is_free(dnode_children_t *children, dnode_phys_t *dn_block, int idx,
|
||||
int slots)
|
||||
{
|
||||
dnode_handle_t *dnh;
|
||||
dmu_object_type_t ot;
|
||||
dnode_children_t *children_dnodes;
|
||||
dnode_phys_t *dn_block;
|
||||
int i;
|
||||
|
||||
if (idx + slots > DNODES_PER_BLOCK)
|
||||
return (B_FALSE);
|
||||
|
||||
children_dnodes = dmu_buf_get_user(&db->db);
|
||||
dn_block = (dnode_phys_t *)db->db.db_data;
|
||||
|
||||
if (dnode_is_consumed(db, idx))
|
||||
if (dnode_is_consumed(children, dn_block, idx))
|
||||
return (B_FALSE);
|
||||
|
||||
for (i = idx; i < idx + slots; i++) {
|
||||
dnh = &children_dnodes->dnc_children[i];
|
||||
for (int i = idx; i < idx + slots; i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
dmu_object_type_t ot;
|
||||
|
||||
if (dnh->dnh_dnode != NULL) {
|
||||
if (dnh->dnh_dnode->dn_num_slots > 1)
|
||||
return (B_FALSE);
|
||||
|
||||
zrl_add(&dnh->dnh_zrlock);
|
||||
if (dnh->dnh_dnode != NULL)
|
||||
ot = dnh->dnh_dnode->dn_type;
|
||||
else
|
||||
} else {
|
||||
ot = dn_block[i].dn_type;
|
||||
zrl_remove(&dnh->dnh_zrlock);
|
||||
}
|
||||
|
||||
if (ot != DMU_OT_NONE)
|
||||
return (B_FALSE);
|
||||
|
@ -1176,6 +1157,24 @@ dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
|
|||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static void
|
||||
dnode_hold_slots(dnode_children_t *children, int idx, int slots)
|
||||
{
|
||||
for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
zrl_add(&dnh->dnh_zrlock);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
dnode_rele_slots(dnode_children_t *children, int idx, int slots)
|
||||
{
|
||||
for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
zrl_remove(&dnh->dnh_zrlock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* errors:
|
||||
* EINVAL - invalid object number.
|
||||
|
@ -1286,36 +1285,42 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
|||
idx = object & (epb - 1);
|
||||
dn_block_begin = (dnode_phys_t *)db->db.db_data;
|
||||
|
||||
if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
|
||||
dnode_hold_slots(children_dnodes, idx, slots);
|
||||
|
||||
if ((flag & DNODE_MUST_BE_FREE) &&
|
||||
!dnode_is_free(children_dnodes, dn_block_begin, idx, slots)) {
|
||||
dnode_rele_slots(children_dnodes, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(ENOSPC));
|
||||
} else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
|
||||
!dnode_is_allocated(db, idx)) {
|
||||
!dnode_is_allocated(children_dnodes, dn_block_begin, idx)) {
|
||||
dnode_rele_slots(children_dnodes, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
dnh = &children_dnodes->dnc_children[idx];
|
||||
zrl_add(&dnh->dnh_zrlock);
|
||||
dn = dnh->dnh_dnode;
|
||||
if (dn == NULL)
|
||||
dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
|
||||
dn = dnode_create(os, dn_block_begin + idx, slots, db,
|
||||
object, dnh);
|
||||
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
type = dn->dn_type;
|
||||
if (dn->dn_free_txg ||
|
||||
((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
zrl_remove(&dnh->dnh_zrlock);
|
||||
dnode_rele_slots(children_dnodes, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST));
|
||||
}
|
||||
if (refcount_add(&dn->dn_holds, tag) == 1)
|
||||
dbuf_add_ref(db, dnh);
|
||||
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
|
||||
/* Now we can rely on the hold to prevent the dnode from moving. */
|
||||
zrl_remove(&dnh->dnh_zrlock);
|
||||
dnode_rele_slots(children_dnodes, idx, slots);
|
||||
|
||||
DNODE_VERIFY(dn);
|
||||
ASSERT3P(dn->dn_dbuf, ==, db);
|
||||
|
|
|
@ -675,7 +675,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
|
|||
dmu_buf_t *db;
|
||||
mzap_phys_t *zp;
|
||||
|
||||
VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
|
||||
VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
{
|
||||
|
|
|
@ -763,7 +763,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
|
|||
}
|
||||
|
||||
zh = zfs_znode_hold_enter(zfsvfs, obj);
|
||||
VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
|
||||
VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
|
||||
|
||||
/*
|
||||
* If this is the root, fix up the half-initialized parent pointer
|
||||
|
|
|
@ -364,7 +364,7 @@ tests = ['async_destroy_001_pos']
|
|||
[tests/functional/features/large_dnode]
|
||||
tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos',
|
||||
'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos',
|
||||
'large_dnode_007_neg']
|
||||
'large_dnode_007_neg', 'large_dnode_008_pos']
|
||||
|
||||
[tests/functional/grow_pool]
|
||||
tests = ['grow_pool_001_pos']
|
||||
|
|
|
@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \
|
|||
large_dnode_004_neg.ksh \
|
||||
large_dnode_005_pos.ksh \
|
||||
large_dnode_006_pos.ksh \
|
||||
large_dnode_007_neg.ksh
|
||||
large_dnode_007_neg.ksh \
|
||||
large_dnode_008_pos.ksh
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or http://www.opensolaris.org/os/licensing.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
|
||||
# Use is subject to license terms.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Run many xattrtests on a dataset with large dnodes and xattr=sa to
|
||||
# stress concurrent allocation of large dnodes.
|
||||
#
|
||||
|
||||
TEST_FS=$TESTPOOL/large_dnode
|
||||
|
||||
verify_runnable "both"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
datasetexists $TEST_FS && log_must zfs destroy $TEST_FS
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
log_assert "xattrtest runs concurrently on dataset with large dnodes"
|
||||
|
||||
log_must zfs create $TEST_FS
|
||||
log_must zfs set dnsize=auto $TEST_FS
|
||||
log_must zfs set xattr=sa $TEST_FS
|
||||
|
||||
for ((i=0; i < 100; i++)); do
|
||||
dir="/$TEST_FS/dir.$i"
|
||||
log_must mkdir "$dir"
|
||||
log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir &"
|
||||
done
|
||||
|
||||
log_must wait
|
||||
|
||||
log_pass
|
Loading…
Reference in New Issue