Fix race in dnode_check_slots_free()
Currently, dnode_check_slots_free() works by checking dn->dn_type in the dnode to determine if the dnode is reclaimable. However, there is a small window of time between dnode_free_sync() in the first call to dsl_dataset_sync() and when the useraccounting code is run when the type is set DMU_OT_NONE, but the dnode is not yet evictable, leading to crashes. This patch adds the ability for dnodes to track which txg they were last dirtied in and adds a check for this before performing the reclaim. This patch also corrects several instances when dn_dirty_link was treated as a list_node_t when it is technically a multilist_node_t. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tom Caputi <tcaputi@datto.com> Closes #7147 Closes #7388
This commit is contained in:
parent
1b0cd07131
commit
b32f1279d4
|
@ -161,6 +161,7 @@ extern "C" {
|
||||||
* dn_allocated_txg
|
* dn_allocated_txg
|
||||||
* dn_free_txg
|
* dn_free_txg
|
||||||
* dn_assigned_txg
|
* dn_assigned_txg
|
||||||
|
* dn_dirty_txg
|
||||||
* dd_assigned_tx
|
* dd_assigned_tx
|
||||||
* dn_notxholds
|
* dn_notxholds
|
||||||
* dn_dirtyctx
|
* dn_dirtyctx
|
||||||
|
|
|
@ -260,6 +260,7 @@ struct dnode {
|
||||||
uint64_t dn_allocated_txg;
|
uint64_t dn_allocated_txg;
|
||||||
uint64_t dn_free_txg;
|
uint64_t dn_free_txg;
|
||||||
uint64_t dn_assigned_txg;
|
uint64_t dn_assigned_txg;
|
||||||
|
uint64_t dn_dirty_txg; /* txg dnode was last dirtied */
|
||||||
kcondvar_t dn_notxholds;
|
kcondvar_t dn_notxholds;
|
||||||
enum dnode_dirtycontext dn_dirtyctx;
|
enum dnode_dirtycontext dn_dirtyctx;
|
||||||
uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
|
uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
|
||||||
|
@ -362,6 +363,9 @@ void dnode_evict_dbufs(dnode_t *dn);
|
||||||
void dnode_evict_bonus(dnode_t *dn);
|
void dnode_evict_bonus(dnode_t *dn);
|
||||||
void dnode_free_interior_slots(dnode_t *dn);
|
void dnode_free_interior_slots(dnode_t *dn);
|
||||||
|
|
||||||
|
#define DNODE_IS_DIRTY(_dn) \
|
||||||
|
((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
|
||||||
|
|
||||||
#define DNODE_IS_CACHEABLE(_dn) \
|
#define DNODE_IS_CACHEABLE(_dn) \
|
||||||
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
|
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
|
||||||
(DMU_OT_IS_METADATA((_dn)->dn_type) && \
|
(DMU_OT_IS_METADATA((_dn)->dn_type) && \
|
||||||
|
|
|
@ -1606,6 +1606,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
||||||
FTAG);
|
FTAG);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tx->tx_txg > dn->dn_dirty_txg)
|
||||||
|
dn->dn_dirty_txg = tx->tx_txg;
|
||||||
mutex_exit(&dn->dn_mtx);
|
mutex_exit(&dn->dn_mtx);
|
||||||
|
|
||||||
if (db->db_blkid == DMU_SPILL_BLKID)
|
if (db->db_blkid == DMU_SPILL_BLKID)
|
||||||
|
|
|
@ -2044,7 +2044,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
|
||||||
* Check if dnode is dirty
|
* Check if dnode is dirty
|
||||||
*/
|
*/
|
||||||
for (i = 0; i < TXG_SIZE; i++) {
|
for (i = 0; i < TXG_SIZE; i++) {
|
||||||
if (list_link_active(&dn->dn_dirty_link[i])) {
|
if (multilist_link_active(&dn->dn_dirty_link[i])) {
|
||||||
clean = B_FALSE;
|
clean = B_FALSE;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1213,10 +1213,23 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
|
||||||
ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
|
ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
|
||||||
multilist_sublist_remove(list, dn);
|
multilist_sublist_remove(list, dn);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we are not doing useraccounting (os_synced_dnodes == NULL)
|
||||||
|
* we are done with this dnode for this txg. Unset dn_dirty_txg
|
||||||
|
* if later txgs aren't dirtying it so that future holders do
|
||||||
|
* not get a stale value. Otherwise, we will do this in
|
||||||
|
* userquota_updates_task() when processing has completely
|
||||||
|
* finished for this txg.
|
||||||
|
*/
|
||||||
multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
|
multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
|
||||||
if (newlist != NULL) {
|
if (newlist != NULL) {
|
||||||
(void) dnode_add_ref(dn, newlist);
|
(void) dnode_add_ref(dn, newlist);
|
||||||
multilist_insert(newlist, dn);
|
multilist_insert(newlist, dn);
|
||||||
|
} else {
|
||||||
|
mutex_enter(&dn->dn_mtx);
|
||||||
|
if (dn->dn_dirty_txg == tx->tx_txg)
|
||||||
|
dn->dn_dirty_txg = 0;
|
||||||
|
mutex_exit(&dn->dn_mtx);
|
||||||
}
|
}
|
||||||
|
|
||||||
dnode_sync(dn, tx);
|
dnode_sync(dn, tx);
|
||||||
|
@ -1621,6 +1634,8 @@ userquota_updates_task(void *arg)
|
||||||
dn->dn_id_flags |= DN_ID_CHKED_BONUS;
|
dn->dn_id_flags |= DN_ID_CHKED_BONUS;
|
||||||
}
|
}
|
||||||
dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
|
dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
|
||||||
|
if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
|
||||||
|
dn->dn_dirty_txg = 0;
|
||||||
mutex_exit(&dn->dn_mtx);
|
mutex_exit(&dn->dn_mtx);
|
||||||
|
|
||||||
multilist_sublist_remove(list, dn);
|
multilist_sublist_remove(list, dn);
|
||||||
|
|
|
@ -137,7 +137,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
|
||||||
bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
|
bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
|
||||||
|
|
||||||
for (i = 0; i < TXG_SIZE; i++) {
|
for (i = 0; i < TXG_SIZE; i++) {
|
||||||
list_link_init(&dn->dn_dirty_link[i]);
|
multilist_link_init(&dn->dn_dirty_link[i]);
|
||||||
dn->dn_free_ranges[i] = NULL;
|
dn->dn_free_ranges[i] = NULL;
|
||||||
list_create(&dn->dn_dirty_records[i],
|
list_create(&dn->dn_dirty_records[i],
|
||||||
sizeof (dbuf_dirty_record_t),
|
sizeof (dbuf_dirty_record_t),
|
||||||
|
@ -147,6 +147,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
|
||||||
dn->dn_allocated_txg = 0;
|
dn->dn_allocated_txg = 0;
|
||||||
dn->dn_free_txg = 0;
|
dn->dn_free_txg = 0;
|
||||||
dn->dn_assigned_txg = 0;
|
dn->dn_assigned_txg = 0;
|
||||||
|
dn->dn_dirty_txg = 0;
|
||||||
dn->dn_dirtyctx = 0;
|
dn->dn_dirtyctx = 0;
|
||||||
dn->dn_dirtyctx_firstset = NULL;
|
dn->dn_dirtyctx_firstset = NULL;
|
||||||
dn->dn_bonus = NULL;
|
dn->dn_bonus = NULL;
|
||||||
|
@ -184,7 +185,7 @@ dnode_dest(void *arg, void *unused)
|
||||||
ASSERT(!list_link_active(&dn->dn_link));
|
ASSERT(!list_link_active(&dn->dn_link));
|
||||||
|
|
||||||
for (i = 0; i < TXG_SIZE; i++) {
|
for (i = 0; i < TXG_SIZE; i++) {
|
||||||
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
|
ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
|
||||||
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
|
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
|
||||||
list_destroy(&dn->dn_dirty_records[i]);
|
list_destroy(&dn->dn_dirty_records[i]);
|
||||||
ASSERT0(dn->dn_next_nblkptr[i]);
|
ASSERT0(dn->dn_next_nblkptr[i]);
|
||||||
|
@ -199,6 +200,7 @@ dnode_dest(void *arg, void *unused)
|
||||||
ASSERT0(dn->dn_allocated_txg);
|
ASSERT0(dn->dn_allocated_txg);
|
||||||
ASSERT0(dn->dn_free_txg);
|
ASSERT0(dn->dn_free_txg);
|
||||||
ASSERT0(dn->dn_assigned_txg);
|
ASSERT0(dn->dn_assigned_txg);
|
||||||
|
ASSERT0(dn->dn_dirty_txg);
|
||||||
ASSERT0(dn->dn_dirtyctx);
|
ASSERT0(dn->dn_dirtyctx);
|
||||||
ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
|
ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
|
||||||
ASSERT3P(dn->dn_bonus, ==, NULL);
|
ASSERT3P(dn->dn_bonus, ==, NULL);
|
||||||
|
@ -523,6 +525,7 @@ dnode_destroy(dnode_t *dn)
|
||||||
dn->dn_allocated_txg = 0;
|
dn->dn_allocated_txg = 0;
|
||||||
dn->dn_free_txg = 0;
|
dn->dn_free_txg = 0;
|
||||||
dn->dn_assigned_txg = 0;
|
dn->dn_assigned_txg = 0;
|
||||||
|
dn->dn_dirty_txg = 0;
|
||||||
|
|
||||||
dn->dn_dirtyctx = 0;
|
dn->dn_dirtyctx = 0;
|
||||||
if (dn->dn_dirtyctx_firstset != NULL) {
|
if (dn->dn_dirtyctx_firstset != NULL) {
|
||||||
|
@ -592,6 +595,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
|
||||||
ASSERT0(dn->dn_maxblkid);
|
ASSERT0(dn->dn_maxblkid);
|
||||||
ASSERT0(dn->dn_allocated_txg);
|
ASSERT0(dn->dn_allocated_txg);
|
||||||
ASSERT0(dn->dn_assigned_txg);
|
ASSERT0(dn->dn_assigned_txg);
|
||||||
|
ASSERT0(dn->dn_dirty_txg);
|
||||||
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
|
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
|
||||||
ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
|
ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
|
||||||
ASSERT(avl_is_empty(&dn->dn_dbufs));
|
ASSERT(avl_is_empty(&dn->dn_dbufs));
|
||||||
|
@ -604,7 +608,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
|
||||||
ASSERT0(dn->dn_next_bonustype[i]);
|
ASSERT0(dn->dn_next_bonustype[i]);
|
||||||
ASSERT0(dn->dn_rm_spillblk[i]);
|
ASSERT0(dn->dn_rm_spillblk[i]);
|
||||||
ASSERT0(dn->dn_next_blksz[i]);
|
ASSERT0(dn->dn_next_blksz[i]);
|
||||||
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
|
ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
|
||||||
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
|
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
|
||||||
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
|
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
|
||||||
}
|
}
|
||||||
|
@ -779,6 +783,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
|
||||||
ndn->dn_allocated_txg = odn->dn_allocated_txg;
|
ndn->dn_allocated_txg = odn->dn_allocated_txg;
|
||||||
ndn->dn_free_txg = odn->dn_free_txg;
|
ndn->dn_free_txg = odn->dn_free_txg;
|
||||||
ndn->dn_assigned_txg = odn->dn_assigned_txg;
|
ndn->dn_assigned_txg = odn->dn_assigned_txg;
|
||||||
|
ndn->dn_dirty_txg = odn->dn_dirty_txg;
|
||||||
ndn->dn_dirtyctx = odn->dn_dirtyctx;
|
ndn->dn_dirtyctx = odn->dn_dirtyctx;
|
||||||
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
|
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
|
||||||
ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
|
ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
|
||||||
|
@ -845,6 +850,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
|
||||||
odn->dn_allocated_txg = 0;
|
odn->dn_allocated_txg = 0;
|
||||||
odn->dn_free_txg = 0;
|
odn->dn_free_txg = 0;
|
||||||
odn->dn_assigned_txg = 0;
|
odn->dn_assigned_txg = 0;
|
||||||
|
odn->dn_dirty_txg = 0;
|
||||||
odn->dn_dirtyctx = 0;
|
odn->dn_dirtyctx = 0;
|
||||||
odn->dn_dirtyctx_firstset = NULL;
|
odn->dn_dirtyctx_firstset = NULL;
|
||||||
odn->dn_have_spill = B_FALSE;
|
odn->dn_have_spill = B_FALSE;
|
||||||
|
@ -1069,6 +1075,10 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
|
||||||
{
|
{
|
||||||
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
|
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If all dnode slots are either already free or
|
||||||
|
* evictable return B_TRUE.
|
||||||
|
*/
|
||||||
for (int i = idx; i < idx + slots; i++) {
|
for (int i = idx; i < idx + slots; i++) {
|
||||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||||
dnode_t *dn = dnh->dnh_dnode;
|
dnode_t *dn = dnh->dnh_dnode;
|
||||||
|
@ -1077,18 +1087,17 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
|
||||||
continue;
|
continue;
|
||||||
} else if (DN_SLOT_IS_PTR(dn)) {
|
} else if (DN_SLOT_IS_PTR(dn)) {
|
||||||
mutex_enter(&dn->dn_mtx);
|
mutex_enter(&dn->dn_mtx);
|
||||||
dmu_object_type_t type = dn->dn_type;
|
boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
|
||||||
|
!DNODE_IS_DIRTY(dn));
|
||||||
mutex_exit(&dn->dn_mtx);
|
mutex_exit(&dn->dn_mtx);
|
||||||
|
|
||||||
if (type != DMU_OT_NONE)
|
if (!can_free)
|
||||||
return (B_FALSE);
|
return (B_FALSE);
|
||||||
|
else
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
return (B_FALSE);
|
return (B_FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
return (B_FALSE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return (B_TRUE);
|
return (B_TRUE);
|
||||||
|
@ -1594,7 +1603,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
|
||||||
/*
|
/*
|
||||||
* If we are already marked dirty, we're done.
|
* If we are already marked dirty, we're done.
|
||||||
*/
|
*/
|
||||||
if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
|
if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
|
||||||
multilist_sublist_unlock(mls);
|
multilist_sublist_unlock(mls);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue