dnode_next_offset: backtrack if lower level does not match

This changes the basic search algorithm from a single search up and down
the tree to a full depth-first traversal to handle conditions where the
tree matches at a higher level but not a lower level.

Normally higher level blocks always point to matching blocks, but there
are cases where this does not happen:

1. Racing block pointer updates from dbuf_write_ready.

   Before f664f1ee7f (#8946), both dbuf_write_ready and
   dnode_next_offset held dn_struct_rwlock which protected against
   pointer writes from concurrent syncs.

   This no longer applies, so sync context can f.e. clear or fill all
   L1->L0 BPs before the L2->L1 BP and higher BP's are updated.

   dnode_free_range in particular can reach this case and skip over L1
   blocks that need to be dirtied. Later, sync will panic in
   free_children when trying to clear a non-dirty indirect block.

   This case was found with ztest.

2. txg > 0, non-hole case. This is #11196.

   Freeing blocks/dnodes breaks the assumption that a match at a higher
   level implies a match at a lower level when filtering txg > 0.

   Whenever some but not all L0 blocks are freed, the parent L1 block is
   rewritten. Its updated L2->L1 BP reflects a newer birth txg.

   Later when searching by txg, if the L1 block matches since the txg is
   newer, it is possible that none of the remaining L1->L0 BPs match if
   none have been updated.

   The same behavior is possible with dnode search at L0.

   This is reachable from dsl_destroy_head for synchronous freeing.
   When this happens open context fails to free objects leaving sync
   context stuck freeing potentially many objects.

   This is also reachable from traverse_pool for extreme rewind where it
   is theoretically possible that datasets not dirtied after txg are
   skipped if the MOS has high enough indirection to trigger this case.

In both of these cases, without backtracking the search ends prematurely
as ESRCH result implies no more matches in the entire object.

Signed-off-by: Robert Evans <evansr@google.com>
This commit is contained in:
Robert Evans 2024-01-26 09:02:36 -05:00
parent c28f94f32e
commit 0fe0b079a5
1 changed files with 101 additions and 40 deletions

View File

@ -2498,13 +2498,18 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
* If we don't find what we are looking for in the block, we return ESRCH. * If we don't find what we are looking for in the block, we return ESRCH.
* Otherwise, return with *offset pointing to the beginning (if searching * Otherwise, return with *offset pointing to the beginning (if searching
* forwards) or end (if searching backwards) of the range covered by the * forwards) or end (if searching backwards) of the range covered by the
* block pointer we matched on (or dnode). * block pointer we matched on (or dnode) but never less (or greater) than
* the starting offset.
* *
* The basic search algorithm used below by dnode_next_offset() is to * For ESRCH, *offset is set to the first byte offset after (or before) the
* use this function to search up the block tree (widen the search) until * searched block unless the block is a hole or the resulting offset would
* we find something (i.e., we don't return ESRCH) and then search back * underflow or overflow (in both cases the starting *offset is unchanged).
* down the tree (narrow the search) until we reach our original search *
* level. * The basic search algorithm used below by dnode_next_offset() uses this
* function to perform a block-order tree traversal. We search up the block
* tree (widen the search) until we find something (i.e., we don't return
* ESRCH) and then search back down the tree (narrow the search) until we
* reach our original search level or backtrack up because nothing matches.
*/ */
static int static int
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
@ -2519,6 +2524,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
int i, inc, error, span; int i, inc, error, span;
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT3U(dn->dn_nlevels, >, 0);
hole = ((flags & DNODE_FIND_HOLE) != 0); hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
@ -2569,24 +2575,29 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
ASSERT(dn->dn_type == DMU_OT_DNODE); ASSERT(dn->dn_type == DMU_OT_DNODE);
ASSERT(!(flags & DNODE_FIND_BACKWARDS)); ASSERT(!(flags & DNODE_FIND_BACKWARDS));
ASSERT3U(P2PHASE(*offset, DNODE_SHIFT), ==, 0);
ASSERT(ISP2(blkfill));
for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); for (i = P2PHASE(*offset >> DNODE_SHIFT, blkfill);
i < blkfill; i += dnp[i].dn_extra_slots + 1) { i < blkfill; i += dnp[i].dn_extra_slots + 1) {
if ((dnp[i].dn_type == DMU_OT_NONE) == hole) if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
break; break;
ASSERT3S(i + dnp[i].dn_extra_slots, <, blkfill);
} }
if (i == blkfill) if (i >= blkfill)
error = SET_ERROR(ESRCH); error = SET_ERROR(ESRCH);
*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + *offset = P2ALIGN(*offset, DNODE_BLOCK_SIZE) +
(i << DNODE_SHIFT); (i << DNODE_SHIFT);
} else { } else {
blkptr_t *bp = data; blkptr_t *bp = data;
uint64_t start = *offset; uint64_t blkid, limit;
span = (lvl - 1) * epbs + dn->dn_datablkshift; span = (lvl - 1) * epbs + dn->dn_datablkshift;
minfill = 0; minfill = 0;
maxfill = blkfill << ((lvl - 1) * epbs); maxfill = blkfill << ((lvl - 1) * epbs);
ASSERT3S(span, >, 0);
ASSERT3U(maxfill, >, 0);
if (hole) if (hole)
maxfill--; maxfill--;
@ -2595,40 +2606,46 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
if (span >= 8 * sizeof (*offset)) { if (span >= 8 * sizeof (*offset)) {
/* This only happens on the highest indirection level */ /* This only happens on the highest indirection level */
ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1); ASSERT3U(lvl, ==, dn->dn_nlevels);
*offset = 0; goto out;
} else {
*offset = *offset >> span;
} }
for (i = BF64_GET(*offset, 0, epbs); blkid = *offset >> span;
limit = 1ULL << (8 * sizeof (*offset) - span);
epb = MIN(epb, limit); /* don't overflow *offset */
ASSERT3U(P2ALIGN(blkid, 1ULL << epbs) + epb, <=, limit);
if (inc < 0 && lvl == dn->dn_nlevels)
blkid = MIN(epb - 1, blkid);
for (i = BF64_GET(blkid, 0, epbs);
i >= 0 && i < epb; i += inc) { i >= 0 && i < epb; i += inc) {
if (BP_GET_FILL(&bp[i]) >= minfill && if (BP_GET_FILL(&bp[i]) >= minfill &&
BP_GET_FILL(&bp[i]) <= maxfill && BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || bp[i].blk_birth > txg)) (hole || bp[i].blk_birth > txg))
break; break;
if (inc > 0 || *offset > 0) if (inc > 0 || blkid > 0)
*offset += inc; blkid += inc;
} }
if (span >= 8 * sizeof (*offset)) { ASSERT(i >= 0 || inc < 0);
*offset = start; ASSERT(blkid < limit || (inc > 0 && i >= epb));
} else {
*offset = *offset << span; /* set *offset unless matched same block or under/overflow */
if (blkid != (*offset >> span) && blkid < limit &&
(i >= 0 || blkid > 0)) {
/* position offset at end if traversing backwards */
uint64_t endoff = inc < 0 ? 1 : 0;
uint64_t result = ((blkid + endoff) << span) - endoff;
ASSERT(inc > 0 ? result > *offset : result < *offset);
*offset = result;
} }
if (inc < 0) {
/* traversing backwards; position offset at the end */
if (span < 8 * sizeof (*offset))
*offset = MIN(*offset + (1ULL << span) - 1,
start);
} else if (*offset < start) {
*offset = start;
}
if (i < 0 || i >= epb) if (i < 0 || i >= epb)
error = SET_ERROR(ESRCH); error = SET_ERROR(ESRCH);
} }
out:
if (db != NULL) { if (db != NULL) {
rw_exit(&db->db_rwlock); rw_exit(&db->db_rwlock);
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
@ -2637,6 +2654,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
return (error); return (error);
} }
/*
* Adjust *offset to the next (or previous) block byte offset at lvl.
* Returns FALSE if *offset would overflow or underflow.
*/
static boolean_t
dnode_next_block(dnode_t *dn, boolean_t back, uint64_t *offset, int lvl)
{
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
int span = lvl * epbs + dn->dn_datablkshift;
uint64_t blkid, limit;
if (span >= 8 * sizeof (uint64_t))
return (B_FALSE);
blkid = *offset >> span;
limit = 1ULL << (8 * sizeof (*offset) - span);
if (!back && blkid + 1 < limit)
*offset = (blkid + 1) << span;
else if (back && blkid > 0)
*offset = (blkid << span) - 1;
else
return (B_FALSE);
return (B_TRUE);
}
/* /*
* Find the next hole, data, or sparse region at or after *offset. * Find the next hole, data, or sparse region at or after *offset.
* The value 'blkfill' tells us how many items we expect to find * The value 'blkfill' tells us how many items we expect to find
@ -2664,9 +2707,10 @@ int
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
int minlvl, uint64_t blkfill, uint64_t txg) int minlvl, uint64_t blkfill, uint64_t txg)
{ {
uint64_t initial_offset = *offset; uint64_t matched = *offset;
int lvl, maxlvl; int lvl, maxlvl;
int error = 0; int error = 0;
boolean_t back = ((flags & DNODE_FIND_BACKWARDS) != 0);
if (!(flags & DNODE_FIND_HAVELOCK)) if (!(flags & DNODE_FIND_HAVELOCK))
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
@ -2688,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
maxlvl = dn->dn_phys->dn_nlevels; maxlvl = dn->dn_phys->dn_nlevels;
for (lvl = minlvl; lvl <= maxlvl; lvl++) { for (lvl = minlvl; lvl <= maxlvl; ) {
error = dnode_next_offset_level(dn, error = dnode_next_offset_level(dn,
flags, offset, lvl, blkfill, txg); flags, offset, lvl, blkfill, txg);
if (error != ESRCH) if (error == 0 && lvl > minlvl) {
--lvl;
matched = *offset;
} else if (error == ESRCH && lvl < maxlvl &&
dnode_next_block(dn, back, &matched, lvl)) {
/*
* Continue search at next/prev offset in lvl+1 block.
*
* Usually we only search upwards at the start of the
* search as higher level blocks point at a matching
* minlvl block in most cases, but we backtrack if not.
*
* This can happen for txg > 0 searches if the block
* contains only BPs/dnodes freed at that txg. It also
* happens if we are still syncing out the tree, and
* some BP's at higher levels are not updated yet.
*
* We must adjust offset to avoid coming back to the
* same offset and getting stuck looping forever. This
* also deals with the case where offset is already at
* the beginning or end of the object.
*/
++lvl;
*offset = matched;
} else {
break; break;
} }
while (error == 0 && --lvl >= minlvl) {
error = dnode_next_offset_level(dn,
flags, offset, lvl, blkfill, txg);
} }
/* /*
@ -2709,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
error = 0; error = 0;
} }
if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
initial_offset < *offset : initial_offset > *offset))
error = SET_ERROR(ESRCH);
out: out:
if (!(flags & DNODE_FIND_HAVELOCK)) if (!(flags & DNODE_FIND_HAVELOCK))
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);