This commit is contained in:
Robert Evans 2024-09-10 00:43:42 +08:00 committed by GitHub
commit ae395be1e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 101 additions and 40 deletions

View File

@ -2528,13 +2528,18 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
* If we don't find what we are looking for in the block, we return ESRCH. * If we don't find what we are looking for in the block, we return ESRCH.
* Otherwise, return with *offset pointing to the beginning (if searching * Otherwise, return with *offset pointing to the beginning (if searching
* forwards) or end (if searching backwards) of the range covered by the * forwards) or end (if searching backwards) of the range covered by the
* block pointer we matched on (or dnode). * block pointer we matched on (or dnode) but never less (or greater) than
* the starting offset.
* *
* The basic search algorithm used below by dnode_next_offset() is to * For ESRCH, *offset is set to the first byte offset after (or before) the
* use this function to search up the block tree (widen the search) until * searched block unless the block is a hole or the resulting offset would
* we find something (i.e., we don't return ESRCH) and then search back * underflow or overflow (in both cases the starting *offset is unchanged).
* down the tree (narrow the search) until we reach our original search *
* level. * The basic search algorithm used below by dnode_next_offset() uses this
* function to perform a block-order tree traversal. We search up the block
* tree (widen the search) until we find something (i.e., we don't return
* ESRCH) and then search back down the tree (narrow the search) until we
* reach our original search level or backtrack up because nothing matches.
*/ */
static int static int
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
@ -2549,6 +2554,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
int i, inc, error, span; int i, inc, error, span;
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT3U(dn->dn_nlevels, >, 0);
hole = ((flags & DNODE_FIND_HOLE) != 0); hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
@ -2599,24 +2605,29 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
ASSERT(dn->dn_type == DMU_OT_DNODE); ASSERT(dn->dn_type == DMU_OT_DNODE);
ASSERT(!(flags & DNODE_FIND_BACKWARDS)); ASSERT(!(flags & DNODE_FIND_BACKWARDS));
ASSERT3U(P2PHASE(*offset, DNODE_SHIFT), ==, 0);
ASSERT(ISP2(blkfill));
for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); for (i = P2PHASE(*offset >> DNODE_SHIFT, blkfill);
i < blkfill; i += dnp[i].dn_extra_slots + 1) { i < blkfill; i += dnp[i].dn_extra_slots + 1) {
if ((dnp[i].dn_type == DMU_OT_NONE) == hole) if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
break; break;
ASSERT3S(i + dnp[i].dn_extra_slots, <, blkfill);
} }
if (i == blkfill) if (i >= blkfill)
error = SET_ERROR(ESRCH); error = SET_ERROR(ESRCH);
*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + *offset = P2ALIGN(*offset, DNODE_BLOCK_SIZE) +
(i << DNODE_SHIFT); (i << DNODE_SHIFT);
} else { } else {
blkptr_t *bp = data; blkptr_t *bp = data;
uint64_t start = *offset; uint64_t blkid, limit;
span = (lvl - 1) * epbs + dn->dn_datablkshift; span = (lvl - 1) * epbs + dn->dn_datablkshift;
minfill = 0; minfill = 0;
maxfill = blkfill << ((lvl - 1) * epbs); maxfill = blkfill << ((lvl - 1) * epbs);
ASSERT3S(span, >, 0);
ASSERT3U(maxfill, >, 0);
if (hole) if (hole)
maxfill--; maxfill--;
@ -2625,40 +2636,46 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
if (span >= 8 * sizeof (*offset)) { if (span >= 8 * sizeof (*offset)) {
/* This only happens on the highest indirection level */ /* This only happens on the highest indirection level */
ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1); ASSERT3U(lvl, ==, dn->dn_nlevels);
*offset = 0; goto out;
} else {
*offset = *offset >> span;
} }
for (i = BF64_GET(*offset, 0, epbs); blkid = *offset >> span;
limit = 1ULL << (8 * sizeof (*offset) - span);
epb = MIN(epb, limit); /* don't overflow *offset */
ASSERT3U(P2ALIGN(blkid, 1ULL << epbs) + epb, <=, limit);
if (inc < 0 && lvl == dn->dn_nlevels)
blkid = MIN(epb - 1, blkid);
for (i = BF64_GET(blkid, 0, epbs);
i >= 0 && i < epb; i += inc) { i >= 0 && i < epb; i += inc) {
if (BP_GET_FILL(&bp[i]) >= minfill && if (BP_GET_FILL(&bp[i]) >= minfill &&
BP_GET_FILL(&bp[i]) <= maxfill && BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg)) (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
break; break;
if (inc > 0 || *offset > 0) if (inc > 0 || blkid > 0)
*offset += inc; blkid += inc;
} }
if (span >= 8 * sizeof (*offset)) { ASSERT(i >= 0 || inc < 0);
*offset = start; ASSERT(blkid < limit || (inc > 0 && i >= epb));
} else {
*offset = *offset << span; /* set *offset unless matched same block or under/overflow */
if (blkid != (*offset >> span) && blkid < limit &&
(i >= 0 || blkid > 0)) {
/* position offset at end if traversing backwards */
uint64_t endoff = inc < 0 ? 1 : 0;
uint64_t result = ((blkid + endoff) << span) - endoff;
ASSERT(inc > 0 ? result > *offset : result < *offset);
*offset = result;
} }
if (inc < 0) {
/* traversing backwards; position offset at the end */
if (span < 8 * sizeof (*offset))
*offset = MIN(*offset + (1ULL << span) - 1,
start);
} else if (*offset < start) {
*offset = start;
}
if (i < 0 || i >= epb) if (i < 0 || i >= epb)
error = SET_ERROR(ESRCH); error = SET_ERROR(ESRCH);
} }
out:
if (db != NULL) { if (db != NULL) {
rw_exit(&db->db_rwlock); rw_exit(&db->db_rwlock);
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
@ -2667,6 +2684,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
return (error); return (error);
} }
/*
* Adjust *offset to the next (or previous) block byte offset at lvl.
* Returns FALSE if *offset would overflow or underflow.
*/
static boolean_t
dnode_next_block(dnode_t *dn, boolean_t back, uint64_t *offset, int lvl)
{
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
int span = lvl * epbs + dn->dn_datablkshift;
uint64_t blkid, limit;
if (span >= 8 * sizeof (uint64_t))
return (B_FALSE);
blkid = *offset >> span;
limit = 1ULL << (8 * sizeof (*offset) - span);
if (!back && blkid + 1 < limit)
*offset = (blkid + 1) << span;
else if (back && blkid > 0)
*offset = (blkid << span) - 1;
else
return (B_FALSE);
return (B_TRUE);
}
/* /*
* Find the next hole, data, or sparse region at or after *offset. * Find the next hole, data, or sparse region at or after *offset.
* The value 'blkfill' tells us how many items we expect to find * The value 'blkfill' tells us how many items we expect to find
@ -2694,9 +2737,10 @@ int
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
int minlvl, uint64_t blkfill, uint64_t txg) int minlvl, uint64_t blkfill, uint64_t txg)
{ {
uint64_t initial_offset = *offset; uint64_t matched = *offset;
int lvl, maxlvl; int lvl, maxlvl;
int error = 0; int error = 0;
boolean_t back = ((flags & DNODE_FIND_BACKWARDS) != 0);
if (!(flags & DNODE_FIND_HAVELOCK)) if (!(flags & DNODE_FIND_HAVELOCK))
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
@ -2718,16 +2762,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
maxlvl = dn->dn_phys->dn_nlevels; maxlvl = dn->dn_phys->dn_nlevels;
for (lvl = minlvl; lvl <= maxlvl; lvl++) { for (lvl = minlvl; lvl <= maxlvl; ) {
error = dnode_next_offset_level(dn, error = dnode_next_offset_level(dn,
flags, offset, lvl, blkfill, txg); flags, offset, lvl, blkfill, txg);
if (error != ESRCH) if (error == 0 && lvl > minlvl) {
--lvl;
matched = *offset;
} else if (error == ESRCH && lvl < maxlvl &&
dnode_next_block(dn, back, &matched, lvl)) {
/*
* Continue search at next/prev offset in lvl+1 block.
*
* Usually we only search upwards at the start of the
* search as higher level blocks point at a matching
* minlvl block in most cases, but we backtrack if not.
*
* This can happen for txg > 0 searches if the block
* contains only BPs/dnodes freed at that txg. It also
* happens if we are still syncing out the tree, and
* some BP's at higher levels are not updated yet.
*
* We must adjust offset to avoid coming back to the
* same offset and getting stuck looping forever. This
* also deals with the case where offset is already at
* the beginning or end of the object.
*/
++lvl;
*offset = matched;
} else {
break; break;
} }
while (error == 0 && --lvl >= minlvl) {
error = dnode_next_offset_level(dn,
flags, offset, lvl, blkfill, txg);
} }
/* /*
@ -2739,9 +2803,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
error = 0; error = 0;
} }
if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
initial_offset < *offset : initial_offset > *offset))
error = SET_ERROR(ESRCH);
out: out:
if (!(flags & DNODE_FIND_HAVELOCK)) if (!(flags & DNODE_FIND_HAVELOCK))
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);