Fix read errors race after block cloning
Investigating read errors triggering panic fixed in #16042 I've found that we have a race in a sync process between the moment dirty record for cloned block is removed and the moment dbuf is destroyed. If dmu_buf_hold_array_by_dnode() take a hold on a cloned dbuf before it is synced/destroyed, then dbuf_read_impl() may see it still in DB_NOFILL state, but without the dirty record. Such case is not an error, but equivalent to DB_UNCACHED, since the dbuf block pointer is already updated by dbuf_write_ready(). Unfortunately it is impossible to safely change the dbuf state to DB_UNCACHED there, since there may already be another cloning in progress, that dropped dbuf lock before creating a new dirty record, protected only by the range lock. Reviewed-by: Rob Norris <robn@despairlabs.com> Reviewed-by: Robert Evans <evansr@google.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #16052
This commit is contained in:
parent
76d1dde94c
commit
eeca9a91d6
|
@ -1563,7 +1563,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||||
zbookmark_phys_t zb;
|
zbookmark_phys_t zb;
|
||||||
uint32_t aflags = ARC_FLAG_NOWAIT;
|
uint32_t aflags = ARC_FLAG_NOWAIT;
|
||||||
int err, zio_flags;
|
int err, zio_flags;
|
||||||
blkptr_t bp, *bpp;
|
blkptr_t bp, *bpp = NULL;
|
||||||
|
|
||||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||||
|
@ -1577,29 +1577,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||||
goto early_unlock;
|
goto early_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (db->db_state == DB_UNCACHED) {
|
/*
|
||||||
if (db->db_blkptr == NULL) {
|
* If we have a pending block clone, we don't want to read the
|
||||||
bpp = NULL;
|
* underlying block, but the content of the block being cloned,
|
||||||
} else {
|
* pointed by the dirty record, so we have the most recent data.
|
||||||
bp = *db->db_blkptr;
|
* If there is no dirty record, then we hit a race in a sync
|
||||||
|
* process when the dirty record is already removed, while the
|
||||||
|
* dbuf is not yet destroyed. Such case is equivalent to uncached.
|
||||||
|
*/
|
||||||
|
if (db->db_state == DB_NOFILL) {
|
||||||
|
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
|
||||||
|
if (dr != NULL) {
|
||||||
|
if (!dr->dt.dl.dr_brtwrite) {
|
||||||
|
err = EIO;
|
||||||
|
goto early_unlock;
|
||||||
|
}
|
||||||
|
bp = dr->dt.dl.dr_overridden_by;
|
||||||
bpp = &bp;
|
bpp = &bp;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
dbuf_dirty_record_t *dr;
|
|
||||||
|
|
||||||
ASSERT3S(db->db_state, ==, DB_NOFILL);
|
if (bpp == NULL && db->db_blkptr != NULL) {
|
||||||
|
bp = *db->db_blkptr;
|
||||||
/*
|
|
||||||
* Block cloning: If we have a pending block clone,
|
|
||||||
* we don't want to read the underlying block, but the content
|
|
||||||
* of the block being cloned, so we have the most recent data.
|
|
||||||
*/
|
|
||||||
dr = list_head(&db->db_dirty_records);
|
|
||||||
if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
|
|
||||||
err = EIO;
|
|
||||||
goto early_unlock;
|
|
||||||
}
|
|
||||||
bp = dr->dt.dl.dr_overridden_by;
|
|
||||||
bpp = &bp;
|
bpp = &bp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue