Verify block pointers before writing them out
If a block pointer is corrupted (but the block containing it checksums correctly, e.g. due to a bug that overwrites random memory), we can often detect it before the block is read, with the `zfs_blkptr_verify()` function, which is used in `arc_read()`, `zio_free()`, etc. However, such corruption is not typically recoverable. To recover from it we would need to detect the memory error before the block pointer is written to disk. This PR verifies BP's that are contained in indirect blocks and dnodes before they are written to disk, in `dbuf_write_ready()`. This way, we'll get a panic before the on-disk data is corrupted. This will help us to diagnose what's causing the corruption, as well as being much easier to recover from. To minimize performance impact, only checks that can be done without holding the spa_config_lock are performed. Additionally, when corruption is detected, the raw words of the block pointer are logged. (Note that `dprintf_bp()` is a no-op by default, but if enabled it is not safe to use with invalid block pointers.) Reviewed-by: Rich Ercolani <rincebrain@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Paul Zuchowski <pzuchowski@datto.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Closes #14817
This commit is contained in:
parent
dd19821149
commit
3095ca91c2
|
@ -8499,8 +8499,8 @@ zdb_read_block(char *thing, spa_t *spa)
|
|||
!(flags & ZDB_FLAG_DECOMPRESS)) {
|
||||
const blkptr_t *b = (const blkptr_t *)(void *)
|
||||
((uintptr_t)buf + (uintptr_t)blkptr_offset);
|
||||
if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) ==
|
||||
B_FALSE) {
|
||||
if (zfs_blkptr_verify(spa, b,
|
||||
BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
|
||||
abd_return_buf_copy(pabd, buf, lsize);
|
||||
borrowed = B_FALSE;
|
||||
buf = lbuf;
|
||||
|
@ -8508,8 +8508,8 @@ zdb_read_block(char *thing, spa_t *spa)
|
|||
lbuf, lsize, psize, flags);
|
||||
b = (const blkptr_t *)(void *)
|
||||
((uintptr_t)buf + (uintptr_t)blkptr_offset);
|
||||
if (failed || zfs_blkptr_verify(spa, b, B_FALSE,
|
||||
BLK_VERIFY_LOG) == B_FALSE) {
|
||||
if (failed || zfs_blkptr_verify(spa, b,
|
||||
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
|
||||
printf("invalid block pointer at this DVA\n");
|
||||
goto out;
|
||||
}
|
||||
|
|
|
@ -531,6 +531,12 @@ enum blk_verify_flag {
|
|||
BLK_VERIFY_HALT
|
||||
};
|
||||
|
||||
enum blk_config_flag {
|
||||
BLK_CONFIG_HELD, // SCL_VDEV held for writer
|
||||
BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader
|
||||
BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV
|
||||
};
|
||||
|
||||
extern int zio_bookmark_compare(const void *, const void *);
|
||||
|
||||
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
|
||||
|
@ -646,7 +652,7 @@ extern int zio_resume(spa_t *spa);
|
|||
extern void zio_resume_wait(spa_t *spa);
|
||||
|
||||
extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
|
||||
boolean_t config_held, enum blk_verify_flag blk_verify);
|
||||
enum blk_config_flag blk_config, enum blk_verify_flag blk_verify);
|
||||
|
||||
/*
|
||||
* Initial setup and teardown.
|
||||
|
|
|
@ -5696,8 +5696,8 @@ top:
|
|||
* and treat it as a checksum error. This allows an alternate blkptr
|
||||
* to be tried when one is available (e.g. ditto blocks).
|
||||
*/
|
||||
if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER,
|
||||
BLK_VERIFY_LOG)) {
|
||||
if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ?
|
||||
BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
|
||||
rc = SET_ERROR(ECKSUM);
|
||||
goto done;
|
||||
}
|
||||
|
|
|
@ -4636,6 +4636,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|||
i += DNODE_MIN_SIZE;
|
||||
if (dnp->dn_type != DMU_OT_NONE) {
|
||||
fill++;
|
||||
for (int j = 0; j < dnp->dn_nblkptr;
|
||||
j++) {
|
||||
(void) zfs_blkptr_verify(spa,
|
||||
&dnp->dn_blkptr[j],
|
||||
BLK_CONFIG_SKIP,
|
||||
BLK_VERIFY_HALT);
|
||||
}
|
||||
if (dnp->dn_flags &
|
||||
DNODE_FLAG_SPILL_BLKPTR) {
|
||||
(void) zfs_blkptr_verify(spa,
|
||||
DN_SPILL_BLKPTR(dnp),
|
||||
BLK_CONFIG_SKIP,
|
||||
BLK_VERIFY_HALT);
|
||||
}
|
||||
i += dnp->dn_extra_slots *
|
||||
DNODE_MIN_SIZE;
|
||||
}
|
||||
|
@ -4653,6 +4667,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|||
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
|
||||
if (BP_IS_HOLE(ibp))
|
||||
continue;
|
||||
(void) zfs_blkptr_verify(spa, ibp,
|
||||
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
|
||||
fill += BP_GET_FILL(ibp);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1970,7 +1970,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
|
|||
DMU_USERUSED_OBJECT, tx);
|
||||
}
|
||||
arc_buf_destroy(buf, &buf);
|
||||
} else if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) {
|
||||
} else if (!zfs_blkptr_verify(spa, bp,
|
||||
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
|
||||
/*
|
||||
* Sanity check the block pointer contents, this is handled
|
||||
* by arc_read() for the cases above.
|
||||
|
|
|
@ -2387,7 +2387,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
|||
* When damaged consider it to be a metadata error since we cannot
|
||||
* trust the BP_GET_TYPE and BP_GET_LEVEL values.
|
||||
*/
|
||||
if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) {
|
||||
if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
|
||||
atomic_inc_64(&sle->sle_meta_count);
|
||||
return (0);
|
||||
}
|
||||
|
|
|
@ -935,9 +935,35 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
|
|||
(void) vsnprintf(buf, sizeof (buf), fmt, adx);
|
||||
va_end(adx);
|
||||
|
||||
zfs_dbgmsg("bad blkptr at %px: "
|
||||
"DVA[0]=%#llx/%#llx "
|
||||
"DVA[1]=%#llx/%#llx "
|
||||
"DVA[2]=%#llx/%#llx "
|
||||
"prop=%#llx "
|
||||
"pad=%#llx,%#llx "
|
||||
"phys_birth=%#llx "
|
||||
"birth=%#llx "
|
||||
"fill=%#llx "
|
||||
"cksum=%#llx/%#llx/%#llx/%#llx",
|
||||
bp,
|
||||
(long long)bp->blk_dva[0].dva_word[0],
|
||||
(long long)bp->blk_dva[0].dva_word[1],
|
||||
(long long)bp->blk_dva[1].dva_word[0],
|
||||
(long long)bp->blk_dva[1].dva_word[1],
|
||||
(long long)bp->blk_dva[2].dva_word[0],
|
||||
(long long)bp->blk_dva[2].dva_word[1],
|
||||
(long long)bp->blk_prop,
|
||||
(long long)bp->blk_pad[0],
|
||||
(long long)bp->blk_pad[1],
|
||||
(long long)bp->blk_phys_birth,
|
||||
(long long)bp->blk_birth,
|
||||
(long long)bp->blk_fill,
|
||||
(long long)bp->blk_cksum.zc_word[0],
|
||||
(long long)bp->blk_cksum.zc_word[1],
|
||||
(long long)bp->blk_cksum.zc_word[2],
|
||||
(long long)bp->blk_cksum.zc_word[3]);
|
||||
switch (blk_verify) {
|
||||
case BLK_VERIFY_HALT:
|
||||
dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
|
||||
zfs_panic_recover("%s: %s", spa_name(spa), buf);
|
||||
break;
|
||||
case BLK_VERIFY_LOG:
|
||||
|
@ -958,47 +984,54 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
|
|||
* If everything checks out B_TRUE is returned. The zfs_blkptr_verify
|
||||
* argument controls the behavior when an invalid field is detected.
|
||||
*
|
||||
* Modes for zfs_blkptr_verify:
|
||||
* 1) BLK_VERIFY_ONLY (evaluate the block)
|
||||
* 2) BLK_VERIFY_LOG (evaluate the block and log problems)
|
||||
* 3) BLK_VERIFY_HALT (call zfs_panic_recover on error)
|
||||
* Values for blk_verify_flag:
|
||||
* BLK_VERIFY_ONLY: evaluate the block
|
||||
* BLK_VERIFY_LOG: evaluate the block and log problems
|
||||
* BLK_VERIFY_HALT: call zfs_panic_recover on error
|
||||
*
|
||||
* Values for blk_config_flag:
|
||||
* BLK_CONFIG_HELD: caller holds SCL_VDEV for writer
|
||||
* BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be
|
||||
* obtained for reader
|
||||
* BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
|
||||
* performance
|
||||
*/
|
||||
boolean_t
|
||||
zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
|
||||
enum blk_verify_flag blk_verify)
|
||||
zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
|
||||
enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
|
||||
{
|
||||
int errors = 0;
|
||||
|
||||
if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p has invalid TYPE %llu",
|
||||
"blkptr at %px has invalid TYPE %llu",
|
||||
bp, (longlong_t)BP_GET_TYPE(bp));
|
||||
}
|
||||
if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p has invalid CHECKSUM %llu",
|
||||
"blkptr at %px has invalid CHECKSUM %llu",
|
||||
bp, (longlong_t)BP_GET_CHECKSUM(bp));
|
||||
}
|
||||
if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p has invalid COMPRESS %llu",
|
||||
"blkptr at %px has invalid COMPRESS %llu",
|
||||
bp, (longlong_t)BP_GET_COMPRESS(bp));
|
||||
}
|
||||
if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p has invalid LSIZE %llu",
|
||||
"blkptr at %px has invalid LSIZE %llu",
|
||||
bp, (longlong_t)BP_GET_LSIZE(bp));
|
||||
}
|
||||
if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p has invalid PSIZE %llu",
|
||||
"blkptr at %px has invalid PSIZE %llu",
|
||||
bp, (longlong_t)BP_GET_PSIZE(bp));
|
||||
}
|
||||
|
||||
if (BP_IS_EMBEDDED(bp)) {
|
||||
if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p has invalid ETYPE %llu",
|
||||
"blkptr at %px has invalid ETYPE %llu",
|
||||
bp, (longlong_t)BPE_GET_ETYPE(bp));
|
||||
}
|
||||
}
|
||||
|
@ -1010,10 +1043,19 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
|
|||
if (!spa->spa_trust_config)
|
||||
return (errors == 0);
|
||||
|
||||
if (!config_held)
|
||||
spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
|
||||
else
|
||||
switch (blk_config) {
|
||||
case BLK_CONFIG_HELD:
|
||||
ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
|
||||
break;
|
||||
case BLK_CONFIG_NEEDED:
|
||||
spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
|
||||
break;
|
||||
case BLK_CONFIG_SKIP:
|
||||
return (errors == 0);
|
||||
default:
|
||||
panic("invalid blk_config %u", blk_config);
|
||||
}
|
||||
|
||||
/*
|
||||
* Pool-specific checks.
|
||||
*
|
||||
|
@ -1028,20 +1070,20 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
|
|||
|
||||
if (vdevid >= spa->spa_root_vdev->vdev_children) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p DVA %u has invalid VDEV %llu",
|
||||
"blkptr at %px DVA %u has invalid VDEV %llu",
|
||||
bp, i, (longlong_t)vdevid);
|
||||
continue;
|
||||
}
|
||||
vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
|
||||
if (vd == NULL) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p DVA %u has invalid VDEV %llu",
|
||||
"blkptr at %px DVA %u has invalid VDEV %llu",
|
||||
bp, i, (longlong_t)vdevid);
|
||||
continue;
|
||||
}
|
||||
if (vd->vdev_ops == &vdev_hole_ops) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p DVA %u has hole VDEV %llu",
|
||||
"blkptr at %px DVA %u has hole VDEV %llu",
|
||||
bp, i, (longlong_t)vdevid);
|
||||
continue;
|
||||
}
|
||||
|
@ -1059,13 +1101,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
|
|||
asize = vdev_gang_header_asize(vd);
|
||||
if (offset + asize > vd->vdev_asize) {
|
||||
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
||||
"blkptr at %p DVA %u has invalid OFFSET %llu",
|
||||
"blkptr at %px DVA %u has invalid OFFSET %llu",
|
||||
bp, i, (longlong_t)offset);
|
||||
}
|
||||
}
|
||||
if (errors > 0)
|
||||
dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
|
||||
if (!config_held)
|
||||
if (blk_config == BLK_CONFIG_NEEDED)
|
||||
spa_config_exit(spa, SCL_VDEV, bp);
|
||||
|
||||
return (errors == 0);
|
||||
|
@ -1203,7 +1243,7 @@ void
|
|||
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
|
||||
{
|
||||
|
||||
(void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT);
|
||||
(void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
|
||||
|
||||
/*
|
||||
* The check for EMBEDDED is a performance optimization. We
|
||||
|
@ -1282,8 +1322,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|||
{
|
||||
zio_t *zio;
|
||||
|
||||
(void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
|
||||
BLK_VERIFY_HALT);
|
||||
(void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ?
|
||||
BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
|
||||
|
||||
if (BP_IS_EMBEDDED(bp))
|
||||
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
|
||||
|
|
Loading…
Reference in New Issue