Refactor dbuf_read() for safer decryption

In dbuf_read_verify_dnode_crypt():
 - We don't need original dbuf locked there. Instead take a lock
on a dnode dbuf, that is actually manipulated.
 - Block decryption for a dnode dbuf if it is currently being
written.  ARC hash lock does not protect anonymous buffers, so
arc_untransform() is unsafe when used on buffers being written,
that may happen in case of encrypted dnode buffers, since they
are not copied by dbuf_dirty()/dbuf_hold_copy().

In dbuf_read():
 - If the buffer is in flight, recheck its compression/encryption
status after it is cached, since it may need arc_untransform().

Tested-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16104
This commit is contained in:
Alexander Motin 2024-04-22 14:41:03 -04:00 committed by GitHub
parent c346068e5e
commit 4036b8d027
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 108 additions and 114 deletions

View File

@ -161,13 +161,13 @@ struct {
} dbuf_sums; } dbuf_sums;
#define DBUF_STAT_INCR(stat, val) \ #define DBUF_STAT_INCR(stat, val) \
wmsum_add(&dbuf_sums.stat, val); wmsum_add(&dbuf_sums.stat, val)
#define DBUF_STAT_DECR(stat, val) \ #define DBUF_STAT_DECR(stat, val) \
DBUF_STAT_INCR(stat, -(val)); DBUF_STAT_INCR(stat, -(val))
#define DBUF_STAT_BUMP(stat) \ #define DBUF_STAT_BUMP(stat) \
DBUF_STAT_INCR(stat, 1); DBUF_STAT_INCR(stat, 1)
#define DBUF_STAT_BUMPDOWN(stat) \ #define DBUF_STAT_BUMPDOWN(stat) \
DBUF_STAT_INCR(stat, -1); DBUF_STAT_INCR(stat, -1)
#define DBUF_STAT_MAX(stat, v) { \ #define DBUF_STAT_MAX(stat, v) { \
uint64_t _m; \ uint64_t _m; \
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
@ -177,7 +177,6 @@ struct {
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
/* /*
* Global data structures and functions for the dbuf cache. * Global data structures and functions for the dbuf cache.
@ -1418,13 +1417,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
* a decrypted block. Otherwise success. * a decrypted block. Otherwise success.
*/ */
static int static int
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
{ {
int bonuslen, max_bonuslen, err; int bonuslen, max_bonuslen;
err = dbuf_read_verify_dnode_crypt(db, flags);
if (err)
return (err);
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
@ -1509,32 +1504,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
* decrypt / authenticate them when we need to read an encrypted bonus buffer. * decrypt / authenticate them when we need to read an encrypted bonus buffer.
*/ */
static int static int
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
{ {
int err = 0;
objset_t *os = db->db_objset; objset_t *os = db->db_objset;
arc_buf_t *dnode_abuf; dmu_buf_impl_t *dndb;
dnode_t *dn; arc_buf_t *dnbuf;
zbookmark_phys_t zb; zbookmark_phys_t zb;
int err;
ASSERT(MUTEX_HELD(&db->db_mtx));
if ((flags & DB_RF_NO_DECRYPT) != 0 || if ((flags & DB_RF_NO_DECRYPT) != 0 ||
!os->os_encrypted || os->os_raw_receive) !os->os_encrypted || os->os_raw_receive ||
(dndb = dn->dn_dbuf) == NULL)
return (0); return (0);
DB_DNODE_ENTER(db); dnbuf = dndb->db_buf;
dn = DB_DNODE(db); if (!arc_is_encrypted(dnbuf))
dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; return (0);
if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { mutex_enter(&dndb->db_mtx);
DB_DNODE_EXIT(db);
/*
* Since dnode buffer is modified by sync process, there can be only
* one copy of it. It means we can not modify (decrypt) it while it
* is being written. I don't see how this may happen now, since
* encrypted dnode writes by receive should be completed before any
* plain-text reads due to txg wait, but better be safe than sorry.
*/
while (1) {
if (!arc_is_encrypted(dnbuf)) {
mutex_exit(&dndb->db_mtx);
return (0); return (0);
} }
dbuf_dirty_record_t *dr = dndb->db_data_pending;
if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
break;
cv_wait(&dndb->db_changed, &dndb->db_mtx);
};
SET_BOOKMARK(&zb, dmu_objset_id(os), SET_BOOKMARK(&zb, dmu_objset_id(os),
DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
/* /*
* An error code of EACCES tells us that the key is still not * An error code of EACCES tells us that the key is still not
@ -1547,7 +1556,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
err = 0; err = 0;
DB_DNODE_EXIT(db); mutex_exit(&dndb->db_mtx);
return (err); return (err);
} }
@ -1573,7 +1582,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
RW_LOCK_HELD(&db->db_parent->db_rwlock)); RW_LOCK_HELD(&db->db_parent->db_rwlock));
if (db->db_blkid == DMU_BONUS_BLKID) { if (db->db_blkid == DMU_BONUS_BLKID) {
err = dbuf_read_bonus(db, dn, flags); err = dbuf_read_bonus(db, dn);
goto early_unlock; goto early_unlock;
} }
@ -1635,10 +1644,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
goto early_unlock; goto early_unlock;
} }
err = dbuf_read_verify_dnode_crypt(db, flags);
if (err != 0)
goto early_unlock;
db->db_state = DB_READ; db->db_state = DB_READ;
DTRACE_SET_STATE(db, "read issued"); DTRACE_SET_STATE(db, "read issued");
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
@ -1754,19 +1759,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
int int
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
{ {
int err = 0;
boolean_t prefetch;
dnode_t *dn; dnode_t *dn;
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
int err;
/*
* We don't have to hold the mutex to check db_state because it
* can't be freed while we have a hold on the buffer.
*/
ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(!zfs_refcount_is_zero(&db->db_holds));
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
dn = DB_DNODE(db); dn = DB_DNODE(db);
/*
* Ensure that this block's dnode has been decrypted if the caller
* has requested decrypted data.
*/
err = dbuf_read_verify_dnode_crypt(db, dn, flags);
if (err != 0)
goto done;
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0; (flags & DB_RF_NOPREFETCH) == 0;
@ -1775,13 +1784,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
db->db_partial_read = B_TRUE; db->db_partial_read = B_TRUE;
else if (!(flags & DB_RF_PARTIAL_MORE)) else if (!(flags & DB_RF_PARTIAL_MORE))
db->db_partial_read = B_FALSE; db->db_partial_read = B_FALSE;
if (db->db_state == DB_CACHED) { miss = (db->db_state != DB_CACHED);
/*
* Ensure that this block's dnode has been decrypted if
* the caller has requested decrypted data.
*/
err = dbuf_read_verify_dnode_crypt(db, flags);
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
/*
* Another reader came in while the dbuf was in flight between
* UNCACHED and CACHED. Either a writer will finish filling
* the buffer, sending the dbuf to CACHED, or the first reader's
* request will reach the read_done callback and send the dbuf
* to CACHED. Otherwise, a failure occurred and the dbuf will
* be sent to UNCACHED.
*/
if (flags & DB_RF_NEVERWAIT) {
mutex_exit(&db->db_mtx);
DB_DNODE_EXIT(db);
goto done;
}
do {
ASSERT(db->db_state == DB_READ ||
(flags & DB_RF_HAVESTRUCT) == 0);
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
zio_t *, pio);
cv_wait(&db->db_changed, &db->db_mtx);
} while (db->db_state == DB_READ || db->db_state == DB_FILL);
if (db->db_state == DB_UNCACHED) {
err = SET_ERROR(EIO);
mutex_exit(&db->db_mtx);
DB_DNODE_EXIT(db);
goto done;
}
}
if (db->db_state == DB_CACHED) {
/* /*
* If the arc buf is compressed or encrypted and the caller * If the arc buf is compressed or encrypted and the caller
* requested uncompressed data, we need to untransform it * requested uncompressed data, we need to untransform it
@ -1789,8 +1823,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
* unauthenticated blocks, which will verify their MAC if * unauthenticated blocks, which will verify their MAC if
* the key is now available. * the key is now available.
*/ */
if (err == 0 && db->db_buf != NULL && if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
(flags & DB_RF_NO_DECRYPT) == 0 &&
(arc_is_encrypted(db->db_buf) || (arc_is_encrypted(db->db_buf) ||
arc_is_unauthenticated(db->db_buf) || arc_is_unauthenticated(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
@ -1804,17 +1837,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
dbuf_set_data(db, db->db_buf); dbuf_set_data(db, db->db_buf);
} }
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (err == 0 && prefetch) { } else {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, ASSERT(db->db_state == DB_UNCACHED ||
B_FALSE, flags & DB_RF_HAVESTRUCT); db->db_state == DB_NOFILL);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_hits);
} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
boolean_t need_wait = B_FALSE;
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
if (pio == NULL && (db->db_state == DB_NOFILL || if (pio == NULL && (db->db_state == DB_NOFILL ||
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
spa_t *spa = dn->dn_objset->os_spa; spa_t *spa = dn->dn_objset->os_spa;
@ -1822,23 +1848,19 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
need_wait = B_TRUE; need_wait = B_TRUE;
} }
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG); err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
/* /* dbuf_read_impl drops db_mtx and parent's rwlock. */
* dbuf_read_impl has dropped db_mtx and our parent's rwlock miss = (db->db_state != DB_CACHED);
* for us
*/
if (!err && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
db->db_state != DB_CACHED,
flags & DB_RF_HAVESTRUCT);
} }
if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses);
/* /*
* If we created a zio_root we must execute it to avoid * If we created a zio we must execute it to avoid leaking it, even if
* leaking it, even if it isn't attached to any work due * it isn't attached to any work due to an error in dbuf_read_impl().
* to an error in dbuf_read_impl().
*/ */
if (need_wait) { if (need_wait) {
if (err == 0) if (err == 0)
@ -1847,40 +1869,12 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
(void) zio_wait(pio); (void) zio_wait(pio);
pio = NULL; pio = NULL;
} }
} else {
/* done:
* Another reader came in while the dbuf was in flight if (miss)
* between UNCACHED and CACHED. Either a writer will finish
* writing the buffer (sending the dbuf to CACHED) or the
* first reader's request will reach the read_done callback
* and send the dbuf to CACHED. Otherwise, a failure
* occurred and the dbuf went to UNCACHED.
*/
mutex_exit(&db->db_mtx);
if (prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
B_TRUE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses); DBUF_STAT_BUMP(hash_misses);
else
/* Skip the wait per the caller's request. */ DBUF_STAT_BUMP(hash_hits);
if ((flags & DB_RF_NEVERWAIT) == 0) {
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ ||
db->db_state == DB_FILL) {
ASSERT(db->db_state == DB_READ ||
(flags & DB_RF_HAVESTRUCT) == 0);
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
db, zio_t *, pio);
cv_wait(&db->db_changed, &db->db_mtx);
}
if (db->db_state == DB_UNCACHED)
err = SET_ERROR(EIO);
mutex_exit(&db->db_mtx);
}
}
if (pio && err != 0) { if (pio && err != 0) {
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
ZIO_FLAG_CANFAIL); ZIO_FLAG_CANFAIL);