Refactor dbuf_read() for safer decryption
In dbuf_read_verify_dnode_crypt(): - We don't need original dbuf locked there. Instead take a lock on a dnode dbuf, that is actually manipulated. - Block decryption for a dnode dbuf if it is currently being written. ARC hash lock does not protect anonymous buffers, so arc_untransform() is unsafe when used on buffers being written, that may happen in case of encrypted dnode buffers, since they are not copied by dbuf_dirty()/dbuf_hold_copy(). In dbuf_read(): - If the buffer is in flight, recheck its compression/encryption status after it is cached, since it may need arc_untransform(). Tested-by: Rich Ercolani <rincebrain@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #16104
This commit is contained in:
parent
9edf6af4ae
commit
b474dfad0d
|
@ -161,13 +161,13 @@ struct {
|
|||
} dbuf_sums;
|
||||
|
||||
#define DBUF_STAT_INCR(stat, val) \
|
||||
wmsum_add(&dbuf_sums.stat, val);
|
||||
wmsum_add(&dbuf_sums.stat, val)
|
||||
#define DBUF_STAT_DECR(stat, val) \
|
||||
DBUF_STAT_INCR(stat, -(val));
|
||||
DBUF_STAT_INCR(stat, -(val))
|
||||
#define DBUF_STAT_BUMP(stat) \
|
||||
DBUF_STAT_INCR(stat, 1);
|
||||
DBUF_STAT_INCR(stat, 1)
|
||||
#define DBUF_STAT_BUMPDOWN(stat) \
|
||||
DBUF_STAT_INCR(stat, -1);
|
||||
DBUF_STAT_INCR(stat, -1)
|
||||
#define DBUF_STAT_MAX(stat, v) { \
|
||||
uint64_t _m; \
|
||||
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
|
||||
|
@ -177,7 +177,6 @@ struct {
|
|||
|
||||
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
||||
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
|
||||
static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
|
||||
|
||||
/*
|
||||
* Global data structures and functions for the dbuf cache.
|
||||
|
@ -1403,13 +1402,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
|||
* a decrypted block. Otherwise success.
|
||||
*/
|
||||
static int
|
||||
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
|
||||
{
|
||||
int bonuslen, max_bonuslen, err;
|
||||
|
||||
err = dbuf_read_verify_dnode_crypt(db, flags);
|
||||
if (err)
|
||||
return (err);
|
||||
int bonuslen, max_bonuslen;
|
||||
|
||||
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
|
||||
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
||||
|
@ -1494,32 +1489,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
|
|||
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
|
||||
*/
|
||||
static int
|
||||
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
|
||||
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
{
|
||||
int err = 0;
|
||||
objset_t *os = db->db_objset;
|
||||
arc_buf_t *dnode_abuf;
|
||||
dnode_t *dn;
|
||||
dmu_buf_impl_t *dndb;
|
||||
arc_buf_t *dnbuf;
|
||||
zbookmark_phys_t zb;
|
||||
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
int err;
|
||||
|
||||
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
|
||||
!os->os_encrypted || os->os_raw_receive)
|
||||
!os->os_encrypted || os->os_raw_receive ||
|
||||
(dndb = dn->dn_dbuf) == NULL)
|
||||
return (0);
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
dn = DB_DNODE(db);
|
||||
dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
|
||||
|
||||
if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
|
||||
DB_DNODE_EXIT(db);
|
||||
dnbuf = dndb->db_buf;
|
||||
if (!arc_is_encrypted(dnbuf))
|
||||
return (0);
|
||||
}
|
||||
|
||||
mutex_enter(&dndb->db_mtx);
|
||||
|
||||
/*
|
||||
* Since dnode buffer is modified by sync process, there can be only
|
||||
* one copy of it. It means we can not modify (decrypt) it while it
|
||||
* is being written. I don't see how this may happen now, since
|
||||
* encrypted dnode writes by receive should be completed before any
|
||||
* plain-text reads due to txg wait, but better be safe than sorry.
|
||||
*/
|
||||
while (1) {
|
||||
if (!arc_is_encrypted(dnbuf)) {
|
||||
mutex_exit(&dndb->db_mtx);
|
||||
return (0);
|
||||
}
|
||||
dbuf_dirty_record_t *dr = dndb->db_data_pending;
|
||||
if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
|
||||
break;
|
||||
cv_wait(&dndb->db_changed, &dndb->db_mtx);
|
||||
};
|
||||
|
||||
SET_BOOKMARK(&zb, dmu_objset_id(os),
|
||||
DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
|
||||
err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
|
||||
DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
|
||||
err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
|
||||
|
||||
/*
|
||||
* An error code of EACCES tells us that the key is still not
|
||||
|
@ -1532,7 +1541,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
|
|||
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
|
||||
err = 0;
|
||||
|
||||
DB_DNODE_EXIT(db);
|
||||
mutex_exit(&dndb->db_mtx);
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
@ -1558,7 +1567,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
|||
RW_LOCK_HELD(&db->db_parent->db_rwlock));
|
||||
|
||||
if (db->db_blkid == DMU_BONUS_BLKID) {
|
||||
err = dbuf_read_bonus(db, dn, flags);
|
||||
err = dbuf_read_bonus(db, dn);
|
||||
goto early_unlock;
|
||||
}
|
||||
|
||||
|
@ -1619,10 +1628,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
|||
goto early_unlock;
|
||||
}
|
||||
|
||||
err = dbuf_read_verify_dnode_crypt(db, flags);
|
||||
if (err != 0)
|
||||
goto early_unlock;
|
||||
|
||||
db->db_state = DB_READ;
|
||||
DTRACE_SET_STATE(db, "read issued");
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
@ -1738,19 +1743,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
|||
int
|
||||
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
{
|
||||
int err = 0;
|
||||
boolean_t prefetch;
|
||||
dnode_t *dn;
|
||||
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
|
||||
int err;
|
||||
|
||||
/*
|
||||
* We don't have to hold the mutex to check db_state because it
|
||||
* can't be freed while we have a hold on the buffer.
|
||||
*/
|
||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
dn = DB_DNODE(db);
|
||||
|
||||
/*
|
||||
* Ensure that this block's dnode has been decrypted if the caller
|
||||
* has requested decrypted data.
|
||||
*/
|
||||
err = dbuf_read_verify_dnode_crypt(db, dn, flags);
|
||||
if (err != 0)
|
||||
goto done;
|
||||
|
||||
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
||||
(flags & DB_RF_NOPREFETCH) == 0;
|
||||
|
||||
|
@ -1759,13 +1768,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
|||
db->db_partial_read = B_TRUE;
|
||||
else if (!(flags & DB_RF_PARTIAL_MORE))
|
||||
db->db_partial_read = B_FALSE;
|
||||
if (db->db_state == DB_CACHED) {
|
||||
/*
|
||||
* Ensure that this block's dnode has been decrypted if
|
||||
* the caller has requested decrypted data.
|
||||
*/
|
||||
err = dbuf_read_verify_dnode_crypt(db, flags);
|
||||
miss = (db->db_state != DB_CACHED);
|
||||
|
||||
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
|
||||
/*
|
||||
* Another reader came in while the dbuf was in flight between
|
||||
* UNCACHED and CACHED. Either a writer will finish filling
|
||||
* the buffer, sending the dbuf to CACHED, or the first reader's
|
||||
* request will reach the read_done callback and send the dbuf
|
||||
* to CACHED. Otherwise, a failure occurred and the dbuf will
|
||||
* be sent to UNCACHED.
|
||||
*/
|
||||
if (flags & DB_RF_NEVERWAIT) {
|
||||
mutex_exit(&db->db_mtx);
|
||||
DB_DNODE_EXIT(db);
|
||||
goto done;
|
||||
}
|
||||
do {
|
||||
ASSERT(db->db_state == DB_READ ||
|
||||
(flags & DB_RF_HAVESTRUCT) == 0);
|
||||
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
|
||||
zio_t *, pio);
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
} while (db->db_state == DB_READ || db->db_state == DB_FILL);
|
||||
if (db->db_state == DB_UNCACHED) {
|
||||
err = SET_ERROR(EIO);
|
||||
mutex_exit(&db->db_mtx);
|
||||
DB_DNODE_EXIT(db);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
if (db->db_state == DB_CACHED) {
|
||||
/*
|
||||
* If the arc buf is compressed or encrypted and the caller
|
||||
* requested uncompressed data, we need to untransform it
|
||||
|
@ -1773,8 +1807,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
|||
* unauthenticated blocks, which will verify their MAC if
|
||||
* the key is now available.
|
||||
*/
|
||||
if (err == 0 && db->db_buf != NULL &&
|
||||
(flags & DB_RF_NO_DECRYPT) == 0 &&
|
||||
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
|
||||
(arc_is_encrypted(db->db_buf) ||
|
||||
arc_is_unauthenticated(db->db_buf) ||
|
||||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
|
||||
|
@ -1788,17 +1821,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
|||
dbuf_set_data(db, db->db_buf);
|
||||
}
|
||||
mutex_exit(&db->db_mtx);
|
||||
if (err == 0 && prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
||||
B_FALSE, flags & DB_RF_HAVESTRUCT);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
DBUF_STAT_BUMP(hash_hits);
|
||||
} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
|
||||
boolean_t need_wait = B_FALSE;
|
||||
|
||||
} else {
|
||||
ASSERT(db->db_state == DB_UNCACHED ||
|
||||
db->db_state == DB_NOFILL);
|
||||
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
||||
|
||||
if (pio == NULL && (db->db_state == DB_NOFILL ||
|
||||
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
|
||||
spa_t *spa = dn->dn_objset->os_spa;
|
||||
|
@ -1806,65 +1832,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
|||
need_wait = B_TRUE;
|
||||
}
|
||||
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
|
||||
/*
|
||||
* dbuf_read_impl has dropped db_mtx and our parent's rwlock
|
||||
* for us
|
||||
*/
|
||||
if (!err && prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
||||
db->db_state != DB_CACHED,
|
||||
flags & DB_RF_HAVESTRUCT);
|
||||
}
|
||||
|
||||
DB_DNODE_EXIT(db);
|
||||
DBUF_STAT_BUMP(hash_misses);
|
||||
|
||||
/*
|
||||
* If we created a zio_root we must execute it to avoid
|
||||
* leaking it, even if it isn't attached to any work due
|
||||
* to an error in dbuf_read_impl().
|
||||
*/
|
||||
if (need_wait) {
|
||||
if (err == 0)
|
||||
err = zio_wait(pio);
|
||||
else
|
||||
(void) zio_wait(pio);
|
||||
pio = NULL;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Another reader came in while the dbuf was in flight
|
||||
* between UNCACHED and CACHED. Either a writer will finish
|
||||
* writing the buffer (sending the dbuf to CACHED) or the
|
||||
* first reader's request will reach the read_done callback
|
||||
* and send the dbuf to CACHED. Otherwise, a failure
|
||||
* occurred and the dbuf went to UNCACHED.
|
||||
*/
|
||||
mutex_exit(&db->db_mtx);
|
||||
if (prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
||||
B_TRUE, flags & DB_RF_HAVESTRUCT);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
DBUF_STAT_BUMP(hash_misses);
|
||||
|
||||
/* Skip the wait per the caller's request. */
|
||||
if ((flags & DB_RF_NEVERWAIT) == 0) {
|
||||
mutex_enter(&db->db_mtx);
|
||||
while (db->db_state == DB_READ ||
|
||||
db->db_state == DB_FILL) {
|
||||
ASSERT(db->db_state == DB_READ ||
|
||||
(flags & DB_RF_HAVESTRUCT) == 0);
|
||||
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
|
||||
db, zio_t *, pio);
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
}
|
||||
if (db->db_state == DB_UNCACHED)
|
||||
err = SET_ERROR(EIO);
|
||||
mutex_exit(&db->db_mtx);
|
||||
}
|
||||
/* dbuf_read_impl drops db_mtx and parent's rwlock. */
|
||||
miss = (db->db_state != DB_CACHED);
|
||||
}
|
||||
|
||||
if (err == 0 && prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
|
||||
flags & DB_RF_HAVESTRUCT);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
/*
|
||||
* If we created a zio we must execute it to avoid leaking it, even if
|
||||
* it isn't attached to any work due to an error in dbuf_read_impl().
|
||||
*/
|
||||
if (need_wait) {
|
||||
if (err == 0)
|
||||
err = zio_wait(pio);
|
||||
else
|
||||
(void) zio_wait(pio);
|
||||
pio = NULL;
|
||||
}
|
||||
|
||||
done:
|
||||
if (miss)
|
||||
DBUF_STAT_BUMP(hash_misses);
|
||||
else
|
||||
DBUF_STAT_BUMP(hash_hits);
|
||||
if (pio && err != 0) {
|
||||
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
|
||||
ZIO_FLAG_CANFAIL);
|
||||
|
|
Loading…
Reference in New Issue