Refactor dbuf_read() for safer decryption
In dbuf_read_verify_dnode_crypt(): - We don't need original dbuf locked there. Instead take a lock on a dnode dbuf, that is actually manipulated. - Block decryption for a dnode dbuf if it is currently being written. ARC hash lock does not protect anonymous buffers, so arc_untransform() is unsafe when used on buffers being written, that may happen in case of encrypted dnode buffers, since they are not copied by dbuf_dirty()/dbuf_hold_copy(). In dbuf_read(): - If the buffer is in flight, recheck its compression/encryption status after it is cached, since it may need arc_untransform(). Tested-by: Rich Ercolani <rincebrain@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #16104
This commit is contained in:
parent
9edf6af4ae
commit
b474dfad0d
|
@ -161,13 +161,13 @@ struct {
|
||||||
} dbuf_sums;
|
} dbuf_sums;
|
||||||
|
|
||||||
#define DBUF_STAT_INCR(stat, val) \
|
#define DBUF_STAT_INCR(stat, val) \
|
||||||
wmsum_add(&dbuf_sums.stat, val);
|
wmsum_add(&dbuf_sums.stat, val)
|
||||||
#define DBUF_STAT_DECR(stat, val) \
|
#define DBUF_STAT_DECR(stat, val) \
|
||||||
DBUF_STAT_INCR(stat, -(val));
|
DBUF_STAT_INCR(stat, -(val))
|
||||||
#define DBUF_STAT_BUMP(stat) \
|
#define DBUF_STAT_BUMP(stat) \
|
||||||
DBUF_STAT_INCR(stat, 1);
|
DBUF_STAT_INCR(stat, 1)
|
||||||
#define DBUF_STAT_BUMPDOWN(stat) \
|
#define DBUF_STAT_BUMPDOWN(stat) \
|
||||||
DBUF_STAT_INCR(stat, -1);
|
DBUF_STAT_INCR(stat, -1)
|
||||||
#define DBUF_STAT_MAX(stat, v) { \
|
#define DBUF_STAT_MAX(stat, v) { \
|
||||||
uint64_t _m; \
|
uint64_t _m; \
|
||||||
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
|
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
|
||||||
|
@ -177,7 +177,6 @@ struct {
|
||||||
|
|
||||||
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
||||||
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
|
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
|
||||||
static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Global data structures and functions for the dbuf cache.
|
* Global data structures and functions for the dbuf cache.
|
||||||
|
@ -1403,13 +1402,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
||||||
* a decrypted block. Otherwise success.
|
* a decrypted block. Otherwise success.
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
|
||||||
{
|
{
|
||||||
int bonuslen, max_bonuslen, err;
|
int bonuslen, max_bonuslen;
|
||||||
|
|
||||||
err = dbuf_read_verify_dnode_crypt(db, flags);
|
|
||||||
if (err)
|
|
||||||
return (err);
|
|
||||||
|
|
||||||
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
|
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
|
||||||
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
||||||
|
@ -1494,32 +1489,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
|
||||||
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
|
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
|
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||||
{
|
{
|
||||||
int err = 0;
|
|
||||||
objset_t *os = db->db_objset;
|
objset_t *os = db->db_objset;
|
||||||
arc_buf_t *dnode_abuf;
|
dmu_buf_impl_t *dndb;
|
||||||
dnode_t *dn;
|
arc_buf_t *dnbuf;
|
||||||
zbookmark_phys_t zb;
|
zbookmark_phys_t zb;
|
||||||
|
int err;
|
||||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
||||||
|
|
||||||
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
|
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
|
||||||
!os->os_encrypted || os->os_raw_receive)
|
!os->os_encrypted || os->os_raw_receive ||
|
||||||
|
(dndb = dn->dn_dbuf) == NULL)
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
DB_DNODE_ENTER(db);
|
dnbuf = dndb->db_buf;
|
||||||
dn = DB_DNODE(db);
|
if (!arc_is_encrypted(dnbuf))
|
||||||
dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
|
|
||||||
|
|
||||||
if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
|
|
||||||
DB_DNODE_EXIT(db);
|
|
||||||
return (0);
|
return (0);
|
||||||
}
|
|
||||||
|
mutex_enter(&dndb->db_mtx);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Since dnode buffer is modified by sync process, there can be only
|
||||||
|
* one copy of it. It means we can not modify (decrypt) it while it
|
||||||
|
* is being written. I don't see how this may happen now, since
|
||||||
|
* encrypted dnode writes by receive should be completed before any
|
||||||
|
* plain-text reads due to txg wait, but better be safe than sorry.
|
||||||
|
*/
|
||||||
|
while (1) {
|
||||||
|
if (!arc_is_encrypted(dnbuf)) {
|
||||||
|
mutex_exit(&dndb->db_mtx);
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
dbuf_dirty_record_t *dr = dndb->db_data_pending;
|
||||||
|
if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
|
||||||
|
break;
|
||||||
|
cv_wait(&dndb->db_changed, &dndb->db_mtx);
|
||||||
|
};
|
||||||
|
|
||||||
SET_BOOKMARK(&zb, dmu_objset_id(os),
|
SET_BOOKMARK(&zb, dmu_objset_id(os),
|
||||||
DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
|
DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
|
||||||
err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
|
err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* An error code of EACCES tells us that the key is still not
|
* An error code of EACCES tells us that the key is still not
|
||||||
|
@ -1532,7 +1541,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
|
||||||
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
|
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
|
||||||
err = 0;
|
err = 0;
|
||||||
|
|
||||||
DB_DNODE_EXIT(db);
|
mutex_exit(&dndb->db_mtx);
|
||||||
|
|
||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
@ -1558,7 +1567,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||||
RW_LOCK_HELD(&db->db_parent->db_rwlock));
|
RW_LOCK_HELD(&db->db_parent->db_rwlock));
|
||||||
|
|
||||||
if (db->db_blkid == DMU_BONUS_BLKID) {
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
||||||
err = dbuf_read_bonus(db, dn, flags);
|
err = dbuf_read_bonus(db, dn);
|
||||||
goto early_unlock;
|
goto early_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1619,10 +1628,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||||
goto early_unlock;
|
goto early_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
err = dbuf_read_verify_dnode_crypt(db, flags);
|
|
||||||
if (err != 0)
|
|
||||||
goto early_unlock;
|
|
||||||
|
|
||||||
db->db_state = DB_READ;
|
db->db_state = DB_READ;
|
||||||
DTRACE_SET_STATE(db, "read issued");
|
DTRACE_SET_STATE(db, "read issued");
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
|
@ -1738,19 +1743,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
||||||
int
|
int
|
||||||
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||||
{
|
{
|
||||||
int err = 0;
|
|
||||||
boolean_t prefetch;
|
|
||||||
dnode_t *dn;
|
dnode_t *dn;
|
||||||
|
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
|
||||||
|
int err;
|
||||||
|
|
||||||
/*
|
|
||||||
* We don't have to hold the mutex to check db_state because it
|
|
||||||
* can't be freed while we have a hold on the buffer.
|
|
||||||
*/
|
|
||||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||||
|
|
||||||
DB_DNODE_ENTER(db);
|
DB_DNODE_ENTER(db);
|
||||||
dn = DB_DNODE(db);
|
dn = DB_DNODE(db);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Ensure that this block's dnode has been decrypted if the caller
|
||||||
|
* has requested decrypted data.
|
||||||
|
*/
|
||||||
|
err = dbuf_read_verify_dnode_crypt(db, dn, flags);
|
||||||
|
if (err != 0)
|
||||||
|
goto done;
|
||||||
|
|
||||||
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
||||||
(flags & DB_RF_NOPREFETCH) == 0;
|
(flags & DB_RF_NOPREFETCH) == 0;
|
||||||
|
|
||||||
|
@ -1759,13 +1768,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||||
db->db_partial_read = B_TRUE;
|
db->db_partial_read = B_TRUE;
|
||||||
else if (!(flags & DB_RF_PARTIAL_MORE))
|
else if (!(flags & DB_RF_PARTIAL_MORE))
|
||||||
db->db_partial_read = B_FALSE;
|
db->db_partial_read = B_FALSE;
|
||||||
if (db->db_state == DB_CACHED) {
|
miss = (db->db_state != DB_CACHED);
|
||||||
/*
|
|
||||||
* Ensure that this block's dnode has been decrypted if
|
|
||||||
* the caller has requested decrypted data.
|
|
||||||
*/
|
|
||||||
err = dbuf_read_verify_dnode_crypt(db, flags);
|
|
||||||
|
|
||||||
|
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
|
||||||
|
/*
|
||||||
|
* Another reader came in while the dbuf was in flight between
|
||||||
|
* UNCACHED and CACHED. Either a writer will finish filling
|
||||||
|
* the buffer, sending the dbuf to CACHED, or the first reader's
|
||||||
|
* request will reach the read_done callback and send the dbuf
|
||||||
|
* to CACHED. Otherwise, a failure occurred and the dbuf will
|
||||||
|
* be sent to UNCACHED.
|
||||||
|
*/
|
||||||
|
if (flags & DB_RF_NEVERWAIT) {
|
||||||
|
mutex_exit(&db->db_mtx);
|
||||||
|
DB_DNODE_EXIT(db);
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
do {
|
||||||
|
ASSERT(db->db_state == DB_READ ||
|
||||||
|
(flags & DB_RF_HAVESTRUCT) == 0);
|
||||||
|
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
|
||||||
|
zio_t *, pio);
|
||||||
|
cv_wait(&db->db_changed, &db->db_mtx);
|
||||||
|
} while (db->db_state == DB_READ || db->db_state == DB_FILL);
|
||||||
|
if (db->db_state == DB_UNCACHED) {
|
||||||
|
err = SET_ERROR(EIO);
|
||||||
|
mutex_exit(&db->db_mtx);
|
||||||
|
DB_DNODE_EXIT(db);
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (db->db_state == DB_CACHED) {
|
||||||
/*
|
/*
|
||||||
* If the arc buf is compressed or encrypted and the caller
|
* If the arc buf is compressed or encrypted and the caller
|
||||||
* requested uncompressed data, we need to untransform it
|
* requested uncompressed data, we need to untransform it
|
||||||
|
@ -1773,8 +1807,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||||
* unauthenticated blocks, which will verify their MAC if
|
* unauthenticated blocks, which will verify their MAC if
|
||||||
* the key is now available.
|
* the key is now available.
|
||||||
*/
|
*/
|
||||||
if (err == 0 && db->db_buf != NULL &&
|
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
|
||||||
(flags & DB_RF_NO_DECRYPT) == 0 &&
|
|
||||||
(arc_is_encrypted(db->db_buf) ||
|
(arc_is_encrypted(db->db_buf) ||
|
||||||
arc_is_unauthenticated(db->db_buf) ||
|
arc_is_unauthenticated(db->db_buf) ||
|
||||||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
|
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
|
||||||
|
@ -1788,17 +1821,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||||
dbuf_set_data(db, db->db_buf);
|
dbuf_set_data(db, db->db_buf);
|
||||||
}
|
}
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
if (err == 0 && prefetch) {
|
} else {
|
||||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
ASSERT(db->db_state == DB_UNCACHED ||
|
||||||
B_FALSE, flags & DB_RF_HAVESTRUCT);
|
db->db_state == DB_NOFILL);
|
||||||
}
|
|
||||||
DB_DNODE_EXIT(db);
|
|
||||||
DBUF_STAT_BUMP(hash_hits);
|
|
||||||
} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
|
|
||||||
boolean_t need_wait = B_FALSE;
|
|
||||||
|
|
||||||
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
||||||
|
|
||||||
if (pio == NULL && (db->db_state == DB_NOFILL ||
|
if (pio == NULL && (db->db_state == DB_NOFILL ||
|
||||||
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
|
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
|
||||||
spa_t *spa = dn->dn_objset->os_spa;
|
spa_t *spa = dn->dn_objset->os_spa;
|
||||||
|
@ -1806,65 +1832,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||||
need_wait = B_TRUE;
|
need_wait = B_TRUE;
|
||||||
}
|
}
|
||||||
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
|
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
|
||||||
/*
|
/* dbuf_read_impl drops db_mtx and parent's rwlock. */
|
||||||
* dbuf_read_impl has dropped db_mtx and our parent's rwlock
|
miss = (db->db_state != DB_CACHED);
|
||||||
* for us
|
|
||||||
*/
|
|
||||||
if (!err && prefetch) {
|
|
||||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
|
||||||
db->db_state != DB_CACHED,
|
|
||||||
flags & DB_RF_HAVESTRUCT);
|
|
||||||
}
|
|
||||||
|
|
||||||
DB_DNODE_EXIT(db);
|
|
||||||
DBUF_STAT_BUMP(hash_misses);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we created a zio_root we must execute it to avoid
|
|
||||||
* leaking it, even if it isn't attached to any work due
|
|
||||||
* to an error in dbuf_read_impl().
|
|
||||||
*/
|
|
||||||
if (need_wait) {
|
|
||||||
if (err == 0)
|
|
||||||
err = zio_wait(pio);
|
|
||||||
else
|
|
||||||
(void) zio_wait(pio);
|
|
||||||
pio = NULL;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* Another reader came in while the dbuf was in flight
|
|
||||||
* between UNCACHED and CACHED. Either a writer will finish
|
|
||||||
* writing the buffer (sending the dbuf to CACHED) or the
|
|
||||||
* first reader's request will reach the read_done callback
|
|
||||||
* and send the dbuf to CACHED. Otherwise, a failure
|
|
||||||
* occurred and the dbuf went to UNCACHED.
|
|
||||||
*/
|
|
||||||
mutex_exit(&db->db_mtx);
|
|
||||||
if (prefetch) {
|
|
||||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
|
||||||
B_TRUE, flags & DB_RF_HAVESTRUCT);
|
|
||||||
}
|
|
||||||
DB_DNODE_EXIT(db);
|
|
||||||
DBUF_STAT_BUMP(hash_misses);
|
|
||||||
|
|
||||||
/* Skip the wait per the caller's request. */
|
|
||||||
if ((flags & DB_RF_NEVERWAIT) == 0) {
|
|
||||||
mutex_enter(&db->db_mtx);
|
|
||||||
while (db->db_state == DB_READ ||
|
|
||||||
db->db_state == DB_FILL) {
|
|
||||||
ASSERT(db->db_state == DB_READ ||
|
|
||||||
(flags & DB_RF_HAVESTRUCT) == 0);
|
|
||||||
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
|
|
||||||
db, zio_t *, pio);
|
|
||||||
cv_wait(&db->db_changed, &db->db_mtx);
|
|
||||||
}
|
|
||||||
if (db->db_state == DB_UNCACHED)
|
|
||||||
err = SET_ERROR(EIO);
|
|
||||||
mutex_exit(&db->db_mtx);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (err == 0 && prefetch) {
|
||||||
|
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
|
||||||
|
flags & DB_RF_HAVESTRUCT);
|
||||||
|
}
|
||||||
|
DB_DNODE_EXIT(db);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we created a zio we must execute it to avoid leaking it, even if
|
||||||
|
* it isn't attached to any work due to an error in dbuf_read_impl().
|
||||||
|
*/
|
||||||
|
if (need_wait) {
|
||||||
|
if (err == 0)
|
||||||
|
err = zio_wait(pio);
|
||||||
|
else
|
||||||
|
(void) zio_wait(pio);
|
||||||
|
pio = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
done:
|
||||||
|
if (miss)
|
||||||
|
DBUF_STAT_BUMP(hash_misses);
|
||||||
|
else
|
||||||
|
DBUF_STAT_BUMP(hash_hits);
|
||||||
if (pio && err != 0) {
|
if (pio && err != 0) {
|
||||||
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
|
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
|
||||||
ZIO_FLAG_CANFAIL);
|
ZIO_FLAG_CANFAIL);
|
||||||
|
|
Loading…
Reference in New Issue