Updating based on PR Feedback(3)

1. Unified the block cloning and Direct I/O code paths further. As part
   of this unification, it is important to outline that Direct I/O
   writes transition the db_state to DB_UNCACHED. This is used so that
   dbuf_unoverride() is called when dbuf_undirty() is called. This is
   needed to cleanup space accounting in a TXG. When a dbuf is redirtied
   through dbuf_redirty(), then dbuf_unoverride() is also called to
   clean up space accounting. This is a bit of a different approach that
   block cloning, which always calls dbuf_undirty().
2. As part of uniying the two, Direct I/O also performs the same check
   in dmu_buf_will_fill() so that on failure the previous contents of
   the dbuf are set correctly.
3. General just code cleanup removing checks that are no longer
   necessary.

Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
This commit is contained in:
Brian Atkinson 2024-07-16 10:55:59 -06:00
parent b1ee363675
commit 6e0ffaf627
4 changed files with 55 additions and 94 deletions

View File

@ -176,6 +176,7 @@ typedef struct dbuf_dirty_record {
uint8_t dr_copies; uint8_t dr_copies;
boolean_t dr_nopwrite; boolean_t dr_nopwrite;
boolean_t dr_brtwrite; boolean_t dr_brtwrite;
boolean_t dr_diowrite;
boolean_t dr_has_raw_params; boolean_t dr_has_raw_params;
/* /*
@ -467,20 +468,6 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
return (NULL); return (NULL);
} }
static inline boolean_t
dbuf_dirty_is_direct_write(dmu_buf_impl_t *db, dbuf_dirty_record_t *dr)
{
boolean_t ret = B_FALSE;
ASSERT(MUTEX_HELD(&db->db_mtx));
if (dr != NULL && db->db_level == 0 && !dr->dt.dl.dr_brtwrite &&
dr->dt.dl.dr_override_state == DR_OVERRIDDEN &&
dr->dt.dl.dr_data == NULL) {
ret = B_TRUE;
}
return (ret);
}
#define DBUF_GET_BUFC_TYPE(_db) \ #define DBUF_GET_BUFC_TYPE(_db) \
(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)

View File

@ -1255,16 +1255,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
ASSERT(buf != NULL); ASSERT(buf != NULL);
db->db_buf = buf; db->db_buf = buf;
/*
* If there is a Direct I/O, set its data too. Then its state
* will be the same as if we did a ZIL dmu_sync().
*/
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
if (dbuf_dirty_is_direct_write(db, dr)) {
dr->dt.dl.dr_data = db->db_buf;
}
ASSERT(buf->b_data != NULL); ASSERT(buf->b_data != NULL);
db->db.db_data = buf->b_data; db->db.db_data = buf->b_data;
} }
@ -1843,7 +1833,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
/* /*
* If a block clone or Direct I/O write has occurred we will * If a block clone or Direct I/O write has occurred we will
* get the dirty records overridden BP so we get the most * get the dirty records overridden BP so we get the most
* recent data.. * recent data.
*/ */
err = dmu_buf_get_bp_from_dbuf(db, &bp); err = dmu_buf_get_bp_from_dbuf(db, &bp);
@ -1948,13 +1938,14 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp); zio_free(db->db_objset->os_spa, txg, bp);
if (dr->dt.dl.dr_brtwrite) { if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {
ASSERT0P(dr->dt.dl.dr_data); ASSERT0P(dr->dt.dl.dr_data);
dr->dt.dl.dr_data = db->db_buf; dr->dt.dl.dr_data = db->db_buf;
} }
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE; dr->dt.dl.dr_nopwrite = B_FALSE;
dr->dt.dl.dr_brtwrite = B_FALSE; dr->dt.dl.dr_brtwrite = B_FALSE;
dr->dt.dl.dr_diowrite = B_FALSE;
dr->dt.dl.dr_has_raw_params = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE;
/* /*
@ -2161,26 +2152,11 @@ dbuf_redirty(dbuf_dirty_record_t *dr)
*/ */
dbuf_unoverride(dr); dbuf_unoverride(dr);
if (db->db.db_object != DMU_META_DNODE_OBJECT && if (db->db.db_object != DMU_META_DNODE_OBJECT &&
db->db_state != DB_NOFILL && db->db_buf != NULL) { db->db_state != DB_NOFILL) {
/* /* Already released on initial dirty, so just thaw. */
* Already released on initial dirty,
* so just thaw.
*/
ASSERT(arc_released(db->db_buf)); ASSERT(arc_released(db->db_buf));
arc_buf_thaw(db->db_buf); arc_buf_thaw(db->db_buf);
} }
/*
* If initial dirty was via Direct I/O, may not have a dr_data.
*
* If the dirty record was associated with cloned block then
* the call above to dbuf_unoverride() will have reset
* dr->dt.dl.dr_data and it will not be NULL here.
*/
if (dr->dt.dl.dr_data == NULL) {
ASSERT3B(dbuf_dirty_is_direct_write(db, dr), ==,
B_TRUE);
dr->dt.dl.dr_data = db->db_buf;
}
} }
} }
@ -2564,6 +2540,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{ {
uint64_t txg = tx->tx_txg; uint64_t txg = tx->tx_txg;
boolean_t brtwrite; boolean_t brtwrite;
boolean_t diowrite;
ASSERT(txg != 0); ASSERT(txg != 0);
@ -2589,7 +2566,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_dbuf == db);
brtwrite = dr->dt.dl.dr_brtwrite; brtwrite = dr->dt.dl.dr_brtwrite;
diowrite = dr->dt.dl.dr_diowrite;
if (brtwrite) { if (brtwrite) {
ASSERT3B(diowrite, ==, B_FALSE);
/* /*
* We are freeing a block that we cloned in the same * We are freeing a block that we cloned in the same
* transaction group. * transaction group.
@ -2630,11 +2609,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_state != DB_NOFILL && !brtwrite) { if (db->db_state != DB_NOFILL && !brtwrite) {
dbuf_unoverride(dr); dbuf_unoverride(dr);
/* if (dr->dt.dl.dr_data != db->db_buf) {
* In the Direct I/O case, the buffer is still dirty, but it
* may be UNCACHED, so we do not need to destroy an ARC buffer.
*/
if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) {
ASSERT(db->db_buf != NULL); ASSERT(db->db_buf != NULL);
ASSERT(dr->dt.dl.dr_data != NULL); ASSERT(dr->dt.dl.dr_data != NULL);
arc_buf_destroy(dr->dt.dl.dr_data, db); arc_buf_destroy(dr->dt.dl.dr_data, db);
@ -2647,12 +2622,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_dirtycnt -= 1; db->db_dirtycnt -= 1;
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
/* ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||
* In the Direct I/O case our db_buf will be NULL as we are not arc_released(db->db_buf));
* caching in the ARC.
*/
ASSERT(db->db_state == DB_NOFILL || brtwrite ||
db->db_buf == NULL || arc_released(db->db_buf));
dbuf_destroy(db); dbuf_destroy(db);
return (B_TRUE); return (B_TRUE);
} }
@ -2711,8 +2682,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
* Block cloning: Do the dbuf_read() before undirtying the dbuf, as we * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
* want to make sure dbuf_read() will read the pending cloned block and * want to make sure dbuf_read() will read the pending cloned block and
* not the uderlying block that is being replaced. dbuf_undirty() will * not the uderlying block that is being replaced. dbuf_undirty() will
* do dbuf_unoverride(), so we will end up with cloned block content, * do brt_pending_remove() before removing the dirty record.
* without overridden BP.
*/ */
(void) dbuf_read(db, NULL, flags); (void) dbuf_read(db, NULL, flags);
if (undirty) { if (undirty) {
@ -2761,20 +2731,17 @@ dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)
*bp = db->db_blkptr; *bp = db->db_blkptr;
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
if (dr) { if (dr && db->db_state == DB_NOFILL) {
if (db->db_state == DB_NOFILL) {
/* Block clone */ /* Block clone */
if (!dr->dt.dl.dr_brtwrite) if (!dr->dt.dl.dr_brtwrite)
error = EIO; error = EIO;
else else
*bp = &dr->dt.dl.dr_overridden_by; *bp = &dr->dt.dl.dr_overridden_by;
} else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN && } else if (dr && db->db_state == DB_UNCACHED) {
dr->dt.dl.dr_data == NULL) {
ASSERT(db->db_state == DB_UNCACHED);
/* Direct I/O write */ /* Direct I/O write */
if (dr->dt.dl.dr_diowrite)
*bp = &dr->dt.dl.dr_overridden_by; *bp = &dr->dt.dl.dr_overridden_by;
} }
}
return (error); return (error);
} }
@ -2929,22 +2896,29 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
dmu_tx_private_ok(tx)); dmu_tx_private_ok(tx));
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
if (db->db_state == DB_NOFILL) { dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
if (db->db_state == DB_NOFILL ||
(db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {
/* /*
* Block cloning: We will be completely overwriting a block * If the fill can fail we should have a way to return back to
* cloned in this transaction group, so let's undirty the * the cloned or Direct I/O write data.
* pending clone and mark the block as uncached. This will be
* as if the clone was never done. But if the fill can fail
* we should have a way to return back to the cloned data.
*/ */
if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { if (canfail && dr) {
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dmu_buf_will_dirty(db_fake, tx); dmu_buf_will_dirty(db_fake, tx);
return; return;
} }
/*
* Block cloning: We will be completely overwriting a block
* cloned in this transaction group, so let's undirty the
* pending clone and mark the block as uncached. This will be
* as if the clone was never done.
*/
if (dr && dr->dt.dl.dr_brtwrite) {
VERIFY(!dbuf_undirty(db, tx)); VERIFY(!dbuf_undirty(db, tx));
db->db_state = DB_UNCACHED; db->db_state = DB_UNCACHED;
} }
}
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dbuf_noread(db); dbuf_noread(db);
@ -5085,6 +5059,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
if (dr->dt.dl.dr_data != NULL && if (dr->dt.dl.dr_data != NULL &&
dr->dt.dl.dr_data != db->db_buf) { dr->dt.dl.dr_data != db->db_buf) {
ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE); ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);
ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);
arc_buf_destroy(dr->dt.dl.dr_data, db); arc_buf_destroy(dr->dt.dl.dr_data, db);
} }
} else { } else {
@ -5146,8 +5121,6 @@ dbuf_write_override_done(zio_t *zio)
if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_EQUAL(zio->io_bp, obp)) {
if (!BP_IS_HOLE(obp)) if (!BP_IS_HOLE(obp))
dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf)
arc_release(dr->dt.dl.dr_data, db); arc_release(dr->dt.dl.dr_data, db);
} }
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
@ -5355,14 +5328,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
* (by dmu_sync(), dmu_write_direct(), * (by dmu_sync(), dmu_write_direct(),
* or dmu_buf_write_embedded()). * or dmu_buf_write_embedded()).
*/ */
blkptr_t *bp = &dr->dt.dl.dr_overridden_by; abd_t *contents = (data != NULL) ?
abd_t *contents = NULL; abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
if (data) {
ASSERT(BP_IS_HOLE(bp) ||
arc_buf_lsize(data) == BP_GET_LSIZE(bp));
contents = abd_get_from_buf(data->b_data,
arc_buf_size(data));
}
dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp, contents, db->db.db_size, db->db.db_size, &zp,
@ -5371,8 +5338,9 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, bp, dr->dt.dl.dr_copies, zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite); dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
dr->dt.dl.dr_brtwrite);
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
} else if (data == NULL) { } else if (data == NULL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||

View File

@ -107,8 +107,12 @@ dmu_write_direct_done(zio_t *zio)
ASSERT3U(zio->io_error, ==, EAGAIN); ASSERT3U(zio->io_error, ==, EAGAIN);
/* /*
* In the event of an I/O error the metaslab cleanup is taken * In the event of an I/O error this block has been freed in
* care of in zio_done(). * zio_done() through zio_dva_unallocate(). Calling
* dmu_sync_done() above set dr_override_state to
* DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
* dbuf_unoverride(), it will skip doing zio_free() to free
* this block as that was already taken care of.
* *
* Since we are undirtying the record in open-context, we must * Since we are undirtying the record in open-context, we must
* have a hold on the db, so it should never be evicted after * have a hold on the db, so it should never be evicted after
@ -154,6 +158,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
dr_head = list_head(&db->db_dirty_records); dr_head = list_head(&db->db_dirty_records);
ASSERT3U(dr_head->dr_txg, ==, txg); ASSERT3U(dr_head->dr_txg, ==, txg);
dr_head->dt.dl.dr_diowrite = B_TRUE;
dr_head->dr_accounted = db->db.db_size; dr_head->dr_accounted = db->db.db_size;
blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);

View File

@ -1154,11 +1154,12 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
if (error == 0) { if (error == 0) {
zgd->zgd_db = dbp; zgd->zgd_db = dbp;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp;
boolean_t direct_write = B_FALSE;
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
dbuf_dirty_record_t *dr = dbuf_dirty_record_t *dr =
dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg);
boolean_t direct_write = if (dr != NULL && dr->dt.dl.dr_diowrite)
dbuf_dirty_is_direct_write(db, dr); direct_write = B_TRUE;
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
/* /*