Remove fastwrite mechanism.

Fastwrite was introduced many years ago to improve ZIL writes spread
between multiple top-level vdevs by tracking number of allocated but
not written blocks and choosing vdev with smaller count.  It suposed
to reduce ZIL knowledge about allocation, but actually made ZIL to
even more actively report allocation code about the allocations,
complicating both ZIL and metaslabs code.

On top of that, it seems ZIO_FLAG_FASTWRITE setting in dmu_sync()
was lost many years ago, that was one of the declared benefits. Plus
introduction of embedded log metaslab class solved another problem
with allocation rotor accounting both normal and log allocations,
since in most cases those are now in different metaslab classes.

After all that, I'd prefer to simplify already too complicated ZIL,
ZIO and metaslab code if the benefit of complexity is not obvious.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15107
This commit is contained in:
Alexander Motin 2023-07-28 16:30:33 -04:00 committed by GitHub
parent 5bdfff5cfc
commit b22bab2547
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 8 additions and 123 deletions

View File

@ -80,7 +80,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
#define METASLAB_ASYNC_ALLOC 0x8 #define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_DONT_THROTTLE 0x10
#define METASLAB_MUST_RESERVE 0x20 #define METASLAB_MUST_RESERVE 0x20
#define METASLAB_FASTWRITE 0x40
#define METASLAB_ZIL 0x80 #define METASLAB_ZIL 0x80
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
@ -96,8 +95,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
void metaslab_stat_init(void); void metaslab_stat_init(void);
void metaslab_stat_fini(void); void metaslab_stat_fini(void);

View File

@ -266,7 +266,6 @@ struct vdev {
metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_mg; /* metaslab group */
metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */
metaslab_t **vdev_ms; /* metaslab array */ metaslab_t **vdev_ms; /* metaslab array */
uint64_t vdev_pending_fastwrite; /* allocated fastwrites */
txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */

View File

@ -91,7 +91,6 @@ typedef enum {
typedef struct lwb { typedef struct lwb {
zilog_t *lwb_zilog; /* back pointer to log struct */ zilog_t *lwb_zilog; /* back pointer to log struct */
blkptr_t lwb_blk; /* on disk address of this log blk */ blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */
boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */
int lwb_nused; /* # used bytes in buffer */ int lwb_nused; /* # used bytes in buffer */

View File

@ -222,7 +222,6 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_NOPWRITE (1ULL << 28) #define ZIO_FLAG_NOPWRITE (1ULL << 28)
#define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_REEXECUTED (1ULL << 29)
#define ZIO_FLAG_DELEGATED (1ULL << 30) #define ZIO_FLAG_DELEGATED (1ULL << 30)
#define ZIO_FLAG_FASTWRITE (1ULL << 31)
#define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_MUSTSUCCEED 0
#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)

View File

@ -5101,7 +5101,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
zio_alloc_list_t *zal, int allocator) zio_alloc_list_t *zal, int allocator)
{ {
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
metaslab_group_t *mg, *fast_mg, *rotor; metaslab_group_t *mg, *rotor;
vdev_t *vd; vdev_t *vd;
boolean_t try_hard = B_FALSE; boolean_t try_hard = B_FALSE;
@ -5164,15 +5164,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
} else if (d != 0) { } else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next; mg = vd->vdev_mg->mg_next;
} else if (flags & METASLAB_FASTWRITE) {
mg = fast_mg = mca->mca_rotor;
do {
if (fast_mg->mg_vd->vdev_pending_fastwrite <
mg->mg_vd->vdev_pending_fastwrite)
mg = fast_mg;
} while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
} else { } else {
ASSERT(mca->mca_rotor != NULL); ASSERT(mca->mca_rotor != NULL);
mg = mca->mca_rotor; mg = mca->mca_rotor;
@ -5297,7 +5288,7 @@ top:
mg->mg_bias = 0; mg->mg_bias = 0;
} }
if ((flags & METASLAB_FASTWRITE) || if ((flags & METASLAB_ZIL) ||
atomic_add_64_nv(&mca->mca_aliquot, asize) >= atomic_add_64_nv(&mca->mca_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) { mg->mg_aliquot + mg->mg_bias) {
mca->mca_rotor = mg->mg_next; mca->mca_rotor = mg->mg_next;
@ -5310,11 +5301,6 @@ top:
((flags & METASLAB_GANG_HEADER) ? 1 : 0)); ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
DVA_SET_ASIZE(&dva[d], asize); DVA_SET_ASIZE(&dva[d], asize);
if (flags & METASLAB_FASTWRITE) {
atomic_add_64(&vd->vdev_pending_fastwrite,
psize);
}
return (0); return (0);
} }
next: next:
@ -5950,55 +5936,6 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
return (error); return (error);
} }
void
metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
uint64_t psize = BP_GET_PSIZE(bp);
int d;
vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (d = 0; d < ndvas; d++) {
if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
continue;
atomic_add_64(&vd->vdev_pending_fastwrite, psize);
}
spa_config_exit(spa, SCL_VDEV, FTAG);
}
void
metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
uint64_t psize = BP_GET_PSIZE(bp);
int d;
vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (d = 0; d < ndvas; d++) {
if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
continue;
ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
}
spa_config_exit(spa, SCL_VDEV, FTAG);
}
static void static void
metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg) uint64_t size, void *arg)

View File

@ -1192,7 +1192,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
ASSERT(tvd == tvd->vdev_top); ASSERT(tvd == tvd->vdev_top);
tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_array = svd->vdev_ms_array;
tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_shift = svd->vdev_ms_shift;
tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_ms_count = svd->vdev_ms_count;
@ -1655,7 +1654,6 @@ vdev_metaslab_fini(vdev_t *vd)
} }
} }
ASSERT0(vd->vdev_ms_count); ASSERT0(vd->vdev_ms_count);
ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
} }
typedef struct vdev_probe_stats { typedef struct vdev_probe_stats {

View File

@ -761,15 +761,13 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
} }
static lwb_t * static lwb_t *
zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
boolean_t fastwrite)
{ {
lwb_t *lwb; lwb_t *lwb;
lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
lwb->lwb_zilog = zilog; lwb->lwb_zilog = zilog;
lwb->lwb_blk = *bp; lwb->lwb_blk = *bp;
lwb->lwb_fastwrite = fastwrite;
lwb->lwb_slog = slog; lwb->lwb_slog = slog;
lwb->lwb_indirect = B_FALSE; lwb->lwb_indirect = B_FALSE;
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
@ -916,7 +914,6 @@ zil_create(zilog_t *zilog)
dmu_tx_t *tx = NULL; dmu_tx_t *tx = NULL;
blkptr_t blk; blkptr_t blk;
int error = 0; int error = 0;
boolean_t fastwrite = FALSE;
boolean_t slog = FALSE; boolean_t slog = FALSE;
dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
@ -949,8 +946,6 @@ zil_create(zilog_t *zilog)
error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
ZIL_MIN_BLKSZ, &slog); ZIL_MIN_BLKSZ, &slog);
fastwrite = TRUE;
if (error == 0) if (error == 0)
zil_init_log_chain(zilog, &blk); zil_init_log_chain(zilog, &blk);
} }
@ -959,7 +954,7 @@ zil_create(zilog_t *zilog)
* Allocate a log write block (lwb) for the first log block. * Allocate a log write block (lwb) for the first log block.
*/ */
if (error == 0) if (error == 0)
lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite); lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
/* /*
* If we just allocated the first log block, commit our transaction * If we just allocated the first log block, commit our transaction
@ -1044,9 +1039,6 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_claim_txg == 0);
VERIFY(!keep_first); VERIFY(!keep_first);
while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
if (lwb->lwb_fastwrite)
metaslab_fastwrite_unmark(zilog->zl_spa,
&lwb->lwb_blk);
if (lwb->lwb_buf != NULL) if (lwb->lwb_buf != NULL)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
@ -1551,7 +1543,6 @@ zil_lwb_write_done(zio_t *zio)
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
lwb->lwb_state = LWB_STATE_WRITE_DONE; lwb->lwb_state = LWB_STATE_WRITE_DONE;
lwb->lwb_write_zio = NULL; lwb->lwb_write_zio = NULL;
lwb->lwb_fastwrite = FALSE;
nlwb = list_next(&zilog->zl_lwb_list, lwb); nlwb = list_next(&zilog->zl_lwb_list, lwb);
mutex_exit(&zilog->zl_lock); mutex_exit(&zilog->zl_lock);
@ -1718,20 +1709,12 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
mutex_enter(&zilog->zl_lock);
if (!lwb->lwb_fastwrite) {
metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
lwb->lwb_fastwrite = 1;
}
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0,
&lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, prio, zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb);
ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
mutex_enter(&zilog->zl_lock);
lwb->lwb_state = LWB_STATE_OPENED; lwb->lwb_state = LWB_STATE_OPENED;
zil_lwb_set_zio_dependency(zilog, lwb); zil_lwb_set_zio_dependency(zilog, lwb);
zilog->zl_last_lwb_opened = lwb; zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock); mutex_exit(&zilog->zl_lock);
@ -1864,7 +1847,7 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
/* /*
* Allocate a new log write block (lwb). * Allocate a new log write block (lwb).
*/ */
nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
} }
lwb->lwb_state = LWB_STATE_ISSUED; lwb->lwb_state = LWB_STATE_ISSUED;
@ -3651,18 +3634,6 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
BP_ZERO(&zh->zh_log); BP_ZERO(&zh->zh_log);
} }
/*
* Remove fastwrite on any blocks that have been pre-allocated for
* the next commit. This prevents fastwrite counter pollution by
* unused, long-lived LWBs.
*/
for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) {
if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) {
metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
lwb->lwb_fastwrite = 0;
}
}
mutex_exit(&zilog->zl_lock); mutex_exit(&zilog->zl_lock);
} }
@ -3895,9 +3866,6 @@ zil_close(zilog_t *zilog)
ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT(list_is_empty(&zilog->zl_lwb_list));
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
if (lwb->lwb_fastwrite)
metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
zil_free_lwb(zilog, lwb); zil_free_lwb(zilog, lwb);
} }

View File

@ -3024,11 +3024,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
*/ */
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
/*
* We didn't allocate this bp, so make sure it doesn't get unmarked.
*/
pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
zio_nowait(zio); zio_nowait(zio);
return (pio); return (pio);
@ -3616,7 +3611,6 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
if (zio->io_flags & ZIO_FLAG_NODATA) if (zio->io_flags & ZIO_FLAG_NODATA)
flags |= METASLAB_DONT_THROTTLE; flags |= METASLAB_DONT_THROTTLE;
if (zio->io_flags & ZIO_FLAG_GANG_CHILD) if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
@ -3776,7 +3770,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* of, so we just hash the objset ID to pick the allocator to get * of, so we just hash the objset ID to pick the allocator to get
* some parallelism. * some parallelism.
*/ */
int flags = METASLAB_FASTWRITE | METASLAB_ZIL; int flags = METASLAB_ZIL;
int allocator = (uint_t)cityhash4(0, 0, 0, int allocator = (uint_t)cityhash4(0, 0, 0,
os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
@ -4931,12 +4925,6 @@ zio_done(zio_t *zio)
zfs_ereport_free_checksum(zcr); zfs_ereport_free_checksum(zcr);
} }
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
!BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
!(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
}
/* /*
* It is the responsibility of the done callback to ensure that this * It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as * particular zio is no longer discoverable for adoption, and as