This commit is contained in:
Rob Norris 2024-09-10 16:34:06 -04:00 committed by GitHub
commit 08c22f1ba2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 686 additions and 170 deletions

View File

@ -604,7 +604,7 @@ extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
extern int zil_suspend(const char *osname, void **cookiep);
extern void zil_resume(void *cookie);
extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp);
extern void zil_lwb_add_flush(struct lwb *lwb, zio_t *zio);
extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg);
extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);

View File

@ -172,15 +172,6 @@ typedef struct itx_async_node {
avl_node_t ia_node; /* AVL tree linkage */
} itx_async_node_t;
/*
* Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
* we've touched so we know which ones need a write cache flush at the end.
*/
typedef struct zil_vdev_node {
uint64_t zv_vdev; /* vdev to be flushed */
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;
#define ZIL_BURSTS 8
/*

View File

@ -183,16 +183,17 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_SCRUB (1ULL << 4)
#define ZIO_FLAG_SCAN_THREAD (1ULL << 5)
#define ZIO_FLAG_PHYSICAL (1ULL << 6)
#define ZIO_FLAG_VDEV_TRACE (1ULL << 7)
#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
/*
* Flags inherited by ddt, gang, and vdev children.
*/
#define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */
#define ZIO_FLAG_SPECULATIVE (1ULL << 8)
#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9)
#define ZIO_FLAG_DONT_RETRY (1ULL << 10)
#define ZIO_FLAG_CANFAIL (1ULL << 8) /* must be first for INHERIT */
#define ZIO_FLAG_SPECULATIVE (1ULL << 9)
#define ZIO_FLAG_CONFIG_WRITER (1ULL << 10)
#define ZIO_FLAG_DONT_RETRY (1ULL << 11)
#define ZIO_FLAG_NODATA (1ULL << 12)
#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13)
#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14)
@ -448,6 +449,11 @@ enum zio_qstate {
ZIO_QS_ACTIVE,
};
typedef struct zio_vdev_trace {
uint64_t zvt_guid;
avl_node_t zvt_node;
} zio_vdev_trace_t;
struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
@ -516,6 +522,7 @@ struct zio {
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
avl_tree_t io_vdev_trace_tree;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
@ -598,7 +605,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
blkptr_t *new_bp, uint64_t size, boolean_t *slog);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_flush(zio_t *zio, vdev_t *vd, boolean_t propagate);
extern void zio_shrink(zio_t *zio, uint64_t size);
extern int zio_wait(zio_t *zio);
@ -639,6 +646,13 @@ extern void zio_vdev_io_bypass(zio_t *zio);
extern void zio_vdev_io_reissue(zio_t *zio);
extern void zio_vdev_io_redone(zio_t *zio);
extern void zio_vdev_trace_init(avl_tree_t *t);
extern void zio_vdev_trace_fini(avl_tree_t *t);
extern void zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst);
extern void zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst);
extern void zio_vdev_trace_flush(zio_t *zio, avl_tree_t *t);
extern void zio_vdev_trace_empty(avl_tree_t *t);
extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
extern void zio_checksum_verified(zio_t *zio);

View File

@ -1014,21 +1014,6 @@ vdev_geom_io_intr(struct bio *bp)
zio->io_error = SET_ERROR(EIO);
switch (zio->io_error) {
case ENOTSUP:
/*
* If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
* that future attempts will never succeed. In this case
* we set a persistent flag so that we don't bother with
* requests in the future.
*/
switch (bp->bio_cmd) {
case BIO_FLUSH:
vd->vdev_nowritecache = B_TRUE;
break;
case BIO_DELETE:
break;
}
break;
case ENXIO:
if (!vd->vdev_remove_wanted) {
/*

View File

@ -1232,9 +1232,6 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
zio->io_error = -error;
#endif
if (zio->io_error && (zio->io_error == EOPNOTSUPP))
zio->io_vd->vdev_nowritecache = B_TRUE;
bio_put(bio);
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)

View File

@ -1806,12 +1806,11 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
zgd_t *zgd = dsa->dsa_zgd;
/*
* Record the vdev(s) backing this blkptr so they can be flushed after
* the writes for the lwb have completed.
* Capture the trace records for this zio so the vdevs can be flushed
* after the writes for the lwb have completed.
*/
if (zio->io_error == 0) {
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
}
if (zio->io_error == 0)
zil_lwb_add_flush(zgd->zgd_lwb, zio);
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
@ -1865,10 +1864,10 @@ dmu_sync_late_arrival_done(zio_t *zio)
if (zio->io_error == 0) {
/*
* Record the vdev(s) backing this blkptr so they can be
* Capture the trace records for this zio so the vdevs can be
* flushed after the writes for the lwb have completed.
*/
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
zil_lwb_add_flush(zgd->zgd_lwb, zio);
if (!BP_IS_HOLE(bp)) {
blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
@ -1955,7 +1954,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
dsa, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, zb));
return (0);
}
@ -2122,7 +2122,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
&zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE,
&zb));
return (0);
}

View File

@ -1830,19 +1830,21 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
for (int v = 0; v < svdcount; v++) {
if (vdev_writeable(svd[v])) {
zio_flush(zio, svd[v]);
zio_flush(zio, svd[v], B_FALSE);
}
}
if (spa->spa_aux_sync_uber) {
spa->spa_aux_sync_uber = B_FALSE;
for (int v = 0; v < spa->spa_spares.sav_count; v++) {
if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) {
zio_flush(zio, spa->spa_spares.sav_vdevs[v]);
zio_flush(zio, spa->spa_spares.sav_vdevs[v],
B_FALSE);
}
}
for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) {
zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]);
zio_flush(zio, spa->spa_l2cache.sav_vdevs[v],
B_FALSE);
}
}
}
@ -2007,13 +2009,13 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
zio = zio_root(spa, NULL, NULL, flags);
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
zio_flush(zio, vd);
zio_flush(zio, vd, B_FALSE);
for (int i = 0; i < 2; i++) {
if (!sav[i]->sav_label_sync)
continue;
for (int v = 0; v < sav[i]->sav_count; v++)
zio_flush(zio, sav[i]->sav_vdevs[v]);
zio_flush(zio, sav[i]->sav_vdevs[v], B_FALSE);
if (l == 1)
sav[i]->sav_label_sync = B_FALSE;
}
@ -2091,7 +2093,7 @@ retry:
for (vdev_t *vd =
txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
zio_flush(zio, vd);
zio_flush(zio, vd, B_FALSE);
(void) zio_wait(zio);

View File

@ -4172,7 +4172,7 @@ io_error_exit:
goto io_error_exit;
}
pio = zio_root(spa, NULL, NULL, 0);
zio_flush(pio, raidvd);
zio_flush(pio, raidvd, B_FALSE);
zio_wait(pio);
zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
@ -4231,7 +4231,7 @@ overwrite:
goto io_error_exit;
}
pio = zio_root(spa, NULL, NULL, 0);
zio_flush(pio, raidvd);
zio_flush(pio, raidvd, B_FALSE);
zio_wait(pio);
zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
@ -4339,7 +4339,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
}
zio_wait(pio);
pio = zio_root(spa, NULL, NULL, 0);
zio_flush(pio, raidvd);
zio_flush(pio, raidvd, B_FALSE);
zio_wait(pio);
zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "

View File

@ -23,6 +23,7 @@
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2018 Datto Inc.
* Copyright (c) 2024, Klara, Inc.
*/
/* Portions Copyright 2010 Robert Milkowski */
@ -797,15 +798,6 @@ zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
}
}
static int
zil_lwb_vdev_compare(const void *x1, const void *x2)
{
const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
return (TREE_CMP(v1, v2));
}
/*
* Allocate a new lwb. We may already have a block pointer for it, in which
* case we get size and version from there. Or we may not yet, in which case
@ -1368,14 +1360,8 @@ zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
}
void
zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
zil_lwb_add_flush(struct lwb *lwb, zio_t *zio)
{
avl_tree_t *t = &lwb->lwb_vdev_tree;
avl_index_t where;
zil_vdev_node_t *zv, zvsearch;
int ndvas = BP_GET_NDVAS(bp);
int i;
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
@ -1383,53 +1369,10 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
return;
mutex_enter(&lwb->lwb_vdev_lock);
for (i = 0; i < ndvas; i++) {
zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
if (avl_find(t, &zvsearch, &where) == NULL) {
zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
zv->zv_vdev = zvsearch.zv_vdev;
avl_insert(t, zv, where);
}
}
zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree);
mutex_exit(&lwb->lwb_vdev_lock);
}
static void
zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
{
avl_tree_t *src = &lwb->lwb_vdev_tree;
avl_tree_t *dst = &nlwb->lwb_vdev_tree;
void *cookie = NULL;
zil_vdev_node_t *zv;
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
/*
* While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
* not need the protection of lwb_vdev_lock (it will only be modified
* while holding zilog->zl_lock) as its writes and those of its
* children have all completed. The younger 'nlwb' may be waiting on
* future writes to additional vdevs.
*/
mutex_enter(&nlwb->lwb_vdev_lock);
/*
* Tear down the 'lwb' vdev tree, ensuring that entries which do not
* exist in 'nlwb' are moved to it, freeing any would-be duplicates.
*/
while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
avl_index_t where;
if (avl_find(dst, zv, &where) == NULL) {
avl_insert(dst, zv, where);
} else {
kmem_free(zv, sizeof (*zv));
}
}
mutex_exit(&nlwb->lwb_vdev_lock);
}
void
zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
{
@ -1495,12 +1438,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
* includes ZIO errors from either this LWB's write or
* flush, as well as any errors from other dependent LWBs
* (e.g. a root LWB ZIO that might be a child of this LWB).
*
* With that said, it's important to note that LWB flush
* errors are not propagated up to the LWB root ZIO.
* This is incorrect behavior, and results in VDEV flush
* errors not being handled correctly here. See the
* comment above the call to "zio_flush" for details.
*/
zcw->zcw_zio_error = zio->io_error;
@ -1578,9 +1515,6 @@ zil_lwb_write_done(zio_t *zio)
lwb_t *lwb = zio->io_private;
spa_t *spa = zio->io_spa;
zilog_t *zilog = lwb->lwb_zilog;
avl_tree_t *t = &lwb->lwb_vdev_tree;
void *cookie = NULL;
zil_vdev_node_t *zv;
lwb_t *nlwb;
ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
@ -1606,13 +1540,9 @@ zil_lwb_write_done(zio_t *zio)
nlwb = NULL;
mutex_exit(&zilog->zl_lock);
if (avl_numnodes(t) == 0)
return;
/*
* If there was an IO error, we're not going to call zio_flush()
* on these vdevs, so we simply empty the tree and free the
* nodes. We avoid calling zio_flush() since there isn't any
* If there was an IO error, there's no point continuing. We
* avoid calling zio_flush() since there isn't any
* good reason for doing so, after the lwb block failed to be
* written out.
*
@ -1622,47 +1552,78 @@ zil_lwb_write_done(zio_t *zio)
* we expect any error seen here, to have been propagated to
* that function).
*/
if (zio->io_error != 0) {
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_free(zv, sizeof (*zv));
if (zio->io_error != 0 || zil_nocacheflush) {
/*
* Trace records may have been passed on to us from a
* previously-deferred lwb. Since we're not going to use them,
* we clean them up now.
*/
zio_vdev_trace_empty(&lwb->lwb_vdev_tree);
return;
}
/*
* If this lwb does not have any threads waiting for it to complete, we
* want to defer issuing the flush command to the vdevs written to by
* "this" lwb, and instead rely on the "next" lwb to handle the flush
* command for those vdevs. Thus, we merge the vdev tree of "this" lwb
* with the vdev tree of the "next" lwb in the list, and assume the
* "next" lwb will handle flushing the vdevs (or deferring the flush(s)
* again).
* This zio was issued with ZIO_FLAG_VDEV_TRACE, and so its
* io_vdev_trace_tree has the set of zio_vdev_trace_t records for vdevs
* that were written to by "this" lwb.
*
* This is a useful performance optimization, especially for workloads
* To complete the commit, we need to issue a flush to those vdevs. If
* this lwb does not have any threads waiting for it to complete, we
* want to defer issuing that flush, and instead rely on the "next" lwb
* to hndle the flush command for those vdevs.
*
* (This is a useful performance optimization, especially for workloads
* with lots of async write activity and few sync write and/or fsync
* activity, as it has the potential to coalesce multiple flush
* commands to a vdev into one.
* commands to a vdev into one).
*
* However, if this lwb does have threads waiting for it, we must issue
* the flush now.
*
* Regardless of whether or not we issue the flush now or defer it to
* the "next" lwb, our lwb_vdev_tree may also have entries on it that
* were deferred to us.
*
* So, our job here is to assemble a single tree of vdev trace records
* including all those that were written by this zio, plus any that
* were deferred to us, and either issue the flush now, or pass them on
* to the "next" lwb.
*
* Because the zio owns the entries on its trace tree, and it is no
* longer available to us after this function returns, so we make our
* own copies instead of just stealing them.
*/
if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
zil_lwb_flush_defer(lwb, nlwb);
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
/*
* While 'lwb' is at a point in its lifetime where
* lwb_vdev_tree does not need the protection of lwb_vdev_lock
* (it will only be modified while holding zilog->zl_lock) as
* its writes and those of its children have all completed.
* The younger 'nlwb' may be waiting on future writes to
* additional vdevs.
*/
mutex_enter(&nlwb->lwb_vdev_lock);
zio_vdev_trace_copy(&zio->io_vdev_trace_tree,
&nlwb->lwb_vdev_tree);
/*
* Also move any trace records from this lwb to the next.
*/
zio_vdev_trace_move(&lwb->lwb_vdev_tree, &nlwb->lwb_vdev_tree);
mutex_exit(&nlwb->lwb_vdev_lock);
return;
}
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
if (vd != NULL) {
/*
* The "ZIO_FLAG_DONT_PROPAGATE" is currently
* always used within "zio_flush". This means,
* any errors when flushing the vdev(s), will
* (unfortunately) not be handled correctly,
* since these "zio_flush" errors will not be
* propagated up to "zil_lwb_flush_vdevs_done".
*/
zio_flush(lwb->lwb_root_zio, vd);
}
kmem_free(zv, sizeof (*zv));
}
/*
* Fill out the trace tree with records, then issue the flush,
* delivering errors to zil_lwb_flush_vdevs_done().
*/
zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree);
zio_vdev_trace_flush(lwb->lwb_root_zio, &lwb->lwb_vdev_tree);
zio_vdev_trace_empty(&lwb->lwb_vdev_tree);
}
/*
@ -1945,8 +1906,7 @@ next_lwb:
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
&lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
lwb, prio, ZIO_FLAG_CANFAIL, &zb);
zil_lwb_add_block(lwb, &lwb->lwb_blk);
lwb, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, &zb);
if (lwb->lwb_slim) {
/* For Slim ZIL only write what is used. */
@ -3787,8 +3747,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag)
list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
offsetof(zil_commit_waiter_t, zcw_node));
avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
zio_vdev_trace_init(&lwb->lwb_vdev_tree);
mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
@ -3799,7 +3758,7 @@ zil_lwb_dest(void *vbuf, void *unused)
(void) unused;
lwb_t *lwb = vbuf;
mutex_destroy(&lwb->lwb_vdev_lock);
avl_destroy(&lwb->lwb_vdev_tree);
zio_vdev_trace_fini(&lwb->lwb_vdev_tree);
list_destroy(&lwb->lwb_waiters);
list_destroy(&lwb->lwb_itxs);
}
@ -4387,7 +4346,6 @@ EXPORT_SYMBOL(zil_sync);
EXPORT_SYMBOL(zil_clean);
EXPORT_SYMBOL(zil_suspend);
EXPORT_SYMBOL(zil_resume);
EXPORT_SYMBOL(zil_lwb_add_block);
EXPORT_SYMBOL(zil_bp_tree_add);
EXPORT_SYMBOL(zil_set_sync);
EXPORT_SYMBOL(zil_set_logbias);

View File

@ -76,6 +76,7 @@ static int zio_deadman_log_all = B_FALSE;
*/
static kmem_cache_t *zio_cache;
static kmem_cache_t *zio_link_cache;
static kmem_cache_t *zio_vdev_trace_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
@ -157,6 +158,8 @@ zio_init(void)
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
zio_link_cache = kmem_cache_create("zio_link_cache",
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
zio_vdev_trace_cache = kmem_cache_create("zio_vdev_trace_cache",
sizeof (zio_vdev_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
@ -285,6 +288,7 @@ zio_fini(void)
VERIFY3P(zio_data_buf_cache[i], ==, NULL);
}
kmem_cache_destroy(zio_vdev_trace_cache);
kmem_cache_destroy(zio_link_cache);
kmem_cache_destroy(zio_cache);
@ -803,6 +807,59 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
pio->io_reexecute |= zio->io_reexecute;
ASSERT3U(*countp, >, 0);
/*
* If all of the following are true:
*
* - the parent has requested vdev tracing
* - the child has just completed
* - the child was for a real vdev
* - the child is "interesting" for tracing purposes (see below)
*
* then we can stash some information about the vdev on the trace tree.
*
* "Interesting" means a vdev whose response was a direct contributor
* to the success of the overall zio; that is, we only consider zios
* that succeeded, and weren't something that was allowed or expected
* to fail (eg an aggregation padding write).
*
* This is important for the initial use case (knowing which vdevs were
* written to but not flushed), and is arguably correct for all cases
* (a vdev that returned an error, by definition, did not participate
* in the completing the zio). Its necessary in practice because an
* error from a leaf does not necessarily mean its parent will error
* out too (eg raidz can sometimes compensate for failed writes). If
* some future case requires more complex filtering we can look at
* stashing more info into zio_vdev_trace_t.
*
*/
if (pio->io_flags & ZIO_FLAG_VDEV_TRACE &&
wait == ZIO_WAIT_DONE && zio->io_vd != NULL &&
((zio->io_flags & (ZIO_FLAG_OPTIONAL | ZIO_FLAG_IO_REPAIR)) == 0)) {
avl_tree_t *t = &pio->io_vdev_trace_tree;
zio_vdev_trace_t *zvt, zvt_search;
avl_index_t where;
if (zio->io_error == 0) {
zvt_search.zvt_guid = zio->io_vd->vdev_guid;
if (avl_find(t, &zvt_search, &where) == NULL) {
zvt = kmem_cache_alloc(
zio_vdev_trace_cache, KM_SLEEP);
zvt->zvt_guid = zio->io_vd->vdev_guid;
avl_insert(t, zvt, where);
}
}
/*
* If the child has itself collected trace records, copy them
* to ours. Note that we can't steal them, as there may be
* multiple parents.
*/
if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) {
zio_vdev_trace_copy(&zio->io_vdev_trace_tree,
&pio->io_vdev_trace_tree);
}
}
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
@ -889,6 +946,92 @@ zio_bookmark_compare(const void *x1, const void *x2)
return (0);
}
static int
zio_vdev_trace_compare(const void *x1, const void *x2)
{
const uint64_t v1 = ((zio_vdev_trace_t *)x1)->zvt_guid;
const uint64_t v2 = ((zio_vdev_trace_t *)x2)->zvt_guid;
return (TREE_CMP(v1, v2));
}
void
zio_vdev_trace_init(avl_tree_t *t)
{
avl_create(t, zio_vdev_trace_compare, sizeof (zio_vdev_trace_t),
offsetof(zio_vdev_trace_t, zvt_node));
}
void
zio_vdev_trace_fini(avl_tree_t *t)
{
ASSERT(avl_is_empty(t));
avl_destroy(t);
}
/*
* Copy trace records on src and add them to dst, skipping any that are already
* on dst.
*/
void
zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst)
{
zio_vdev_trace_t *zvt, *nzvt;
avl_index_t where;
for (zvt = avl_first(src); zvt != NULL; zvt = AVL_NEXT(src, zvt)) {
if (avl_find(dst, zvt, &where) == NULL) {
nzvt = kmem_cache_alloc(zio_vdev_trace_cache, KM_SLEEP);
nzvt->zvt_guid = zvt->zvt_guid;
avl_insert(dst, nzvt, where);
}
}
}
/* Move trace records from src to dst. src will be empty upon return. */
void
zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst)
{
zio_vdev_trace_t *zvt;
avl_index_t where;
void *cookie;
cookie = NULL;
while ((zvt = avl_destroy_nodes(src, &cookie)) != NULL) {
if (avl_find(dst, zvt, &where) == NULL)
avl_insert(dst, zvt, where);
else
kmem_cache_free(zio_vdev_trace_cache, zvt);
}
ASSERT(avl_is_empty(src));
}
void
zio_vdev_trace_flush(zio_t *pio, avl_tree_t *t)
{
spa_t *spa = pio->io_spa;
zio_vdev_trace_t *zvt;
vdev_t *vd;
for (zvt = avl_first(t); zvt != NULL; zvt = AVL_NEXT(t, zvt)) {
vd = vdev_lookup_by_guid(spa->spa_root_vdev, zvt->zvt_guid);
if (vd != NULL && vd->vdev_children == 0)
zio_flush(pio, vd, B_TRUE);
}
}
void
zio_vdev_trace_empty(avl_tree_t *t)
{
zio_vdev_trace_t *zvt;
void *cookie = NULL;
while ((zvt = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_cache_free(zio_vdev_trace_cache, zvt);
ASSERT(avl_is_empty(t));
}
/*
* ==========================================================================
* Create the various types of I/O (read, write, free, etc)
@ -926,6 +1069,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
offsetof(zio_link_t, zl_child_node));
metaslab_trace_init(&zio->io_alloc_list);
zio_vdev_trace_init(&zio->io_vdev_trace_tree);
if (vd != NULL)
zio->io_child_type = ZIO_CHILD_VDEV;
else if (flags & ZIO_FLAG_GANG_CHILD)
@ -991,6 +1136,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
void
zio_destroy(zio_t *zio)
{
zio_vdev_trace_empty(&zio->io_vdev_trace_tree);
zio_vdev_trace_fini(&zio->io_vdev_trace_tree);
metaslab_trace_fini(&zio->io_alloc_list);
list_destroy(&zio->io_parent_list);
list_destroy(&zio->io_child_list);
@ -1640,10 +1787,10 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
* the flushes complete.
*/
void
zio_flush(zio_t *pio, vdev_t *vd)
zio_flush(zio_t *pio, vdev_t *vd, boolean_t propagate)
{
const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY;
const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
(propagate ? 0 : ZIO_FLAG_DONT_PROPAGATE);
if (vd->vdev_nowritecache)
return;
@ -1654,7 +1801,7 @@ zio_flush(zio_t *pio, vdev_t *vd)
NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
} else {
for (uint64_t c = 0; c < vd->vdev_children; c++)
zio_flush(pio, vd->vdev_child[c]);
zio_flush(pio, vd->vdev_child[c], propagate);
}
}
@ -4553,11 +4700,14 @@ zio_vdev_io_assess(zio_t *zio)
/*
* If a cache flush returns ENOTSUP or ENOTTY, we know that no future
* attempts will ever succeed. In this case we set a persistent
* boolean flag so that we don't bother with it in the future.
* boolean flag so that we don't bother with it in the future, and
* then we act like the flush succeeded.
*/
if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
zio->io_type == ZIO_TYPE_FLUSH && vd != NULL)
zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) {
vd->vdev_nowritecache = B_TRUE;
zio->io_error = 0;
}
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

View File

@ -124,6 +124,10 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
'scrub_after_resilver', 'suspend_resume_single', 'zpool_status_-s']
tags = ['functional', 'fault']
[tests/functional/flush:Linux]
tests = ['zil_flush_error', 'zil_flush_fallback']
tags = ['functional', 'flush']
[tests/functional/features/large_dnode:Linux]
tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos']
tags = ['functional', 'features', 'large_dnode']

View File

@ -379,6 +379,7 @@ if os.environ.get('CI') == 'true':
'fault/auto_spare_ashift': ['SKIP', ci_reason],
'fault/auto_spare_shared': ['SKIP', ci_reason],
'fault/suspend_resume_single': ['SKIP', ci_reason],
'flush/zil_flush_error': ['SKIP', ci_reason],
'procfs/pool_state': ['SKIP', ci_reason],
})

View File

@ -462,13 +462,16 @@ function unload_scsi_debug
# Get scsi_debug device name.
# Returns basename of scsi_debug device (for example "sdb").
#
function get_debug_device
# $1 (optional): Return the first $1 number of SCSI debug device names.
function get_debug_device #num
{
typeset num=${1:-1}
for i in {1..10} ; do
val=$(lsscsi | awk '/scsi_debug/ {print $6; exit}' | cut -d/ -f3)
val=$(lsscsi | awk '/scsi_debug/ {print $6}' | cut -d/ -f3 | head -n$num)
# lsscsi can take time to settle
if [ "$val" != "-" ] ; then
if [[ ! "$val" =~ "-" ]] ; then
break
fi
sleep 1

View File

@ -1516,6 +1516,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/features/large_dnode/large_dnode_008_pos.ksh \
functional/features/large_dnode/large_dnode_009_pos.ksh \
functional/features/large_dnode/setup.ksh \
functional/flush/cleanup.ksh \
functional/flush/zil_flush_error.ksh \
functional/flush/zil_flush_fallback.ksh \
functional/flush/setup.ksh \
functional/grow/grow_pool_001_pos.ksh \
functional/grow/grow_replicas_001_pos.ksh \
functional/history/cleanup.ksh \

View File

@ -0,0 +1,28 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2024, Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
default_cleanup

View File

@ -0,0 +1,30 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2024, Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
log_pass

View File

@ -0,0 +1,259 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2024, Klara, Inc.
#
#
# This tests that if the ZIL write sequence fails, it corectly falls back and
# waits until the transaction has fully committed before returning.
#
# When this test was written, the ZIL has a flaw - it assumes that if its
# writes succeed, then the data is definitely on disk and available for reply
# if the pool fails. It issues a flush immediately after the write, but does
# not check it is result. If a disk fails after the data has been accepted into
# the disk cache, but before it can be written to permanent storage, then
# fsync() will have returned success even though the data is not stored in the
# ZIL for replay.
#
# If the main pool then fails before the transaction can be written, then data
# is lost, and fsync() returning success was premature.
#
# To prove this, we create a pool with a separate log device. We inject two
# faults:
#
# - ZIL writes appear to succeed, but never make it disk
# - ZIL flushes fail, and return error
#
# We then remove the main pool device, and do a write+fsync. This goes to the
# ZIL, and appears to succeed. When the txg closes, the write will fail, and
# the pool suspends.
#
# Then, we simulate a reboot by copying the content of the pool devices aside.
# We restore the pool devices, bring it back online, and export it - we don't
# need it anymore, but we have to clean up properly. Then we restore the copied
# content and import the pool, in whatever state it was in when it suspended.
#
# Finally, we check the content of the file we wrote to. If it matches what we
# wrote, then the fsync() was correct, and all is well. If it doesn't match,
# then the flaw is present, and the test fails.
#
# We run the test twice: once without the log device injections, one with. The
# first confirms the expected behaviour of the ZIL - when the pool is imported,
# the log is replayed. The second fails as above. When the flaw is corrected,
# both will succeed, and this overall test succeeds.
#
. $STF_SUITE/include/libtest.shlib
TMPDIR=${TMPDIR:-$TEST_BASE_DIR}
BACKUP_MAIN="$TMPDIR/backup_main"
BACKUP_LOG="$TMPDIR/backup_log"
LOOP_LOG="$TMPDIR/loop_log"
DATA_FILE="$TMPDIR/data_file"
verify_runnable "global"
function cleanup
{
zinject -c all
destroy_pool $TESTPOOL
unload_scsi_debug
rm -f $BACKUP_MAIN $BACKUP_LOG $DATA_FILE
}
log_onexit cleanup
log_assert "verify fsync() waits if the ZIL commit fails"
# create 128K of random data, and take its checksum. we do this up front to
# ensure we don't get messed up by any latency from reading /dev/random or
# checksumming the file on the pool
log_must dd if=/dev/random of=$DATA_FILE bs=128K count=1
typeset sum=$(sha256digest $DATA_FILE)
# create a virtual scsi device with two device nodes. these are backed by the
# same memory. we do this because we need to be able to take the device offline
# properly in order to get the pool to suspend; fault injection on a loop
# device can't do it. once offline, we can use the second node to take a copy
# of its state.
load_scsi_debug 100 1 2 1 '512b'
set -A sd $(get_debug_device 2)
# create a loop device for the log.
truncate -s 100M $LOOP_LOG
typeset ld=$(basename $(losetup -f))
log_must losetup /dev/$ld $LOOP_LOG
# this function runs the entire test sequence. the option decides if faults
# are injected on the slog device, mimicking the trigger situation that causes
# the fsync() bug to occur
function test_fsync
{
typeset -i do_fault_log="$1"
log_note "setting up test"
# create the pool. the main data store is on the scsi device, with the
# log on a loopback. we bias the ZIL towards to the log device to try
# to ensure that fsync() never involves the main device
log_must zpool create -f -O logbias=latency $TESTPOOL ${sd[0]} log $ld
# create the file ahead of time. the ZIL head structure is created on
# first use, and does a full txg wait, which we need to avoid
log_must dd if=/dev/zero of=/$TESTPOOL/data_file \
bs=128k count=1 conv=fsync
log_must zpool sync
# arrange for writes to the log device to appear to succeed, and
# flushes to fail. this simulates a loss of the device between it
# accepting the the write into its cache, but before it can be written
# out
if [[ $do_fault_log != 0 ]] ; then
log_note "injecting log device faults"
log_must zinject -d $ld -e noop -T write $TESTPOOL
log_must zinject -d $ld -e io -T flush $TESTPOOL
fi
# take the main device offline. there is no IO in flight, so ZFS won't
# notice immediately
log_note "taking main pool offline"
log_must eval "echo offline > /sys/block/${sd[0]}/device/state"
# write out some data, then call fsync(). there are three possible
# results:
#
# - if the bug is present, fsync() will return success, and dd will
# succeed "immediately"; before the pool suspends
# - if the bug is fixed, fsync() will block, the pool will suspend, and
# dd will return success after the pool returns to service
# - if something else goes wrong, dd will fail; this may happen before
# or after the pool suspends or returns. this shouldn't happen, and
# should abort the test
#
# we have to put dd in the background, otherwise if it blocks we will
# block with it. what we're interested in is whether or not it succeeds
# before the pool is suspended. if it does, then we expect that after
# the suspended pool is reimported, the data will have been written
log_note "running dd in background to write data and call fsync()"
dd if=$DATA_FILE of=/$TESTPOOL/data_file bs=128k count=1 conv=fsync &
fsync_pid=$!
# wait for the pool to suspend. this should happen within ~5s, when the
# txg sync tries to write the change to the main device
log_note "waiting for pool to suspend"
typeset -i tries=10
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
if ((tries-- == 0)); then
log_fail "pool didn't suspend"
fi
sleep 1
done
# the pool is suspended. see if dd is still present; if it is, then
# it's blocked in fsync(), and we have no expectation that the write
# made it to disk. if dd has exited, then its return code will tell
# us whether fsync() returned success, or it failed for some other
# reason
typeset -i fsync_success=0
if kill -0 $fsync_pid ; then
log_note "dd is blocked; fsync() has not returned"
else
log_note "dd has finished, ensuring it was successful"
log_must wait $fsync_pid
fsync_success=1
fi
# pool is suspended. if we online the main device right now, it will
# retry writing the transaction, which will succed, and everything will
# continue as its supposed to. that's the opposite of what we want; we
# want to do an import, as if after reboot, to force the pool to try to
# replay the ZIL, so we can compare the final result against what
# fsync() told us
#
# however, right now the pool is wedged. we need to get it back online
# so we can export it, so we can do the import. so we need to copy the
# entire pool state away. for the scsi device, we can do this through
# the second device node. for the loopback, we can copy it directly
log_note "taking copy of suspended pool"
log_must cp /dev/${sd[1]} $BACKUP_MAIN
log_must cp /dev/$ld $BACKUP_LOG
# bring the entire pool back online, by clearing error injections and
# restoring the main device. this will unblock anything still waiting
# on it, and tidy up all the internals so we can reset it
log_note "bringing pool back online"
if [[ $do_fault_log != 0 ]] ; then
log_must zinject -c all
fi
log_must eval "echo running > /sys/block/${sd[0]}/device/state"
log_must zpool clear $TESTPOOL
# now the pool is back online. if dd was blocked, it should now
# complete successfully. make sure that's true
if [[ $fsync_success == 0 ]] ; then
log_note "ensuring blocked dd has now finished"
log_must wait $fsync_pid
fi
log_note "exporting pool"
# pool now clean, export it
log_must zpool export $TESTPOOL
log_note "reverting pool to suspended state"
# restore the pool to the suspended state, mimicking a reboot
log_must cp $BACKUP_MAIN /dev/${sd[0]}
log_must cp $BACKUP_LOG /dev/$ld
# import the crashed pool
log_must zpool import $TESTPOOL
# if fsync() succeeded before the pool suspended, then the ZIL should
# have replayed properly and the data is now available on the pool
#
# note that we don't check the alternative; fsync() blocking does not
# mean that data _didn't_ make it to disk, just the ZFS never claimed
# that it did. in that case we can't know what _should_ be on disk
# right now, so can't check
if [[ $fsync_success == 1 ]] ; then
log_note "fsync() succeeded earlier; checking data was written correctly"
typeset newsum=$(sha256digest /$TESTPOOL/data_file)
log_must test "$sum" = "$newsum"
fi
log_note "test finished, cleaning up"
log_must zpool destroy -f $TESTPOOL
}
log_note "first run: ZIL succeeds, and repairs the pool at import"
test_fsync 0
log_note "second run: ZIL commit fails, and falls back to txg sync"
test_fsync 1
log_pass "fsync() waits if the ZIL commit fails"

View File

@ -0,0 +1,89 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2024, Klara, Inc.
#
#
# This tests that when a pool is degraded, ZIL writes/flushes are still done
# directly, rather than falling back to a full txg flush.
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
function cleanup {
default_cleanup_noexit
}
log_onexit cleanup
log_assert "ZIL writes go direct to the pool even when the pool is degraded"
function zil_commit_count {
kstat zil | grep ^zil_commit_count | awk '{ print $3 }'
}
function zil_commit_error_count {
kstat zil | grep ^zil_commit_error_count | awk '{ print $3 }'
}
DISK1=${DISKS%% *}
log_must default_mirror_setup_noexit $DISKS
# get the current count of commits vs errors
typeset -i c1=$(zil_commit_count)
typeset -i e1=$(zil_commit_error_count)
# force a single ZIL commit
log_must dd if=/dev/zero of=/$TESTPOOL/file bs=128k count=1 conv=fsync
# get the updated count of commits vs errors
typeset -i c2=$(zil_commit_count)
typeset -i e2=$(zil_commit_error_count)
# degrade the pool
log_must zpool offline -f $TESTPOOL $DISK1
log_must wait_for_degraded $TESTPOOL
# force another ZIL commit
log_must dd if=/dev/zero of=/$TESTPOOL/file bs=128k count=1 conv=fsync
# get counts again
typeset -i c3=$(zil_commit_count)
typeset -i e3=$(zil_commit_error_count)
# repair the pool
log_must zpool online $TESTPOOL $DISK1
# when pool is in good health, a ZIL commit should go direct to the
# pool and not fall back to a txg sync
log_must test $(( $c2 - $c1 )) -eq 1
log_must test $(( $e2 - $e1 )) -eq 0
# when pool is degraded but still writeable, ZIL should still go direct
# to the pull and not fall back
log_must test $(( $c3 - $c2 )) -eq 1
log_must test $(( $e3 - $e2 )) -eq 0
log_pass "ZIL writes go direct to the pool even when the pool is degraded"