zio: add vdev tracing machinery

The idea here is that you can add a flag to a zio, and every vdev that
contributed to the successful completion of that zio will be referenced
on the "trace tree". You can poke around in here from your _done handler
to do any per-vdev followup work.

The actual use case is to track the vdevs that were actually written to,
in order to have a list of vdevs that we should flush. Thats why it
looks like the ZIL vdev flush tracker - the only difference is that it
will also list interior and leaf vdevs, not just toplevel vdevs.

Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
This commit is contained in:
Rob Norris 2023-08-29 18:24:00 +10:00 committed by Geoff Amey
parent 609550bc5f
commit efeeeec2a4
2 changed files with 113 additions and 17 deletions

View File

@ -212,6 +212,7 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_NODATA (1ULL << 12)
#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13)
#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14)
#define ZIO_FLAG_VDEV_TRACE (1ULL << 15)
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
@ -219,29 +220,29 @@ typedef uint64_t zio_flag_t;
/*
* Flags inherited by vdev children.
*/
#define ZIO_FLAG_IO_RETRY (1ULL << 15) /* must be first for INHERIT */
#define ZIO_FLAG_PROBE (1ULL << 16)
#define ZIO_FLAG_TRYHARD (1ULL << 17)
#define ZIO_FLAG_OPTIONAL (1ULL << 18)
#define ZIO_FLAG_IO_RETRY (1ULL << 16) /* must be first for INHERIT */
#define ZIO_FLAG_PROBE (1ULL << 17)
#define ZIO_FLAG_TRYHARD (1ULL << 18)
#define ZIO_FLAG_OPTIONAL (1ULL << 19)
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
/*
* Flags not inherited by any children.
*/
#define ZIO_FLAG_DONT_QUEUE (1ULL << 19) /* must be first for INHERIT */
#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 20)
#define ZIO_FLAG_IO_BYPASS (1ULL << 21)
#define ZIO_FLAG_IO_REWRITE (1ULL << 22)
#define ZIO_FLAG_RAW_COMPRESS (1ULL << 23)
#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 24)
#define ZIO_FLAG_GANG_CHILD (1ULL << 25)
#define ZIO_FLAG_DDT_CHILD (1ULL << 26)
#define ZIO_FLAG_GODFATHER (1ULL << 27)
#define ZIO_FLAG_NOPWRITE (1ULL << 28)
#define ZIO_FLAG_REEXECUTED (1ULL << 29)
#define ZIO_FLAG_DELEGATED (1ULL << 30)
#define ZIO_FLAG_FASTWRITE (1ULL << 31)
#define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */
#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21)
#define ZIO_FLAG_IO_BYPASS (1ULL << 22)
#define ZIO_FLAG_IO_REWRITE (1ULL << 23)
#define ZIO_FLAG_RAW_COMPRESS (1ULL << 24)
#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25)
#define ZIO_FLAG_GANG_CHILD (1ULL << 26)
#define ZIO_FLAG_DDT_CHILD (1ULL << 27)
#define ZIO_FLAG_GODFATHER (1ULL << 28)
#define ZIO_FLAG_NOPWRITE (1ULL << 29)
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
#define ZIO_FLAG_DELEGATED (1ULL << 31)
#define ZIO_FLAG_FASTWRITE (1ULL << 32)
#define ZIO_FLAG_MUSTSUCCEED 0
#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
@ -441,6 +442,11 @@ typedef struct zio_link {
list_node_t zl_child_node;
} zio_link_t;
typedef struct zio_vdev_trace {
uint64_t zvt_guid;
avl_node_t zvt_node;
} zio_vdev_trace_t;
struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
@ -511,6 +517,7 @@ struct zio {
uint64_t io_child_count;
uint64_t io_phys_children;
uint64_t io_parent_count;
avl_tree_t io_vdev_trace_tree;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;

View File

@ -75,6 +75,7 @@ int zio_deadman_log_all = B_FALSE;
*/
kmem_cache_t *zio_cache;
kmem_cache_t *zio_link_cache;
kmem_cache_t *zio_vdev_trace_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
@ -150,6 +151,8 @@ zio_init(void)
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
zio_link_cache = kmem_cache_create("zio_link_cache",
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
zio_vdev_trace_cache = kmem_cache_create("zio_vdev_trace_cache",
sizeof (zio_vdev_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
/*
* For small buffers, we want a cache for each multiple of
@ -294,6 +297,7 @@ zio_fini(void)
VERIFY3P(zio_data_buf_cache[i], ==, NULL);
}
kmem_cache_destroy(zio_vdev_trace_cache);
kmem_cache_destroy(zio_link_cache);
kmem_cache_destroy(zio_cache);
@ -716,6 +720,69 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
pio->io_reexecute |= zio->io_reexecute;
ASSERT3U(*countp, >, 0);
/*
* If all of the following are true:
*
* - the parent has requested vdev tracing
* - the child has just completed
* - the child was for a real vdev
* - the child is "interesting" for tracing purposes (see below)
*
* then we can stash some information about the vdev on the trace tree.
*
* "Interesting" means a vdev whose response was a direct contributor
* to the success of the overall zio; that is, we only consider zios
* that succeeded, and weren't something that was allowed or expected
* to fail (eg an aggregation padding write).
*
* This is important for the initial use case (knowing which vdevs were
* written to but not flushed), and is arguably correct for all cases
* (a vdev that returned an error, by definition, did not participate
* in the completing the zio). Its necessary in practice because an
* error from a leaf does not necessarily mean its parent will error
* out too (eg raidz can sometimes compensate for failed writes). If
* some future case requires more complex filtering we can look at
* stashing more info into zio_vdev_trace_t.
*
*/
if (pio->io_flags & ZIO_FLAG_VDEV_TRACE &&
wait == ZIO_WAIT_DONE && zio->io_vd != NULL &&
((zio->io_flags & (ZIO_FLAG_OPTIONAL | ZIO_FLAG_IO_REPAIR)) == 0))
{
avl_tree_t *t = &pio->io_vdev_trace_tree;
zio_vdev_trace_t *zvt, zvt_search;
avl_index_t where;
if (zio->io_error == 0) {
zvt_search.zvt_guid = zio->io_vd->vdev_guid;
if (avl_find(t, &zvt_search, &where) == NULL) {
zvt = kmem_cache_alloc(zio_vdev_trace_cache,
KM_SLEEP);
zvt->zvt_guid = zio->io_vd->vdev_guid;
avl_insert(t, zvt, where);
}
}
/*
* If the child has itself collected trace records, copy them
* to ours. Note that we can't steal them, as there may be
* multiple parents.
*/
if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) {
avl_tree_t *ct = &zio->io_vdev_trace_tree;
zio_vdev_trace_t *czvt;
for (czvt = avl_first(ct); czvt != NULL;
czvt = AVL_NEXT(ct, czvt)) {
if (avl_find(t, czvt, &where) == NULL) {
zvt = kmem_cache_alloc(
zio_vdev_trace_cache, KM_SLEEP);
zvt->zvt_guid = czvt->zvt_guid;
avl_insert(t, zvt, where);
}
}
}
}
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
@ -797,6 +864,15 @@ zio_bookmark_compare(const void *x1, const void *x2)
return (0);
}
static int
zio_vdev_trace_compare(const void *x1, const void *x2)
{
const uint64_t v1 = ((zio_vdev_trace_t *)x1)->zvt_guid;
const uint64_t v2 = ((zio_vdev_trace_t *)x2)->zvt_guid;
return (TREE_CMP(v1, v2));
}
/*
* ==========================================================================
* Create the various types of I/O (read, write, free, etc)
@ -834,6 +910,11 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
offsetof(zio_link_t, zl_child_node));
metaslab_trace_init(&zio->io_alloc_list);
if (flags & ZIO_FLAG_VDEV_TRACE)
avl_create(&zio->io_vdev_trace_tree, zio_vdev_trace_compare,
sizeof (zio_vdev_trace_t),
offsetof(zio_vdev_trace_t, zvt_node));
if (vd != NULL)
zio->io_child_type = ZIO_CHILD_VDEV;
else if (flags & ZIO_FLAG_GANG_CHILD)
@ -895,6 +976,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
static void
zio_destroy(zio_t *zio)
{
if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) {
avl_tree_t *t = &zio->io_vdev_trace_tree;
zio_vdev_trace_t *zvt;
void *cookie = NULL;
while ((zvt = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_cache_free(zio_vdev_trace_cache, zvt);
avl_destroy(t);
}
metaslab_trace_fini(&zio->io_alloc_list);
list_destroy(&zio->io_parent_list);
list_destroy(&zio->io_child_list);