zio: add vdev tracing machinery
The idea here is that you can add a flag to a zio, and every vdev that contributed to the successful completion of that zio will be referenced on the "trace tree". You can poke around in here from your _done handler to do any per-vdev followup work. The actual use case is to track the vdevs that were actually written to, in order to have a list of vdevs that we should flush. Thats why it looks like the ZIL vdev flush tracker - the only difference is that it will also list interior and leaf vdevs, not just toplevel vdevs. Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
This commit is contained in:
parent
609550bc5f
commit
efeeeec2a4
|
@ -212,6 +212,7 @@ typedef uint64_t zio_flag_t;
|
|||
#define ZIO_FLAG_NODATA (1ULL << 12)
|
||||
#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13)
|
||||
#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14)
|
||||
#define ZIO_FLAG_VDEV_TRACE (1ULL << 15)
|
||||
|
||||
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
|
||||
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
|
||||
|
@ -219,29 +220,29 @@ typedef uint64_t zio_flag_t;
|
|||
/*
|
||||
* Flags inherited by vdev children.
|
||||
*/
|
||||
#define ZIO_FLAG_IO_RETRY (1ULL << 15) /* must be first for INHERIT */
|
||||
#define ZIO_FLAG_PROBE (1ULL << 16)
|
||||
#define ZIO_FLAG_TRYHARD (1ULL << 17)
|
||||
#define ZIO_FLAG_OPTIONAL (1ULL << 18)
|
||||
#define ZIO_FLAG_IO_RETRY (1ULL << 16) /* must be first for INHERIT */
|
||||
#define ZIO_FLAG_PROBE (1ULL << 17)
|
||||
#define ZIO_FLAG_TRYHARD (1ULL << 18)
|
||||
#define ZIO_FLAG_OPTIONAL (1ULL << 19)
|
||||
|
||||
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
|
||||
|
||||
/*
|
||||
* Flags not inherited by any children.
|
||||
*/
|
||||
#define ZIO_FLAG_DONT_QUEUE (1ULL << 19) /* must be first for INHERIT */
|
||||
#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 20)
|
||||
#define ZIO_FLAG_IO_BYPASS (1ULL << 21)
|
||||
#define ZIO_FLAG_IO_REWRITE (1ULL << 22)
|
||||
#define ZIO_FLAG_RAW_COMPRESS (1ULL << 23)
|
||||
#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 24)
|
||||
#define ZIO_FLAG_GANG_CHILD (1ULL << 25)
|
||||
#define ZIO_FLAG_DDT_CHILD (1ULL << 26)
|
||||
#define ZIO_FLAG_GODFATHER (1ULL << 27)
|
||||
#define ZIO_FLAG_NOPWRITE (1ULL << 28)
|
||||
#define ZIO_FLAG_REEXECUTED (1ULL << 29)
|
||||
#define ZIO_FLAG_DELEGATED (1ULL << 30)
|
||||
#define ZIO_FLAG_FASTWRITE (1ULL << 31)
|
||||
#define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */
|
||||
#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21)
|
||||
#define ZIO_FLAG_IO_BYPASS (1ULL << 22)
|
||||
#define ZIO_FLAG_IO_REWRITE (1ULL << 23)
|
||||
#define ZIO_FLAG_RAW_COMPRESS (1ULL << 24)
|
||||
#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25)
|
||||
#define ZIO_FLAG_GANG_CHILD (1ULL << 26)
|
||||
#define ZIO_FLAG_DDT_CHILD (1ULL << 27)
|
||||
#define ZIO_FLAG_GODFATHER (1ULL << 28)
|
||||
#define ZIO_FLAG_NOPWRITE (1ULL << 29)
|
||||
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
|
||||
#define ZIO_FLAG_DELEGATED (1ULL << 31)
|
||||
#define ZIO_FLAG_FASTWRITE (1ULL << 32)
|
||||
|
||||
#define ZIO_FLAG_MUSTSUCCEED 0
|
||||
#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
|
||||
|
@ -441,6 +442,11 @@ typedef struct zio_link {
|
|||
list_node_t zl_child_node;
|
||||
} zio_link_t;
|
||||
|
||||
typedef struct zio_vdev_trace {
|
||||
uint64_t zvt_guid;
|
||||
avl_node_t zvt_node;
|
||||
} zio_vdev_trace_t;
|
||||
|
||||
struct zio {
|
||||
/* Core information about this I/O */
|
||||
zbookmark_phys_t io_bookmark;
|
||||
|
@ -511,6 +517,7 @@ struct zio {
|
|||
uint64_t io_child_count;
|
||||
uint64_t io_phys_children;
|
||||
uint64_t io_parent_count;
|
||||
avl_tree_t io_vdev_trace_tree;
|
||||
uint64_t *io_stall;
|
||||
zio_t *io_gang_leader;
|
||||
zio_gang_node_t *io_gang_tree;
|
||||
|
|
|
@ -75,6 +75,7 @@ int zio_deadman_log_all = B_FALSE;
|
|||
*/
|
||||
kmem_cache_t *zio_cache;
|
||||
kmem_cache_t *zio_link_cache;
|
||||
kmem_cache_t *zio_vdev_trace_cache;
|
||||
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
||||
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
||||
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
||||
|
@ -150,6 +151,8 @@ zio_init(void)
|
|||
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||
zio_link_cache = kmem_cache_create("zio_link_cache",
|
||||
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||
zio_vdev_trace_cache = kmem_cache_create("zio_vdev_trace_cache",
|
||||
sizeof (zio_vdev_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||
|
||||
/*
|
||||
* For small buffers, we want a cache for each multiple of
|
||||
|
@ -294,6 +297,7 @@ zio_fini(void)
|
|||
VERIFY3P(zio_data_buf_cache[i], ==, NULL);
|
||||
}
|
||||
|
||||
kmem_cache_destroy(zio_vdev_trace_cache);
|
||||
kmem_cache_destroy(zio_link_cache);
|
||||
kmem_cache_destroy(zio_cache);
|
||||
|
||||
|
@ -716,6 +720,69 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
|
|||
pio->io_reexecute |= zio->io_reexecute;
|
||||
ASSERT3U(*countp, >, 0);
|
||||
|
||||
/*
|
||||
* If all of the following are true:
|
||||
*
|
||||
* - the parent has requested vdev tracing
|
||||
* - the child has just completed
|
||||
* - the child was for a real vdev
|
||||
* - the child is "interesting" for tracing purposes (see below)
|
||||
*
|
||||
* then we can stash some information about the vdev on the trace tree.
|
||||
*
|
||||
* "Interesting" means a vdev whose response was a direct contributor
|
||||
* to the success of the overall zio; that is, we only consider zios
|
||||
* that succeeded, and weren't something that was allowed or expected
|
||||
* to fail (eg an aggregation padding write).
|
||||
*
|
||||
* This is important for the initial use case (knowing which vdevs were
|
||||
* written to but not flushed), and is arguably correct for all cases
|
||||
* (a vdev that returned an error, by definition, did not participate
|
||||
* in the completing the zio). Its necessary in practice because an
|
||||
* error from a leaf does not necessarily mean its parent will error
|
||||
* out too (eg raidz can sometimes compensate for failed writes). If
|
||||
* some future case requires more complex filtering we can look at
|
||||
* stashing more info into zio_vdev_trace_t.
|
||||
*
|
||||
*/
|
||||
if (pio->io_flags & ZIO_FLAG_VDEV_TRACE &&
|
||||
wait == ZIO_WAIT_DONE && zio->io_vd != NULL &&
|
||||
((zio->io_flags & (ZIO_FLAG_OPTIONAL | ZIO_FLAG_IO_REPAIR)) == 0))
|
||||
{
|
||||
avl_tree_t *t = &pio->io_vdev_trace_tree;
|
||||
zio_vdev_trace_t *zvt, zvt_search;
|
||||
avl_index_t where;
|
||||
|
||||
if (zio->io_error == 0) {
|
||||
zvt_search.zvt_guid = zio->io_vd->vdev_guid;
|
||||
if (avl_find(t, &zvt_search, &where) == NULL) {
|
||||
zvt = kmem_cache_alloc(zio_vdev_trace_cache,
|
||||
KM_SLEEP);
|
||||
zvt->zvt_guid = zio->io_vd->vdev_guid;
|
||||
avl_insert(t, zvt, where);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the child has itself collected trace records, copy them
|
||||
* to ours. Note that we can't steal them, as there may be
|
||||
* multiple parents.
|
||||
*/
|
||||
if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) {
|
||||
avl_tree_t *ct = &zio->io_vdev_trace_tree;
|
||||
zio_vdev_trace_t *czvt;
|
||||
for (czvt = avl_first(ct); czvt != NULL;
|
||||
czvt = AVL_NEXT(ct, czvt)) {
|
||||
if (avl_find(t, czvt, &where) == NULL) {
|
||||
zvt = kmem_cache_alloc(
|
||||
zio_vdev_trace_cache, KM_SLEEP);
|
||||
zvt->zvt_guid = czvt->zvt_guid;
|
||||
avl_insert(t, zvt, where);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(*countp)--;
|
||||
|
||||
if (*countp == 0 && pio->io_stall == countp) {
|
||||
|
@ -797,6 +864,15 @@ zio_bookmark_compare(const void *x1, const void *x2)
|
|||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
zio_vdev_trace_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const uint64_t v1 = ((zio_vdev_trace_t *)x1)->zvt_guid;
|
||||
const uint64_t v2 = ((zio_vdev_trace_t *)x2)->zvt_guid;
|
||||
|
||||
return (TREE_CMP(v1, v2));
|
||||
}
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* Create the various types of I/O (read, write, free, etc)
|
||||
|
@ -834,6 +910,11 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|||
offsetof(zio_link_t, zl_child_node));
|
||||
metaslab_trace_init(&zio->io_alloc_list);
|
||||
|
||||
if (flags & ZIO_FLAG_VDEV_TRACE)
|
||||
avl_create(&zio->io_vdev_trace_tree, zio_vdev_trace_compare,
|
||||
sizeof (zio_vdev_trace_t),
|
||||
offsetof(zio_vdev_trace_t, zvt_node));
|
||||
|
||||
if (vd != NULL)
|
||||
zio->io_child_type = ZIO_CHILD_VDEV;
|
||||
else if (flags & ZIO_FLAG_GANG_CHILD)
|
||||
|
@ -895,6 +976,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|||
static void
|
||||
zio_destroy(zio_t *zio)
|
||||
{
|
||||
if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) {
|
||||
avl_tree_t *t = &zio->io_vdev_trace_tree;
|
||||
zio_vdev_trace_t *zvt;
|
||||
void *cookie = NULL;
|
||||
while ((zvt = avl_destroy_nodes(t, &cookie)) != NULL)
|
||||
kmem_cache_free(zio_vdev_trace_cache, zvt);
|
||||
avl_destroy(t);
|
||||
}
|
||||
metaslab_trace_fini(&zio->io_alloc_list);
|
||||
list_destroy(&zio->io_parent_list);
|
||||
list_destroy(&zio->io_child_list);
|
||||
|
|
Loading…
Reference in New Issue