zio: add vdev tracing machinery
The idea here is that you can add a flag to a zio, and every vdev that contributed to the successful completion of that zio will be referenced on the "trace tree". You can poke around in here from your _done handler to do any per-vdev followup work. The actual use case is to track the vdevs that were actually written to, in order to have a list of vdevs that we should flush. Thats why it looks like the ZIL vdev flush tracker - the only difference is that it will also list interior and leaf vdevs, not just toplevel vdevs. Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
This commit is contained in:
parent
609550bc5f
commit
efeeeec2a4
|
@ -212,6 +212,7 @@ typedef uint64_t zio_flag_t;
|
||||||
#define ZIO_FLAG_NODATA (1ULL << 12)
|
#define ZIO_FLAG_NODATA (1ULL << 12)
|
||||||
#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13)
|
#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13)
|
||||||
#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14)
|
#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14)
|
||||||
|
#define ZIO_FLAG_VDEV_TRACE (1ULL << 15)
|
||||||
|
|
||||||
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
|
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
|
||||||
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
|
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
|
||||||
|
@ -219,29 +220,29 @@ typedef uint64_t zio_flag_t;
|
||||||
/*
|
/*
|
||||||
* Flags inherited by vdev children.
|
* Flags inherited by vdev children.
|
||||||
*/
|
*/
|
||||||
#define ZIO_FLAG_IO_RETRY (1ULL << 15) /* must be first for INHERIT */
|
#define ZIO_FLAG_IO_RETRY (1ULL << 16) /* must be first for INHERIT */
|
||||||
#define ZIO_FLAG_PROBE (1ULL << 16)
|
#define ZIO_FLAG_PROBE (1ULL << 17)
|
||||||
#define ZIO_FLAG_TRYHARD (1ULL << 17)
|
#define ZIO_FLAG_TRYHARD (1ULL << 18)
|
||||||
#define ZIO_FLAG_OPTIONAL (1ULL << 18)
|
#define ZIO_FLAG_OPTIONAL (1ULL << 19)
|
||||||
|
|
||||||
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
|
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Flags not inherited by any children.
|
* Flags not inherited by any children.
|
||||||
*/
|
*/
|
||||||
#define ZIO_FLAG_DONT_QUEUE (1ULL << 19) /* must be first for INHERIT */
|
#define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */
|
||||||
#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 20)
|
#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21)
|
||||||
#define ZIO_FLAG_IO_BYPASS (1ULL << 21)
|
#define ZIO_FLAG_IO_BYPASS (1ULL << 22)
|
||||||
#define ZIO_FLAG_IO_REWRITE (1ULL << 22)
|
#define ZIO_FLAG_IO_REWRITE (1ULL << 23)
|
||||||
#define ZIO_FLAG_RAW_COMPRESS (1ULL << 23)
|
#define ZIO_FLAG_RAW_COMPRESS (1ULL << 24)
|
||||||
#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 24)
|
#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25)
|
||||||
#define ZIO_FLAG_GANG_CHILD (1ULL << 25)
|
#define ZIO_FLAG_GANG_CHILD (1ULL << 26)
|
||||||
#define ZIO_FLAG_DDT_CHILD (1ULL << 26)
|
#define ZIO_FLAG_DDT_CHILD (1ULL << 27)
|
||||||
#define ZIO_FLAG_GODFATHER (1ULL << 27)
|
#define ZIO_FLAG_GODFATHER (1ULL << 28)
|
||||||
#define ZIO_FLAG_NOPWRITE (1ULL << 28)
|
#define ZIO_FLAG_NOPWRITE (1ULL << 29)
|
||||||
#define ZIO_FLAG_REEXECUTED (1ULL << 29)
|
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
|
||||||
#define ZIO_FLAG_DELEGATED (1ULL << 30)
|
#define ZIO_FLAG_DELEGATED (1ULL << 31)
|
||||||
#define ZIO_FLAG_FASTWRITE (1ULL << 31)
|
#define ZIO_FLAG_FASTWRITE (1ULL << 32)
|
||||||
|
|
||||||
#define ZIO_FLAG_MUSTSUCCEED 0
|
#define ZIO_FLAG_MUSTSUCCEED 0
|
||||||
#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
|
#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
|
||||||
|
@ -441,6 +442,11 @@ typedef struct zio_link {
|
||||||
list_node_t zl_child_node;
|
list_node_t zl_child_node;
|
||||||
} zio_link_t;
|
} zio_link_t;
|
||||||
|
|
||||||
|
typedef struct zio_vdev_trace {
|
||||||
|
uint64_t zvt_guid;
|
||||||
|
avl_node_t zvt_node;
|
||||||
|
} zio_vdev_trace_t;
|
||||||
|
|
||||||
struct zio {
|
struct zio {
|
||||||
/* Core information about this I/O */
|
/* Core information about this I/O */
|
||||||
zbookmark_phys_t io_bookmark;
|
zbookmark_phys_t io_bookmark;
|
||||||
|
@ -511,6 +517,7 @@ struct zio {
|
||||||
uint64_t io_child_count;
|
uint64_t io_child_count;
|
||||||
uint64_t io_phys_children;
|
uint64_t io_phys_children;
|
||||||
uint64_t io_parent_count;
|
uint64_t io_parent_count;
|
||||||
|
avl_tree_t io_vdev_trace_tree;
|
||||||
uint64_t *io_stall;
|
uint64_t *io_stall;
|
||||||
zio_t *io_gang_leader;
|
zio_t *io_gang_leader;
|
||||||
zio_gang_node_t *io_gang_tree;
|
zio_gang_node_t *io_gang_tree;
|
||||||
|
|
|
@ -75,6 +75,7 @@ int zio_deadman_log_all = B_FALSE;
|
||||||
*/
|
*/
|
||||||
kmem_cache_t *zio_cache;
|
kmem_cache_t *zio_cache;
|
||||||
kmem_cache_t *zio_link_cache;
|
kmem_cache_t *zio_link_cache;
|
||||||
|
kmem_cache_t *zio_vdev_trace_cache;
|
||||||
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
||||||
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
||||||
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
||||||
|
@ -150,6 +151,8 @@ zio_init(void)
|
||||||
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||||
zio_link_cache = kmem_cache_create("zio_link_cache",
|
zio_link_cache = kmem_cache_create("zio_link_cache",
|
||||||
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||||
|
zio_vdev_trace_cache = kmem_cache_create("zio_vdev_trace_cache",
|
||||||
|
sizeof (zio_vdev_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* For small buffers, we want a cache for each multiple of
|
* For small buffers, we want a cache for each multiple of
|
||||||
|
@ -294,6 +297,7 @@ zio_fini(void)
|
||||||
VERIFY3P(zio_data_buf_cache[i], ==, NULL);
|
VERIFY3P(zio_data_buf_cache[i], ==, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kmem_cache_destroy(zio_vdev_trace_cache);
|
||||||
kmem_cache_destroy(zio_link_cache);
|
kmem_cache_destroy(zio_link_cache);
|
||||||
kmem_cache_destroy(zio_cache);
|
kmem_cache_destroy(zio_cache);
|
||||||
|
|
||||||
|
@ -716,6 +720,69 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
|
||||||
pio->io_reexecute |= zio->io_reexecute;
|
pio->io_reexecute |= zio->io_reexecute;
|
||||||
ASSERT3U(*countp, >, 0);
|
ASSERT3U(*countp, >, 0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If all of the following are true:
|
||||||
|
*
|
||||||
|
* - the parent has requested vdev tracing
|
||||||
|
* - the child has just completed
|
||||||
|
* - the child was for a real vdev
|
||||||
|
* - the child is "interesting" for tracing purposes (see below)
|
||||||
|
*
|
||||||
|
* then we can stash some information about the vdev on the trace tree.
|
||||||
|
*
|
||||||
|
* "Interesting" means a vdev whose response was a direct contributor
|
||||||
|
* to the success of the overall zio; that is, we only consider zios
|
||||||
|
* that succeeded, and weren't something that was allowed or expected
|
||||||
|
* to fail (eg an aggregation padding write).
|
||||||
|
*
|
||||||
|
* This is important for the initial use case (knowing which vdevs were
|
||||||
|
* written to but not flushed), and is arguably correct for all cases
|
||||||
|
* (a vdev that returned an error, by definition, did not participate
|
||||||
|
* in the completing the zio). Its necessary in practice because an
|
||||||
|
* error from a leaf does not necessarily mean its parent will error
|
||||||
|
* out too (eg raidz can sometimes compensate for failed writes). If
|
||||||
|
* some future case requires more complex filtering we can look at
|
||||||
|
* stashing more info into zio_vdev_trace_t.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
if (pio->io_flags & ZIO_FLAG_VDEV_TRACE &&
|
||||||
|
wait == ZIO_WAIT_DONE && zio->io_vd != NULL &&
|
||||||
|
((zio->io_flags & (ZIO_FLAG_OPTIONAL | ZIO_FLAG_IO_REPAIR)) == 0))
|
||||||
|
{
|
||||||
|
avl_tree_t *t = &pio->io_vdev_trace_tree;
|
||||||
|
zio_vdev_trace_t *zvt, zvt_search;
|
||||||
|
avl_index_t where;
|
||||||
|
|
||||||
|
if (zio->io_error == 0) {
|
||||||
|
zvt_search.zvt_guid = zio->io_vd->vdev_guid;
|
||||||
|
if (avl_find(t, &zvt_search, &where) == NULL) {
|
||||||
|
zvt = kmem_cache_alloc(zio_vdev_trace_cache,
|
||||||
|
KM_SLEEP);
|
||||||
|
zvt->zvt_guid = zio->io_vd->vdev_guid;
|
||||||
|
avl_insert(t, zvt, where);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the child has itself collected trace records, copy them
|
||||||
|
* to ours. Note that we can't steal them, as there may be
|
||||||
|
* multiple parents.
|
||||||
|
*/
|
||||||
|
if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) {
|
||||||
|
avl_tree_t *ct = &zio->io_vdev_trace_tree;
|
||||||
|
zio_vdev_trace_t *czvt;
|
||||||
|
for (czvt = avl_first(ct); czvt != NULL;
|
||||||
|
czvt = AVL_NEXT(ct, czvt)) {
|
||||||
|
if (avl_find(t, czvt, &where) == NULL) {
|
||||||
|
zvt = kmem_cache_alloc(
|
||||||
|
zio_vdev_trace_cache, KM_SLEEP);
|
||||||
|
zvt->zvt_guid = czvt->zvt_guid;
|
||||||
|
avl_insert(t, zvt, where);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
(*countp)--;
|
(*countp)--;
|
||||||
|
|
||||||
if (*countp == 0 && pio->io_stall == countp) {
|
if (*countp == 0 && pio->io_stall == countp) {
|
||||||
|
@ -797,6 +864,15 @@ zio_bookmark_compare(const void *x1, const void *x2)
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
zio_vdev_trace_compare(const void *x1, const void *x2)
|
||||||
|
{
|
||||||
|
const uint64_t v1 = ((zio_vdev_trace_t *)x1)->zvt_guid;
|
||||||
|
const uint64_t v2 = ((zio_vdev_trace_t *)x2)->zvt_guid;
|
||||||
|
|
||||||
|
return (TREE_CMP(v1, v2));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ==========================================================================
|
* ==========================================================================
|
||||||
* Create the various types of I/O (read, write, free, etc)
|
* Create the various types of I/O (read, write, free, etc)
|
||||||
|
@ -834,6 +910,11 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
||||||
offsetof(zio_link_t, zl_child_node));
|
offsetof(zio_link_t, zl_child_node));
|
||||||
metaslab_trace_init(&zio->io_alloc_list);
|
metaslab_trace_init(&zio->io_alloc_list);
|
||||||
|
|
||||||
|
if (flags & ZIO_FLAG_VDEV_TRACE)
|
||||||
|
avl_create(&zio->io_vdev_trace_tree, zio_vdev_trace_compare,
|
||||||
|
sizeof (zio_vdev_trace_t),
|
||||||
|
offsetof(zio_vdev_trace_t, zvt_node));
|
||||||
|
|
||||||
if (vd != NULL)
|
if (vd != NULL)
|
||||||
zio->io_child_type = ZIO_CHILD_VDEV;
|
zio->io_child_type = ZIO_CHILD_VDEV;
|
||||||
else if (flags & ZIO_FLAG_GANG_CHILD)
|
else if (flags & ZIO_FLAG_GANG_CHILD)
|
||||||
|
@ -895,6 +976,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
||||||
static void
|
static void
|
||||||
zio_destroy(zio_t *zio)
|
zio_destroy(zio_t *zio)
|
||||||
{
|
{
|
||||||
|
if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) {
|
||||||
|
avl_tree_t *t = &zio->io_vdev_trace_tree;
|
||||||
|
zio_vdev_trace_t *zvt;
|
||||||
|
void *cookie = NULL;
|
||||||
|
while ((zvt = avl_destroy_nodes(t, &cookie)) != NULL)
|
||||||
|
kmem_cache_free(zio_vdev_trace_cache, zvt);
|
||||||
|
avl_destroy(t);
|
||||||
|
}
|
||||||
metaslab_trace_fini(&zio->io_alloc_list);
|
metaslab_trace_fini(&zio->io_alloc_list);
|
||||||
list_destroy(&zio->io_parent_list);
|
list_destroy(&zio->io_parent_list);
|
||||||
list_destroy(&zio->io_child_list);
|
list_destroy(&zio->io_child_list);
|
||||||
|
|
Loading…
Reference in New Issue