From 193a37cb2430960ce759daf12ce5cc804818aba1 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 29 Feb 2016 10:05:23 -0800 Subject: [PATCH] Add -lhHpw options to "zpool iostat" for avg latency, histograms, & queues Update the zfs module to collect statistics on average latencies, queue sizes, and keep an internal histogram of all IO latencies. Along with this, update "zpool iostat" with some new options to print out the stats: -l: Include average IO latencies stats: total_wait disk_wait syncq_wait asyncq_wait scrub read write read write read write read write wait ----- ----- ----- ----- ----- ----- ----- ----- ----- - 41ms - 2ms - 46ms - 4ms - - 5ms - 1ms - 1us - 4ms - - 5ms - 1ms - 1us - 4ms - - - - - - - - - - - 49ms - 2ms - 47ms - - - - - - - - - - - - - 2ms - 1ms - - - 1ms - ----- ----- ----- ----- ----- ----- ----- ----- ----- 1ms 1ms 1ms 413us 16us 25us - 5ms - 1ms 1ms 1ms 413us 16us 25us - 5ms - 2ms 1ms 2ms 412us 26us 25us - 5ms - - 1ms - 413us - 25us - 5ms - - 1ms - 460us - 29us - 5ms - 196us 1ms 196us 370us 7us 23us - 5ms - ----- ----- ----- ----- ----- ----- ----- ----- ----- -w: Print out latency histograms: sdb total disk sync_queue async_queue latency read write read write read write read write scrub ------- ------ ------ ------ ------ ------ ------ ------ ------ ------ 1ns 0 0 0 0 0 0 0 0 0 ... 33us 0 0 0 0 0 0 0 0 0 66us 0 0 107 2486 2 788 12 12 0 131us 2 797 359 4499 10 558 184 184 6 262us 22 801 264 1563 10 286 287 287 24 524us 87 575 71 52086 15 1063 136 136 92 1ms 152 1190 5 41292 4 1693 252 252 141 2ms 245 2018 0 50007 0 2322 371 371 220 4ms 189 7455 22 162957 0 3912 6726 6726 199 8ms 108 9461 0 102320 0 5775 2526 2526 86 17ms 23 11287 0 37142 0 8043 1813 1813 19 34ms 0 14725 0 24015 0 11732 3071 3071 0 67ms 0 23597 0 7914 0 18113 5025 5025 0 134ms 0 33798 0 254 0 25755 7326 7326 0 268ms 0 51780 0 12 0 41593 10002 10002 0 537ms 0 77808 0 0 0 64255 13120 13120 0 1s 0 105281 0 0 0 83805 20841 20841 0 2s 0 88248 0 0 0 73772 14006 14006 0 4s 0 47266 0 0 0 29783 17176 17176 0 9s 0 10460 0 0 0 4130 6295 6295 0 17s 0 0 0 0 0 0 0 0 0 34s 0 0 0 0 0 0 0 0 0 69s 0 0 0 0 0 0 0 0 0 137s 0 0 0 0 0 0 0 0 0 ------------------------------------------------------------------------------- -h: Help -H: Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. -q: Include current number of entries in sync & async read/write queues, and scrub queue: syncq_read syncq_write asyncq_read asyncq_write scrubq_read pend activ pend activ pend activ pend activ pend activ ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 0 0 0 0 78 29 0 0 0 0 0 0 0 0 78 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - - - - - - - - - - 0 0 0 0 0 0 0 0 0 0 - - - - - - - - - - 0 0 0 0 0 0 0 0 0 0 ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 0 0 227 394 0 19 0 0 0 0 0 0 227 394 0 19 0 0 0 0 0 0 108 98 0 19 0 0 0 0 0 0 19 98 0 0 0 0 0 0 0 0 78 98 0 0 0 0 0 0 0 0 19 88 0 0 0 0 0 0 ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -p: Display numbers in parseable (exact) values. Also, update iostat syntax to allow the user to specify specific vdevs to show statistics for. The three options for choosing pools/vdevs are: Display a list of pools: zpool iostat ... [pool ...] Display a list of vdevs from a specific pool: zpool iostat ... [pool vdev ...] Display a list of vdevs from any pools: zpool iostat ... [vdev ...] Lastly, allow zpool command "interval" value to be floating point: zpool iostat -v 0.5 Signed-off-by: Tony Hutter Closes #4433 --- cmd/zpool/Makefile.am | 2 +- cmd/zpool/zpool_iter.c | 66 + cmd/zpool/zpool_main.c | 1334 +++++++++++++++-- cmd/zpool/zpool_util.c | 25 + cmd/zpool/zpool_util.h | 6 + include/libzfs.h | 11 + include/sys/fs/zfs.h | 73 + include/sys/vdev.h | 3 +- include/sys/vdev_impl.h | 1 + include/sys/zfs_context.h | 1 + include/sys/zio.h | 3 +- include/sys/zio_priority.h | 3 +- lib/libspl/include/sys/sysmacros.h | 3 + lib/libzfs/libzfs_pool.c | 2 - lib/libzfs/libzfs_util.c | 94 +- lib/libzpool/kernel.c | 44 + lib/libzpool/util.c | 7 +- man/man8/zpool.8 | 209 ++- module/zfs/spa.c | 2 + module/zfs/vdev.c | 151 +- module/zfs/vdev_disk.c | 9 +- module/zfs/vdev_label.c | 107 +- module/zfs/zio.c | 9 +- tests/runfiles/linux.run | 2 +- .../cli_user/zpool_iostat/Makefile.am | 3 +- .../cli_user/zpool_iostat/setup.ksh | 2 +- .../zpool_iostat/zpool_iostat_002_pos.ksh | 7 +- .../zpool_iostat/zpool_iostat_003_neg.ksh | 5 +- .../zpool_iostat/zpool_iostat_004_pos.ksh | 74 + 29 files changed, 2077 insertions(+), 181 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index c11951b227..b4ff106e1a 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -19,4 +19,4 @@ zpool_LDADD = \ $(top_builddir)/lib/libzpool/libzpool.la \ $(top_builddir)/lib/libzfs/libzfs.la \ $(top_builddir)/lib/libzfs_core/libzfs_core.la \ - $(LIBBLKID) + -lm $(LIBBLKID) diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 952d19172c..a18ccf29df 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -250,3 +250,69 @@ for_each_pool(int argc, char **argv, boolean_t unavail, return (ret); } + +static int +for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func, + void *data) +{ + nvlist_t **child; + uint_t c, children; + int ret = 0; + int i; + char *type; + + const char *list[] = { + ZPOOL_CONFIG_SPARES, + ZPOOL_CONFIG_L2CACHE, + ZPOOL_CONFIG_CHILDREN + }; + + for (i = 0; i < ARRAY_SIZE(list); i++) { + if (nvlist_lookup_nvlist_array(nv, list[i], &child, + &children) == 0) { + for (c = 0; c < children; c++) { + uint64_t ishole = 0; + + (void) nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole); + + if (ishole) + continue; + + ret |= for_each_vdev_cb(zhp, child[c], func, + data); + } + } + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (ret); + + /* Don't run our function on root vdevs */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0) { + ret |= func(zhp, nv, data); + } + + return (ret); +} + +/* + * This is the equivalent of for_each_pool() for vdevs. It iterates thorough + * all vdevs in the pool, ignoring root vdevs and holes, calling func() on + * each one. + * + * @zhp: Zpool handle + * @func: Function to call on each vdev + * @data: Custom data to pass to the function + */ +int +for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data) +{ + nvlist_t *config, *nvroot; + + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + } + return (for_each_vdev_cb(zhp, nvroot, func, data)); +} diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 9c7e2a0c4a..6412a8e935 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -144,6 +145,23 @@ typedef enum { } zpool_help_t; +/* + * Flags for stats to display with "zpool iostats" + */ +enum iostat_type { + IOS_DEFAULT = 0, + IOS_LATENCY = 1, + IOS_QUEUES = 2, + IOS_L_HISTO = 3, + IOS_COUNT, /* always last element */ +}; + +/* iostat_type entries as bitmasks */ +#define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) +#define IOS_LATENCY_M (1ULL << IOS_LATENCY) +#define IOS_QUEUES_M (1ULL << IOS_QUEUES) +#define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) + typedef struct zpool_command { const char *name; int (*func)(int, char **); @@ -196,7 +214,7 @@ static zpool_command_t command_table[] = { { "set", zpool_do_set, HELP_SET }, }; -#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) +#define NCOMMAND (ARRAY_SIZE(command_table)) static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; @@ -237,7 +255,8 @@ get_usage(zpool_help_t idx) { "[-R root] [-F [-n]]\n" "\t [newpool]\n")); case HELP_IOSTAT: - return (gettext("\tiostat [-gLPvy] [-T d|u] [pool] ... " + return (gettext("\tiostat [-T d | u] [-ghHLpPvy] [[-lq]|-w]\n" + "\t [[pool ...]|[pool vdev ...]|[vdev ...]] " "[interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); @@ -2481,61 +2500,690 @@ error: } typedef struct iostat_cbdata { - boolean_t cb_verbose; + uint64_t cb_flags; int cb_name_flags; int cb_namewidth; int cb_iteration; + char **cb_vdev_names; /* Only show these vdevs */ + unsigned int cb_vdev_names_count; + boolean_t cb_verbose; + boolean_t cb_literal; + boolean_t cb_scripted; zpool_list_t *cb_list; } iostat_cbdata_t; +/* iostat labels */ +typedef struct name_and_columns { + const char *name; /* Column name */ + unsigned int columns; /* Center name to this number of columns */ +} name_and_columns_t; + +#define IOSTAT_MAX_LABELS 11 /* Max number of labels on one line */ + +static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, + {NULL}}, + [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, + {"asyncq_wait", 2}, {"scrub"}}, + [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, + {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, + {NULL}}, + [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, + {"sync_queue", 2}, {"async_queue", 2}, {NULL}}, +}; + +/* Shorthand - if "columns" field not set, default to 1 column */ +static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, + {"write"}, {NULL}}, + [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"wait"}, {NULL}}, + [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, + {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, + [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"scrub"}, {NULL}}, +}; + +/* + * Return the number of labels in a null-terminated name_and_columns_t + * array. + * + */ +static unsigned int +label_array_len(const name_and_columns_t *labels) +{ + int i = 0; + + while (labels[i].name) + i++; + + return (i); +} + +/* + * Return a default column width for default/latency/queue columns. This does + * not include histograms, which have their columns autosized. + */ +static unsigned int +default_column_width(iostat_cbdata_t *cb, enum iostat_type type) +{ + unsigned long column_width = 5; /* Normal niceprint */ + static unsigned long widths[] = { + /* + * Choose some sane default column sizes for printing the + * raw numbers. + */ + [IOS_DEFAULT] = 15, /* 1PB capacity */ + [IOS_LATENCY] = 10, /* 1B ns = 10sec */ + [IOS_QUEUES] = 6, /* 1M queue entries */ + }; + + if (cb->cb_literal) + column_width = widths[type]; + + return (column_width); +} + +/* + * Print the column labels, i.e: + * + * capacity operations bandwidth + * alloc free read write read write ... + * + * If force_column_width is set, use it for the column width. If not set, use + * the default column width. + */ +void +print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, + const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) +{ + int i, idx, s; + unsigned int text_start, rw_column_width, spaces_to_end; + uint64_t flags = cb->cb_flags; + uint64_t f; + unsigned int column_width = force_column_width; + + /* For each bit set in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + if (!force_column_width) + column_width = default_column_width(cb, idx); + /* Print our top labels centered over "read write" label. */ + for (i = 0; i < label_array_len(labels[idx]); i++) { + const char *name = labels[idx][i].name; + /* + * We treat labels[][].columns == 0 as shorthand + * for one column. It makes writing out the label + * tables more concise. + */ + unsigned int columns = MAX(1, labels[idx][i].columns); + unsigned int slen = strlen(name); + + rw_column_width = (column_width * columns) + + (2 * (columns - 1)); + + text_start = (int) ((rw_column_width)/columns - + slen/columns); + + printf(" "); /* Two spaces between columns */ + + /* Space from beginning of column to label */ + for (s = 0; s < text_start; s++) + printf(" "); + + printf("%s", name); + + /* Print space after label to end of column */ + spaces_to_end = rw_column_width - text_start - slen; + for (s = 0; s < spaces_to_end; s++) + printf(" "); + + } + } + printf("\n"); +} + +/* + * Utility function to print out a line of dashes like: + * + * -------------------------------- ----- ----- ----- ----- ----- + * + * ...or a dashed named-row line like: + * + * logs - - - - - + * + * @cb: iostat data + * + * @force_column_width If non-zero, use the value as the column width. + * Otherwise use the default column widths. + * + * @name: Print a dashed named-row line starting + * with @name. Otherwise, print a regular + * dashed line. + */ +static void +print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *name) +{ + int i; + unsigned int namewidth; + uint64_t flags = cb->cb_flags; + uint64_t f; + int idx; + const name_and_columns_t *labels; + + if (cb->cb_flags & IOS_L_HISTO_M) + namewidth = MAX(cb->cb_namewidth, strlen("latency")); + else + namewidth = cb->cb_namewidth; + + if (name) { + namewidth = MAX(cb->cb_namewidth, strlen(name)); + printf("%-*s", namewidth, name); + } else { + for (i = 0; i < namewidth; i++) + (void) printf("-"); + } + + /* For each bit in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + unsigned int column_width; + idx = lowbit64(f) - 1; + if (force_column_width) + column_width = force_column_width; + else + column_width = default_column_width(cb, idx); + + labels = iostat_bottom_labels[idx]; + for (i = 0; i < label_array_len(labels); i++) { + if (name) + printf(" %*s-", column_width - 1, " "); + else + printf(" %.*s", column_width, + "--------------------"); + } + } + printf("\n"); +} + + +static void +print_iostat_separator_impl(iostat_cbdata_t *cb, + unsigned int force_column_width) +{ + print_iostat_dashes(cb, force_column_width, NULL); +} + static void print_iostat_separator(iostat_cbdata_t *cb) { - int i = 0; + print_iostat_separator_impl(cb, 0); +} - for (i = 0; i < cb->cb_namewidth; i++) - (void) printf("-"); - (void) printf(" ----- ----- ----- ----- ----- -----\n"); +static void +print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *histo_vdev_name) +{ + unsigned int namewidth; + uint64_t flags = cb->cb_flags; + + if (flags & IOS_L_HISTO_M) + namewidth = MAX(cb->cb_namewidth, strlen("latency")); + else + namewidth = cb->cb_namewidth; + + if (flags & IOS_L_HISTO_M) + printf("%-*s", namewidth, histo_vdev_name); + else + printf("%*s", namewidth, ""); + + print_iostat_labels(cb, force_column_width, iostat_top_labels); + + printf("%-*s", namewidth, flags & IOS_L_HISTO_M ? "latency" : + cb->cb_vdev_names_count ? "vdev" : "pool"); + + print_iostat_labels(cb, force_column_width, iostat_bottom_labels); + + print_iostat_separator_impl(cb, force_column_width); } static void print_iostat_header(iostat_cbdata_t *cb) { - (void) printf("%*s capacity operations bandwidth\n", - cb->cb_namewidth, ""); - (void) printf("%-*s alloc free read write read write\n", - cb->cb_namewidth, "pool"); - print_iostat_separator(cb); + print_iostat_header_impl(cb, 0, NULL); } + /* * Display a single statistic. */ static void -print_one_stat(uint64_t value) +print_one_stat(uint64_t value, enum zfs_nicenum_format format, + unsigned int column_size, boolean_t scripted) { char buf[64]; - zfs_nicenum(value, buf, sizeof (buf)); - (void) printf(" %5s", buf); + zfs_nicenum_format(value, buf, sizeof (buf), format); + + if (scripted) + printf("\t%s", buf); + else + printf(" %*s", column_size, buf); +} + +/* + * Calculate the default vdev stats + * + * Subtract oldvs from newvs, apply a scaling factor, and save the resulting + * stats into calcvs. + */ +static void +calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, + vdev_stat_t *calcvs) +{ + int i; + + memcpy(calcvs, newvs, sizeof (*calcvs)); + for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) + calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); + + for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) + calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); +} + +/* + * Internal representation of the extended iostats data. + * + * The extended iostat stats are exported in nvlists as either uint64_t arrays + * or single uint64_t's. We make both look like arrays to make them easier + * to process. In order to make single uint64_t's look like arrays, we set + * __data to the stat data, and then set *data = &__data with count = 1. Then, + * we can just use *data and count. + */ +struct stat_array { + uint64_t *data; + uint_t count; /* Number of entries in data[] */ + uint64_t __data; /* Only used when data is a single uint64_t */ +}; + +static uint64_t +stat_histo_max(struct stat_array *nva, unsigned int len) { + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array64_max(nva[i].data, nva[i].count)); + + return (max); +} + +/* + * Helper function to lookup a uint64_t array or uint64_t value and store its + * data as a stat_array. If the nvpair is a single uint64_t value, then we make + * it look like a one element array to make it easier to process. + */ +static int +nvpair64_to_stat_array(nvlist_t *nvl, const char *name, + struct stat_array *nva) { + nvpair_t *tmp; + int ret; + + verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); + switch (nvpair_type(tmp)) { + case DATA_TYPE_UINT64_ARRAY: + ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); + break; + case DATA_TYPE_UINT64: + ret = nvpair_value_uint64(tmp, &nva->__data); + nva->data = &nva->__data; + nva->count = 1; + break; + default: + /* Not a uint64_t */ + ret = EINVAL; + break; + } + + return (ret); +} + +/* + * Given a list of nvlist names, look up the extended stats in newnv and oldnv, + * subtract them, and return the results in a newly allocated stat_array. + * You must free the returned array after you are done with it with + * free_calc_stats(). + * + * Additionally, you can set "oldnv" to NULL if you simply want the newnv + * values. + */ +static struct stat_array * +calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, + nvlist_t *newnv) +{ + nvlist_t *oldnvx = NULL, *newnvx; + struct stat_array *oldnva, *newnva, *calcnva; + int i, j; + unsigned int alloc_size = (sizeof (struct stat_array)) * len; + + /* Extract our extended stats nvlist from the main list */ + verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &newnvx) == 0); + if (oldnv) { + verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &oldnvx) == 0); + } + + newnva = safe_malloc(alloc_size); + oldnva = safe_malloc(alloc_size); + calcnva = safe_malloc(alloc_size); + + for (j = 0; j < len; j++) { + verify(nvpair64_to_stat_array(newnvx, names[j], + &newnva[j]) == 0); + calcnva[j].count = newnva[j].count; + alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); + calcnva[j].data = safe_malloc(alloc_size); + memcpy(calcnva[j].data, newnva[j].data, alloc_size); + + if (oldnvx) { + verify(nvpair64_to_stat_array(oldnvx, names[j], + &oldnva[j]) == 0); + for (i = 0; i < oldnva[j].count; i++) + calcnva[j].data[i] -= oldnva[j].data[i]; + } + } + free(newnva); + free(oldnva); + return (calcnva); +} + +static void +free_calc_stats(struct stat_array *nva, unsigned int len) +{ + int i; + for (i = 0; i < len; i++) + free(nva[i].data); + + free(nva); +} + +static void +print_iostat_histo(struct stat_array *nva, unsigned int len, + iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, + double scale) +{ + int i, j; + char buf[6]; + uint64_t val; + enum zfs_nicenum_format format; + unsigned int buckets; + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + /* All these histos are the same size, so just use nva[0].count */ + buckets = nva[0].count; + + for (j = 0; j < buckets; j++) { + /* Ending range of this bucket */ + val = (1UL << (j + 1)) - 1; + + /* Print histogram bucket label */ + zfs_nicetime(val, buf, sizeof (buf)); + if (cb->cb_scripted) + printf("%llu", (u_longlong_t) val); + else + printf("%-*s", namewidth, buf); + + /* Print the values on the line */ + for (i = 0; i < len; i++) { + print_one_stat(nva[i].data[j] * scale, format, + column_width, cb->cb_scripted); + } + printf("\n"); + } +} + +static void +print_solid_separator(unsigned int length) +{ + while (length--) + printf("-"); + printf("\n"); +} + +static void +print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv, double scale, const char *name) +{ + unsigned int column_width; + unsigned int namewidth; + unsigned int entire_width; + + const char *names[] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + }; + struct stat_array *nva; + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); + + if (cb->cb_literal) { + column_width = MAX(5, + (unsigned int) log10(stat_histo_max(nva, + ARRAY_SIZE(names))) + 1); + } else { + column_width = 5; + } + + namewidth = MAX(cb->cb_namewidth, strlen("latency")); + + /* + * Calculate the entire line width of what we're printing. The + * +2 is for the two spaces between columns: + */ + /* read write */ + /* ----- ----- */ + /* |___| <---------- column_width */ + /* */ + /* |__________| <--- entire_width */ + /* */ + entire_width = namewidth + (column_width + 2) * + label_array_len(iostat_bottom_labels[IOS_L_HISTO]); + + if (cb->cb_scripted) + printf("%s\n", name); + else + print_iostat_header_impl(cb, column_width, name); + + print_iostat_histo(nva, ARRAY_SIZE(names), cb, column_width, + namewidth, scale); + + free_calc_stats(nva, ARRAY_SIZE(names)); + if (!cb->cb_scripted) + print_solid_separator(entire_width); +} + +/* + * Calculate the average latency of a power-of-two latency histogram + */ +static uint64_t +single_histo_average(uint64_t *histo, unsigned int buckets) +{ + int i; + uint64_t count = 0, total = 0; + + for (i = 0; i < buckets; i++) { + /* + * Our buckets are power-of-two latency ranges. Use the + * midpoint latency of each bucket to calculate the average. + * For example: + * + * Bucket Midpoint + * 8ns-15ns: 12ns + * 16ns-31ns: 24ns + * ... + */ + if (histo[i] != 0) { + total += histo[i] * (((1UL << i) + ((1UL << i)/2))); + count += histo[i]; + } + } + + /* Prevent divide by zero */ + return (count == 0 ? 0 : total / count); +} + +static void +print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv, double scale) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + }; + + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_QUEUES); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + for (i = 0; i < ARRAY_SIZE(names); i++) { + val = nva[i].data[0] * scale; + print_one_stat(val, format, column_width, cb->cb_scripted); + } + + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +static void +print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv, double scale) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + }; + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_LATENCY); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_TIME; + + /* Print our avg latencies on the line */ + for (i = 0; i < ARRAY_SIZE(names); i++) { + /* Compute average latency for a latency histo */ + val = single_histo_average(nva[i].data, nva[i].count) * scale; + print_one_stat(val, format, column_width, cb->cb_scripted); + } + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +/* + * Print default statistics (capacity/operations/bandwidth) + */ +static void +print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) +{ + unsigned int column_width = default_column_width(cb, IOS_DEFAULT); + enum zfs_nicenum_format format; + char na; /* char to print for "not applicable" values */ + + if (cb->cb_literal) { + format = ZFS_NICENUM_RAW; + na = '0'; + } else { + format = ZFS_NICENUM_1024; + na = '-'; + } + + /* only toplevel vdevs have capacity stats */ + if (vs->vs_space == 0) { + if (cb->cb_scripted) + printf("\t%c\t%c", na, na); + else + printf(" %*c %*c", column_width, na, column_width, + na); + } else { + print_one_stat(vs->vs_alloc, format, column_width, + cb->cb_scripted); + print_one_stat(vs->vs_space - vs->vs_alloc, format, + column_width, cb->cb_scripted); + } + + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); } /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. + * + * Returns the number of stat lines printed. */ -void +unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children; - vdev_stat_t *oldvs, *newvs; + vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; + char *vname; + int i; + int ret = 0; uint64_t tdelta; double scale; - char *vname; + + calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, @@ -2544,54 +3192,92 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, oldvs = &zerovs; } + /* Do we only want to see a specific vdev? */ + for (i = 0; i < cb->cb_vdev_names_count; i++) { + /* Yes we do. Is this the vdev? */ + if (strcmp(name, cb->cb_vdev_names[i]) == 0) { + /* + * This is our vdev. Since it is the only vdev we + * will be displaying, make depth = 0 so that it + * doesn't get indented. + */ + depth = 0; + break; + } + } + + if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { + /* Couldn't match the name */ + goto children; + } + + verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); - if (strlen(name) + depth > cb->cb_namewidth) - (void) printf("%*s%s", depth, "", name); - else - (void) printf("%*s%s%*s", depth, "", name, - (int)(cb->cb_namewidth - strlen(name) - depth), ""); - - tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; - - if (tdelta == 0) - scale = 1.0; - else - scale = (double)NANOSEC / tdelta; - - /* only toplevel vdevs have capacity stats */ - if (newvs->vs_space == 0) { - (void) printf(" - -"); - } else { - print_one_stat(newvs->vs_alloc); - print_one_stat(newvs->vs_space - newvs->vs_alloc); + /* + * Print the vdev name unless it's is a histogram. Histograms + * display the vdev name in the header itself. + */ + if (!(cb->cb_flags & IOS_L_HISTO_M)) { + if (cb->cb_scripted) { + printf("%s", name); + } else { + if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - + depth), ""); + } } - print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] - - oldvs->vs_ops[ZIO_TYPE_READ]))); + /* Calculate our scaling factor */ + tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; + if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_L_HISTO_M)) { + /* + * If we specify printing histograms with no time interval, then + * print the histogram numbers over the entire lifetime of the + * vdev. + */ + scale = 1; + } else { + if (tdelta == 0) + scale = 1.0; + else + scale = (double)NANOSEC / tdelta; + } - print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] - - oldvs->vs_ops[ZIO_TYPE_WRITE]))); + if (cb->cb_flags & IOS_DEFAULT_M) { + calc_default_iostats(oldvs, newvs, calcvs); + print_iostat_default(calcvs, cb, scale); + } + if (cb->cb_flags & IOS_LATENCY_M) + print_iostat_latency(cb, oldnv, newnv, scale); + if (cb->cb_flags & IOS_QUEUES_M) + print_iostat_queues(cb, oldnv, newnv, scale); + if (cb->cb_flags & IOS_L_HISTO_M) { + printf("\n"); + print_iostat_histos(cb, oldnv, newnv, scale, name); + } - print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] - - oldvs->vs_bytes[ZIO_TYPE_READ]))); + if (!(cb->cb_flags & IOS_L_HISTO_M)) + printf("\n"); - print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] - - oldvs->vs_bytes[ZIO_TYPE_WRITE]))); - - (void) printf("\n"); + free(calcvs); + ret++; +children: if (!cb->cb_verbose) - return; + return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) - return; + return (ret); if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &c) != 0) - return; + return (ret); for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; @@ -2607,7 +3293,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } @@ -2617,8 +3303,10 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, */ if (num_logs(newnv) > 0) { - (void) printf("%-*s - - - - - " - "-\n", cb->cb_namewidth, "logs"); + if ((!(cb->cb_flags & IOS_L_HISTO_M)) && !cb->cb_scripted && + !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, "logs"); + } for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; @@ -2628,7 +3316,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, if (islog) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); @@ -2642,23 +3330,28 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) - return; + return (ret); if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &c) != 0) - return; + return (ret); if (children > 0) { - (void) printf("%-*s - - - - - " - "-\n", cb->cb_namewidth, "cache"); + if ((!(cb->cb_flags & IOS_L_HISTO_M)) && !cb->cb_scripted && + !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, "cache"); + } + for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, - newchild[c], cb, depth + 2); + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] + : NULL, newchild[c], cb, depth + 2); free(vname); } } + + return (ret); } static int @@ -2688,6 +3381,7 @@ print_iostat(zpool_handle_t *zhp, void *data) iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; + int ret; newconfig = zpool_get_config(zhp, &oldconfig); @@ -2703,15 +3397,13 @@ print_iostat(zpool_handle_t *zhp, void *data) verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); - /* - * Print out the statistics for the pool. - */ - print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); + ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, + cb, 0); + if ((ret != 0) && !(cb->cb_flags & IOS_L_HISTO_M) && !cb->cb_scripted && + cb->cb_verbose && !cb->cb_vdev_names_count) + print_iostat_separator(cb); - if (cb->cb_verbose) - print_iostat_separator(cb); - - return (0); + return (ret); } static int @@ -2742,13 +3434,14 @@ get_namewidth(zpool_handle_t *zhp, void *data) if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + unsigned int poolname_len = strlen(zpool_get_name(zhp)); if (!cb->cb_verbose) - cb->cb_namewidth = strlen(zpool_get_name(zhp)); + cb->cb_namewidth = poolname_len; else - cb->cb_namewidth = max_width(zhp, nvroot, 0, - cb->cb_namewidth, cb->cb_name_flags); + cb->cb_namewidth = MAX(poolname_len, + max_width(zhp, nvroot, 0, cb->cb_namewidth, + cb->cb_name_flags)); } - /* * The width must be at least 10, but may be as large as the * column width - 42 so that we can still fit in one line. @@ -2767,20 +3460,21 @@ get_namewidth(zpool_handle_t *zhp, void *data) * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void -get_interval_count(int *argcp, char **argv, unsigned long *iv, +get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ - if (argc > 0 && isdigit(argv[argc - 1][0])) { + if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; - interval = strtoul(argv[argc - 1], &end, 10); + interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { @@ -2806,12 +3500,12 @@ get_interval_count(int *argcp, char **argv, unsigned long *iv, * If the last argument is also an integer, then we have both a count * and an interval. */ - if (argc > 0 && isdigit(argv[argc - 1][0])) { + if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; - interval = strtoul(argv[argc - 1], &end, 10); + interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { @@ -2846,12 +3540,299 @@ get_timestamp_arg(char c) } /* - * zpool iostat [-gLPv] [-T d|u] [pool] ... [interval [count]] + * Return stat flags that are supported by all pools by both the module and + * zpool iostat. "*data" should be initialized to all 0xFFs before running. + * It will get ANDed down until only the flags that are supported on all pools + * remain. + */ +static int +get_stat_flags_cb(zpool_handle_t *zhp, void *data) +{ + uint64_t *mask = data; + nvlist_t *config, *nvroot, *nvx; + uint64_t flags = 0; + int i, j; + + /* + * Lookup table for extended iostat flags to nvlist names. + * Basically a list of all the nvpairs a flag requires. + */ + static const char *vsx_type_to_nvlist[IOS_COUNT][10] = { + [IOS_L_HISTO] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + NULL}, + [IOS_LATENCY] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + NULL}, + [IOS_QUEUES] = { + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + NULL} + }; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + /* Default stats are always supported, but for completeness.. */ + if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) + flags |= IOS_DEFAULT_M; + + /* Get our extended stats nvlist from the main list */ + if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, + &nvx) != 0) { + /* + * No extended stats; they're probably running an older + * module. No big deal, we support that too. + */ + goto end; + } + + /* For each extended stat, make sure all its nvpairs are supported */ + for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { + if (!vsx_type_to_nvlist[j][0]) + continue; + + /* Start off by assuming the flag is supported, then check */ + flags |= (1ULL << j); + for (i = 0; vsx_type_to_nvlist[j][i]; i++) { + if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { + /* flag isn't supported */ + flags = flags & ~(1ULL << j); + break; + } + } + } +end: + *mask = *mask & flags; + return (0); +} + +/* + * Return a bitmask of stats that are supported on all pools by both the module + * and zpool iostat. + */ +static uint64_t +get_stat_flags(zpool_list_t *list) +{ + uint64_t mask = -1; + + /* + * get_stat_flags_cb() will lop off bits from "mask" until only the + * flags that are supported on all pools remain. + */ + pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); + return (mask); +} + +/* + * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. + */ +static int +is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) +{ + iostat_cbdata_t *cb = cb_data; + char *name; + + name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); + + if (strcmp(name, cb->cb_vdev_names[0]) == 0) + return (1); /* match */ + + return (0); +} + +/* + * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. + */ +static int +is_vdev(zpool_handle_t *zhp, void *cb_data) +{ + return (for_each_vdev(zhp, is_vdev_cb, cb_data)); +} + +/* + * Check if vdevs are in a pool + * + * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise + * return 0. If pool_name is NULL, then search all pools. + */ +static int +are_vdevs_in_pool(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + char **tmp_name; + int ret = 0; + int i; + int pool_count = 0; + + if ((argc == 0) || !*argv) + return (0); + + if (pool_name) + pool_count = 1; + + /* Temporarily hijack cb_vdev_names for a second... */ + tmp_name = cb->cb_vdev_names; + + /* Go though our list of prospective vdev names */ + for (i = 0; i < argc; i++) { + cb->cb_vdev_names = argv + i; + + /* Is this name a vdev in our pools? */ + ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, + is_vdev, cb); + if (!ret) { + /* No match */ + break; + } + } + + cb->cb_vdev_names = tmp_name; + + return (ret); +} + +static int +is_pool_cb(zpool_handle_t *zhp, void *data) +{ + char *name = data; + if (strcmp(name, zpool_get_name(zhp)) == 0) + return (1); + + return (0); +} + +/* + * Do we have a pool named *name? If so, return 1, otherwise 0. + */ +static int +is_pool(char *name) +{ + return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); +} + +/* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ +static int +are_all_pools(int argc, char **argv) { + if ((argc == 0) || !*argv) + return (0); + + while (--argc >= 0) + if (!is_pool(argv[argc])) + return (0); + + return (1); +} + +/* + * Helper function to print out vdev/pool names we can't resolve. Used for an + * error message. + */ +static void +error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + int i; + char *name; + char *str; + for (i = 0; i < argc; i++) { + name = argv[i]; + + if (is_pool(name)) + str = gettext("pool"); + else if (are_vdevs_in_pool(1, &name, pool_name, cb)) + str = gettext("vdev in this pool"); + else if (are_vdevs_in_pool(1, &name, NULL, cb)) + str = gettext("vdev in another pool"); + else + str = gettext("unknown"); + + fprintf(stderr, "\t%s (%s)\n", name, str); + } +} + +/* + * Same as get_interval_count(), but with additional checks to not misinterpret + * guids as interval/count values. Assumes VDEV_NAME_GUID is set in + * cb.cb_name_flags. + */ +static void +get_interval_count_filter_guids(int *argc, char **argv, float *interval, + unsigned long *count, iostat_cbdata_t *cb) +{ + char **tmpargv = argv; + int argc_for_interval = 0; + + /* Is the last arg an interval value? Or a guid? */ + if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { + /* + * The last arg is not a guid, so it's probably an + * interval value. + */ + argc_for_interval++; + + if (*argc >= 2 && + !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { + /* + * The 2nd to last arg is not a guid, so it's probably + * an interval value. + */ + argc_for_interval++; + } + } + + /* Point to our list of possible intervals */ + tmpargv = &argv[*argc - argc_for_interval]; + + *argc = *argc - argc_for_interval; + get_interval_count(&argc_for_interval, tmpargv, + interval, count); +} + +/* + * Floating point sleep(). Allows you to pass in a floating point value for + * seconds. + */ +static void +fsleep(float sec) { + struct timespec req; + req.tv_sec = floor(sec); + req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; + nanosleep(&req, NULL); +} + + +/* + * zpool iostat [-ghHLpPvy] [[-lq]-w] [-n name] [-T d|u] + * [[ pool ...]|[pool vdev ...]|[vdev ...]] + * [interval [count]] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs + * -h Display help + * -p Display values in parsable (exact) format. + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -l Display average latency + * -q Display queue depths + * -w Display histograms * -T Display a timestamp in date(1) or Unix format * * This command can be tricky because we want to be able to deal with pool @@ -2866,17 +3847,26 @@ zpool_do_iostat(int argc, char **argv) int c; int ret; int npools; - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; zpool_list_t *list; boolean_t verbose = B_FALSE; + boolean_t latency = B_FALSE, histo = B_FALSE; + boolean_t queues = B_FALSE, parseable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; iostat_cbdata_t cb = { 0 }; + /* Used for printing error message */ + const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', + [IOS_L_HISTO] = 'w'}; + + uint64_t unsupported_flags; + /* check options */ - while ((c = getopt(argc, argv, "gLPT:vy")) != -1) { + while ((c = getopt(argc, argv, "gLPT:vyhplqwH")) != -1) { switch (c) { case 'g': guid = B_TRUE; @@ -2893,9 +3883,27 @@ zpool_do_iostat(int argc, char **argv) case 'v': verbose = B_TRUE; break; + case 'p': + parseable = B_TRUE; + break; + case 'l': + latency = B_TRUE; + break; + case 'q': + queues = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'w': + histo = B_TRUE; + break; case 'y': omit_since_boot = B_TRUE; break; + case 'h': + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -2906,7 +3914,70 @@ zpool_do_iostat(int argc, char **argv) argc -= optind; argv += optind; - get_interval_count(&argc, argv, &interval, &count); + cb.cb_literal = parseable; + cb.cb_scripted = scripted; + + if (guid) + cb.cb_name_flags |= VDEV_NAME_GUID; + if (follow_links) + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + if (full_name) + cb.cb_name_flags |= VDEV_NAME_PATH; + cb.cb_iteration = 0; + cb.cb_namewidth = 0; + cb.cb_verbose = verbose; + + /* Get our interval and count values (if any) */ + if (guid) { + get_interval_count_filter_guids(&argc, argv, &interval, + &count, &cb); + } else { + get_interval_count(&argc, argv, &interval, &count); + } + + if (argc == 0) { + /* No args, so just print the defaults. */ + } else if (are_all_pools(argc, argv)) { + /* All the args are pool names */ + } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { + /* All the args are vdevs */ + cb.cb_vdev_names = argv; + cb.cb_vdev_names_count = argc; + argc = 0; /* No pools to process */ + } else if (are_all_pools(1, argv)) { + /* The first arg is a pool name */ + if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { + /* ...and the rest are vdev names */ + cb.cb_vdev_names = argv + 1; + cb.cb_vdev_names_count = argc - 1; + argc = 1; /* One pool to process */ + } else { + fprintf(stderr, gettext("Expected either a list of ")); + fprintf(stderr, gettext("pools, or list of vdevs in")); + fprintf(stderr, " \"%s\", ", argv[0]); + fprintf(stderr, gettext("but got:\n")); + error_list_unresolved_vdevs(argc - 1, argv + 1, + argv[0], &cb); + fprintf(stderr, "\n"); + usage(B_FALSE); + return (1); + } + } else { + /* + * The args don't make sense. The first arg isn't a pool name, + * nor are all the args vdevs. + */ + fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); + fprintf(stderr, "\n"); + return (1); + } + + if (cb.cb_vdev_names_count != 0) { + /* + * If user specified vdevs, it implies verbose. + */ + cb.cb_verbose = B_TRUE; + } /* * Construct the list of all interesting pools. @@ -2926,19 +3997,56 @@ zpool_do_iostat(int argc, char **argv) return (1); } + if (histo && (queues || latency)) { + pool_list_free(list); + (void) fprintf(stderr, + gettext("-w isn't allowed with [-q|-l]\n")); + usage(B_FALSE); + return (1); + } + /* * Enter the main iostat loop. */ cb.cb_list = list; - cb.cb_verbose = verbose; - if (guid) - cb.cb_name_flags |= VDEV_NAME_GUID; - if (follow_links) - cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; - if (full_name) - cb.cb_name_flags |= VDEV_NAME_PATH; - cb.cb_iteration = 0; - cb.cb_namewidth = 0; + + if (histo) { + /* + * Histograms tables look out of place when you try to display + * them with the other stats, so make a rule that you can only + * print histograms by themselves. + */ + cb.cb_flags = IOS_L_HISTO_M; + } else { + cb.cb_flags = IOS_DEFAULT_M; + if (latency) + cb.cb_flags |= IOS_LATENCY_M; + if (queues) + cb.cb_flags |= IOS_QUEUES_M; + } + + /* + * See if the module supports all the stats we want to display. + */ + unsupported_flags = cb.cb_flags & ~get_stat_flags(list); + if (unsupported_flags) { + uint64_t f; + int idx; + fprintf(stderr, + gettext("The loaded zfs module doesn't support:")); + + /* for each bit set in unsupported_flags */ + for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + fprintf(stderr, " -%c", flag_to_arg[idx]); + } + + fprintf(stderr, ". Try running a newer module.\n"), + pool_list_free(list); + + return (1); + } + for (;;) { if ((npools = pool_list_count(list)) == 0) @@ -2949,7 +4057,7 @@ zpool_do_iostat(int argc, char **argv) * we skip any printing. */ boolean_t skip = (omit_since_boot && - cb.cb_iteration == 0); + cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an @@ -2958,7 +4066,7 @@ zpool_do_iostat(int argc, char **argv) * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, - &cb); + &cb); /* * Iterate over all pools to determine the maximum width @@ -2966,7 +4074,7 @@ zpool_do_iostat(int argc, char **argv) */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth, - &cb); + &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); @@ -2974,28 +4082,38 @@ zpool_do_iostat(int argc, char **argv) /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. + * + * The histogram code explicitly prints its header on + * every vdev, so skip this for histograms. */ - if ((++cb.cb_iteration == 1 && !skip) || - (skip != verbose)) + if (((++cb.cb_iteration == 1 && !skip) || + (skip != verbose)) && + (!(cb.cb_flags & IOS_L_HISTO_M)) && + !cb.cb_scripted) print_iostat_header(&cb); if (skip) { - (void) sleep(interval); + (void) fsleep(interval); continue; } - (void) pool_list_iter(list, B_FALSE, print_iostat, &cb); + pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. + * + * In addition, if we're printing specific vdevs then + * we also want an ending separator. */ - if (npools > 1 && !verbose) + if (((npools > 1 && !verbose && + !(cb.cb_flags & IOS_L_HISTO_M)) || + (!(cb.cb_flags & IOS_L_HISTO_M) && + cb.cb_vdev_names_count)) && + !cb.cb_scripted) { print_iostat_separator(&cb); - - if (verbose) - (void) printf("\n"); + } } /* @@ -3010,7 +4128,7 @@ zpool_do_iostat(int argc, char **argv) if (count != 0 && --count == 0) break; - (void) sleep(interval); + (void) fsleep(interval); } pool_list_free(list); @@ -3352,7 +4470,8 @@ zpool_do_list(int argc, char **argv) "name,size,allocated,free,expandsize,fragmentation,capacity," "dedupratio,health,altroot"; char *props = default_props; - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; @@ -3427,7 +4546,7 @@ zpool_do_list(int argc, char **argv) break; pool_list_free(list); - (void) sleep(interval); + (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { @@ -4776,7 +5895,8 @@ zpool_do_status(int argc, char **argv) { int c; int ret; - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; status_cbdata_t cb = { 0 }; /* check options */ @@ -4841,7 +5961,7 @@ zpool_do_status(int argc, char **argv) if (count != 0 && --count == 0) break; - (void) sleep(interval); + (void) fsleep(interval); } return (0); diff --git a/cmd/zpool/zpool_util.c b/cmd/zpool/zpool_util.c index c7a002efb1..df3f9bf834 100644 --- a/cmd/zpool/zpool_util.c +++ b/cmd/zpool/zpool_util.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "zpool_util.h" @@ -84,3 +85,27 @@ num_logs(nvlist_t *nv) } return (nlogs); } + +/* Find the max element in an array of uint64_t values */ +uint64_t +array64_max(uint64_t array[], unsigned int len) { + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array[i]); + + return (max); +} + +/* + * Return 1 if "str" is a number string, 0 otherwise. Works for integer and + * floating point numbers. + */ +int +isnumber(char *str) { + for (; *str; str++) + if (!(isdigit(*str) || (*str == '.'))) + return (0); + + return (1); +} diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index 1b4ce518f8..f279fd5dd6 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -38,6 +38,8 @@ extern "C" { void *safe_malloc(size_t); void zpool_no_memory(void); uint_t num_logs(nvlist_t *nv); +uint64_t array64_max(uint64_t array[], unsigned int len); +int isnumber(char *str); /* * Virtual device functions @@ -55,6 +57,10 @@ nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, zpool_iter_f, void *); +/* Vdev list functions */ +typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *); +int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); + typedef struct zpool_list zpool_list_t; zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); diff --git a/include/libzfs.h b/include/libzfs.h index 3faee0addd..654b932843 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -747,10 +747,21 @@ extern int zfs_unshareall(zfs_handle_t *); extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); +enum zfs_nicenum_format { + ZFS_NICENUM_1024 = 0, + ZFS_NICENUM_TIME = 1, + ZFS_NICENUM_RAW = 2 +}; + /* * Utility function to convert a number to a human-readable form. */ extern void zfs_nicenum(uint64_t, char *, size_t); +extern void zfs_nicenum_format(uint64_t num, char *buf, size_t buflen, + enum zfs_nicenum_format type); + + +extern void zfs_nicetime(uint64_t, char *, size_t); extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index e2974ad7ac..65dba125c9 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -32,6 +32,7 @@ #define _SYS_FS_ZFS_H #include +#include #ifdef __cplusplus extern "C" { @@ -528,6 +529,37 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ + +/* container nvlist of extended stats */ +#define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" + +/* Active queue read/write stats */ +#define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue" +#define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" +#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" + +/* Queue sizes */ +#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" +#define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" +#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" + +/* Latency read/write histogram stats */ +#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" + + + #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" @@ -766,8 +798,49 @@ typedef struct vdev_stat { uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ + } vdev_stat_t; +/* + * Extended stats + * + * These are stats which aren't included in the original iostat output. For + * convenience, they are grouped together in vdev_stat_ex, although each stat + * is individually exported as a nvlist. + */ +typedef struct vdev_stat_ex { + /* Number of ZIOs issued to disk and waiting to finish */ + uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; + + /* Number of ZIOs pending to be issued to disk */ + uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; + + /* + * Below are the histograms for various latencies. Buckets are in + * units of nanoseconds. + */ + + /* + * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in + * before this. + */ +#define VDEV_HISTO_BUCKETS 37 + + /* Amount of time in ZIO queue (ns) */ + uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE] + [VDEV_HISTO_BUCKETS]; + + /* Total ZIO latency (ns). Includes queuing and disk access time */ + uint64_t vsx_total_histo[ZIO_TYPES][VDEV_HISTO_BUCKETS]; + + /* Amount of time to read/write the disk (ns) */ + uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_HISTO_BUCKETS]; + + /* "lookup the bucket for a value" macro */ +#define HISTO(a) (a != 0 ? MIN(highbit64(a) - 1, VDEV_HISTO_BUCKETS - 1) : 0) + +} vdev_stat_ex_t; + /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 5abd8c0194..4f54b1707c 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -85,7 +85,7 @@ extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd); - +extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); @@ -153,6 +153,7 @@ extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern int vdev_label_number(uint64_t psise, uint64_t offset); extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); +extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 4958cad9c4..0d09c81c7f 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -150,6 +150,7 @@ struct vdev { vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ vdev_stat_t vdev_stat; /* virtual device statistics */ + vdev_stat_ex_t vdev_stat_ex; /* extended statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ boolean_t vdev_nonrot; /* true if solid state */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index e68223eb30..693035ee29 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -647,6 +647,7 @@ extern void delay(clock_t ticks); extern uint64_t physmem; extern int highbit64(uint64_t i); +extern int lowbit64(uint64_t i); extern int random_get_bytes(uint8_t *ptr, size_t len); extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); diff --git a/include/sys/zio.h b/include/sys/zio.h index ced7fe87bc..9790b4a900 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -421,7 +421,8 @@ struct zio { uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_delta; /* vdev queue service delta */ - uint64_t io_delay; /* vdev disk service delta (ticks) */ + hrtime_t io_delay; /* Device access time (disk or */ + /* file). */ avl_node_t io_queue_node; avl_node_t io_offset_node; diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index e33b9585b1..3fc3589be0 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -29,8 +29,7 @@ typedef enum zio_priority { ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ ZIO_PRIORITY_NUM_QUEUEABLE, - - ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ + ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; #ifdef __cplusplus diff --git a/lib/libspl/include/sys/sysmacros.h b/lib/libspl/include/sys/sysmacros.h index 5d10657be5..c2525dd2a4 100644 --- a/lib/libspl/include/sys/sysmacros.h +++ b/lib/libspl/include/sys/sysmacros.h @@ -39,6 +39,9 @@ #ifndef ABS #define ABS(a) ((a) < 0 ? -(a) : (a)) #endif +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif #define makedevice(maj, min) makedev(maj, min) #define _sysconf(a) sysconf(a) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 8cacc01dd1..789df407c4 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3538,7 +3538,6 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, * If it's a raidz device, we need to stick in the parity level. */ if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &value) == 0); (void) snprintf(buf, sizeof (buf), "%s%llu", path, @@ -3552,7 +3551,6 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, */ if (name_flags & VDEV_NAME_TYPE_ID) { uint64_t id; - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &id) == 0); (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s-%llu", diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 57c2ac8538..926ed4ed8a 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -596,27 +596,49 @@ zfs_strdup(libzfs_handle_t *hdl, const char *str) * Convert a number to an appropriately human-readable output. */ void -zfs_nicenum(uint64_t num, char *buf, size_t buflen) +zfs_nicenum_format(uint64_t num, char *buf, size_t buflen, + enum zfs_nicenum_format format) { uint64_t n = num; int index = 0; - char u; + const char *u; + const char *units[3][7] = { + [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"}, + [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"} + }; - while (n >= 1024 && index < 6) { - n /= 1024; + const int units_len[] = {[ZFS_NICENUM_1024] = 6, + [ZFS_NICENUM_TIME] = 4}; + + const int k_unit[] = { [ZFS_NICENUM_1024] = 1024, + [ZFS_NICENUM_TIME] = 1000}; + + double val; + + if (format == ZFS_NICENUM_RAW) { + snprintf(buf, buflen, "%llu", (u_longlong_t) num); + return; + } + + + while (n >= k_unit[format] && index < units_len[format]) { + n /= k_unit[format]; index++; } - u = " KMGTPE"[index]; + u = units[format][index]; - if (index == 0) { - (void) snprintf(buf, buflen, "%llu", (u_longlong_t) n); - } else if ((num & ((1ULL << 10 * index) - 1)) == 0) { + /* Don't print 0ns times */ + if ((format == ZFS_NICENUM_TIME) && (num == 0)) { + (void) snprintf(buf, buflen, "-"); + } else if ((index == 0) || ((num % + (uint64_t) powl(k_unit[format], index)) == 0)) { /* * If this is an even multiple of the base, always display * without any decimal precision. */ - (void) snprintf(buf, buflen, "%llu%c", (u_longlong_t) n, u); + (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t) n, u); + } else { /* * We want to choose a precision that reflects the best choice @@ -629,13 +651,61 @@ zfs_nicenum(uint64_t num, char *buf, size_t buflen) */ int i; for (i = 2; i >= 0; i--) { - if (snprintf(buf, buflen, "%.*f%c", i, - (double)num / (1ULL << 10 * index), u) <= 5) - break; + val = (double) num / + (uint64_t) powl(k_unit[format], index); + + /* + * Don't print floating point values for time. Note, + * we use floor() instead of round() here, since + * round can result in undesirable results. For + * example, if "num" is in the range of + * 999500-999999, it will print out "1000us". This + * doesn't happen if we use floor(). + */ + if (format == ZFS_NICENUM_TIME) { + if (snprintf(buf, buflen, "%d%s", + (unsigned int) floor(val), u) <= 5) + break; + + } else { + if (snprintf(buf, buflen, "%.*f%s", i, + val, u) <= 5) + break; + } } } } +/* + * Convert a number to an appropriately human-readable output. + */ +void +zfs_nicenum(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024); +} + +/* + * Convert a time to an appropriately human-readable output. + * @num: Time in nanoseconds + */ +void +zfs_nicetime(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME); +} + +/* + * Print out a raw number with correct column spacing + */ +void +zfs_niceraw(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW); +} + + + void libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) { diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 49d17ece32..3d85093e2d 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -1071,6 +1071,50 @@ highbit64(uint64_t i) return (h); } +/* + * Find lowest one bit set. + * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. + * This is basically a reimplementation of ffsll(), which is GNU specific. + */ +int +lowbit64(uint64_t i) +{ + register int h = 64; + if (i == 0) + return (0); + + if (i & 0x00000000ffffffffULL) + h -= 32; + else + i >>= 32; + + if (i & 0x0000ffff) + h -= 16; + else + i >>= 16; + + if (i & 0x00ff) + h -= 8; + else + i >>= 8; + + if (i & 0x0f) + h -= 4; + else + i >>= 4; + + if (i & 0x3) + h -= 2; + else + i >>= 2; + + if (i & 0x1) + h -= 1; + + return (h); +} + + static int random_fd = -1, urandom_fd = -1; static int diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c index 231043d75b..7a0748c032 100644 --- a/lib/libzpool/util.c +++ b/lib/libzpool/util.c @@ -67,7 +67,7 @@ static void show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) { vdev_stat_t *vs; - vdev_stat_t v0 = { 0 }; + vdev_stat_t *v0 = { 0 }; uint64_t sec; uint64_t is_log = 0; nvlist_t **child; @@ -76,6 +76,8 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6]; char *prefix = ""; + v0 = umem_zalloc(sizeof (*v0), UMEM_NOFAIL); + if (indent == 0 && desc != NULL) { (void) printf(" " " capacity operations bandwidth ---- errors ----\n"); @@ -91,7 +93,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) != 0) - vs = &v0; + vs = v0; sec = MAX(1, vs->vs_timestamp / NANOSEC); @@ -114,6 +116,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", rops, wops, rbytes, wbytes, rerr, werr, cerr); } + free(v0); if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0) return; diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index bcbcaa249a..1f14eee987 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -95,7 +95,9 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool iostat\fR [\fB-T\fR d | u ] [\fB-gLPvy\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]] +\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [\fB-G\fR|[\fB-lq\fR]] + [[\fIpool\fR ...]|[\fIpool vdev\fR ...]|[\fIvdev\fR ...]] [\fIinterval\fR[\fIcount\fR]]\fR + .fi .LP @@ -1677,11 +1679,22 @@ Scan using the default search path, the libblkid cache will not be consulted. A .ne 2 .mk .na -\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-gLPvy\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR +\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [\fB-w\fR|[\fB-lq\fR]] [[\fIpool\fR ...]|[\fIpool vdev\fR ...]|[\fIvdev\fR ...]] [\fIinterval\fR[\fIcount\fR]]\fR + .ad .sp .6 .RS 4n -Displays \fBI/O\fR statistics for the given pools. When given an interval, the statistics are printed every \fIinterval\fR seconds until \fBCtrl-C\fR is pressed. If no \fIpools\fR are specified, statistics for every pool in the system is shown. If \fIcount\fR is specified, the command exits after \fIcount\fR reports are printed. +Displays \fBI/O\fR statistics for the given \fIpool\fRs/\fIvdev\fRs. You can +pass in a list of \fIpool\fRs, a \fIpool\fR and list of \fIvdev\fRs in that +\fIpool\fR, or a list of any \fIvdev\fRs from any \fIpool\fR. If no items are +specified, statistics for every pool in the system are shown. When given an +interval, the statistics are printed every \fIinterval\fR seconds until +\fBCtrl-C\fR is pressed. If \fIcount\fR is specified, the command exits after +\fIcount\fR reports are printed. The first report printed is always the +statistics since boot regardless of whether \fIinterval\fR and \fIcount\fR +are passed. However, this behavior can be suppressed with the -y flag. Also +note that the units of 'K', 'M', 'G'... that are printed in the report are in +base 1024. To get the raw values, use the \fB-p\fR flag. .sp .ne 2 .mk @@ -1706,6 +1719,17 @@ Specify \fBu\fR for a printed representation of the internal representation of t Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-H\fR\fR +.ad +.RS 12n +.rt +Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. +.RE + .sp .ne 2 .mk @@ -1717,6 +1741,17 @@ Display vdev GUIDs instead of the normal device names. These GUIDs can be used i Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the /dev/disk/ path used to open it. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR\fR +.ad +.RS 12n +.rt +Display numbers in parseable (exact) values. Time values are in nanoseconds. +.RE + .sp .ne 2 .mk @@ -1749,9 +1784,177 @@ Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within t .rt Omit statistics since boot. Normally the first line of output reports the statistics since boot. This option suppresses that first line of output. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-w\fR\fR +.ad +.RS 12n +.rt +Display latency histograms: + +.sp +.ne 2 +.mk +.na +total_wait: +.ad +.RS 20n +.rt +Total IO time (queuing + disk IO time). +.RE +.ne 2 +.mk +.na +disk_wait: +.ad +.RS 20n +.rt +Disk IO time (time reading/writing the disk). +.RE +.ne 2 +.mk +.na +syncq_wait: +.ad +.RS 20n +.rt +Amount of time IO spent in synchronous priority queues. Does not include +disk time. +.RE +.ne 2 +.mk +.na +asyncq_wait: +.ad +.RS 20n +.rt +Amount of time IO spent in asynchronous priority queues. Does not include +disk time. +.RE +.ne 2 +.mk +.na +scrub: +.ad +.RS 20n +.rt +Amount of time IO spent in scrub queue. Does not include disk time. + .RE +All histogram buckets are power-of-two sized. The time labels are the end +ranges of the buckets, so for example, a 15ns bucket stores latencies from +8-15ns. The last bucket is also a catch-all for latencies higher than the +maximum. +.RE +.sp +.ne 2 +.mk +.na +\fB\fB-l\fR\fR +.ad +.RS 12n +.rt +Include average latency statistics: + +.sp +.ne 2 +.mk +.na +total_wait: +.ad +.RS 20n +.rt +Average total IO time (queuing + disk IO time). +.RE +.ne 2 +.mk +.na +disk_wait: +.ad +.RS 20n +.rt +Average disk IO time (time reading/writing the disk). +.RE +.ne 2 +.mk +.na +syncq_wait: +.ad +.RS 20n +.rt +Average amount of time IO spent in synchronous priority queues. Does not +include disk time. +.RE +.ne 2 +.mk +.na +asyncq_wait: +.ad +.RS 20n +.rt +Average amount of time IO spent in asynchronous priority queues. Does not +include disk time. +.RE +.ne 2 +.mk +.na +scrub: +.ad +.RS 20n +.rt +Average queuing time in scrub queue. Does not include disk time. +.RE + +.RE +.sp +.ne 2 +.mk +.na +\fB\fB-q\fR\fR +.ad +.RS 12n +.rt +Include active queue statistics. Each priority queue has both pending ("pend") +and active ("activ") IOs. Pending IOs are waiting to be issued to the disk, and +active IOs have been issued to disk and are waiting for completion. These stats +are broken out by priority queue: +.sp +.ne 2 +.mk +.na +syncq_read/write: +.ad +.RS 20n +.rt +Current number of entries in synchronous priority queues. +.RE +.ne 2 +.mk +.na +asyncq_read/write: +.ad +.RS 20n +.rt +Current number of entries in asynchronous priority queues. +.RE +.ne 2 +.mk +.na +scrubq_read: +.ad +.RS 20n +.rt +Current number of entries in scrub queue. +.RE + +All queue statistics are instantaneous measurements of the number of entries +in the queues. If you specify an interval, the measurements will be sampled +from the end of the interval. +.RE .sp .ne 2 .mk diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9246495ee1..c23fd7a3aa 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3350,6 +3350,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); + vdev_config_generate_stats(vd, l2cache[i]); + } } } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7114c2efca..1373901738 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2764,49 +2764,123 @@ vdev_accessible(vdev_t *vd, zio_t *zio) return (B_TRUE); } +static void +vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) +{ + int t; + for (t = 0; t < ZIO_TYPES; t++) { + vs->vs_ops[t] += cvs->vs_ops[t]; + vs->vs_bytes[t] += cvs->vs_bytes[t]; + } + + cvs->vs_scan_removing = cvd->vdev_removing; +} + +/* + * Get extended stats + */ +static void +vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) +{ + int t, b; + for (t = 0; t < ZIO_TYPES; t++) { + for (b = 0; b < VDEV_HISTO_BUCKETS; b++) { + vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; + vsx->vsx_total_histo[t][b] += + cvsx->vsx_total_histo[t][b]; + } + } + + for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { + for (b = 0; b < VDEV_HISTO_BUCKETS; b++) { + vsx->vsx_queue_histo[t][b] += + cvsx->vsx_queue_histo[t][b]; + } + vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; + vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; + } +} + /* * Get statistics for the given vdev. */ -void -vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +static void +vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; int c, t; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - - mutex_enter(&vd->vdev_stat_lock); - bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_timestamp = gethrtime() - vs->vs_timestamp; - vs->vs_state = vd->vdev_state; - vs->vs_rsize = vdev_get_min_asize(vd); - if (vd->vdev_ops->vdev_op_leaf) - vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; - if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { - vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; - } - /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ - if (vd == rvd) { - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - vdev_stat_t *cvs = &cvd->vdev_stat; + if (!vd->vdev_ops->vdev_op_leaf) { + if (vs) { + memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); + memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); + } + if (vsx) + memset(vsx, 0, sizeof (*vsx)); - for (t = 0; t < ZIO_TYPES; t++) { - vs->vs_ops[t] += cvs->vs_ops[t]; - vs->vs_bytes[t] += cvs->vs_bytes[t]; - } - cvs->vs_scan_removing = cvd->vdev_removing; + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + vdev_stat_t *cvs = &cvd->vdev_stat; + vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; + + vdev_get_stats_ex_impl(cvd, cvs, cvsx); + if (vs) + vdev_get_child_stat(cvd, vs, cvs); + if (vsx) + vdev_get_child_stat_ex(cvd, vsx, cvsx); + + } + } else { + /* + * We're a leaf. Just copy our ZIO active queue stats in. The + * other leaf stats are updated in vdev_stat_update(). + */ + if (!vsx) + return; + + memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); + + for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { + vsx->vsx_active_queue[t] = + vd->vdev_queue.vq_class[t].vqc_active; + vsx->vsx_pend_queue[t] = avl_numnodes( + &vd->vdev_queue.vq_class[t].vqc_queued_tree); } } +} + +void +vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) +{ + mutex_enter(&vd->vdev_stat_lock); + if (vs) { + bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + vs->vs_timestamp = gethrtime() - vs->vs_timestamp; + vs->vs_state = vd->vdev_state; + vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) + vs->vs_rsize += VDEV_LABEL_START_SIZE + + VDEV_LABEL_END_SIZE; + vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; + if (vd->vdev_aux == NULL && vd == vd->vdev_top && + !vd->vdev_ishole) { + vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; + } + } + + ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER) != 0); + vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } +void +vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +{ + return (vdev_get_stats_ex(vd, vs, NULL)); +} + void vdev_clear_stats(vdev_t *vd) { @@ -2840,6 +2914,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; + vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; zio_type_t type = zio->io_type; int flags = zio->io_flags; @@ -2890,8 +2965,24 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vs->vs_self_healed += psize; } - vs->vs_ops[type]++; - vs->vs_bytes[type] += psize; + /* + * The bytes/ops/histograms are recorded at the leaf level and + * aggregated into the higher level vdevs in vdev_get_stats(). + */ + if (vd->vdev_ops->vdev_op_leaf) { + + vs->vs_ops[type]++; + vs->vs_bytes[type] += psize; + + if (zio->io_delta && zio->io_delay) { + vsx->vsx_queue_histo[zio->io_priority] + [HISTO(zio->io_delta - zio->io_delay)]++; + vsx->vsx_disk_histo[type] + [HISTO(zio->io_delay)]++; + vsx->vsx_total_histo[type] + [HISTO(zio->io_delta)]++; + } + } mutex_exit(&vd->vdev_stat_lock); return; diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 9b51ecc1d9..4e362226a8 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -100,9 +100,9 @@ vdev_disk_error(zio_t *zio) { #ifdef ZFS_DEBUG printk("ZFS: zio error=%d type=%d offset=%llu size=%llu " - "flags=%x delay=%llu\n", zio->io_error, zio->io_type, + "flags=%x\n", zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, - zio->io_flags, (u_longlong_t)zio->io_delay); + zio->io_flags); #endif } @@ -410,7 +410,6 @@ vdev_disk_dio_put(dio_request_t *dr) vdev_disk_dio_free(dr); if (zio) { - zio->io_delay = jiffies_64 - zio->io_delay; zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) @@ -588,8 +587,6 @@ retry: /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_disk_dio_get(dr); - if (zio) - zio->io_delay = jiffies_64; /* Submit all bio's associated with this dio */ for (i = 0; i < dr->dr_bio_count; i++) @@ -630,7 +627,6 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc) int rc = bio->bi_error; #endif - zio->io_delay = jiffies_64 - zio->io_delay; zio->io_error = -rc; if (rc && (rc == -EOPNOTSUPP)) zio->io_vd->vdev_nowritecache = B_TRUE; @@ -660,7 +656,6 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_private = zio; bio->bi_bdev = bdev; - zio->io_delay = jiffies_64; vdev_submit_bio(VDEV_WRITE_FLUSH_FUA, bio); invalidate_bdev(bdev); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 3dc3d0d9d3..1400aee7b7 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -207,6 +207,107 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); } +/* + * Generate the nvlist representing this vdev's stats + */ +void +vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) +{ + nvlist_t *nvx; + vdev_stat_t *vs; + vdev_stat_ex_t *vsx; + + vs = kmem_alloc(sizeof (*vs), KM_SLEEP); + vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP); + + vdev_get_stats_ex(vd, vs, vsx); + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t)); + + kmem_free(vs, sizeof (*vs)); + + /* + * Add extended stats into a special extended stats nvlist. This keeps + * all the extended stats nicely grouped together. The extended stats + * nvlist is then added to the main nvlist. + */ + nvx = fnvlist_alloc(); + + /* ZIOs in flight to disk */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); + + /* ZIOs pending */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); + + /* Histograms */ + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); + + /* Add extended stats nvlist to main nvlist */ + fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); + + kmem_free(vsx, sizeof (*vsx)); +} + /* * Generate the nvlist representing this vdev's config. */ @@ -215,7 +316,6 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) { nvlist_t *nv = NULL; - nv = fnvlist_alloc(); fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); @@ -306,12 +406,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, } if (getstats) { - vdev_stat_t vs; pool_scan_stat_t ps; - vdev_get_stats(vd, &vs); - fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); + vdev_config_generate_stats(vd, nv); /* provide either current or previous scan information */ if (spa_scan_get_stats(spa, &ps) == 0) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 2d16e632de..523a924d67 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -39,6 +39,7 @@ #include #include #include +#include /* * ========================================================================== @@ -2694,6 +2695,8 @@ zio_vdev_io_start(zio_t *zio) uint64_t align; spa_t *spa = zio->io_spa; + zio->io_delay = 0; + ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); @@ -2799,6 +2802,7 @@ zio_vdev_io_start(zio_t *zio) } } + zio->io_delay = gethrtime(); vd->vdev_ops->vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } @@ -2815,6 +2819,9 @@ zio_vdev_io_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + if (zio->io_delay) + zio->io_delay = gethrtime() - zio->io_delay; + if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { vdev_queue_io_done(zio); @@ -3217,7 +3224,7 @@ zio_done(zio_t *zio) * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ - if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) { + if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio, 0, 0); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 179f82e43d..c9b882987b 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -370,7 +370,7 @@ tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', - 'zpool_iostat_003_neg'] + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am index 2c292b9999..621dff91f2 100644 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am @@ -4,4 +4,5 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zpool_iostat_001_neg.ksh \ zpool_iostat_002_pos.ksh \ - zpool_iostat_003_neg.ksh + zpool_iostat_003_neg.ksh \ + zpool_iostat_004_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh index d275e063b1..77eb6bd34f 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh @@ -33,4 +33,4 @@ DISK=${DISKS%% *} -default_setup $DISK +default_raidz_setup $DISKS diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh index 37062ca536..ec5599acef 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh @@ -33,13 +33,13 @@ # # DESCRIPTION: -# Verify that 'zpool iostat [interval [count]' can be executed as non-root. +# Verify that 'zpool iostat [interval [count]]' can be executed as non-root. # # STRATEGY: # 1. Set the interval to 1 and count to 4. # 2. Sleep for 4 seconds. # 3. Verify that the output has 4 records. -# +# 4. Set interval to 0.5 and count to 1 to test floating point intervals. verify_runnable "both" @@ -68,4 +68,7 @@ if [[ $stat_count -ne 4 ]]; then log_fail "zpool iostat [pool_name] [interval] [count] failed" fi +# Test a floating point interval value +log_must $ZPOOL iostat -v 0.5 1 + log_pass "zpool iostat [pool_name ...] [interval] [count] passed" diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh index d73f5d5c8e..ae1e5a1523 100755 --- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh @@ -51,13 +51,14 @@ else fi set -A args "" "-?" "-f" "nonexistpool" "$TESTPOOL/$TESTFS" \ - "$testpool 1.23" "$testpool 0" "$testpool -1" "$testpool 1 0" \ - "$testpool 0 0" + "$testpool 0" "$testpool -1" "$testpool 1 0" \ + "$testpool 0 0" "$testpool -wl" "$testpool -wq" log_assert "Executing 'zpool iostat' with bad options fails" typeset -i i=1 while [[ $i -lt ${#args[*]} ]]; do + log_assert "doing $ZPOOL iostat ${args[i]}" log_mustnot $ZPOOL iostat ${args[i]} ((i = i + 1)) done diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh new file mode 100755 index 0000000000..70318dbb91 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +# Copyright (C) 2016 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Executing 'zpool iostat' command with various combinations of extended +# stats (-vqL), parseable/script options (-pH), and misc lists of pools +# and vdevs. +# +# STRATEGY: +# 1. Create an array of mixed 'zpool iostat' options. +# 2. Execute each element of the array. +# 3. Verify an error code is returned. +# + +verify_runnable "both" + +typeset testpool +if is_global_zone ; then + testpool=$TESTPOOL +else + testpool=${TESTPOOL%%/*} +fi + +set -A args "" "-v" "-q" "-l" "-lq $TESTPOOL" "-ql ${DISKS[0]} ${DISKS[1]}" \ + "-w $TESTPOOL ${DISKS[0]} ${DISKS[1]}" \ + "-wp $TESTPOOL" \ + "-qlH $TESTPOOL ${DISKS[0]}" \ + "-vpH ${DISKS[0]}" \ + "-wpH ${DISKS[0]}" + +log_assert "Executing 'zpool iostat' with extended stat options succeeds" +log_note "testpool: $TESTPOOL, disks $DISKS" + +typeset -i i=1 +while [[ $i -lt ${#args[*]} ]]; do + log_note "doing $ZPOOL iostat ${args[i]}" + log_must $ZPOOL iostat ${args[i]} + ((i = i + 1)) +done + +log_pass "Executing 'zpool iostat' with extended stat options succeeds"