Add zpool status -s (slow I/Os) and -p (parseable)
This patch adds a new slow I/Os (-s) column to zpool status to show the number of VDEV slow I/Os. This is the number of I/Os that didn't complete in zio_slow_io_ms milliseconds. It also adds a new parsable (-p) flag to display exact values. NAME STATE READ WRITE CKSUM SLOW testpool ONLINE 0 0 0 - mirror-0 ONLINE 0 0 0 - loop0 ONLINE 0 0 0 20 loop1 ONLINE 0 0 0 0 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #7756 Closes #6885
This commit is contained in:
parent
877d925a9e
commit
ad796b8a3b
|
@ -365,8 +365,8 @@ get_usage(zpool_help_t idx)
|
||||||
case HELP_RESILVER:
|
case HELP_RESILVER:
|
||||||
return (gettext("\tresilver <pool> ...\n"));
|
return (gettext("\tresilver <pool> ...\n"));
|
||||||
case HELP_STATUS:
|
case HELP_STATUS:
|
||||||
return (gettext("\tstatus [-c [script1,script2,...]] [-gLPvxD]"
|
return (gettext("\tstatus [-c [script1,script2,...]] "
|
||||||
"[-T d|u] [pool] ... \n"
|
"[-gLpPsvxD] [-T d|u] [pool] ... \n"
|
||||||
"\t [interval [count]]\n"));
|
"\t [interval [count]]\n"));
|
||||||
case HELP_UPGRADE:
|
case HELP_UPGRADE:
|
||||||
return (gettext("\tupgrade\n"
|
return (gettext("\tupgrade\n"
|
||||||
|
@ -1669,10 +1669,12 @@ typedef struct status_cbdata {
|
||||||
int cb_namewidth;
|
int cb_namewidth;
|
||||||
boolean_t cb_allpools;
|
boolean_t cb_allpools;
|
||||||
boolean_t cb_verbose;
|
boolean_t cb_verbose;
|
||||||
|
boolean_t cb_literal;
|
||||||
boolean_t cb_explain;
|
boolean_t cb_explain;
|
||||||
boolean_t cb_first;
|
boolean_t cb_first;
|
||||||
boolean_t cb_dedup_stats;
|
boolean_t cb_dedup_stats;
|
||||||
boolean_t cb_print_status;
|
boolean_t cb_print_status;
|
||||||
|
boolean_t cb_print_slow_ios;
|
||||||
vdev_cmd_data_list_t *vcdl;
|
vdev_cmd_data_list_t *vcdl;
|
||||||
} status_cbdata_t;
|
} status_cbdata_t;
|
||||||
|
|
||||||
|
@ -1788,10 +1790,34 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
||||||
name, state);
|
name, state);
|
||||||
|
|
||||||
if (!isspare) {
|
if (!isspare) {
|
||||||
zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
|
if (cb->cb_literal) {
|
||||||
zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
|
printf(" %5llu %5llu %5llu",
|
||||||
zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
|
(u_longlong_t)vs->vs_read_errors,
|
||||||
(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
|
(u_longlong_t)vs->vs_write_errors,
|
||||||
|
(u_longlong_t)vs->vs_checksum_errors);
|
||||||
|
} else {
|
||||||
|
zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
|
||||||
|
zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
|
||||||
|
zfs_nicenum(vs->vs_checksum_errors, cbuf,
|
||||||
|
sizeof (cbuf));
|
||||||
|
printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cb->cb_print_slow_ios) {
|
||||||
|
if (children == 0) {
|
||||||
|
/* Only leafs vdevs have slow IOs */
|
||||||
|
zfs_nicenum(vs->vs_slow_ios, rbuf,
|
||||||
|
sizeof (rbuf));
|
||||||
|
} else {
|
||||||
|
snprintf(rbuf, sizeof (rbuf), "-");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cb->cb_literal)
|
||||||
|
printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
|
||||||
|
else
|
||||||
|
printf(" %5s", rbuf);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
|
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
|
||||||
|
@ -7175,6 +7201,9 @@ status_callback(zpool_handle_t *zhp, void *data)
|
||||||
cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
|
cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
|
||||||
"CKSUM");
|
"CKSUM");
|
||||||
|
|
||||||
|
if (cbp->cb_print_slow_ios)
|
||||||
|
(void) printf(" %5s", gettext("SLOW"));
|
||||||
|
|
||||||
if (cbp->vcdl != NULL)
|
if (cbp->vcdl != NULL)
|
||||||
print_cmd_columns(cbp->vcdl, 0);
|
print_cmd_columns(cbp->vcdl, 0);
|
||||||
|
|
||||||
|
@ -7241,13 +7270,15 @@ status_callback(zpool_handle_t *zhp, void *data)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* zpool status [-c [script1,script2,...]] [-gLPvx] [-T d|u] [pool] ...
|
* zpool status [-c [script1,script2,...]] [-gLpPsvx] [-T d|u] [pool] ...
|
||||||
* [interval [count]]
|
* [interval [count]]
|
||||||
*
|
*
|
||||||
* -c CMD For each vdev, run command CMD
|
* -c CMD For each vdev, run command CMD
|
||||||
* -g Display guid for individual vdev name.
|
* -g Display guid for individual vdev name.
|
||||||
* -L Follow links when resolving vdev path name.
|
* -L Follow links when resolving vdev path name.
|
||||||
|
* -p Display values in parsable (exact) format.
|
||||||
* -P Display full path for vdev name.
|
* -P Display full path for vdev name.
|
||||||
|
* -s Display slow IOs column.
|
||||||
* -v Display complete error logs
|
* -v Display complete error logs
|
||||||
* -x Display only pools with potential problems
|
* -x Display only pools with potential problems
|
||||||
* -D Display dedup status (undocumented)
|
* -D Display dedup status (undocumented)
|
||||||
|
@ -7266,7 +7297,7 @@ zpool_do_status(int argc, char **argv)
|
||||||
char *cmd = NULL;
|
char *cmd = NULL;
|
||||||
|
|
||||||
/* check options */
|
/* check options */
|
||||||
while ((c = getopt(argc, argv, "c:gLPvxDT:")) != -1) {
|
while ((c = getopt(argc, argv, "c:gLpPsvxDT:")) != -1) {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 'c':
|
case 'c':
|
||||||
if (cmd != NULL) {
|
if (cmd != NULL) {
|
||||||
|
@ -7298,9 +7329,15 @@ zpool_do_status(int argc, char **argv)
|
||||||
case 'L':
|
case 'L':
|
||||||
cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
|
cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
|
||||||
break;
|
break;
|
||||||
|
case 'p':
|
||||||
|
cb.cb_literal = B_TRUE;
|
||||||
|
break;
|
||||||
case 'P':
|
case 'P':
|
||||||
cb.cb_name_flags |= VDEV_NAME_PATH;
|
cb.cb_name_flags |= VDEV_NAME_PATH;
|
||||||
break;
|
break;
|
||||||
|
case 's':
|
||||||
|
cb.cb_print_slow_ios = B_TRUE;
|
||||||
|
break;
|
||||||
case 'v':
|
case 'v':
|
||||||
cb.cb_verbose = B_TRUE;
|
cb.cb_verbose = B_TRUE;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -74,6 +74,7 @@ extern "C" {
|
||||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors"
|
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors"
|
||||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors"
|
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors"
|
||||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors"
|
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors"
|
||||||
|
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays"
|
||||||
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
|
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
|
||||||
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
|
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
|
||||||
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
|
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
|
||||||
|
|
|
@ -665,6 +665,9 @@ typedef struct zpool_load_policy {
|
||||||
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
|
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
|
||||||
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
|
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
|
||||||
|
|
||||||
|
/* Number of slow IOs */
|
||||||
|
#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"
|
||||||
|
|
||||||
/* vdev enclosure sysfs path */
|
/* vdev enclosure sysfs path */
|
||||||
#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path"
|
#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path"
|
||||||
|
|
||||||
|
@ -990,6 +993,7 @@ typedef struct vdev_stat {
|
||||||
uint64_t vs_fragmentation; /* device fragmentation */
|
uint64_t vs_fragmentation; /* device fragmentation */
|
||||||
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
|
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
|
||||||
uint64_t vs_resilver_deferred; /* resilver deferred */
|
uint64_t vs_resilver_deferred; /* resilver deferred */
|
||||||
|
uint64_t vs_slow_ios; /* slow IOs */
|
||||||
} vdev_stat_t;
|
} vdev_stat_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -1076,9 +1076,11 @@ extern const char *spa_state_to_name(spa_t *spa);
|
||||||
/* error handling */
|
/* error handling */
|
||||||
struct zbookmark_phys;
|
struct zbookmark_phys;
|
||||||
extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
|
extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
|
||||||
extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
|
extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
|
||||||
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
|
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
|
||||||
uint64_t length);
|
uint64_t length);
|
||||||
|
extern boolean_t zfs_ereport_is_valid(const char *class, spa_t *spa, vdev_t *vd,
|
||||||
|
zio_t *zio);
|
||||||
extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
|
extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
|
||||||
const char *name, nvlist_t *aux);
|
const char *name, nvlist_t *aux);
|
||||||
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
|
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
|
||||||
|
|
|
@ -158,11 +158,6 @@ enum zio_encrypt {
|
||||||
(compress) == ZIO_COMPRESS_ON || \
|
(compress) == ZIO_COMPRESS_ON || \
|
||||||
(compress) == ZIO_COMPRESS_OFF)
|
(compress) == ZIO_COMPRESS_OFF)
|
||||||
|
|
||||||
/*
|
|
||||||
* Default Linux timeout for a sd device.
|
|
||||||
*/
|
|
||||||
#define ZIO_DELAY_MAX (30 * MILLISEC)
|
|
||||||
|
|
||||||
#define ZIO_FAILURE_MODE_WAIT 0
|
#define ZIO_FAILURE_MODE_WAIT 0
|
||||||
#define ZIO_FAILURE_MODE_CONTINUE 1
|
#define ZIO_FAILURE_MODE_CONTINUE 1
|
||||||
#define ZIO_FAILURE_MODE_PANIC 2
|
#define ZIO_FAILURE_MODE_PANIC 2
|
||||||
|
@ -664,7 +659,7 @@ extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
|
||||||
extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
|
extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
|
||||||
|
|
||||||
/* If we have the good data in hand, this function can be used */
|
/* If we have the good data in hand, this function can be used */
|
||||||
extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
|
extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
|
||||||
const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
|
const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
|
||||||
uint64_t length, const abd_t *good_data, const abd_t *bad_data,
|
uint64_t length, const abd_t *good_data, const abd_t *bad_data,
|
||||||
struct zio_bad_cksum *info);
|
struct zio_bad_cksum *info);
|
||||||
|
|
|
@ -96,8 +96,9 @@ information regarding "hung" I/O detection and configuration.
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
Issued when a completed I/O exceeds the maximum allowed time specified
|
Issued when a completed I/O exceeds the maximum allowed time specified
|
||||||
by the \fBzio_delay_max\fR module option. This can be an indicator of
|
by the \fBzio_slow_io_ms\fR module option. This can be an indicator of
|
||||||
problems with the underlying storage device.
|
problems with the underlying storage device. The number of delay events is
|
||||||
|
ratelimited by the \fBzfs_slow_io_events_per_second\fR module parameter.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
|
@ -697,10 +698,9 @@ full list of all the I/O stages.
|
||||||
\fBzio_delay\fR
|
\fBzio_delay\fR
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
The time in ticks (HZ) required for the block layer to service the I/O. Unlike
|
The time elapsed (in nanoseconds) waiting for the block layer to complete the
|
||||||
\fBzio_delta\fR this does not include any vdev queuing time and is therefore
|
I/O. Unlike \fBzio_delta\fR this does not include any vdev queuing time and is
|
||||||
solely a measure of the block layer performance. On most modern Linux systems
|
therefore solely a measure of the block layer performance.
|
||||||
HZ is defined as 1000 making a tick equivalent to 1 millisecond.
|
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
|
|
|
@ -1086,7 +1086,7 @@ Default value: \fB600,000\fR.
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
Interval in milliseconds after which the deadman is triggered and an
|
Interval in milliseconds after which the deadman is triggered and an
|
||||||
individual IO operation is considered to be "hung". As long as the I/O
|
individual I/O operation is considered to be "hung". As long as the I/O
|
||||||
remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
|
remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
|
||||||
milliseconds until the I/O completes.
|
milliseconds until the I/O completes.
|
||||||
.sp
|
.sp
|
||||||
|
@ -1141,10 +1141,10 @@ Default value: \fB500,000\fR.
|
||||||
.sp
|
.sp
|
||||||
.ne 2
|
.ne 2
|
||||||
.na
|
.na
|
||||||
\fBzfs_delays_per_second\fR (int)
|
\fBzfs_low_ios_per_second\fR (int)
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
Rate limit IO delay events to this many per second.
|
Rate limit delay zevents (which report slow I/Os) to this many per second.
|
||||||
.sp
|
.sp
|
||||||
Default value: 20
|
Default value: 20
|
||||||
.RE
|
.RE
|
||||||
|
@ -1655,7 +1655,7 @@ Default value: \fB50\fR.
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
We currently support block sizes from 512 bytes to 16MB. The benefits of
|
We currently support block sizes from 512 bytes to 16MB. The benefits of
|
||||||
larger blocks, and thus larger IO, need to be weighed against the cost of
|
larger blocks, and thus larger I/O, need to be weighed against the cost of
|
||||||
COWing a giant block to modify one byte. Additionally, very large blocks
|
COWing a giant block to modify one byte. Additionally, very large blocks
|
||||||
can have an impact on i/o latency, and also potentially on the memory
|
can have an impact on i/o latency, and also potentially on the memory
|
||||||
allocator. Therefore, we do not allow the recordsize to be set larger than
|
allocator. Therefore, we do not allow the recordsize to be set larger than
|
||||||
|
@ -2552,12 +2552,13 @@ Default value: \fB0\fR.
|
||||||
.sp
|
.sp
|
||||||
.ne 2
|
.ne 2
|
||||||
.na
|
.na
|
||||||
\fBzio_delay_max\fR (int)
|
\fBzio_slow_io_ms\fR (int)
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
A zevent will be logged if a ZIO operation takes more than N milliseconds to
|
When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to
|
||||||
complete. Note that this is only a logging facility, not a timeout on
|
complete is marked as a slow I/O. Each slow I/O causes a delay zevent. Slow
|
||||||
operations.
|
I/O counters can be seen with "zpool status -s".
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
Default value: \fB30,000\fR.
|
Default value: \fB30,000\fR.
|
||||||
.RE
|
.RE
|
||||||
|
@ -2568,7 +2569,7 @@ Default value: \fB30,000\fR.
|
||||||
\fBzio_dva_throttle_enabled\fR (int)
|
\fBzio_dva_throttle_enabled\fR (int)
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
Throttle block allocations in the ZIO pipeline. This allows for
|
Throttle block allocations in the I/O pipeline. This allows for
|
||||||
dynamic allocation distribution when devices are imbalanced.
|
dynamic allocation distribution when devices are imbalanced.
|
||||||
When enabled, the maximum number of pending allocations per top-level vdev
|
When enabled, the maximum number of pending allocations per top-level vdev
|
||||||
is limited by \fBzfs_vdev_queue_depth_pct\fR.
|
is limited by \fBzfs_vdev_queue_depth_pct\fR.
|
||||||
|
@ -2594,7 +2595,7 @@ Default value: \fB0\fR.
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
|
Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
|
||||||
for IO. These workers are responsible for IO work such as compression and
|
for I/O. These workers are responsible for I/O work such as compression and
|
||||||
checksum calculations. Fractional number of CPUs will be rounded down.
|
checksum calculations. Fractional number of CPUs will be rounded down.
|
||||||
.sp
|
.sp
|
||||||
The default value of 75 was chosen to avoid using all CPUs which can result in
|
The default value of 75 was chosen to avoid using all CPUs which can result in
|
||||||
|
|
|
@ -182,7 +182,7 @@
|
||||||
.Nm
|
.Nm
|
||||||
.Cm status
|
.Cm status
|
||||||
.Oo Fl c Ar SCRIPT Oc
|
.Oo Fl c Ar SCRIPT Oc
|
||||||
.Op Fl gLPvxD
|
.Op Fl DgLpPsvx
|
||||||
.Op Fl T Sy u Ns | Ns Sy d
|
.Op Fl T Sy u Ns | Ns Sy d
|
||||||
.Oo Ar pool Oc Ns ...
|
.Oo Ar pool Oc Ns ...
|
||||||
.Op Ar interval Op Ar count
|
.Op Ar interval Op Ar count
|
||||||
|
@ -2167,7 +2167,7 @@ and automatically import it.
|
||||||
.Nm
|
.Nm
|
||||||
.Cm status
|
.Cm status
|
||||||
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ...
|
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ...
|
||||||
.Op Fl gLPvxD
|
.Op Fl DgLpPsvx
|
||||||
.Op Fl T Sy u Ns | Ns Sy d
|
.Op Fl T Sy u Ns | Ns Sy d
|
||||||
.Oo Ar pool Oc Ns ...
|
.Oo Ar pool Oc Ns ...
|
||||||
.Op Ar interval Op Ar count
|
.Op Ar interval Op Ar count
|
||||||
|
@ -2203,6 +2203,8 @@ Display real paths for vdevs resolving all symbolic links. This can
|
||||||
be used to look up the current block device name regardless of the
|
be used to look up the current block device name regardless of the
|
||||||
.Pa /dev/disk/
|
.Pa /dev/disk/
|
||||||
path used to open it.
|
path used to open it.
|
||||||
|
.It Fl p
|
||||||
|
Display numbers in parsable (exact) values.
|
||||||
.It Fl P
|
.It Fl P
|
||||||
Display full paths for vdevs instead of only the last component of
|
Display full paths for vdevs instead of only the last component of
|
||||||
the path. This can be used in conjunction with the
|
the path. This can be used in conjunction with the
|
||||||
|
@ -2214,6 +2216,12 @@ Display a histogram of deduplication statistics, showing the allocated
|
||||||
and referenced
|
and referenced
|
||||||
.Pq logically referenced in the pool
|
.Pq logically referenced in the pool
|
||||||
block counts and sizes by reference count.
|
block counts and sizes by reference count.
|
||||||
|
.It Fl s
|
||||||
|
Display the number of leaf VDEV slow IOs. This is the number of IOs that
|
||||||
|
didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds).
|
||||||
|
This does not necessarily mean the IOs failed to complete, just took an
|
||||||
|
unreasonably long amount of time. This may indicate a problem with the
|
||||||
|
underlying storage.
|
||||||
.It Fl T Sy u Ns | Ns Sy d
|
.It Fl T Sy u Ns | Ns Sy d
|
||||||
Display a time stamp.
|
Display a time stamp.
|
||||||
Specify
|
Specify
|
||||||
|
|
|
@ -77,14 +77,14 @@ int vdev_validate_skip = B_FALSE;
|
||||||
int vdev_dtl_sm_blksz = (1 << 12);
|
int vdev_dtl_sm_blksz = (1 << 12);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Rate limit delay events to this many IO delays per second.
|
* Rate limit slow IO (delay) events to this many per second.
|
||||||
*/
|
*/
|
||||||
unsigned int zfs_delays_per_second = 20;
|
unsigned int zfs_slow_io_events_per_second = 20;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Rate limit checksum events after this many checksum errors per second.
|
* Rate limit checksum events after this many checksum errors per second.
|
||||||
*/
|
*/
|
||||||
unsigned int zfs_checksums_per_second = 20;
|
unsigned int zfs_checksum_events_per_second = 20;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ignore errors during scrub/resilver. Allows to work around resilver
|
* Ignore errors during scrub/resilver. Allows to work around resilver
|
||||||
|
@ -507,8 +507,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||||
* and checksum events so that we don't overwhelm ZED with thousands
|
* and checksum events so that we don't overwhelm ZED with thousands
|
||||||
* of events when a disk is acting up.
|
* of events when a disk is acting up.
|
||||||
*/
|
*/
|
||||||
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1);
|
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
|
||||||
zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1);
|
1);
|
||||||
|
zfs_ratelimit_init(&vd->vdev_checksum_rl,
|
||||||
|
&zfs_checksum_events_per_second, 1);
|
||||||
|
|
||||||
list_link_init(&vd->vdev_config_dirty_node);
|
list_link_init(&vd->vdev_config_dirty_node);
|
||||||
list_link_init(&vd->vdev_state_dirty_node);
|
list_link_init(&vd->vdev_state_dirty_node);
|
||||||
|
@ -3591,6 +3593,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
|
||||||
vd->vdev_stat.vs_read_errors = 0;
|
vd->vdev_stat.vs_read_errors = 0;
|
||||||
vd->vdev_stat.vs_write_errors = 0;
|
vd->vdev_stat.vs_write_errors = 0;
|
||||||
vd->vdev_stat.vs_checksum_errors = 0;
|
vd->vdev_stat.vs_checksum_errors = 0;
|
||||||
|
vd->vdev_stat.vs_slow_ios = 0;
|
||||||
|
|
||||||
for (int c = 0; c < vd->vdev_children; c++)
|
for (int c = 0; c < vd->vdev_children; c++)
|
||||||
vdev_clear(spa, vd->vdev_child[c]);
|
vdev_clear(spa, vd->vdev_child[c]);
|
||||||
|
@ -4630,12 +4633,12 @@ module_param(vdev_ms_count_limit, int, 0644);
|
||||||
MODULE_PARM_DESC(vdev_ms_count_limit,
|
MODULE_PARM_DESC(vdev_ms_count_limit,
|
||||||
"Practical upper limit of total metaslabs per top-level vdev");
|
"Practical upper limit of total metaslabs per top-level vdev");
|
||||||
|
|
||||||
module_param(zfs_delays_per_second, uint, 0644);
|
module_param(zfs_slow_io_events_per_second, uint, 0644);
|
||||||
MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
|
MODULE_PARM_DESC(zfs_slow_io_events_per_second,
|
||||||
"IO delays per second");
|
"Rate limit slow IO (delay) events to this many per second");
|
||||||
|
|
||||||
module_param(zfs_checksums_per_second, uint, 0644);
|
module_param(zfs_checksum_events_per_second, uint, 0644);
|
||||||
MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events "
|
MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events "
|
||||||
"to this many checksum errors per second (do not set below zed"
|
"to this many checksum errors per second (do not set below zed"
|
||||||
"threshold).");
|
"threshold).");
|
||||||
|
|
||||||
|
|
|
@ -347,6 +347,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
|
||||||
vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
|
vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
|
||||||
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
|
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
|
||||||
|
|
||||||
|
/* IO delays */
|
||||||
|
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
|
||||||
|
|
||||||
/* Add extended stats nvlist to main nvlist */
|
/* Add extended stats nvlist to main nvlist */
|
||||||
fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
|
fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
|
||||||
|
|
||||||
|
|
|
@ -140,7 +140,10 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
|
||||||
return (rc);
|
return (rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
/*
|
||||||
|
* Return B_TRUE if the event actually posted, B_FALSE if not.
|
||||||
|
*/
|
||||||
|
static boolean_t
|
||||||
zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||||
const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
|
const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
|
||||||
zio_t *zio, uint64_t stateoroffset, uint64_t size)
|
zio_t *zio, uint64_t stateoroffset, uint64_t size)
|
||||||
|
@ -150,78 +153,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||||
uint64_t ena;
|
uint64_t ena;
|
||||||
char class[64];
|
char class[64];
|
||||||
|
|
||||||
/*
|
if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
|
||||||
* If we are doing a spa_tryimport() or in recovery mode,
|
return (B_FALSE);
|
||||||
* ignore errors.
|
|
||||||
*/
|
|
||||||
if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
|
|
||||||
spa_load_state(spa) == SPA_LOAD_RECOVER)
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we are in the middle of opening a pool, and the previous attempt
|
|
||||||
* failed, don't bother logging any new ereports - we're just going to
|
|
||||||
* get the same diagnosis anyway.
|
|
||||||
*/
|
|
||||||
if (spa_load_state(spa) != SPA_LOAD_NONE &&
|
|
||||||
spa->spa_last_open_failed)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (zio != NULL) {
|
|
||||||
/*
|
|
||||||
* If this is not a read or write zio, ignore the error. This
|
|
||||||
* can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
|
|
||||||
*/
|
|
||||||
if (zio->io_type != ZIO_TYPE_READ &&
|
|
||||||
zio->io_type != ZIO_TYPE_WRITE)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (vd != NULL) {
|
|
||||||
/*
|
|
||||||
* If the vdev has already been marked as failing due
|
|
||||||
* to a failed probe, then ignore any subsequent I/O
|
|
||||||
* errors, as the DE will automatically fault the vdev
|
|
||||||
* on the first such failure. This also catches cases
|
|
||||||
* where vdev_remove_wanted is set and the device has
|
|
||||||
* not yet been asynchronously placed into the REMOVED
|
|
||||||
* state.
|
|
||||||
*/
|
|
||||||
if (zio->io_vd == vd && !vdev_accessible(vd, zio))
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Ignore checksum errors for reads from DTL regions of
|
|
||||||
* leaf vdevs.
|
|
||||||
*/
|
|
||||||
if (zio->io_type == ZIO_TYPE_READ &&
|
|
||||||
zio->io_error == ECKSUM &&
|
|
||||||
vd->vdev_ops->vdev_op_leaf &&
|
|
||||||
vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* For probe failure, we want to avoid posting ereports if we've
|
|
||||||
* already removed the device in the meantime.
|
|
||||||
*/
|
|
||||||
if (vd != NULL &&
|
|
||||||
strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
|
|
||||||
(vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
|
|
||||||
return;
|
|
||||||
|
|
||||||
if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
|
|
||||||
(zio != NULL) && (!zio->io_timestamp)) {
|
|
||||||
/* Ignore bogus delay events */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((ereport = fm_nvlist_create(NULL)) == NULL)
|
if ((ereport = fm_nvlist_create(NULL)) == NULL)
|
||||||
return;
|
return (B_FALSE);
|
||||||
|
|
||||||
if ((detector = fm_nvlist_create(NULL)) == NULL) {
|
if ((detector = fm_nvlist_create(NULL)) == NULL) {
|
||||||
fm_nvlist_destroy(ereport, FM_NVA_FREE);
|
fm_nvlist_destroy(ereport, FM_NVA_FREE);
|
||||||
return;
|
return (B_FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -332,7 +272,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
|
||||||
DATA_TYPE_UINT64, vs->vs_write_errors,
|
DATA_TYPE_UINT64, vs->vs_write_errors,
|
||||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
|
||||||
DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL);
|
DATA_TYPE_UINT64, vs->vs_checksum_errors,
|
||||||
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
|
||||||
|
DATA_TYPE_UINT64, vs->vs_slow_ios,
|
||||||
|
NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pvd != NULL) {
|
if (pvd != NULL) {
|
||||||
|
@ -427,7 +370,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||||
/*
|
/*
|
||||||
* Payload for I/Os with corresponding logical information.
|
* Payload for I/Os with corresponding logical information.
|
||||||
*/
|
*/
|
||||||
if (zb != NULL && (zio == NULL || zio->io_logical != NULL))
|
if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
|
||||||
fm_payload_set(ereport,
|
fm_payload_set(ereport,
|
||||||
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
|
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
|
||||||
DATA_TYPE_UINT64, zb->zb_objset,
|
DATA_TYPE_UINT64, zb->zb_objset,
|
||||||
|
@ -437,11 +380,13 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
||||||
DATA_TYPE_INT64, zb->zb_level,
|
DATA_TYPE_INT64, zb->zb_level,
|
||||||
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
|
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
|
||||||
DATA_TYPE_UINT64, zb->zb_blkid, NULL);
|
DATA_TYPE_UINT64, zb->zb_blkid, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
mutex_exit(&spa->spa_errlist_lock);
|
mutex_exit(&spa->spa_errlist_lock);
|
||||||
|
|
||||||
*ereport_out = ereport;
|
*ereport_out = ereport;
|
||||||
*detector_out = detector;
|
*detector_out = detector;
|
||||||
|
return (B_TRUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if it's <= 128 bytes, save the corruption directly */
|
/* if it's <= 128 bytes, save the corruption directly */
|
||||||
|
@ -765,27 +710,111 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void
|
/*
|
||||||
|
* Make sure our event is still valid for the given zio/vdev/pool. For example,
|
||||||
|
* we don't want to keep logging events for a faulted or missing vdev.
|
||||||
|
*/
|
||||||
|
boolean_t
|
||||||
|
zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
|
||||||
|
{
|
||||||
|
#ifdef _KERNEL
|
||||||
|
/*
|
||||||
|
* If we are doing a spa_tryimport() or in recovery mode,
|
||||||
|
* ignore errors.
|
||||||
|
*/
|
||||||
|
if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
|
||||||
|
spa_load_state(spa) == SPA_LOAD_RECOVER)
|
||||||
|
return (B_FALSE);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we are in the middle of opening a pool, and the previous attempt
|
||||||
|
* failed, don't bother logging any new ereports - we're just going to
|
||||||
|
* get the same diagnosis anyway.
|
||||||
|
*/
|
||||||
|
if (spa_load_state(spa) != SPA_LOAD_NONE &&
|
||||||
|
spa->spa_last_open_failed)
|
||||||
|
return (B_FALSE);
|
||||||
|
|
||||||
|
if (zio != NULL) {
|
||||||
|
/*
|
||||||
|
* If this is not a read or write zio, ignore the error. This
|
||||||
|
* can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
|
||||||
|
*/
|
||||||
|
if (zio->io_type != ZIO_TYPE_READ &&
|
||||||
|
zio->io_type != ZIO_TYPE_WRITE)
|
||||||
|
return (B_FALSE);
|
||||||
|
|
||||||
|
if (vd != NULL) {
|
||||||
|
/*
|
||||||
|
* If the vdev has already been marked as failing due
|
||||||
|
* to a failed probe, then ignore any subsequent I/O
|
||||||
|
* errors, as the DE will automatically fault the vdev
|
||||||
|
* on the first such failure. This also catches cases
|
||||||
|
* where vdev_remove_wanted is set and the device has
|
||||||
|
* not yet been asynchronously placed into the REMOVED
|
||||||
|
* state.
|
||||||
|
*/
|
||||||
|
if (zio->io_vd == vd && !vdev_accessible(vd, zio))
|
||||||
|
return (B_FALSE);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Ignore checksum errors for reads from DTL regions of
|
||||||
|
* leaf vdevs.
|
||||||
|
*/
|
||||||
|
if (zio->io_type == ZIO_TYPE_READ &&
|
||||||
|
zio->io_error == ECKSUM &&
|
||||||
|
vd->vdev_ops->vdev_op_leaf &&
|
||||||
|
vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
|
||||||
|
return (B_FALSE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For probe failure, we want to avoid posting ereports if we've
|
||||||
|
* already removed the device in the meantime.
|
||||||
|
*/
|
||||||
|
if (vd != NULL &&
|
||||||
|
strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
|
||||||
|
(vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
|
||||||
|
return (B_FALSE);
|
||||||
|
|
||||||
|
/* Ignore bogus delay events (like from ioctls or unqueued IOs) */
|
||||||
|
if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
|
||||||
|
(zio != NULL) && (!zio->io_timestamp)) {
|
||||||
|
return (B_FALSE);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return (B_TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return 0 if event was posted, EINVAL if there was a problem posting it or
|
||||||
|
* EBUSY if the event was rate limited.
|
||||||
|
*/
|
||||||
|
int
|
||||||
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
|
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
|
||||||
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
|
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
|
||||||
uint64_t size)
|
uint64_t size)
|
||||||
{
|
{
|
||||||
|
int rc = 0;
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
nvlist_t *ereport = NULL;
|
nvlist_t *ereport = NULL;
|
||||||
nvlist_t *detector = NULL;
|
nvlist_t *detector = NULL;
|
||||||
|
|
||||||
if (zfs_is_ratelimiting_event(subclass, vd))
|
if (zfs_is_ratelimiting_event(subclass, vd))
|
||||||
return;
|
return (SET_ERROR(EBUSY));
|
||||||
|
|
||||||
zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
|
if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
|
||||||
zb, zio, stateoroffset, size);
|
zb, zio, stateoroffset, size))
|
||||||
|
return (SET_ERROR(EINVAL)); /* couldn't post event */
|
||||||
|
|
||||||
if (ereport == NULL)
|
if (ereport == NULL)
|
||||||
return;
|
return (SET_ERROR(EINVAL));
|
||||||
|
|
||||||
/* Cleanup is handled by the callback function */
|
/* Cleanup is handled by the callback function */
|
||||||
zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
|
rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
|
||||||
#endif
|
#endif
|
||||||
|
return (rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -795,7 +824,6 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
|
||||||
{
|
{
|
||||||
zio_cksum_report_t *report;
|
zio_cksum_report_t *report;
|
||||||
|
|
||||||
|
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
|
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
|
||||||
return;
|
return;
|
||||||
|
@ -874,30 +902,34 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
int
|
||||||
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
|
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
|
||||||
struct zio *zio, uint64_t offset, uint64_t length,
|
struct zio *zio, uint64_t offset, uint64_t length,
|
||||||
const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
|
const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
|
||||||
{
|
{
|
||||||
|
int rc = 0;
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
nvlist_t *ereport = NULL;
|
nvlist_t *ereport = NULL;
|
||||||
nvlist_t *detector = NULL;
|
nvlist_t *detector = NULL;
|
||||||
zfs_ecksum_info_t *info;
|
zfs_ecksum_info_t *info;
|
||||||
|
|
||||||
zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
|
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
|
||||||
spa, vd, zb, zio, offset, length);
|
return (EBUSY);
|
||||||
|
|
||||||
if (ereport == NULL)
|
if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
|
||||||
return;
|
spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
|
||||||
|
return (SET_ERROR(EINVAL));
|
||||||
|
}
|
||||||
|
|
||||||
info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
|
info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
|
||||||
B_FALSE);
|
B_FALSE);
|
||||||
|
|
||||||
if (info != NULL) {
|
if (info != NULL) {
|
||||||
zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
|
rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
|
||||||
kmem_free(info, sizeof (*info));
|
kmem_free(info, sizeof (*info));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
return (rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1043,6 +1075,7 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
|
||||||
|
|
||||||
#if defined(_KERNEL)
|
#if defined(_KERNEL)
|
||||||
EXPORT_SYMBOL(zfs_ereport_post);
|
EXPORT_SYMBOL(zfs_ereport_post);
|
||||||
|
EXPORT_SYMBOL(zfs_ereport_is_valid);
|
||||||
EXPORT_SYMBOL(zfs_ereport_post_checksum);
|
EXPORT_SYMBOL(zfs_ereport_post_checksum);
|
||||||
EXPORT_SYMBOL(zfs_post_remove);
|
EXPORT_SYMBOL(zfs_post_remove);
|
||||||
EXPORT_SYMBOL(zfs_post_autoreplace);
|
EXPORT_SYMBOL(zfs_post_autoreplace);
|
||||||
|
|
|
@ -77,7 +77,8 @@ uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
||||||
uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int zio_delay_max = ZIO_DELAY_MAX;
|
/* Mark IOs as "slow" if they take longer than 30 seconds */
|
||||||
|
int zio_slow_io_ms = (30 * MILLISEC);
|
||||||
|
|
||||||
#define BP_SPANB(indblkshift, level) \
|
#define BP_SPANB(indblkshift, level) \
|
||||||
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
|
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
|
||||||
|
@ -4431,10 +4432,28 @@ zio_done(zio_t *zio)
|
||||||
* 30 seconds to complete, post an error described the I/O delay.
|
* 30 seconds to complete, post an error described the I/O delay.
|
||||||
* We ignore these errors if the device is currently unavailable.
|
* We ignore these errors if the device is currently unavailable.
|
||||||
*/
|
*/
|
||||||
if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
|
if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
|
||||||
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
|
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
|
||||||
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
|
/*
|
||||||
zio->io_vd, &zio->io_bookmark, zio, 0, 0);
|
* We want to only increment our slow IO counters if
|
||||||
|
* the IO is valid (i.e. not if the drive is removed).
|
||||||
|
*
|
||||||
|
* zfs_ereport_post() will also do these checks, but
|
||||||
|
* it can also ratelimit and have other failures, so we
|
||||||
|
* need to increment the slow_io counters independent
|
||||||
|
* of it.
|
||||||
|
*/
|
||||||
|
if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
|
||||||
|
zio->io_spa, zio->io_vd, zio)) {
|
||||||
|
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
||||||
|
zio->io_vd->vdev_stat.vs_slow_ios++;
|
||||||
|
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
||||||
|
|
||||||
|
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
|
||||||
|
zio->io_spa, zio->io_vd, &zio->io_bookmark,
|
||||||
|
zio, 0, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (zio->io_error) {
|
if (zio->io_error) {
|
||||||
|
@ -4823,8 +4842,9 @@ EXPORT_SYMBOL(zio_data_buf_alloc);
|
||||||
EXPORT_SYMBOL(zio_buf_free);
|
EXPORT_SYMBOL(zio_buf_free);
|
||||||
EXPORT_SYMBOL(zio_data_buf_free);
|
EXPORT_SYMBOL(zio_data_buf_free);
|
||||||
|
|
||||||
module_param(zio_delay_max, int, 0644);
|
module_param(zio_slow_io_ms, int, 0644);
|
||||||
MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
|
MODULE_PARM_DESC(zio_slow_io_ms,
|
||||||
|
"Max I/O completion time (milliseconds) before marking it as slow");
|
||||||
|
|
||||||
module_param(zio_requeue_io_start_cut_in_line, int, 0644);
|
module_param(zio_requeue_io_start_cut_in_line, int, 0644);
|
||||||
MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
|
MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
|
||||||
|
|
|
@ -546,7 +546,7 @@ tags = ['functional', 'exec']
|
||||||
tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos',
|
tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos',
|
||||||
'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple',
|
'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple',
|
||||||
'auto_spare_shared', 'scrub_after_resilver', 'decrypt_fault',
|
'auto_spare_shared', 'scrub_after_resilver', 'decrypt_fault',
|
||||||
'decompress_fault']
|
'decompress_fault','zpool_status_-s']
|
||||||
tags = ['functional', 'fault']
|
tags = ['functional', 'fault']
|
||||||
|
|
||||||
[tests/functional/features/async_destroy]
|
[tests/functional/features/async_destroy]
|
||||||
|
|
|
@ -11,7 +11,8 @@ dist_pkgdata_SCRIPTS = \
|
||||||
auto_spare_shared.ksh \
|
auto_spare_shared.ksh \
|
||||||
decrypt_fault.ksh \
|
decrypt_fault.ksh \
|
||||||
decompress_fault.ksh \
|
decompress_fault.ksh \
|
||||||
scrub_after_resilver.ksh
|
scrub_after_resilver.ksh \
|
||||||
|
zpool_status_-s.ksh
|
||||||
|
|
||||||
dist_pkgdata_DATA = \
|
dist_pkgdata_DATA = \
|
||||||
fault.cfg
|
fault.cfg
|
||||||
|
|
|
@ -0,0 +1,77 @@
|
||||||
|
#!/bin/ksh -p
|
||||||
|
#
|
||||||
|
# CDDL HEADER START
|
||||||
|
#
|
||||||
|
# The contents of this file are subject to the terms of the
|
||||||
|
# Common Development and Distribution License (the "License").
|
||||||
|
# You may not use this file except in compliance with the License.
|
||||||
|
#
|
||||||
|
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||||
|
# or http://www.opensolaris.org/os/licensing.
|
||||||
|
# See the License for the specific language governing permissions
|
||||||
|
# and limitations under the License.
|
||||||
|
#
|
||||||
|
# When distributing Covered Code, include this CDDL HEADER in each
|
||||||
|
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||||
|
# If applicable, add the following below this CDDL HEADER, with the
|
||||||
|
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||||
|
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||||
|
#
|
||||||
|
# CDDL HEADER END
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
|
||||||
|
#
|
||||||
|
|
||||||
|
# DESCRIPTION:
|
||||||
|
# Verify zpool status -s (slow IOs) works
|
||||||
|
#
|
||||||
|
# STRATEGY:
|
||||||
|
# 1. Create a file
|
||||||
|
# 2. Inject slow IOs into the pool
|
||||||
|
# 3. Verify we can see the slow IOs with "zpool status -s".
|
||||||
|
# 4. Verify we can see delay events.
|
||||||
|
#
|
||||||
|
|
||||||
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
. $STF_SUITE/include/zpool_script.shlib
|
||||||
|
|
||||||
|
DISK=${DISKS%% *}
|
||||||
|
|
||||||
|
verify_runnable "both"
|
||||||
|
|
||||||
|
log_must zpool create $TESTPOOL mirror ${DISKS}
|
||||||
|
|
||||||
|
function cleanup
|
||||||
|
{
|
||||||
|
log_must zinject -c all
|
||||||
|
log_must set_tunable64 zio_slow_io_ms $OLD_SLOW_IO
|
||||||
|
log_must set_tunable64 zfs_slow_io_events_per_second $OLD_SLOW_IO_EVENTS
|
||||||
|
log_must destroy_pool $TESTPOOL
|
||||||
|
}
|
||||||
|
|
||||||
|
log_onexit cleanup
|
||||||
|
|
||||||
|
log_must zpool events -c
|
||||||
|
|
||||||
|
# Mark any IOs greater than 10ms as slow IOs
|
||||||
|
OLD_SLOW_IO=$(get_tunable zio_slow_io_ms)
|
||||||
|
OLD_SLOW_IO_EVENTS=$(get_tunable zfs_slow_io_events_per_second)
|
||||||
|
log_must set_tunable64 zio_slow_io_ms 10
|
||||||
|
log_must set_tunable64 zfs_slow_io_events_per_second 1000
|
||||||
|
|
||||||
|
# Create 20ms IOs
|
||||||
|
log_must zinject -d $DISK -D20:100 $TESTPOOL
|
||||||
|
log_must mkfile 1048576 /$TESTPOOL/testfile
|
||||||
|
log_must zpool sync $TESTPOOL
|
||||||
|
|
||||||
|
log_must zinject -c all
|
||||||
|
SLOW_IOS=$(zpool status -sp | grep "$DISK" | awk '{print $6}')
|
||||||
|
DELAY_EVENTS=$(zpool events | grep delay | wc -l)
|
||||||
|
|
||||||
|
if [ $SLOW_IOS -gt 0 ] && [ $DELAY_EVENTS -gt 0 ] ; then
|
||||||
|
log_pass "Correctly saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events"
|
||||||
|
else
|
||||||
|
log_fail "Only saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events"
|
||||||
|
fi
|
|
@ -410,7 +410,7 @@ function get_system_config
|
||||||
zfs_vdev_async_write_max_active \
|
zfs_vdev_async_write_max_active \
|
||||||
zfs_vdev_sync_read_max_active \
|
zfs_vdev_sync_read_max_active \
|
||||||
zfs_vdev_sync_write_max_active \
|
zfs_vdev_sync_write_max_active \
|
||||||
zio_delay_max
|
zio_slow_io_ms
|
||||||
do
|
do
|
||||||
if [ "$tunable" != "zfs_arc_max" ]
|
if [ "$tunable" != "zfs_arc_max" ]
|
||||||
then
|
then
|
||||||
|
|
Loading…
Reference in New Issue