Initial zio delay timing

While there is no right maximum timeout for a disk IO we can start
laying the ground work to measure how long they do take in practice.
This change simply measures the IO time and if it exceeds 30s an
event is posted for 'zpool events'.

This value was carefully selected because for sd devices it implies
that at least one timeout (SD_TIMEOUT) has occured.  Unfortunately,
even with FAILFAST set we may retry and request and not get an
error.  This behavior is strongly dependant on the device driver
and how it is hooked in to the scsi error handling stack.  However
by setting the limit at 30s we can log the event even if no error
was returned.

Slightly longer term we can start recording these delays perhaps
as a simple power-of-two histrogram.  This histogram can then be
reported as part of the 'zpool status' command when given an command
line option.

None of this code changes the internal behavior of ZFS.  Currently
it is simply for reporting excessively long delays.
This commit is contained in:
Brian Behlendorf 2010-10-01 16:54:52 -07:00
parent 2959d94a0a
commit a69052be7f
5 changed files with 34 additions and 3 deletions

View File

@ -35,6 +35,7 @@ extern "C" {
#define FM_EREPORT_ZFS_CHECKSUM "checksum" #define FM_EREPORT_ZFS_CHECKSUM "checksum"
#define FM_EREPORT_ZFS_IO "io" #define FM_EREPORT_ZFS_IO "io"
#define FM_EREPORT_ZFS_DATA "data" #define FM_EREPORT_ZFS_DATA "data"
#define FM_EREPORT_ZFS_DELAY "delay"
#define FM_EREPORT_ZFS_CONFIG_SYNC "config.sync" #define FM_EREPORT_ZFS_CONFIG_SYNC "config.sync"
#define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_POOL "zpool"
#define FM_EREPORT_ZFS_POOL_DESTROY "zpool.destroy" #define FM_EREPORT_ZFS_POOL_DESTROY "zpool.destroy"
@ -81,6 +82,7 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay"
#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual"

View File

@ -116,6 +116,11 @@ enum zio_compress {
ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
(compress) == ZIO_COMPRESS_OFF) (compress) == ZIO_COMPRESS_OFF)
/*
* Default Linux timeout for a sd device.
*/
#define ZIO_DELAY_MAX (30 * MILLISEC)
#define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_WAIT 0
#define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2 #define ZIO_FAILURE_MODE_PANIC 2
@ -401,6 +406,7 @@ struct zio {
enum zio_flag io_orig_flags; enum zio_flag io_orig_flags;
enum zio_stage io_orig_stage; enum zio_stage io_orig_stage;
enum zio_stage io_orig_pipeline; enum zio_stage io_orig_pipeline;
uint64_t io_delay;
int io_error; int io_error;
int io_child_error[ZIO_CHILD_TYPES]; int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];

View File

@ -95,10 +95,10 @@ static void
vdev_disk_error(zio_t *zio) vdev_disk_error(zio_t *zio)
{ {
#ifdef ZFS_DEBUG #ifdef ZFS_DEBUG
printk("ZFS: zio error=%d type=%d offset=%llu " printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
"size=%llu flags=%x\n", zio->io_error, zio->io_type, "flags=%x delay=%llu\n", zio->io_error, zio->io_type,
(u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
zio->io_flags); zio->io_flags, (u_longlong_t)zio->io_delay);
#endif #endif
} }
@ -242,6 +242,8 @@ vdev_disk_dio_put(dio_request_t *dr)
vdev_disk_dio_free(dr); vdev_disk_dio_free(dr);
if (zio) { if (zio) {
zio->io_delay = jiffies_to_msecs(
jiffies_64 - zio->io_delay);
zio->io_error = error; zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0); ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error) if (zio->io_error)
@ -403,6 +405,8 @@ retry:
/* Extra reference to protect dio_request during submit_bio */ /* Extra reference to protect dio_request during submit_bio */
vdev_disk_dio_get(dr); vdev_disk_dio_get(dr);
if (zio)
zio->io_delay = jiffies_64;
/* Submit all bio's associated with this dio */ /* Submit all bio's associated with this dio */
for (i = 0; i < dr->dr_bio_count; i++) for (i = 0; i < dr->dr_bio_count; i++)
@ -442,6 +446,7 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
{ {
zio_t *zio = bio->bi_private; zio_t *zio = bio->bi_private;
zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay);
zio->io_error = -rc; zio->io_error = -rc;
if (rc && (rc == -EOPNOTSUPP)) if (rc && (rc == -EOPNOTSUPP))
zio->io_vd->vdev_nowritecache = B_TRUE; zio->io_vd->vdev_nowritecache = B_TRUE;
@ -472,6 +477,7 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_end_io = vdev_disk_io_flush_completion;
bio->bi_private = zio; bio->bi_private = zio;
bio->bi_bdev = bdev; bio->bi_bdev = bdev;
zio->io_delay = jiffies_64;
submit_bio(WRITE_BARRIER, bio); submit_bio(WRITE_BARRIER, bio);
return 0; return 0;

View File

@ -290,6 +290,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_INT32, zio->io_error, NULL); DATA_TYPE_INT32, zio->io_error, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
DATA_TYPE_INT32, zio->io_flags, NULL); DATA_TYPE_INT32, zio->io_flags, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
DATA_TYPE_UINT64, zio->io_delay, NULL);
/* /*
* If the 'size' parameter is non-zero, it indicates this is a * If the 'size' parameter is non-zero, it indicates this is a

View File

@ -75,6 +75,7 @@ kmem_cache_t *zio_link_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
int zio_bulk_flags = 0; int zio_bulk_flags = 0;
int zio_delay_max = ZIO_DELAY_MAX;
#ifdef _KERNEL #ifdef _KERNEL
extern vmem_t *zio_alloc_arena; extern vmem_t *zio_alloc_arena;
@ -2754,6 +2755,17 @@ zio_done(zio_t *zio)
vdev_stat_update(zio, zio->io_size); vdev_stat_update(zio, zio->io_size);
/*
* If this I/O is attached to a particular vdev is slow, exeeding
* 30 seconds to complete, post an error described the I/O delay.
* We ignore these errors if the device is currently unavailable.
*/
if (zio->io_delay >= zio_delay_max) {
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
zio->io_vd, zio, 0, 0);
}
if (zio->io_error) { if (zio->io_error) {
/* /*
* If this I/O is attached to a particular vdev, * If this I/O is attached to a particular vdev,
@ -2993,4 +3005,7 @@ EXPORT_SYMBOL(zio_type_name);
module_param(zio_bulk_flags, int, 0644); module_param(zio_bulk_flags, int, 0644);
MODULE_PARM_DESC(zio_bulk_flags, "Additional flags to pass to bulk buffers"); MODULE_PARM_DESC(zio_bulk_flags, "Additional flags to pass to bulk buffers");
module_param(zio_delay_max, int, 0644);
MODULE_PARM_DESC(zio_delay_max, "Max zio delay before posting an event (ms)");
#endif #endif