Log I/Os longer than zio_delay_max (30s default)

There have been reports of ZFS deadlocking due to what appears to
be a lost IO.  This patch addes some debugging to determine the
exact state of the IO which neither 1) completed, 2) failed, or
3) timed out after zio_delay_max (30) seconds.

This information will be logged using the ZFS FMA infrastructure
as a 'delay' event and posted to the internal zevent log.  By
default the last 64 events will be kept in the log but the limit
is configurable via the zfs_zevent_len_max module option.

To dump the contents of the log use the 'zpool events -v' command
and look for the resource.fs.zfs.delay event.  It will include
various information about the pool, vdev, and zio which may shed
some light on the issue.

In the context of this change the 120 second kernel blocked thread
watchdog has been disabled for synchronous IOs.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #930
This commit is contained in:
Brian Behlendorf 2012-10-25 13:02:31 -07:00
parent e95853a331
commit 9dcb971983
3 changed files with 28 additions and 10 deletions

View File

@ -83,6 +83,8 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay"
#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"

View File

@ -294,6 +294,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_INT32, zio->io_error, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
DATA_TYPE_INT32, zio->io_flags, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
DATA_TYPE_UINT32, zio->io_stage, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
DATA_TYPE_UINT32, zio->io_pipeline, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
DATA_TYPE_UINT64, zio->io_delay, NULL);

View File

@ -1305,18 +1305,34 @@ __zio_execute(zio_t *zio)
int
zio_wait(zio_t *zio)
{
uint64_t timeout;
int error;
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
ASSERT(zio->io_executor == NULL);
zio->io_waiter = curthread;
timeout = ddi_get_lbolt() + (zio_delay_max / MILLISEC * hz);
__zio_execute(zio);
mutex_enter(&zio->io_lock);
while (zio->io_executor != NULL)
cv_wait(&zio->io_cv, &zio->io_lock);
while (zio->io_executor != NULL) {
/*
* Wake up periodically to prevent the kernel from complaining
* about a blocked task. However, check zio_delay_max to see
* if the I/O has exceeded the timeout and post an ereport.
*/
cv_timedwait_interruptible(&zio->io_cv, &zio->io_lock,
ddi_get_lbolt() + hz);
if (timeout && (ddi_get_lbolt() > timeout)) {
zio->io_delay = zio_delay_max;
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
zio->io_spa, zio->io_vd, zio, 0, 0);
timeout = 0;
}
}
mutex_exit(&zio->io_lock);
error = zio->io_error;
@ -2889,15 +2905,11 @@ zio_done(zio_t *zio)
vdev_stat_update(zio, zio->io_size);
/*
* If this I/O is attached to a particular vdev is slow, exeeding
* 30 seconds to complete, post an error described the I/O delay.
* We ignore these errors if the device is currently unavailable.
* When an I/O completes but was slow post an ereport.
*/
if (zio->io_delay >= zio_delay_max) {
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
zio->io_vd, zio, 0, 0);
}
if (zio->io_delay >= zio_delay_max)
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
zio->io_vd, zio, 0, 0);
if (zio->io_error) {
/*