Extend deadman logic

The intent of this patch is extend the existing deadman code such that it's flexible enough to be used by both ztest and on production systems. The proposed changes include: * Added a new `zfs_deadman_failmode` module option which is used to dynamically control the behavior of the deadman. It's loosely modeled after, but independant from, the pool failmode property. It can be set to wait, continue, or panic. * wait - Wait for the "hung" I/O (default) * continue - Attempt to recover from a "hung" I/O * panic - Panic the system * Added a new `zfs_deadman_ziotime_ms` module option which is analogous to `zfs_deadman_synctime_ms` except instead of applying to a pool TXG sync it applies to zio_wait(). A default value of 300s is used to define a "hung" zio. * The ztest deadman thread has been re-enabled by default, aligned with the upstream OpenZFS code, and then extended to terminate the process when it takes significantly longer to complete than expected. * The -G option was added to ztest to print the internal debug log when a fatal error is encountered. This same option was previously added to zdb in commit fa603f82. Update zloop.sh to unconditionally pass -G to obtain additional debugging. * The FM_EREPORT_ZFS_DELAY event which was previously posted when the deadman detect a "hung" pool has been replaced by a new dedicated FM_EREPORT_ZFS_DEADMAN event. * The proposed recovery logic attempts to restart a "hung" zio by calling zio_interrupt() on any outstanding leaf zios. We may want to further restrict this to zios in either the ZIO_STAGE_VDEV_IO_START or ZIO_STAGE_VDEV_IO_DONE stages. Calling zio_interrupt() is expected to only be useful for cases when an IO has been submitted to the physical device but for some reasonable the completion callback hasn't been called by the lower layers. This shouldn't be possible but has been observed and may be caused by kernel/driver bugs. * The 'zfs_deadman_synctime_ms' default value was reduced from 1000s to 600s. * Depending on how ztest fails there may be no cache file to move. This should not be considered fatal, collect the logs which are available and carry on. * Add deadman test cases for spa_deadman() and zio_wait(). * Increase default zfs_deadman_checktime_ms to 60s. Reviewed-by: Tim Chase <tim@chase2k.com> Reviewed by: Thomas Caputi <tcaputi@datto.com> Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #6999
2017-12-18 17:06:07 -05:00 · 2017-12-18 17:06:07 -05:00 · 8fb1ede146
parent 1b18c6d791
commit 8fb1ede146
21 changed files with 583 additions and 58 deletions
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@ -197,6 +197,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = {

 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
+extern unsigned long zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
 extern boolean_t zfs_compressed_arc_enabled;
 extern int  zfs_abd_scatter_enabled;
@ -447,6 +448,7 @@ static kmutex_t ztest_vdev_lock;
 static rwlock_t ztest_name_lock;

 static boolean_t ztest_dump_core = B_TRUE;
+static boolean_t ztest_dump_debug_buffer = B_FALSE;
 static boolean_t ztest_exiting;

 /* Global commit callback list */
@ -495,6 +497,16 @@ _umem_logging_init(void)
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }

+static void
+dump_debug_buffer(void)
+{
+	if (!ztest_dump_debug_buffer)
+		return;
+
+	(void) printf("\n");
+	zfs_dbgmsg_print("ztest");
+}
+
 #define	BACKTRACE_SZ	100

 static void sig_handler(int signo)
@ -507,6 +519,7 @@ static void sig_handler(int signo)
 	nptrs = backtrace(buffer, BACKTRACE_SZ);
 	backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
 #endif
+	dump_debug_buffer();

 	/*
 	 * Restore default action and re-raise signal so SIGSEGV and
@ -544,6 +557,9 @@ fatal(int do_perror, char *message, ...)
 	}
 	(void) fprintf(stderr, "%s\n", buf);
 	fatal_msg = buf;			/* to ease debugging */
+
+	dump_debug_buffer();
+
 	if (ztest_dump_core)
 		abort();
 	exit(3);
@ -641,6 +657,7 @@ usage(boolean_t requested)
 	    "\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
 	    "\t[-o variable=value] ... set global variable to an unsigned\n"
 	    "\t    32-bit integer value\n"
+	    "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n"
 	    "\t[-h] (print help)\n"
 	    "",
 	    zo->zo_pool,
@ -676,7 +693,7 @@ process_options(int argc, char **argv)
 	bcopy(&ztest_opts_defaults, zo, sizeof (*zo));

 	while ((opt = getopt(argc, argv,
-	    "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:")) != EOF) {
+	    "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:G")) != EOF) {
 		value = 0;
 		switch (opt) {
 		case 'v':
@ -771,6 +788,9 @@ process_options(int argc, char **argv)
 			if (set_global_var(optarg) != 0)
 				usage(B_FALSE);
 			break;
+		case 'G':
+			ztest_dump_debug_buffer = B_TRUE;
+			break;
 		case 'h':
 			usage(B_TRUE);
 			break;
@ -6224,15 +6244,48 @@ ztest_resume_thread(void *arg)
 	thread_exit();
 }

-#define	GRACE	300
-
-#if 0
 static void
-ztest_deadman_alarm(int sig)
+ztest_deadman_thread(void *arg)
 {
-	fatal(0, "failed to complete within %d seconds of deadline", GRACE);
+	ztest_shared_t *zs = arg;
+	spa_t *spa = ztest_spa;
+	hrtime_t delta, overdue, total = 0;
+
+	for (;;) {
+		delta = zs->zs_thread_stop - zs->zs_thread_start +
+		    MSEC2NSEC(zfs_deadman_synctime_ms);
+
+		(void) poll(NULL, 0, (int)NSEC2MSEC(delta));
+
+		/*
+		 * If the pool is suspended then fail immediately. Otherwise,
+		 * check to see if the pool is making any progress. If
+		 * vdev_deadman() discovers that there hasn't been any recent
+		 * I/Os then it will end up aborting the tests.
+		 */
+		if (spa_suspended(spa) || spa->spa_root_vdev == NULL) {
+			fatal(0, "aborting test after %llu seconds because "
+			    "pool has transitioned to a suspended state.",
+			    zfs_deadman_synctime_ms / 1000);
+		}
+		vdev_deadman(spa->spa_root_vdev, FTAG);
+
+		/*
+		 * If the process doesn't complete within a grace period of
+		 * zfs_deadman_synctime_ms over the expected finish time,
+		 * then it may be hung and is terminated.
+		 */
+		overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms);
+		total += zfs_deadman_synctime_ms / 1000;
+		if (gethrtime() > overdue) {
+			fatal(0, "aborting test after %llu seconds because "
+			    "the process is overdue for termination.", total);
+		}
+
+		(void) printf("ztest has been running for %lld seconds\n",
+		    total);
+	}
 }
-#endif

 static void
 ztest_execute(int test, ztest_info_t *zi, uint64_t id)
@ -6491,13 +6544,13 @@ ztest_run(ztest_shared_t *zs)
 	resume_thread = thread_create(NULL, 0, ztest_resume_thread,
 	    spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);

-#if 0
 	/*
-	 * Set a deadman alarm to abort() if we hang.
+	 * Create a deadman thread and set to panic if we hang.
 	 */
-	signal(SIGALRM, ztest_deadman_alarm);
-	alarm((zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + GRACE);
-#endif
+	(void) thread_create(NULL, 0, ztest_deadman_thread,
+	    zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
+
+	spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;

 	/*
 	 * Verify that we can safely inquire about about any object,
@ -7047,6 +7100,7 @@ main(int argc, char **argv)
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);

 	dprintf_setup(&argc, argv);
+	zfs_deadman_synctime_ms = 300000;

 	action.sa_handler = sig_handler;
 	sigemptyset(&action.sa_mask);
--- a/configure.ac
+++ b/configure.ac
@ -242,6 +242,7 @@ AC_CONFIG_FILES([
 	tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile
 	tests/zfs-tests/tests/functional/compression/Makefile
 	tests/zfs-tests/tests/functional/ctime/Makefile
+	tests/zfs-tests/tests/functional/deadman/Makefile
 	tests/zfs-tests/tests/functional/delegate/Makefile
 	tests/zfs-tests/tests/functional/devices/Makefile
 	tests/zfs-tests/tests/functional/events/Makefile
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@ -37,6 +37,7 @@ extern "C" {
 #define	FM_EREPORT_ZFS_IO			"io"
 #define	FM_EREPORT_ZFS_DATA			"data"
 #define	FM_EREPORT_ZFS_DELAY			"delay"
+#define	FM_EREPORT_ZFS_DEADMAN			"deadman"
 #define	FM_EREPORT_ZFS_POOL			"zpool"
 #define	FM_EREPORT_ZFS_DEVICE_UNKNOWN		"vdev.unknown"
 #define	FM_EREPORT_ZFS_DEVICE_OPEN_FAILED	"vdev.open_failed"
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@ -955,12 +955,15 @@ extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
 extern void spa_evicting_os_wait(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_prev_software_version(spa_t *spa);
-extern uint8_t spa_get_failmode(spa_t *spa);
+extern uint64_t spa_get_failmode(spa_t *spa);
+extern uint64_t spa_get_deadman_failmode(spa_t *spa);
+extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode);
 extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
+extern uint64_t spa_deadman_ziotime(spa_t *spa);

 /* Miscellaneous support routines */
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
@ -1074,6 +1077,10 @@ extern boolean_t spa_debug_enabled(spa_t *spa);
 }

 extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
+extern int zfs_deadman_enabled;
+extern unsigned long zfs_deadman_synctime_ms;
+extern unsigned long zfs_deadman_ziotime_ms;
+extern unsigned long zfs_deadman_checktime_ms;

 #ifdef	__cplusplus
 }
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@ -228,6 +228,7 @@ struct spa {
 	uint64_t	spa_pool_props_object;	/* object for properties */
 	uint64_t	spa_bootfs;		/* default boot filesystem */
 	uint64_t	spa_failmode;		/* failure mode for the pool */
+	uint64_t	spa_deadman_failmode;	/* failure mode for deadman */
 	uint64_t	spa_delegation;		/* delegation on/off */
 	list_t		spa_config_list;	/* previous cache file(s) */
 	/* per-CPU array of root of async I/O: */
@ -270,7 +271,8 @@ struct spa {
 	taskqid_t	spa_deadman_tqid;	/* Task id */
 	uint64_t	spa_deadman_calls;	/* number of deadman calls */
 	hrtime_t	spa_sync_starttime;	/* starting time of spa_sync */
-	uint64_t	spa_deadman_synctime;	/* deadman expiration timer */
+	uint64_t	spa_deadman_synctime;	/* deadman sync expiration */
+	uint64_t	spa_deadman_ziotime;	/* deadman zio expiration */
 	uint64_t	spa_all_vdev_zaps;	/* ZAP of per-vd ZAP obj #s */
 	spa_avz_action_t	spa_avz_action;	/* destroy/rebuild AVZ? */
 	uint64_t	spa_errata;		/* errata issues detected */
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@ -84,7 +84,7 @@ extern void vdev_metaslab_fini(vdev_t *vd);
 extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
-extern void vdev_deadman(vdev_t *vd);
+extern void vdev_deadman(vdev_t *vd, char *tag);

 extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@ -317,6 +317,7 @@ extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
 extern void cv_signal(kcondvar_t *cv);
 extern void cv_broadcast(kcondvar_t *cv);

+#define	cv_timedwait_io(cv, mp, at)		cv_timedwait(cv, mp, at)
 #define	cv_timedwait_sig(cv, mp, at)		cv_timedwait(cv, mp, at)
 #define	cv_wait_sig(cv, mp)			cv_wait(cv, mp)
 #define	cv_wait_io(cv, mp)			cv_wait(cv, mp)
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@ -556,6 +556,7 @@ extern void zio_execute(zio_t *zio);
 extern void zio_interrupt(zio_t *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);
+extern void zio_deadman(zio_t *zio, char *tag);

 extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
 extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
--- a/man/man1/ztest.1
+++ b/man/man1/ztest.1
@ -129,6 +129,10 @@ Total test run time.
 .BI "\-z" " zil_failure_rate" " (default: fail every 2^5 allocs)
 .IP
 Injected failure rate.
+.HP
+.BI "\-G"
+.IP
+Dump zfs_dbgmsg buffer before exiting.
 .SH "EXAMPLES"
 .LP
 To override /tmp as your location for block files, you can use the -f
--- a/man/man5/zfs-events.5
+++ b/man/man5/zfs-events.5
@ -55,7 +55,7 @@ part here.
 \fBchecksum\fR
 .ad
 .RS 12n
-Issued when a checksum error have been detected.
+Issued when a checksum error has been detected.
 .RE

 .sp
@ -76,14 +76,27 @@ Issued when there is an I/O error in a vdev in the pool.
 Issued when there have been data errors in the pool.
 .RE

+.sp
+.ne 2
+.na
+\fBdeadman\fR
+.ad
+.RS 12n
+Issued when an I/O is determined to be "hung", this can be caused by lost
+completion events due to flaky hardware or drivers.  See the
+\fBzfs_deadman_failmode\fR module option description for additional
+information regarding "hung" I/O detection and configuration.
+.RE
+
 .sp
 .ne 2
 .na
 \fBdelay\fR
 .ad
 .RS 12n
-Issued when an I/O was slow to complete as defined by the zio_delay_max module
-option.
+Issued when a completed I/O exceeds the maximum allowed time specified
+by the \fBzio_delay_max\fR module option.  This can be an indicator of
+problems with the underlying storage device.
 .RE

 .sp
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -823,14 +823,36 @@ Default value: \fB0\fR.
 .ad
 .RS 12n
 When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR
-milliseconds, a "slow spa_sync" message is logged to the debug log
-(see \fBzfs_dbgmsg_enable\fR).  If \fBzfs_deadman_enabled\fR is set,
-all pending IO operations are also checked and if any haven't completed
-within \fBzfs_deadman_synctime_ms\fR milliseconds, a "SLOW IO" message
-is logged to the debug log and a "delay" system event with the details of
-the hung IO is posted.
+milliseconds, or when an individual I/O takes longer than
+\fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to
+be "hung".  If \fBzfs_deadman_enabled\fR is set then the deadman behavior is
+invoked as described by the \fBzfs_deadman_failmode\fR module option.
+By default the deadman is enabled and configured to \fBwait\fR which results
+in "hung" I/Os only being logged.  The deadman is automatically disabled
+when a pool gets suspended.
 .sp
-Use \fB1\fR (default) to enable the slow IO check and \fB0\fR to disable.
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_deadman_failmode\fR (charp)
+.ad
+.RS 12n
+Controls the failure behavior when the deadman detects a "hung" I/O.  Valid
+values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR.
+.sp
+\fBwait\fR - Wait for a "hung" I/O to complete.  For each "hung" I/O a
+"deadman" event will be posted describing that I/O.
+.sp
+\fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it
+to the I/O pipeline if possible.
+.sp
+\fBpanic\fR - Panic the system.  This can be used to facilitate an automatic
+fail-over to a properly configured fail-over partner.
+.sp
+Default value: \fBwait\fR.
 .RE

 .sp
@ -839,11 +861,10 @@ Use \fB1\fR (default) to enable the slow IO check and \fB0\fR to disable.
 \fBzfs_deadman_checktime_ms\fR (int)
 .ad
 .RS 12n
-Once a pool sync operation has taken longer than
-\fBzfs_deadman_synctime_ms\fR milliseconds, continue to check for slow
-operations every \fBzfs_deadman_checktime_ms\fR milliseconds.
+Check time in milliseconds. This defines the frequency at which we check
+for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior.
 .sp
-Default value: \fB5,000\fR.
+Default value: \fB60,000\fR.
 .RE

 .sp
@ -853,12 +874,25 @@ Default value: \fB5,000\fR.
 .ad
 .RS 12n
 Interval in milliseconds after which the deadman is triggered and also
-the interval after which an IO operation is considered to be "hung"
-if \fBzfs_deadman_enabled\fR is set.
-
-See \fBzfs_deadman_enabled\fR.
+the interval after which a pool sync operation is considered to be "hung".
+Once this limit is exceeded the deadman will be invoked every
+\fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes.
 .sp
-Default value: \fB1,000,000\fR.
+Default value: \fB600,000\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_deadman_ziotime_ms\fR (ulong)
+.ad
+.RS 12n
+Interval in milliseconds after which the deadman is triggered and an
+individual IO operation is considered to be "hung".  As long as the I/O
+remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
+milliseconds until the I/O completes.
+.sp
+Default value: \fB300,000\fR.
 .RE

 .sp
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@ -292,24 +292,41 @@ int zfs_free_leak_on_eio = B_FALSE;
 /*
 * Expiration time in milliseconds. This value has two meanings. First it is
 * used to determine when the spa_deadman() logic should fire. By default the
- * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
+ * spa_deadman() will fire if spa_sync() has not completed in 600 seconds.
 * Secondly, the value determines if an I/O is considered "hung". Any I/O that
 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
- * in a system panic.
+ * in one of three behaviors controlled by zfs_deadman_failmode.
 */
-unsigned long zfs_deadman_synctime_ms = 1000000ULL;
+unsigned long zfs_deadman_synctime_ms = 600000ULL;
+
+/*
+ * This value controls the maximum amount of time zio_wait() will block for an
+ * outstanding IO.  By default this is 300 seconds at which point the "hung"
+ * behavior will be applied as described for zfs_deadman_synctime_ms.
+ */
+unsigned long zfs_deadman_ziotime_ms = 300000ULL;

 /*
 * Check time in milliseconds. This defines the frequency at which we check
 * for hung I/O.
 */
-unsigned long  zfs_deadman_checktime_ms = 5000ULL;
+unsigned long  zfs_deadman_checktime_ms = 60000ULL;

 /*
 * By default the deadman is enabled.
 */
 int zfs_deadman_enabled = 1;

+/*
+ * Controls the behavior of the deadman when it detects a "hung" I/O.
+ * Valid values are zfs_deadman_failmode=<wait|continue|panic>.
+ *
+ * wait     - Wait for the "hung" I/O (default)
+ * continue - Attempt to recover from a "hung" I/O
+ * panic    - Panic the system
+ */
+char *zfs_deadman_failmode = "wait";
+
 /*
 * The worst case is single-sector max-parity RAID-Z blocks, in which
 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
@ -536,7 +553,7 @@ spa_deadman(void *arg)
 	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
 	    ++spa->spa_deadman_calls);
 	if (zfs_deadman_enabled)
-		vdev_deadman(spa->spa_root_vdev);
+		vdev_deadman(spa->spa_root_vdev, FTAG);

 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
@ -590,6 +607,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_proc_state = SPA_PROC_NONE;

 	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
+	spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
+	spa_set_deadman_failmode(spa, zfs_deadman_failmode);

 	refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
@ -1681,7 +1700,7 @@ spa_update_dspace(spa_t *spa)
 * Return the failure mode that has been set to this pool. The default
 * behavior will be to block all I/Os when a complete failure occurs.
 */
-uint8_t
+uint64_t
 spa_get_failmode(spa_t *spa)
 {
 	return (spa->spa_failmode);
@ -1770,6 +1789,31 @@ spa_deadman_synctime(spa_t *spa)
 	return (spa->spa_deadman_synctime);
 }

+uint64_t
+spa_deadman_ziotime(spa_t *spa)
+{
+	return (spa->spa_deadman_ziotime);
+}
+
+uint64_t
+spa_get_deadman_failmode(spa_t *spa)
+{
+	return (spa->spa_deadman_failmode);
+}
+
+void
+spa_set_deadman_failmode(spa_t *spa, const char *failmode)
+{
+	if (strcmp(failmode, "wait") == 0)
+		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
+	else if (strcmp(failmode, "continue") == 0)
+		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE;
+	else if (strcmp(failmode, "panic") == 0)
+		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
+	else
+		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
+}
+
 uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
@ -2106,6 +2150,33 @@ spa_get_hostid(void)
 }

 #if defined(_KERNEL) && defined(HAVE_SPL)
+
+#include <linux/mod_compat.h>
+
+static int
+param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp)
+{
+	spa_t *spa = NULL;
+	char *p;
+
+	if (val == NULL)
+		return (SET_ERROR(-EINVAL));
+
+	if ((p = strchr(val, '\n')) != NULL)
+		*p = '\0';
+
+	if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 &&
+	    strcmp(val, "panic"))
+		return (SET_ERROR(-EINVAL));
+
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL)
+		spa_set_deadman_failmode(spa, val);
+	mutex_exit(&spa_namespace_lock);
+
+	return (param_set_charp(val, kp));
+}
+
 /* Namespace manipulation */
 EXPORT_SYMBOL(spa_lookup);
 EXPORT_SYMBOL(spa_add);
@ -2196,7 +2267,12 @@ MODULE_PARM_DESC(zfs_free_leak_on_eio,
 	"Set to ignore IO errors during free and permanently leak the space");

 module_param(zfs_deadman_synctime_ms, ulong, 0644);
-MODULE_PARM_DESC(zfs_deadman_synctime_ms, "Expiration time in milliseconds");
+MODULE_PARM_DESC(zfs_deadman_synctime_ms,
+	"Pool sync expiration time in milliseconds");
+
+module_param(zfs_deadman_ziotime_ms, ulong, 0644);
+MODULE_PARM_DESC(zfs_deadman_ziotime_ms,
+	"IO expiration time in milliseconds");

 module_param(zfs_deadman_checktime_ms, ulong, 0644);
 MODULE_PARM_DESC(zfs_deadman_checktime_ms,
@ -2205,6 +2281,10 @@ MODULE_PARM_DESC(zfs_deadman_checktime_ms,
 module_param(zfs_deadman_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");

+module_param_call(zfs_deadman_failmode, param_set_deadman_failmode,
+    param_get_charp, &zfs_deadman_failmode, 0644);
+MODULE_PARM_DESC(zfs_deadman_failmode, "Failmode for deadman timer");
+
 module_param(spa_asize_inflation, int, 0644);
 MODULE_PARM_DESC(spa_asize_inflation,
 	"SPA size estimate multiplication factor");
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@ -3707,12 +3707,12 @@ vdev_split(vdev_t *vd)
 }

 void
-vdev_deadman(vdev_t *vd)
+vdev_deadman(vdev_t *vd, char *tag)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];

-		vdev_deadman(cvd);
+		vdev_deadman(cvd, tag);
 	}

 	if (vd->vdev_ops->vdev_op_leaf) {
@ -3724,21 +3724,18 @@ vdev_deadman(vdev_t *vd)
 			zio_t *fio;
 			uint64_t delta;

+			zfs_dbgmsg("slow vdev: %s has %d active IOs",
+			    vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
-			 * the spa_deadman_synctime we log a zevent.
+			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
 			fio = avl_first(&vq->vq_active_tree);
 			delta = gethrtime() - fio->io_timestamp;
-			if (delta > spa_deadman_synctime(spa)) {
-				zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
-				    "delta %lluns, last io %lluns",
-				    fio->io_timestamp, delta,
-				    vq->vq_io_complete_ts);
-				zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
-				    spa, vd, &fio->io_bookmark, fio, 0, 0);
-			}
+			if (delta > spa_deadman_synctime(spa))
+				zio_deadman(fio, tag);
 		}
 		mutex_exit(&vq->vq_lock);
 	}
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -1748,6 +1748,80 @@ zio_delay_interrupt(zio_t *zio)
 	zio_interrupt(zio);
 }

+static void
+zio_deadman_impl(zio_t *pio)
+{
+	zio_t *cio, *cio_next;
+	zio_link_t *zl = NULL;
+	vdev_t *vd = pio->io_vd;
+
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
+		vdev_queue_t *vq = &vd->vdev_queue;
+		zbookmark_phys_t *zb = &pio->io_bookmark;
+		uint64_t delta = gethrtime() - pio->io_timestamp;
+		uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
+
+		zfs_dbgmsg("slow zio: zio=%p timestamp=%llu "
+		    "delta=%llu queued=%llu io=%llu "
+		    "path=%s last=%llu "
+		    "type=%d priority=%d flags=0x%x "
+		    "stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
+		    "objset=%llu object=%llu level=%llu blkid=%llu "
+		    "offset=%llu size=%llu error=%d",
+		    pio, pio->io_timestamp,
+		    delta, pio->io_delta, pio->io_delay,
+		    vd->vdev_path, vq->vq_io_complete_ts,
+		    pio->io_type, pio->io_priority, pio->io_flags,
+		    pio->io_state, pio->io_pipeline, pio->io_pipeline_trace,
+		    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+		    pio->io_offset, pio->io_size, pio->io_error);
+		zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
+		    pio->io_spa, vd, zb, pio, 0, 0);
+
+		if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
+		    taskq_empty_ent(&pio->io_tqent)) {
+			zio_interrupt(pio);
+		}
+	}
+
+	mutex_enter(&pio->io_lock);
+	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio, &zl);
+		zio_deadman_impl(cio);
+	}
+	mutex_exit(&pio->io_lock);
+}
+
+/*
+ * Log the critical information describing this zio and all of its children
+ * using the zfs_dbgmsg() interface then post deadman event for the ZED.
+ */
+void
+zio_deadman(zio_t *pio, char *tag)
+{
+	spa_t *spa = pio->io_spa;
+	char *name = spa_name(spa);
+
+	if (!zfs_deadman_enabled || spa_suspended(spa))
+		return;
+
+	zio_deadman_impl(pio);
+
+	switch (spa_get_deadman_failmode(spa)) {
+	case ZIO_FAILURE_MODE_WAIT:
+		zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
+		break;
+
+	case ZIO_FAILURE_MODE_CONTINUE:
+		zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
+		break;
+
+	case ZIO_FAILURE_MODE_PANIC:
+		fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
+		break;
+	}
+}
+
 /*
 * Execute the I/O pipeline until one of the following occurs:
 * (1) the I/O completes; (2) the pipeline stalls waiting for
@ -1877,6 +1951,7 @@ __zio_execute(zio_t *zio)
 int
 zio_wait(zio_t *zio)
 {
+	long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
 	int error;

 	ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
@ -1889,8 +1964,19 @@ zio_wait(zio_t *zio)
 	__zio_execute(zio);

 	mutex_enter(&zio->io_lock);
-	while (zio->io_executor != NULL)
-		cv_wait_io(&zio->io_cv, &zio->io_lock);
+	while (zio->io_executor != NULL) {
+		error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
+		    ddi_get_lbolt() + timeout);
+
+		if (zfs_deadman_enabled && error == -1 &&
+		    gethrtime() - zio->io_queued_timestamp >
+		    spa_deadman_ziotime(zio->io_spa)) {
+			mutex_exit(&zio->io_lock);
+			timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
+			zio_deadman(zio, FTAG);
+			mutex_enter(&zio->io_lock);
+		}
+	}
 	mutex_exit(&zio->io_lock);

 	error = zio->io_error;
--- a/scripts/zloop.sh
+++ b/scripts/zloop.sh
@ -115,7 +115,10 @@ function store_core
 		or_die mv ztest.ddt "$dest/"
 		or_die mv ztest.out "$dest/"
 		or_die mv "$workdir/ztest*" "$dest/vdev/"
+
+		if [[ -e "$workdir/zpool.cache" ]]; then
 			or_die mv "$workdir/zpool.cache" "$dest/vdev/"
+		fi

 		# check for core
 		if [[ -f "$core" ]]; then
@ -141,8 +144,6 @@ function store_core
 			# Record info in cores logfile
 			echo "*** core @ $coredir/$coreid/$core:" | \
 			    tee -a ztest.cores
-			echo "$corestatus" | tee -a ztest.cores
-			echo "" | tee -a ztest.cores
 		fi
 		echo "continuing..."
 	fi
@ -203,7 +204,7 @@ curtime=$starttime

 # if no timeout was specified, loop forever.
 while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
-	zopt="-VVVVV"
+	zopt="-G -VVVVV"

 	# start each run with an empty directory
 	workdir="$basedir/$rundir"
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@ -440,6 +440,12 @@ tags = ['functional', 'compression']
 tests = ['ctime_001_pos' ]
 tags = ['functional', 'ctime']

+[tests/functional/deadman]
+tests = ['deadman_sync', 'deadman_zio']
+pre =
+post =
+tags = ['functional', 'deadman']
+
 [tests/functional/delegate]
 tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos',
    'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',
--- a/tests/zfs-tests/tests/functional/Makefile.am
+++ b/tests/zfs-tests/tests/functional/Makefile.am
@ -12,6 +12,7 @@ SUBDIRS = \
 	cli_user \
 	compression \
 	ctime \
+	deadman \
 	delegate \
 	devices \
 	events \
--- a/tests/zfs-tests/tests/functional/deadman/Makefile.am
+++ b/tests/zfs-tests/tests/functional/deadman/Makefile.am
@ -0,0 +1,5 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/deadman
+dist_pkgdata_SCRIPTS = \
+	deadman.cfg \
+	deadman_sync.ksh \
+	deadman_zio.ksh
--- a/tests/zfs-tests/tests/functional/deadman/deadman.cfg
+++ b/tests/zfs-tests/tests/functional/deadman/deadman.cfg
@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+export DISK1=${DISKS%% *}
+
+export SYNCTIME_DEFAULT=600000
+export ZIOTIME_DEFAULT=300000
+export CHECKTIME_DEFAULT=60000
+export FAILMODE_DEFAULT="wait"
--- a/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh
+++ b/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh
@ -0,0 +1,86 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+# DESCRIPTION:
+#	Verify spa deadman detects a hung txg
+#
+# STRATEGY:
+#	1. Reduce the zfs_deadman_synctime_ms to 5s.
+#	2. Reduce the zfs_deadman_checktime_ms to 1s.
+#	3. Inject a 10s zio delay to force long IOs.
+#	4. Write enough data to force a long txg sync time due to the delay.
+#	5. Verify a "deadman" event is posted.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/deadman/deadman.cfg
+
+verify_runnable "both"
+
+function cleanup
+{
+	log_must zinject -c all
+	default_cleanup_noexit
+
+	log_must set_tunable64 zfs_deadman_synctime_ms $SYNCTIME_DEFAULT
+	log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT
+	log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT
+}
+
+log_assert "Verify spa deadman detects a hung txg"
+log_onexit cleanup
+
+log_must set_tunable64 zfs_deadman_synctime_ms 5000
+log_must set_tunable64 zfs_deadman_checktime_ms 1000
+log_must set_tunable64 zfs_deadman_failmode "wait"
+
+# Create a new pool in order to use the updated deadman settings.
+default_setup_noexit $DISK1
+log_must zpool events -c
+
+# Force each IO to take 10s by allow them to run concurrently.
+log_must zinject -d $DISK1 -D10000:10 $TESTPOOL
+
+mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file
+sleep 10
+
+log_must zinject -c all
+log_must zpool sync
+
+# Log txg sync times for reference and the zpool event summary.
+log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs
+log_must zpool events
+
+# Verify at least 5 deadman events were logged.  The first after 5 seconds,
+# and another each second thereafter until the delay  is clearer.
+events=$(zpool events | grep -c ereport.fs.zfs.deadman)
+if [ "$events" -lt 5 ]; then
+	log_fail "Expect >=5 deadman events, $events found"
+fi
+
+log_pass "Verify spa deadman detected a hung txg and $events deadman events"
--- a/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh
+++ b/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh
@ -0,0 +1,113 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+# DESCRIPTION:
+#	Verify zio deadman detects a hung zio
+#
+# STRATEGY:
+#	1. Reduce the zfs_deadman_ziotime_ms to 5s.
+#	2. Reduce the zfs_deadman_checktime_ms to 1s.
+#	3. Inject a 10s zio delay to force long IOs.
+#	4. Read an uncached file in the background.
+#	5. Verify a "deadman" event is posted.
+#	6. Inject a 100ms zio delay which is under the 5s allowed.
+#	7. Read an uncached file in the background.
+#	8. Verify a "deadman" event is not posted.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/deadman/deadman.cfg
+
+verify_runnable "both"
+
+function cleanup
+{
+	log_must zinject -c all
+	default_cleanup_noexit
+
+	log_must set_tunable64 zfs_deadman_ziotime_ms $ZIOTIME_DEFAULT
+	log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT
+	log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT
+}
+
+log_assert "Verify zio deadman detects a hung zio"
+log_onexit cleanup
+
+# 1. Reduce the zfs_deadman_ziotime_ms to 5s.
+log_must set_tunable64 zfs_deadman_ziotime_ms 5000
+# 2. Reduce the zfs_deadman_checktime_ms to 1s.
+log_must set_tunable64 zfs_deadman_checktime_ms 1000
+log_must set_tunable64 zfs_deadman_failmode "wait"
+
+# Create a new pool in order to use the updated deadman settings.
+default_setup_noexit $DISK1
+
+# Write a file and export/import the pool to clear the ARC.
+mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file1
+log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file2
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+log_must stat -t /$mntpnt/file1
+log_must stat -t /$mntpnt/file2
+
+# 3. Inject a 10s zio delay to force long IOs and serialize them..
+log_must zpool events -c
+log_must zinject -d $DISK1 -D10000:1 $TESTPOOL
+
+# 4. Read an uncached file in the background, it's expected to hang.
+log_must eval "dd if=/$mntpnt/file1 of=/dev/null bs=1048576 &"
+sleep 10
+log_must zinject -c all
+log_must zpool sync
+wait
+
+# 5. Verify a "deadman" event is posted.  The first appears after 5
+# seconds, and another each second thereafter until the delay is cleared.
+events=$(zpool events | grep -c ereport.fs.zfs.deadman)
+if [ "$events" -lt 1 ]; then
+	log_fail "Expect >=1 deadman events, $events found"
+fi
+
+# 6. Inject a 100ms zio delay which is under the 5s allowed, allow them
+# to run concurrently so they don't get starved in the queue.
+log_must zpool events -c
+log_must zinject -d $DISK1 -D100:10 $TESTPOOL
+
+# 7. Read an uncached file in the background.
+log_must eval "dd if=/$mntpnt/file2 of=/dev/null bs=1048576 &"
+sleep 10
+log_must zinject -c all
+wait
+
+# 8. Verify a "deadman" event is not posted.
+events=$(zpool events | grep -c ereport.fs.zfs.deadman)
+if [ "$events" -ne 0 ]; then
+	log_fail "Expect 0 deadman events, $events found"
+fi
+
+log_pass "Verify zio deadman detected a hung zio and $events deadman events"