Extend deadman logic

The intent of this patch is extend the existing deadman code
such that it's flexible enough to be used by both ztest and
on production systems.  The proposed changes include:

* Added a new `zfs_deadman_failmode` module option which is
  used to dynamically control the behavior of the deadman.  It's
  loosely modeled after, but independant from, the pool failmode
  property.  It can be set to wait, continue, or panic.

    * wait     - Wait for the "hung" I/O (default)
    * continue - Attempt to recover from a "hung" I/O
    * panic    - Panic the system

* Added a new `zfs_deadman_ziotime_ms` module option which is
  analogous to `zfs_deadman_synctime_ms` except instead of
  applying to a pool TXG sync it applies to zio_wait().  A
  default value of 300s is used to define a "hung" zio.

* The ztest deadman thread has been re-enabled by default,
  aligned with the upstream OpenZFS code, and then extended
  to terminate the process when it takes significantly longer
  to complete than expected.

* The -G option was added to ztest to print the internal debug
  log when a fatal error is encountered.  This same option was
  previously added to zdb in commit fa603f82.  Update zloop.sh
  to unconditionally pass -G to obtain additional debugging.

* The FM_EREPORT_ZFS_DELAY event which was previously posted
  when the deadman detect a "hung" pool has been replaced by
  a new dedicated FM_EREPORT_ZFS_DEADMAN event.

* The proposed recovery logic attempts to restart a "hung"
  zio by calling zio_interrupt() on any outstanding leaf zios.
  We may want to further restrict this to zios in either the
  ZIO_STAGE_VDEV_IO_START or ZIO_STAGE_VDEV_IO_DONE stages.
  Calling zio_interrupt() is expected to only be useful for
  cases when an IO has been submitted to the physical device
  but for some reasonable the completion callback hasn't been
  called by the lower layers.  This shouldn't be possible but
  has been observed and may be caused by kernel/driver bugs.

* The 'zfs_deadman_synctime_ms' default value was reduced from
  1000s to 600s.

* Depending on how ztest fails there may be no cache file to
  move.  This should not be considered fatal, collect the logs
  which are available and carry on.

* Add deadman test cases for spa_deadman() and zio_wait().

* Increase default zfs_deadman_checktime_ms to 60s.

Reviewed-by: Tim Chase <tim@chase2k.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #6999
This commit is contained in:
Brian Behlendorf 2017-12-18 17:06:07 -05:00
parent 1b18c6d791
commit 8fb1ede146
21 changed files with 583 additions and 58 deletions

View File

@ -197,6 +197,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
extern unsigned long zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
extern int zfs_abd_scatter_enabled;
@ -447,6 +448,7 @@ static kmutex_t ztest_vdev_lock;
static rwlock_t ztest_name_lock;
static boolean_t ztest_dump_core = B_TRUE;
static boolean_t ztest_dump_debug_buffer = B_FALSE;
static boolean_t ztest_exiting;
/* Global commit callback list */
@ -495,6 +497,16 @@ _umem_logging_init(void)
return ("fail,contents"); /* $UMEM_LOGGING setting */
}
static void
dump_debug_buffer(void)
{
if (!ztest_dump_debug_buffer)
return;
(void) printf("\n");
zfs_dbgmsg_print("ztest");
}
#define BACKTRACE_SZ 100
static void sig_handler(int signo)
@ -507,6 +519,7 @@ static void sig_handler(int signo)
nptrs = backtrace(buffer, BACKTRACE_SZ);
backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
#endif
dump_debug_buffer();
/*
* Restore default action and re-raise signal so SIGSEGV and
@ -544,6 +557,9 @@ fatal(int do_perror, char *message, ...)
}
(void) fprintf(stderr, "%s\n", buf);
fatal_msg = buf; /* to ease debugging */
dump_debug_buffer();
if (ztest_dump_core)
abort();
exit(3);
@ -641,6 +657,7 @@ usage(boolean_t requested)
"\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
"\t[-o variable=value] ... set global variable to an unsigned\n"
"\t 32-bit integer value\n"
"\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n"
"\t[-h] (print help)\n"
"",
zo->zo_pool,
@ -676,7 +693,7 @@ process_options(int argc, char **argv)
bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
while ((opt = getopt(argc, argv,
"v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:")) != EOF) {
"v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:G")) != EOF) {
value = 0;
switch (opt) {
case 'v':
@ -771,6 +788,9 @@ process_options(int argc, char **argv)
if (set_global_var(optarg) != 0)
usage(B_FALSE);
break;
case 'G':
ztest_dump_debug_buffer = B_TRUE;
break;
case 'h':
usage(B_TRUE);
break;
@ -6224,15 +6244,48 @@ ztest_resume_thread(void *arg)
thread_exit();
}
#define GRACE 300
#if 0
static void
ztest_deadman_alarm(int sig)
ztest_deadman_thread(void *arg)
{
fatal(0, "failed to complete within %d seconds of deadline", GRACE);
ztest_shared_t *zs = arg;
spa_t *spa = ztest_spa;
hrtime_t delta, overdue, total = 0;
for (;;) {
delta = zs->zs_thread_stop - zs->zs_thread_start +
MSEC2NSEC(zfs_deadman_synctime_ms);
(void) poll(NULL, 0, (int)NSEC2MSEC(delta));
/*
* If the pool is suspended then fail immediately. Otherwise,
* check to see if the pool is making any progress. If
* vdev_deadman() discovers that there hasn't been any recent
* I/Os then it will end up aborting the tests.
*/
if (spa_suspended(spa) || spa->spa_root_vdev == NULL) {
fatal(0, "aborting test after %llu seconds because "
"pool has transitioned to a suspended state.",
zfs_deadman_synctime_ms / 1000);
}
vdev_deadman(spa->spa_root_vdev, FTAG);
/*
* If the process doesn't complete within a grace period of
* zfs_deadman_synctime_ms over the expected finish time,
* then it may be hung and is terminated.
*/
overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms);
total += zfs_deadman_synctime_ms / 1000;
if (gethrtime() > overdue) {
fatal(0, "aborting test after %llu seconds because "
"the process is overdue for termination.", total);
}
(void) printf("ztest has been running for %lld seconds\n",
total);
}
}
#endif
static void
ztest_execute(int test, ztest_info_t *zi, uint64_t id)
@ -6491,13 +6544,13 @@ ztest_run(ztest_shared_t *zs)
resume_thread = thread_create(NULL, 0, ztest_resume_thread,
spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
#if 0
/*
* Set a deadman alarm to abort() if we hang.
* Create a deadman thread and set to panic if we hang.
*/
signal(SIGALRM, ztest_deadman_alarm);
alarm((zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + GRACE);
#endif
(void) thread_create(NULL, 0, ztest_deadman_thread,
zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
/*
* Verify that we can safely inquire about about any object,
@ -7047,6 +7100,7 @@ main(int argc, char **argv)
(void) setvbuf(stdout, NULL, _IOLBF, 0);
dprintf_setup(&argc, argv);
zfs_deadman_synctime_ms = 300000;
action.sa_handler = sig_handler;
sigemptyset(&action.sa_mask);

View File

@ -242,6 +242,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile
tests/zfs-tests/tests/functional/compression/Makefile
tests/zfs-tests/tests/functional/ctime/Makefile
tests/zfs-tests/tests/functional/deadman/Makefile
tests/zfs-tests/tests/functional/delegate/Makefile
tests/zfs-tests/tests/functional/devices/Makefile
tests/zfs-tests/tests/functional/events/Makefile

View File

@ -37,6 +37,7 @@ extern "C" {
#define FM_EREPORT_ZFS_IO "io"
#define FM_EREPORT_ZFS_DATA "data"
#define FM_EREPORT_ZFS_DELAY "delay"
#define FM_EREPORT_ZFS_DEADMAN "deadman"
#define FM_EREPORT_ZFS_POOL "zpool"
#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"

View File

@ -955,12 +955,15 @@ extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
extern void spa_evicting_os_wait(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
extern int spa_prev_software_version(spa_t *spa);
extern uint8_t spa_get_failmode(spa_t *spa);
extern uint64_t spa_get_failmode(spa_t *spa);
extern uint64_t spa_get_deadman_failmode(spa_t *spa);
extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode);
extern boolean_t spa_suspended(spa_t *spa);
extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
extern uint64_t spa_deadman_ziotime(spa_t *spa);
/* Miscellaneous support routines */
extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
@ -1074,6 +1077,10 @@ extern boolean_t spa_debug_enabled(spa_t *spa);
}
extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
extern int zfs_deadman_enabled;
extern unsigned long zfs_deadman_synctime_ms;
extern unsigned long zfs_deadman_ziotime_ms;
extern unsigned long zfs_deadman_checktime_ms;
#ifdef __cplusplus
}

View File

@ -228,6 +228,7 @@ struct spa {
uint64_t spa_pool_props_object; /* object for properties */
uint64_t spa_bootfs; /* default boot filesystem */
uint64_t spa_failmode; /* failure mode for the pool */
uint64_t spa_deadman_failmode; /* failure mode for deadman */
uint64_t spa_delegation; /* delegation on/off */
list_t spa_config_list; /* previous cache file(s) */
/* per-CPU array of root of async I/O: */
@ -270,7 +271,8 @@ struct spa {
taskqid_t spa_deadman_tqid; /* Task id */
uint64_t spa_deadman_calls; /* number of deadman calls */
hrtime_t spa_sync_starttime; /* starting time of spa_sync */
uint64_t spa_deadman_synctime; /* deadman expiration timer */
uint64_t spa_deadman_synctime; /* deadman sync expiration */
uint64_t spa_deadman_ziotime; /* deadman zio expiration */
uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */
uint64_t spa_errata; /* errata issues detected */

View File

@ -84,7 +84,7 @@ extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd, char *tag);
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);

View File

@ -317,6 +317,7 @@ extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
extern void cv_signal(kcondvar_t *cv);
extern void cv_broadcast(kcondvar_t *cv);
#define cv_timedwait_io(cv, mp, at) cv_timedwait(cv, mp, at)
#define cv_timedwait_sig(cv, mp, at) cv_timedwait(cv, mp, at)
#define cv_wait_sig(cv, mp) cv_wait(cv, mp)
#define cv_wait_io(cv, mp) cv_wait(cv, mp)

View File

@ -556,6 +556,7 @@ extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
extern void zio_deadman(zio_t *zio, char *tag);
extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);

View File

@ -129,6 +129,10 @@ Total test run time.
.BI "\-z" " zil_failure_rate" " (default: fail every 2^5 allocs)
.IP
Injected failure rate.
.HP
.BI "\-G"
.IP
Dump zfs_dbgmsg buffer before exiting.
.SH "EXAMPLES"
.LP
To override /tmp as your location for block files, you can use the -f

View File

@ -55,7 +55,7 @@ part here.
\fBchecksum\fR
.ad
.RS 12n
Issued when a checksum error have been detected.
Issued when a checksum error has been detected.
.RE
.sp
@ -76,14 +76,27 @@ Issued when there is an I/O error in a vdev in the pool.
Issued when there have been data errors in the pool.
.RE
.sp
.ne 2
.na
\fBdeadman\fR
.ad
.RS 12n
Issued when an I/O is determined to be "hung", this can be caused by lost
completion events due to flaky hardware or drivers. See the
\fBzfs_deadman_failmode\fR module option description for additional
information regarding "hung" I/O detection and configuration.
.RE
.sp
.ne 2
.na
\fBdelay\fR
.ad
.RS 12n
Issued when an I/O was slow to complete as defined by the zio_delay_max module
option.
Issued when a completed I/O exceeds the maximum allowed time specified
by the \fBzio_delay_max\fR module option. This can be an indicator of
problems with the underlying storage device.
.RE
.sp

View File

@ -823,14 +823,36 @@ Default value: \fB0\fR.
.ad
.RS 12n
When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR
milliseconds, a "slow spa_sync" message is logged to the debug log
(see \fBzfs_dbgmsg_enable\fR). If \fBzfs_deadman_enabled\fR is set,
all pending IO operations are also checked and if any haven't completed
within \fBzfs_deadman_synctime_ms\fR milliseconds, a "SLOW IO" message
is logged to the debug log and a "delay" system event with the details of
the hung IO is posted.
milliseconds, or when an individual I/O takes longer than
\fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to
be "hung". If \fBzfs_deadman_enabled\fR is set then the deadman behavior is
invoked as described by the \fBzfs_deadman_failmode\fR module option.
By default the deadman is enabled and configured to \fBwait\fR which results
in "hung" I/Os only being logged. The deadman is automatically disabled
when a pool gets suspended.
.sp
Use \fB1\fR (default) to enable the slow IO check and \fB0\fR to disable.
Default value: \fB1\fR.
.RE
.sp
.ne 2
.na
\fBzfs_deadman_failmode\fR (charp)
.ad
.RS 12n
Controls the failure behavior when the deadman detects a "hung" I/O. Valid
values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR.
.sp
\fBwait\fR - Wait for a "hung" I/O to complete. For each "hung" I/O a
"deadman" event will be posted describing that I/O.
.sp
\fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it
to the I/O pipeline if possible.
.sp
\fBpanic\fR - Panic the system. This can be used to facilitate an automatic
fail-over to a properly configured fail-over partner.
.sp
Default value: \fBwait\fR.
.RE
.sp
@ -839,11 +861,10 @@ Use \fB1\fR (default) to enable the slow IO check and \fB0\fR to disable.
\fBzfs_deadman_checktime_ms\fR (int)
.ad
.RS 12n
Once a pool sync operation has taken longer than
\fBzfs_deadman_synctime_ms\fR milliseconds, continue to check for slow
operations every \fBzfs_deadman_checktime_ms\fR milliseconds.
Check time in milliseconds. This defines the frequency at which we check
for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior.
.sp
Default value: \fB5,000\fR.
Default value: \fB60,000\fR.
.RE
.sp
@ -853,12 +874,25 @@ Default value: \fB5,000\fR.
.ad
.RS 12n
Interval in milliseconds after which the deadman is triggered and also
the interval after which an IO operation is considered to be "hung"
if \fBzfs_deadman_enabled\fR is set.
See \fBzfs_deadman_enabled\fR.
the interval after which a pool sync operation is considered to be "hung".
Once this limit is exceeded the deadman will be invoked every
\fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes.
.sp
Default value: \fB1,000,000\fR.
Default value: \fB600,000\fR.
.RE
.sp
.ne 2
.na
\fBzfs_deadman_ziotime_ms\fR (ulong)
.ad
.RS 12n
Interval in milliseconds after which the deadman is triggered and an
individual IO operation is considered to be "hung". As long as the I/O
remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
milliseconds until the I/O completes.
.sp
Default value: \fB300,000\fR.
.RE
.sp

View File

@ -292,24 +292,41 @@ int zfs_free_leak_on_eio = B_FALSE;
/*
* Expiration time in milliseconds. This value has two meanings. First it is
* used to determine when the spa_deadman() logic should fire. By default the
* spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
* spa_deadman() will fire if spa_sync() has not completed in 600 seconds.
* Secondly, the value determines if an I/O is considered "hung". Any I/O that
* has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
* in a system panic.
* in one of three behaviors controlled by zfs_deadman_failmode.
*/
unsigned long zfs_deadman_synctime_ms = 1000000ULL;
unsigned long zfs_deadman_synctime_ms = 600000ULL;
/*
* This value controls the maximum amount of time zio_wait() will block for an
* outstanding IO. By default this is 300 seconds at which point the "hung"
* behavior will be applied as described for zfs_deadman_synctime_ms.
*/
unsigned long zfs_deadman_ziotime_ms = 300000ULL;
/*
* Check time in milliseconds. This defines the frequency at which we check
* for hung I/O.
*/
unsigned long zfs_deadman_checktime_ms = 5000ULL;
unsigned long zfs_deadman_checktime_ms = 60000ULL;
/*
* By default the deadman is enabled.
*/
int zfs_deadman_enabled = 1;
/*
* Controls the behavior of the deadman when it detects a "hung" I/O.
* Valid values are zfs_deadman_failmode=<wait|continue|panic>.
*
* wait - Wait for the "hung" I/O (default)
* continue - Attempt to recover from a "hung" I/O
* panic - Panic the system
*/
char *zfs_deadman_failmode = "wait";
/*
* The worst case is single-sector max-parity RAID-Z blocks, in which
* case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
@ -536,7 +553,7 @@ spa_deadman(void *arg)
(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
++spa->spa_deadman_calls);
if (zfs_deadman_enabled)
vdev_deadman(spa->spa_root_vdev);
vdev_deadman(spa->spa_root_vdev, FTAG);
spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
@ -590,6 +607,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_proc_state = SPA_PROC_NONE;
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
spa_set_deadman_failmode(spa, zfs_deadman_failmode);
refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
@ -1681,7 +1700,7 @@ spa_update_dspace(spa_t *spa)
* Return the failure mode that has been set to this pool. The default
* behavior will be to block all I/Os when a complete failure occurs.
*/
uint8_t
uint64_t
spa_get_failmode(spa_t *spa)
{
return (spa->spa_failmode);
@ -1770,6 +1789,31 @@ spa_deadman_synctime(spa_t *spa)
return (spa->spa_deadman_synctime);
}
uint64_t
spa_deadman_ziotime(spa_t *spa)
{
return (spa->spa_deadman_ziotime);
}
uint64_t
spa_get_deadman_failmode(spa_t *spa)
{
return (spa->spa_deadman_failmode);
}
void
spa_set_deadman_failmode(spa_t *spa, const char *failmode)
{
if (strcmp(failmode, "wait") == 0)
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
else if (strcmp(failmode, "continue") == 0)
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE;
else if (strcmp(failmode, "panic") == 0)
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
else
spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
}
uint64_t
dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{
@ -2106,6 +2150,33 @@ spa_get_hostid(void)
}
#if defined(_KERNEL) && defined(HAVE_SPL)
#include <linux/mod_compat.h>
static int
param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp)
{
spa_t *spa = NULL;
char *p;
if (val == NULL)
return (SET_ERROR(-EINVAL));
if ((p = strchr(val, '\n')) != NULL)
*p = '\0';
if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 &&
strcmp(val, "panic"))
return (SET_ERROR(-EINVAL));
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL)
spa_set_deadman_failmode(spa, val);
mutex_exit(&spa_namespace_lock);
return (param_set_charp(val, kp));
}
/* Namespace manipulation */
EXPORT_SYMBOL(spa_lookup);
EXPORT_SYMBOL(spa_add);
@ -2196,7 +2267,12 @@ MODULE_PARM_DESC(zfs_free_leak_on_eio,
"Set to ignore IO errors during free and permanently leak the space");
module_param(zfs_deadman_synctime_ms, ulong, 0644);
MODULE_PARM_DESC(zfs_deadman_synctime_ms, "Expiration time in milliseconds");
MODULE_PARM_DESC(zfs_deadman_synctime_ms,
"Pool sync expiration time in milliseconds");
module_param(zfs_deadman_ziotime_ms, ulong, 0644);
MODULE_PARM_DESC(zfs_deadman_ziotime_ms,
"IO expiration time in milliseconds");
module_param(zfs_deadman_checktime_ms, ulong, 0644);
MODULE_PARM_DESC(zfs_deadman_checktime_ms,
@ -2205,6 +2281,10 @@ MODULE_PARM_DESC(zfs_deadman_checktime_ms,
module_param(zfs_deadman_enabled, int, 0644);
MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");
module_param_call(zfs_deadman_failmode, param_set_deadman_failmode,
param_get_charp, &zfs_deadman_failmode, 0644);
MODULE_PARM_DESC(zfs_deadman_failmode, "Failmode for deadman timer");
module_param(spa_asize_inflation, int, 0644);
MODULE_PARM_DESC(spa_asize_inflation,
"SPA size estimate multiplication factor");

View File

@ -3707,12 +3707,12 @@ vdev_split(vdev_t *vd)
}
void
vdev_deadman(vdev_t *vd)
vdev_deadman(vdev_t *vd, char *tag)
{
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
vdev_deadman(cvd);
vdev_deadman(cvd, tag);
}
if (vd->vdev_ops->vdev_op_leaf) {
@ -3724,21 +3724,18 @@ vdev_deadman(vdev_t *vd)
zio_t *fio;
uint64_t delta;
zfs_dbgmsg("slow vdev: %s has %d active IOs",
vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime we log a zevent.
* the spa_deadman_synctime invoke the deadman logic.
*/
fio = avl_first(&vq->vq_active_tree);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa)) {
zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
"delta %lluns, last io %lluns",
fio->io_timestamp, delta,
vq->vq_io_complete_ts);
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
spa, vd, &fio->io_bookmark, fio, 0, 0);
}
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
}
mutex_exit(&vq->vq_lock);
}

View File

@ -1748,6 +1748,80 @@ zio_delay_interrupt(zio_t *zio)
zio_interrupt(zio);
}
static void
zio_deadman_impl(zio_t *pio)
{
zio_t *cio, *cio_next;
zio_link_t *zl = NULL;
vdev_t *vd = pio->io_vd;
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
vdev_queue_t *vq = &vd->vdev_queue;
zbookmark_phys_t *zb = &pio->io_bookmark;
uint64_t delta = gethrtime() - pio->io_timestamp;
uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
zfs_dbgmsg("slow zio: zio=%p timestamp=%llu "
"delta=%llu queued=%llu io=%llu "
"path=%s last=%llu "
"type=%d priority=%d flags=0x%x "
"stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
"objset=%llu object=%llu level=%llu blkid=%llu "
"offset=%llu size=%llu error=%d",
pio, pio->io_timestamp,
delta, pio->io_delta, pio->io_delay,
vd->vdev_path, vq->vq_io_complete_ts,
pio->io_type, pio->io_priority, pio->io_flags,
pio->io_state, pio->io_pipeline, pio->io_pipeline_trace,
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
pio->io_offset, pio->io_size, pio->io_error);
zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
pio->io_spa, vd, zb, pio, 0, 0);
if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
taskq_empty_ent(&pio->io_tqent)) {
zio_interrupt(pio);
}
}
mutex_enter(&pio->io_lock);
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
zio_deadman_impl(cio);
}
mutex_exit(&pio->io_lock);
}
/*
* Log the critical information describing this zio and all of its children
* using the zfs_dbgmsg() interface then post deadman event for the ZED.
*/
void
zio_deadman(zio_t *pio, char *tag)
{
spa_t *spa = pio->io_spa;
char *name = spa_name(spa);
if (!zfs_deadman_enabled || spa_suspended(spa))
return;
zio_deadman_impl(pio);
switch (spa_get_deadman_failmode(spa)) {
case ZIO_FAILURE_MODE_WAIT:
zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
break;
case ZIO_FAILURE_MODE_CONTINUE:
zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
break;
case ZIO_FAILURE_MODE_PANIC:
fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
break;
}
}
/*
* Execute the I/O pipeline until one of the following occurs:
* (1) the I/O completes; (2) the pipeline stalls waiting for
@ -1877,6 +1951,7 @@ __zio_execute(zio_t *zio)
int
zio_wait(zio_t *zio)
{
long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
int error;
ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
@ -1889,8 +1964,19 @@ zio_wait(zio_t *zio)
__zio_execute(zio);
mutex_enter(&zio->io_lock);
while (zio->io_executor != NULL)
cv_wait_io(&zio->io_cv, &zio->io_lock);
while (zio->io_executor != NULL) {
error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
ddi_get_lbolt() + timeout);
if (zfs_deadman_enabled && error == -1 &&
gethrtime() - zio->io_queued_timestamp >
spa_deadman_ziotime(zio->io_spa)) {
mutex_exit(&zio->io_lock);
timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
zio_deadman(zio, FTAG);
mutex_enter(&zio->io_lock);
}
}
mutex_exit(&zio->io_lock);
error = zio->io_error;

View File

@ -115,7 +115,10 @@ function store_core
or_die mv ztest.ddt "$dest/"
or_die mv ztest.out "$dest/"
or_die mv "$workdir/ztest*" "$dest/vdev/"
if [[ -e "$workdir/zpool.cache" ]]; then
or_die mv "$workdir/zpool.cache" "$dest/vdev/"
fi
# check for core
if [[ -f "$core" ]]; then
@ -141,8 +144,6 @@ function store_core
# Record info in cores logfile
echo "*** core @ $coredir/$coreid/$core:" | \
tee -a ztest.cores
echo "$corestatus" | tee -a ztest.cores
echo "" | tee -a ztest.cores
fi
echo "continuing..."
fi
@ -203,7 +204,7 @@ curtime=$starttime
# if no timeout was specified, loop forever.
while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
zopt="-VVVVV"
zopt="-G -VVVVV"
# start each run with an empty directory
workdir="$basedir/$rundir"

View File

@ -440,6 +440,12 @@ tags = ['functional', 'compression']
tests = ['ctime_001_pos' ]
tags = ['functional', 'ctime']
[tests/functional/deadman]
tests = ['deadman_sync', 'deadman_zio']
pre =
post =
tags = ['functional', 'deadman']
[tests/functional/delegate]
tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos',
'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',

View File

@ -12,6 +12,7 @@ SUBDIRS = \
cli_user \
compression \
ctime \
deadman \
delegate \
devices \
events \

View File

@ -0,0 +1,5 @@
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/deadman
dist_pkgdata_SCRIPTS = \
deadman.cfg \
deadman_sync.ksh \
deadman_zio.ksh

View File

@ -0,0 +1,32 @@
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
# Use is subject to license terms.
#
export DISK1=${DISKS%% *}
export SYNCTIME_DEFAULT=600000
export ZIOTIME_DEFAULT=300000
export CHECKTIME_DEFAULT=60000
export FAILMODE_DEFAULT="wait"

View File

@ -0,0 +1,86 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
# Use is subject to license terms.
#
# DESCRIPTION:
# Verify spa deadman detects a hung txg
#
# STRATEGY:
# 1. Reduce the zfs_deadman_synctime_ms to 5s.
# 2. Reduce the zfs_deadman_checktime_ms to 1s.
# 3. Inject a 10s zio delay to force long IOs.
# 4. Write enough data to force a long txg sync time due to the delay.
# 5. Verify a "deadman" event is posted.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/deadman/deadman.cfg
verify_runnable "both"
function cleanup
{
log_must zinject -c all
default_cleanup_noexit
log_must set_tunable64 zfs_deadman_synctime_ms $SYNCTIME_DEFAULT
log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT
log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT
}
log_assert "Verify spa deadman detects a hung txg"
log_onexit cleanup
log_must set_tunable64 zfs_deadman_synctime_ms 5000
log_must set_tunable64 zfs_deadman_checktime_ms 1000
log_must set_tunable64 zfs_deadman_failmode "wait"
# Create a new pool in order to use the updated deadman settings.
default_setup_noexit $DISK1
log_must zpool events -c
# Force each IO to take 10s by allow them to run concurrently.
log_must zinject -d $DISK1 -D10000:10 $TESTPOOL
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file
sleep 10
log_must zinject -c all
log_must zpool sync
# Log txg sync times for reference and the zpool event summary.
log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs
log_must zpool events
# Verify at least 5 deadman events were logged. The first after 5 seconds,
# and another each second thereafter until the delay is clearer.
events=$(zpool events | grep -c ereport.fs.zfs.deadman)
if [ "$events" -lt 5 ]; then
log_fail "Expect >=5 deadman events, $events found"
fi
log_pass "Verify spa deadman detected a hung txg and $events deadman events"

View File

@ -0,0 +1,113 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
# Use is subject to license terms.
#
# DESCRIPTION:
# Verify zio deadman detects a hung zio
#
# STRATEGY:
# 1. Reduce the zfs_deadman_ziotime_ms to 5s.
# 2. Reduce the zfs_deadman_checktime_ms to 1s.
# 3. Inject a 10s zio delay to force long IOs.
# 4. Read an uncached file in the background.
# 5. Verify a "deadman" event is posted.
# 6. Inject a 100ms zio delay which is under the 5s allowed.
# 7. Read an uncached file in the background.
# 8. Verify a "deadman" event is not posted.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/deadman/deadman.cfg
verify_runnable "both"
function cleanup
{
log_must zinject -c all
default_cleanup_noexit
log_must set_tunable64 zfs_deadman_ziotime_ms $ZIOTIME_DEFAULT
log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT
log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT
}
log_assert "Verify zio deadman detects a hung zio"
log_onexit cleanup
# 1. Reduce the zfs_deadman_ziotime_ms to 5s.
log_must set_tunable64 zfs_deadman_ziotime_ms 5000
# 2. Reduce the zfs_deadman_checktime_ms to 1s.
log_must set_tunable64 zfs_deadman_checktime_ms 1000
log_must set_tunable64 zfs_deadman_failmode "wait"
# Create a new pool in order to use the updated deadman settings.
default_setup_noexit $DISK1
# Write a file and export/import the pool to clear the ARC.
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file1
log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file2
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
log_must stat -t /$mntpnt/file1
log_must stat -t /$mntpnt/file2
# 3. Inject a 10s zio delay to force long IOs and serialize them..
log_must zpool events -c
log_must zinject -d $DISK1 -D10000:1 $TESTPOOL
# 4. Read an uncached file in the background, it's expected to hang.
log_must eval "dd if=/$mntpnt/file1 of=/dev/null bs=1048576 &"
sleep 10
log_must zinject -c all
log_must zpool sync
wait
# 5. Verify a "deadman" event is posted. The first appears after 5
# seconds, and another each second thereafter until the delay is cleared.
events=$(zpool events | grep -c ereport.fs.zfs.deadman)
if [ "$events" -lt 1 ]; then
log_fail "Expect >=1 deadman events, $events found"
fi
# 6. Inject a 100ms zio delay which is under the 5s allowed, allow them
# to run concurrently so they don't get starved in the queue.
log_must zpool events -c
log_must zinject -d $DISK1 -D100:10 $TESTPOOL
# 7. Read an uncached file in the background.
log_must eval "dd if=/$mntpnt/file2 of=/dev/null bs=1048576 &"
sleep 10
log_must zinject -c all
wait
# 8. Verify a "deadman" event is not posted.
events=$(zpool events | grep -c ereport.fs.zfs.deadman)
if [ "$events" -ne 0 ]; then
log_fail "Expect 0 deadman events, $events found"
fi
log_pass "Verify zio deadman detected a hung zio and $events deadman events"