3246 ZFS I/O deadman thread

Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com>
Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>

NOTES: This patch has been reworked from the original in the
following ways to accomidate Linux ZFS implementation

*) Usage of the cyclic interface was replaced by the delayed taskq
   interface.  This avoids the need to implement new compatibility
   code and allows us to rely on the existing taskq implementation.

*) An extern for zfs_txg_synctime_ms was added to sys/dsl_pool.h
   because declaring externs in source files as was done in the
   original patch is just plain wrong.

*) Instead of panicing the system when the deadman triggers a
   zevent describing the blocked vdev and the first pending I/O
   is posted.  If the panic behavior is desired Linux provides
   other generic methods to panic the system when threads are
   observed to hang.

*) For reference, to delay zios by 30 seconds for testing you can
   use zinject as follows: 'zinject -d <vdev> -D30 <pool>'

References:
  illumos/illumos-gate@283b84606b
  https://www.illumos.org/issues/3246

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1396
This commit is contained in:
George.Wilson 2013-04-29 15:49:23 -07:00 committed by Brian Behlendorf
parent 57f5a2008e
commit cc92e9d0c3
20 changed files with 275 additions and 50 deletions

View File

@ -20,6 +20,7 @@
*/ */
/* /*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/ */
#include <libzfs.h> #include <libzfs.h>
@ -476,6 +477,20 @@ translate_device(const char *pool, const char *device, err_type_t label_type,
&record->zi_guid) == 0); &record->zi_guid) == 0);
} }
/*
* Device faults can take on three different forms:
* 1). delayed or hanging I/O
* 2). zfs label faults
* 3). generic disk faults
*/
if (record->zi_timer != 0) {
record->zi_cmd = ZINJECT_DELAY_IO;
} else if (label_type != TYPE_INVAL) {
record->zi_cmd = ZINJECT_LABEL_FAULT;
} else {
record->zi_cmd = ZINJECT_DEVICE_FAULT;
}
switch (label_type) { switch (label_type) {
default: default:
break; break;

View File

@ -20,6 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/ */
/* /*
@ -235,8 +236,8 @@ usage(void)
"\t\t'pad1', or 'pad2'.\n" "\t\t'pad1', or 'pad2'.\n"
"\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n" "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
"\n" "\n"
"\tzinject -d device -A <degrade|fault> pool\n" "\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
"\t\tPerform a specific action on a particular device\n" "\t\tPerform a specific action on a particular device.\n"
"\n" "\n"
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n" "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
"\t\tCause the pool to stop writing blocks yet not\n" "\t\tCause the pool to stop writing blocks yet not\n"
@ -589,7 +590,7 @@ main(int argc, char **argv)
} }
while ((c = getopt(argc, argv, while ((c = getopt(argc, argv,
":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
switch (c) { switch (c) {
case 'a': case 'a':
flags |= ZINJECT_FLUSH_ARC; flags |= ZINJECT_FLUSH_ARC;
@ -615,6 +616,16 @@ main(int argc, char **argv)
case 'd': case 'd':
device = optarg; device = optarg;
break; break;
case 'D':
errno = 0;
record.zi_timer = strtoull(optarg, &end, 10);
if (errno != 0 || *end != '\0') {
(void) fprintf(stderr, "invalid i/o delay "
"value: '%s'\n", optarg);
usage();
return (1);
}
break;
case 'e': case 'e':
if (strcasecmp(optarg, "io") == 0) { if (strcasecmp(optarg, "io") == 0) {
error = EIO; error = EIO;
@ -679,6 +690,7 @@ main(int argc, char **argv)
case 'p': case 'p':
(void) strlcpy(record.zi_func, optarg, (void) strlcpy(record.zi_func, optarg,
sizeof (record.zi_func)); sizeof (record.zi_func));
record.zi_cmd = ZINJECT_PANIC;
break; break;
case 'q': case 'q':
quiet = 1; quiet = 1;
@ -762,13 +774,15 @@ main(int argc, char **argv)
return (1); return (1);
} }
if (record.zi_duration != 0)
record.zi_cmd = ZINJECT_IGNORED_WRITES;
if (cancel != NULL) { if (cancel != NULL) {
/* /*
* '-c' is invalid with any other options. * '-c' is invalid with any other options.
*/ */
if (raw != NULL || range != NULL || type != TYPE_INVAL || if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_func[0] != '\0' || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
record.zi_duration != 0) {
(void) fprintf(stderr, "cancel (-c) incompatible with " (void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n"); "any other options\n");
usage(); usage();
@ -800,8 +814,7 @@ main(int argc, char **argv)
* for doing injection, so handle it separately here. * for doing injection, so handle it separately here.
*/ */
if (raw != NULL || range != NULL || type != TYPE_INVAL || if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_func[0] != '\0' || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
record.zi_duration != 0) {
(void) fprintf(stderr, "device (-d) incompatible with " (void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n"); "data error injection\n");
usage(); usage();
@ -835,7 +848,7 @@ main(int argc, char **argv)
} else if (raw != NULL) { } else if (raw != NULL) {
if (range != NULL || type != TYPE_INVAL || level != 0 || if (range != NULL || type != TYPE_INVAL || level != 0 ||
record.zi_func[0] != '\0' || record.zi_duration != 0) { record.zi_cmd != ZINJECT_UNINITIALIZED) {
(void) fprintf(stderr, "raw (-b) format with " (void) fprintf(stderr, "raw (-b) format with "
"any other options\n"); "any other options\n");
usage(); usage();
@ -858,13 +871,14 @@ main(int argc, char **argv)
return (1); return (1);
} }
record.zi_cmd = ZINJECT_DATA_FAULT;
if (translate_raw(raw, &record) != 0) if (translate_raw(raw, &record) != 0)
return (1); return (1);
if (!error) if (!error)
error = EIO; error = EIO;
} else if (record.zi_func[0] != '\0') { } else if (record.zi_cmd == ZINJECT_PANIC) {
if (raw != NULL || range != NULL || type != TYPE_INVAL || if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || device != NULL || record.zi_duration != 0) { level != 0 || device != NULL) {
(void) fprintf(stderr, "panic (-p) incompatible with " (void) fprintf(stderr, "panic (-p) incompatible with "
"other options\n"); "other options\n");
usage(); usage();
@ -882,7 +896,7 @@ main(int argc, char **argv)
if (argv[1] != NULL) if (argv[1] != NULL)
record.zi_type = atoi(argv[1]); record.zi_type = atoi(argv[1]);
dataset[0] = '\0'; dataset[0] = '\0';
} else if (record.zi_duration != 0) { } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
if (nowrites == 0) { if (nowrites == 0) {
(void) fprintf(stderr, "-s or -g meaningless " (void) fprintf(stderr, "-s or -g meaningless "
"without -I (ignore writes)\n"); "without -I (ignore writes)\n");
@ -936,6 +950,7 @@ main(int argc, char **argv)
return (1); return (1);
} }
record.zi_cmd = ZINJECT_DATA_FAULT;
if (translate_record(type, argv[0], range, level, &record, pool, if (translate_record(type, argv[0], range, level, &record, pool,
dataset) != 0) dataset) != 0)
return (1); return (1);

View File

@ -41,6 +41,8 @@
extern "C" { extern "C" {
#endif #endif
extern int zfs_txg_synctime_ms;
struct objset; struct objset;
struct dsl_dir; struct dsl_dir;
struct dsl_dataset; struct dsl_dataset;

View File

@ -73,6 +73,8 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
@ -88,6 +90,9 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE "zio_deadline"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta"
#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual"

View File

@ -570,6 +570,7 @@ extern int spa_offline_log(spa_t *spa);
/* Log claim callback */ /* Log claim callback */
extern void spa_claim_notify(zio_t *zio); extern void spa_claim_notify(zio_t *zio);
extern void spa_deadman(void *);
/* Accessor functions */ /* Accessor functions */
extern boolean_t spa_shutting_down(spa_t *spa); extern boolean_t spa_shutting_down(spa_t *spa);
@ -604,6 +605,7 @@ extern boolean_t spa_suspended(spa_t *spa);
extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
/* Miscellaneous support routines */ /* Miscellaneous support routines */
extern void spa_activate_mos_feature(spa_t *spa, const char *feature); extern void spa_activate_mos_feature(spa_t *spa, const char *feature);

View File

@ -227,6 +227,10 @@ struct spa {
uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_write_obj; /* required to write to pool */
uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */
uint64_t spa_feat_desc_obj; /* Feature descriptions */ uint64_t spa_feat_desc_obj; /* Feature descriptions */
taskqid_t spa_deadman_tqid; /* Task id */
uint64_t spa_deadman_calls; /* number of deadman calls */
uint64_t spa_sync_starttime; /* starting time fo spa_sync */
uint64_t spa_deadman_synctime; /* deadman expiration timer */
/* /*
* spa_refcnt & spa_config_lock must be the last elements * spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options. * because refcount_t changes size based on compilation options.

View File

@ -78,6 +78,7 @@ extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd); extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);

View File

@ -105,6 +105,8 @@ struct vdev_queue {
avl_tree_t vq_read_tree; avl_tree_t vq_read_tree;
avl_tree_t vq_write_tree; avl_tree_t vq_write_tree;
avl_tree_t vq_pending_tree; avl_tree_t vq_pending_tree;
uint64_t vq_io_complete_ts;
uint64_t vq_io_delta_ts;
list_t vq_io_list; list_t vq_io_list;
kmutex_t vq_lock; kmutex_t vq_lock;
}; };

View File

@ -25,6 +25,7 @@
/* /*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_ZFS_CONTEXT_H #ifndef _SYS_ZFS_CONTEXT_H
@ -400,6 +401,8 @@ extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ #define taskq_create_sysdc(a, b, d, e, p, dc, f) \
(taskq_create(a, b, maxclsyspri, d, e, f)) (taskq_create(a, b, maxclsyspri, d, e, f))
extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t,
clock_t);
extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *); taskq_ent_t *);
extern int taskq_empty_ent(taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *);
@ -407,6 +410,7 @@ extern void taskq_init_ent(taskq_ent_t *);
extern void taskq_destroy(taskq_t *); extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *); extern void taskq_wait(taskq_t *);
extern int taskq_member(taskq_t *, kthread_t *); extern int taskq_member(taskq_t *, kthread_t *);
extern int taskq_cancel_id(taskq_t *, taskqid_t);
extern void system_taskq_init(void); extern void system_taskq_init(void);
extern void system_taskq_fini(void); extern void system_taskq_fini(void);
@ -523,6 +527,11 @@ extern vnode_t *rootdir;
extern void delay(clock_t ticks); extern void delay(clock_t ticks);
#define SEC_TO_TICK(sec) ((sec) * hz)
#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz))
#define USEC_TO_TICK(usec) ((usec) / (MICROSEC / hz))
#define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz))
#define gethrestime_sec() time(NULL) #define gethrestime_sec() time(NULL)
#define gethrestime(t) \ #define gethrestime(t) \
do {\ do {\

View File

@ -236,6 +236,8 @@ typedef struct zinject_record {
uint32_t zi_iotype; uint32_t zi_iotype;
int32_t zi_duration; int32_t zi_duration;
uint64_t zi_timer; uint64_t zi_timer;
uint32_t zi_cmd;
uint32_t zi_pad;
} zinject_record_t; } zinject_record_t;
#define ZINJECT_NULL 0x1 #define ZINJECT_NULL 0x1
@ -245,6 +247,16 @@ typedef struct zinject_record {
#define ZEVENT_NONBLOCK 0x1 #define ZEVENT_NONBLOCK 0x1
#define ZEVENT_SIZE 1024 #define ZEVENT_SIZE 1024
typedef enum zinject_type {
ZINJECT_UNINITIALIZED,
ZINJECT_DATA_FAULT,
ZINJECT_DEVICE_FAULT,
ZINJECT_LABEL_FAULT,
ZINJECT_IGNORED_WRITES,
ZINJECT_PANIC,
ZINJECT_DELAY_IO,
} zinject_type_t;
typedef struct zfs_share { typedef struct zfs_share {
uint64_t z_exportdata; uint64_t z_exportdata;
uint64_t z_sharedata; uint64_t z_sharedata;

View File

@ -21,8 +21,6 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
@ -411,7 +409,10 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops; const zio_vsd_ops_t *io_vsd_ops;
uint64_t io_offset; uint64_t io_offset;
uint64_t io_deadline; uint64_t io_deadline; /* expires at timestamp + deadline */
uint64_t io_timestamp; /* submitted at (ticks) */
uint64_t io_delta; /* vdev queue service delta (ticks) */
uint64_t io_delay; /* vdev disk service delta (ticks) */
avl_node_t io_offset_node; avl_node_t io_offset_node;
avl_node_t io_deadline_node; avl_node_t io_deadline_node;
avl_tree_t *io_vdev_tree; avl_tree_t *io_vdev_tree;
@ -423,7 +424,6 @@ struct zio {
enum zio_flag io_orig_flags; enum zio_flag io_orig_flags;
enum zio_stage io_orig_stage; enum zio_stage io_orig_stage;
enum zio_stage io_orig_pipeline; enum zio_stage io_orig_pipeline;
uint64_t io_delay;
int io_error; int io_error;
int io_child_error[ZIO_CHILD_TYPES]; int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
@ -560,6 +560,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error);
extern void zio_handle_ignored_writes(zio_t *zio); extern void zio_handle_ignored_writes(zio_t *zio);
extern uint64_t zio_handle_io_delay(zio_t *zio);
/* /*
* Checksum ereport functions * Checksum ereport functions

View File

@ -147,6 +147,13 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
return (1); return (1);
} }
taskqid_t
taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags,
clock_t expire_time)
{
return (0);
}
int int
taskq_empty_ent(taskq_ent_t *t) taskq_empty_ent(taskq_ent_t *t)
{ {
@ -339,6 +346,12 @@ taskq_member(taskq_t *tq, kthread_t *t)
return (0); return (0);
} }
int
taskq_cancel_id(taskq_t *tq, taskqid_t id)
{
return (ENOENT);
}
void void
system_taskq_init(void) system_taskq_init(void)
{ {

View File

@ -1013,6 +1013,8 @@ spa_deactivate(spa_t *spa)
list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_config_dirty_list);
list_destroy(&spa->spa_state_dirty_list); list_destroy(&spa->spa_state_dirty_list);
taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
for (t = 0; t < ZIO_TYPES; t++) { for (t = 0; t < ZIO_TYPES; t++) {
for (q = 0; q < ZIO_TASKQ_TYPES; q++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
if (spa->spa_zio_taskq[t][q] != NULL) if (spa->spa_zio_taskq[t][q] != NULL)
@ -6017,6 +6019,12 @@ spa_sync(spa_t *spa, uint64_t txg)
tx = dmu_tx_create_assigned(dp, txg); tx = dmu_tx_create_assigned(dp, txg);
spa->spa_sync_starttime = gethrtime();
taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
NSEC_TO_TICK(spa->spa_deadman_synctime));
/* /*
* If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
* set spa_deflate if we have no raid-z vdevs. * set spa_deflate if we have no raid-z vdevs.
@ -6145,6 +6153,9 @@ spa_sync(spa_t *spa, uint64_t txg)
} }
dmu_tx_commit(tx); dmu_tx_commit(tx);
taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
spa->spa_deadman_tqid = 0;
/* /*
* Clear the dirty config list. * Clear the dirty config list.
*/ */

View File

@ -236,6 +236,24 @@ static avl_tree_t spa_l2cache_avl;
kmem_cache_t *spa_buffer_pool; kmem_cache_t *spa_buffer_pool;
int spa_mode_global; int spa_mode_global;
/*
* Expiration time in units of zfs_txg_synctime_ms. This value has two
* meanings. First it is used to determine when the spa_deadman logic
* should fire. By default the spa_deadman will fire if spa_sync has
* not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
* Secondly, the value determines if an I/O is considered "hung".
* Any I/O that has not completed in zfs_deadman_synctime is considered
* "hung" resulting in a zevent being posted.
* 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
*/
unsigned long zfs_deadman_synctime = 1000ULL;
/*
* By default the deadman is enabled.
*/
int zfs_deadman_enabled = 1;
/* /*
* ========================================================================== * ==========================================================================
* SPA config locking * SPA config locking
@ -412,6 +430,27 @@ spa_lookup(const char *name)
return (spa); return (spa);
} }
/*
* Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
* If the zfs_deadman_enabled flag is set then it inspects all vdev queues
* looking for potentially hung I/Os.
*/
void
spa_deadman(void *arg)
{
spa_t *spa = arg;
zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
++spa->spa_deadman_calls);
if (zfs_deadman_enabled)
vdev_deadman(spa->spa_root_vdev);
spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
NSEC_TO_TICK(spa->spa_deadman_synctime));
}
/* /*
* Create an uninitialized spa_t with the given name. Requires * Create an uninitialized spa_t with the given name. Requires
* spa_namespace_lock. The caller must ensure that the spa_t doesn't already * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
@ -454,6 +493,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_proc = &p0; spa->spa_proc = &p0;
spa->spa_proc_state = SPA_PROC_NONE; spa->spa_proc_state = SPA_PROC_NONE;
spa->spa_deadman_synctime = zfs_deadman_synctime *
zfs_txg_synctime_ms * MICROSEC;
refcount_create(&spa->spa_refcount); refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa); spa_config_lock_init(spa);
@ -1492,6 +1534,12 @@ spa_prev_software_version(spa_t *spa)
return (spa->spa_prev_software_version); return (spa->spa_prev_software_version);
} }
uint64_t
spa_deadman_synctime(spa_t *spa)
{
return (spa->spa_deadman_synctime);
}
uint64_t uint64_t
dva_get_dsize_sync(spa_t *spa, const dva_t *dva) dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{ {
@ -1812,4 +1860,10 @@ EXPORT_SYMBOL(spa_writeable);
EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_mode);
EXPORT_SYMBOL(spa_namespace_lock); EXPORT_SYMBOL(spa_namespace_lock);
module_param(zfs_deadman_synctime, ulong, 0644);
MODULE_PARM_DESC(zfs_deadman_synctime,"Expire in units of zfs_txg_synctime_ms");
module_param(zfs_deadman_enabled, int, 0644);
MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");
#endif #endif

View File

@ -3195,6 +3195,46 @@ vdev_split(vdev_t *vd)
vdev_propagate_state(cvd); vdev_propagate_state(cvd);
} }
void
vdev_deadman(vdev_t *vd)
{
int c;
for (c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
vdev_deadman(cvd);
}
if (vd->vdev_ops->vdev_op_leaf) {
vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock);
if (avl_numnodes(&vq->vq_pending_tree) > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime we log a zevent.
*/
fio = avl_first(&vq->vq_pending_tree);
delta = ddi_get_lbolt64() - fio->io_timestamp;
if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) {
zfs_dbgmsg("SLOW IO: zio timestamp %llu, "
"delta %llu, last io %llu",
fio->io_timestamp, delta,
vq->vq_io_complete_ts);
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
spa, vd, fio, 0, 0);
}
}
mutex_exit(&vq->vq_lock);
}
}
#if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_fault);
EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_degrade);

View File

@ -407,8 +407,7 @@ vdev_disk_dio_put(dio_request_t *dr)
vdev_disk_dio_free(dr); vdev_disk_dio_free(dr);
if (zio) { if (zio) {
zio->io_delay = jiffies_to_msecs( zio->io_delay = jiffies_64 - zio->io_delay;
jiffies_64 - zio->io_delay);
zio->io_error = error; zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0); ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error) if (zio->io_error)
@ -609,7 +608,7 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
{ {
zio_t *zio = bio->bi_private; zio_t *zio = bio->bi_private;
zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay); zio->io_delay = jiffies_64 - zio->io_delay;
zio->io_error = -rc; zio->io_error = -rc;
if (rc && (rc == -EOPNOTSUPP)) if (rc && (rc == -EOPNOTSUPP))
zio->io_vd->vdev_nowritecache = B_TRUE; zio->io_vd->vdev_nowritecache = B_TRUE;

View File

@ -23,6 +23,10 @@
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/zio.h> #include <sys/zio.h>
@ -319,6 +323,7 @@ again:
vi, size, fio->io_type, ZIO_PRIORITY_AGG, vi, size, fio->io_type, ZIO_PRIORITY_AGG,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL); vdev_queue_agg_io_done, NULL);
aio->io_timestamp = fio->io_timestamp;
nio = fio; nio = fio;
do { do {
@ -391,7 +396,8 @@ vdev_queue_io(zio_t *zio)
mutex_enter(&vq->vq_lock); mutex_enter(&vq->vq_lock);
zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + zio->io_timestamp = ddi_get_lbolt64();
zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
zio->io_priority; zio->io_priority;
vdev_queue_io_add(vq, zio); vdev_queue_io_add(vq, zio);
@ -417,10 +423,17 @@ vdev_queue_io_done(zio_t *zio)
vdev_queue_t *vq = &zio->io_vd->vdev_queue; vdev_queue_t *vq = &zio->io_vd->vdev_queue;
int i; int i;
if (zio_injection_enabled)
delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
mutex_enter(&vq->vq_lock); mutex_enter(&vq->vq_lock);
avl_remove(&vq->vq_pending_tree, zio); avl_remove(&vq->vq_pending_tree, zio);
zio->io_delta = ddi_get_lbolt64() - zio->io_timestamp;
vq->vq_io_complete_ts = ddi_get_lbolt64();
vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
for (i = 0; i < zfs_vdev_ramp_rate; i++) { for (i = 0; i < zfs_vdev_ramp_rate; i++) {
zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
if (nio == NULL) if (nio == NULL)

View File

@ -250,6 +250,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
if (vd != NULL) { if (vd != NULL) {
vdev_t *pvd = vd->vdev_parent; vdev_t *pvd = vd->vdev_parent;
vdev_queue_t *vq = &vd->vdev_queue;
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
DATA_TYPE_UINT64, vd->vdev_guid, DATA_TYPE_UINT64, vd->vdev_guid,
@ -272,6 +273,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
DATA_TYPE_UINT64, vd->vdev_ashift, NULL); DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
if (vq != NULL) {
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
}
if (pvd != NULL) { if (pvd != NULL) {
fm_payload_set(ereport, fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
@ -304,6 +314,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT32, zio->io_pipeline, NULL); DATA_TYPE_UINT32, zio->io_pipeline, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
DATA_TYPE_UINT64, zio->io_delay, NULL); DATA_TYPE_UINT64, zio->io_delay, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
DATA_TYPE_UINT64, zio->io_timestamp, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE,
DATA_TYPE_UINT64, zio->io_deadline, NULL);
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
DATA_TYPE_UINT64, zio->io_delta, NULL);
/* /*
* If the 'size' parameter is non-zero, it indicates this is a * If the 'size' parameter is non-zero, it indicates this is a

View File

@ -609,6 +609,9 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_vsd_ops = NULL; zio->io_vsd_ops = NULL;
zio->io_offset = offset; zio->io_offset = offset;
zio->io_deadline = 0; zio->io_deadline = 0;
zio->io_timestamp = 0;
zio->io_delta = 0;
zio->io_delay = 0;
zio->io_orig_data = zio->io_data = data; zio->io_orig_data = zio->io_data = data;
zio->io_orig_size = zio->io_size = size; zio->io_orig_size = zio->io_size = size;
zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_flags = zio->io_flags = flags;
@ -620,7 +623,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bp_override = NULL; zio->io_bp_override = NULL;
zio->io_walk_link = NULL; zio->io_walk_link = NULL;
zio->io_transform_stack = NULL; zio->io_transform_stack = NULL;
zio->io_delay = 0;
zio->io_error = 0; zio->io_error = 0;
zio->io_child_count = 0; zio->io_child_count = 0;
zio->io_parent_count = 0; zio->io_parent_count = 0;
@ -2906,11 +2908,11 @@ zio_done(zio_t *zio)
vdev_stat_update(zio, zio->io_size); vdev_stat_update(zio, zio->io_size);
/* /*
* If this I/O is attached to a particular vdev is slow, exeeding * If this I/O is attached to a particular vdev is slow, exceeding
* 30 seconds to complete, post an error described the I/O delay. * 30 seconds to complete, post an error described the I/O delay.
* We ignore these errors if the device is currently unavailable. * We ignore these errors if the device is currently unavailable.
*/ */
if (zio->io_delay >= zio_delay_max) { if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
zio->io_vd, zio, 0, 0); zio->io_vd, zio, 0, 0);

View File

@ -20,6 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/ */
/* /*
@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error)
for (handler = list_head(&inject_handlers); handler != NULL; for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) { handler = list_next(&inject_handlers, handler)) {
/* Ignore errors not destined for this pool */ if (zio->io_spa != handler->zi_spa ||
if (zio->io_spa != handler->zi_spa) handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
continue;
/* Ignore device errors and panic injection */
if (handler->zi_record.zi_guid != 0 ||
handler->zi_record.zi_func[0] != '\0' ||
handler->zi_record.zi_duration != 0)
continue; continue;
/* If this handler matches, return EIO */ /* If this handler matches, return EIO */
@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error)
uint64_t start = handler->zi_record.zi_start; uint64_t start = handler->zi_record.zi_start;
uint64_t end = handler->zi_record.zi_end; uint64_t end = handler->zi_record.zi_end;
/* Ignore device only faults or panic injection */ if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
if (handler->zi_record.zi_start == 0 ||
handler->zi_record.zi_func[0] != '\0' ||
handler->zi_record.zi_duration != 0)
continue; continue;
/* /*
@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
for (handler = list_head(&inject_handlers); handler != NULL; for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) { handler = list_next(&inject_handlers, handler)) {
/* if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
* Ignore label specific faults, panic injection
* or fake writes
*/
if (handler->zi_record.zi_start != 0 ||
handler->zi_record.zi_func[0] != '\0' ||
handler->zi_record.zi_duration != 0)
continue; continue;
if (vd->vdev_guid == handler->zi_record.zi_guid) { if (vd->vdev_guid == handler->zi_record.zi_guid) {
@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio)
handler = list_next(&inject_handlers, handler)) { handler = list_next(&inject_handlers, handler)) {
/* Ignore errors not destined for this pool */ /* Ignore errors not destined for this pool */
if (zio->io_spa != handler->zi_spa) if (zio->io_spa != handler->zi_spa ||
continue; handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
if (handler->zi_record.zi_duration == 0)
continue; continue;
/* /*
@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa)
for (handler = list_head(&inject_handlers); handler != NULL; for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) { handler = list_next(&inject_handlers, handler)) {
/* Ignore errors not destined for this pool */ if (spa != handler->zi_spa ||
if (spa != handler->zi_spa) handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
continue;
if (handler->zi_record.zi_duration == 0)
continue; continue;
if (handler->zi_record.zi_duration > 0) { if (handler->zi_record.zi_duration > 0) {
@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa)
rw_exit(&inject_lock); rw_exit(&inject_lock);
} }
uint64_t
zio_handle_io_delay(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
inject_handler_t *handler;
uint64_t seconds = 0;
if (zio_injection_enabled == 0)
return (0);
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
continue;
if (vd->vdev_guid == handler->zi_record.zi_guid) {
seconds = handler->zi_record.zi_timer;
break;
}
}
rw_exit(&inject_lock);
return (seconds);
}
/* /*
* Create a new handler for the given record. We add it to the list, adding * Create a new handler for the given record. We add it to the list, adding
* a reference to the spa_t in the process. We increment zio_injection_enabled, * a reference to the spa_t in the process. We increment zio_injection_enabled,