diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c index 160786641e..fc1612738e 100644 --- a/cmd/zinject/translate.c +++ b/cmd/zinject/translate.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -476,6 +477,20 @@ translate_device(const char *pool, const char *device, err_type_t label_type, &record->zi_guid) == 0); } + /* + * Device faults can take on three different forms: + * 1). delayed or hanging I/O + * 2). zfs label faults + * 3). generic disk faults + */ + if (record->zi_timer != 0) { + record->zi_cmd = ZINJECT_DELAY_IO; + } else if (label_type != TYPE_INVAL) { + record->zi_cmd = ZINJECT_LABEL_FAULT; + } else { + record->zi_cmd = ZINJECT_DEVICE_FAULT; + } + switch (label_type) { default: break; diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index d584eadd82..9ae3f9ddc3 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -235,8 +236,8 @@ usage(void) "\t\t'pad1', or 'pad2'.\n" "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n" "\n" - "\tzinject -d device -A pool\n" - "\t\tPerform a specific action on a particular device\n" + "\tzinject -d device -A -D pool\n" + "\t\tPerform a specific action on a particular device.\n" "\n" "\tzinject -I [-s | -g ] pool\n" "\t\tCause the pool to stop writing blocks yet not\n" @@ -589,7 +590,7 @@ main(int argc, char **argv) } while ((c = getopt(argc, argv, - ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -615,6 +616,16 @@ main(int argc, char **argv) case 'd': device = optarg; break; + case 'D': + errno = 0; + record.zi_timer = strtoull(optarg, &end, 10); + if (errno != 0 || *end != '\0') { + (void) fprintf(stderr, "invalid i/o delay " + "value: '%s'\n", optarg); + usage(); + return (1); + } + break; case 'e': if (strcasecmp(optarg, "io") == 0) { error = EIO; @@ -679,6 +690,7 @@ main(int argc, char **argv) case 'p': (void) strlcpy(record.zi_func, optarg, sizeof (record.zi_func)); + record.zi_cmd = ZINJECT_PANIC; break; case 'q': quiet = 1; @@ -762,13 +774,15 @@ main(int argc, char **argv) return (1); } + if (record.zi_duration != 0) + record.zi_cmd = ZINJECT_IGNORED_WRITES; + if (cancel != NULL) { /* * '-c' is invalid with any other options. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || record.zi_func[0] != '\0' || - record.zi_duration != 0) { + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { (void) fprintf(stderr, "cancel (-c) incompatible with " "any other options\n"); usage(); @@ -800,8 +814,7 @@ main(int argc, char **argv) * for doing injection, so handle it separately here. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || record.zi_func[0] != '\0' || - record.zi_duration != 0) { + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { (void) fprintf(stderr, "device (-d) incompatible with " "data error injection\n"); usage(); @@ -835,7 +848,7 @@ main(int argc, char **argv) } else if (raw != NULL) { if (range != NULL || type != TYPE_INVAL || level != 0 || - record.zi_func[0] != '\0' || record.zi_duration != 0) { + record.zi_cmd != ZINJECT_UNINITIALIZED) { (void) fprintf(stderr, "raw (-b) format with " "any other options\n"); usage(); @@ -858,13 +871,14 @@ main(int argc, char **argv) return (1); } + record.zi_cmd = ZINJECT_DATA_FAULT; if (translate_raw(raw, &record) != 0) return (1); if (!error) error = EIO; - } else if (record.zi_func[0] != '\0') { + } else if (record.zi_cmd == ZINJECT_PANIC) { if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || device != NULL || record.zi_duration != 0) { + level != 0 || device != NULL) { (void) fprintf(stderr, "panic (-p) incompatible with " "other options\n"); usage(); @@ -882,7 +896,7 @@ main(int argc, char **argv) if (argv[1] != NULL) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; - } else if (record.zi_duration != 0) { + } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { if (nowrites == 0) { (void) fprintf(stderr, "-s or -g meaningless " "without -I (ignore writes)\n"); @@ -936,6 +950,7 @@ main(int argc, char **argv) return (1); } + record.zi_cmd = ZINJECT_DATA_FAULT; if (translate_record(type, argv[0], range, level, &record, pool, dataset) != 0) return (1); diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index ff5df1414a..96a2296bf8 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -41,6 +41,8 @@ extern "C" { #endif +extern int zfs_txg_synctime_ms; + struct objset; struct dsl_dir; struct dsl_dataset; diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index d5c6004c26..741b99e62f 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -73,6 +73,8 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" @@ -88,6 +90,9 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE "zio_deadline" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" diff --git a/include/sys/spa.h b/include/sys/spa.h index 821172297a..ca9fb24384 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -570,6 +570,7 @@ extern int spa_offline_log(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); +extern void spa_deadman(void *); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); @@ -604,6 +605,7 @@ extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); +extern uint64_t spa_deadman_synctime(spa_t *spa); /* Miscellaneous support routines */ extern void spa_activate_mos_feature(spa_t *spa, const char *feature); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 65edc97f52..5bd0e0a9c1 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -227,6 +227,10 @@ struct spa { uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ + taskqid_t spa_deadman_tqid; /* Task id */ + uint64_t spa_deadman_calls; /* number of deadman calls */ + uint64_t spa_sync_starttime; /* starting time fo spa_sync */ + uint64_t spa_deadman_synctime; /* deadman expiration timer */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 8f297a917a..f49086a477 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -78,6 +78,7 @@ extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); +extern void vdev_deadman(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 964ee24074..711408e6a9 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -105,6 +105,8 @@ struct vdev_queue { avl_tree_t vq_read_tree; avl_tree_t vq_write_tree; avl_tree_t vq_pending_tree; + uint64_t vq_io_complete_ts; + uint64_t vq_io_delta_ts; list_t vq_io_list; kmutex_t vq_lock; }; diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 53080f3823..a23bfdcf82 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -25,6 +25,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H @@ -400,6 +401,8 @@ extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ (taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, + clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); @@ -407,6 +410,7 @@ extern void taskq_init_ent(taskq_ent_t *); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern int taskq_member(taskq_t *, kthread_t *); +extern int taskq_cancel_id(taskq_t *, taskqid_t); extern void system_taskq_init(void); extern void system_taskq_fini(void); @@ -523,6 +527,11 @@ extern vnode_t *rootdir; extern void delay(clock_t ticks); +#define SEC_TO_TICK(sec) ((sec) * hz) +#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz)) +#define USEC_TO_TICK(usec) ((usec) / (MICROSEC / hz)) +#define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz)) + #define gethrestime_sec() time(NULL) #define gethrestime(t) \ do {\ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 740d8eda4e..c0cb4705d4 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -236,6 +236,8 @@ typedef struct zinject_record { uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; + uint32_t zi_cmd; + uint32_t zi_pad; } zinject_record_t; #define ZINJECT_NULL 0x1 @@ -245,6 +247,16 @@ typedef struct zinject_record { #define ZEVENT_NONBLOCK 0x1 #define ZEVENT_SIZE 1024 +typedef enum zinject_type { + ZINJECT_UNINITIALIZED, + ZINJECT_DATA_FAULT, + ZINJECT_DEVICE_FAULT, + ZINJECT_LABEL_FAULT, + ZINJECT_IGNORED_WRITES, + ZINJECT_PANIC, + ZINJECT_DELAY_IO, +} zinject_type_t; + typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; diff --git a/include/sys/zio.h b/include/sys/zio.h index 64efde0169..03530330c9 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -21,8 +21,6 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. @@ -411,7 +409,10 @@ struct zio { const zio_vsd_ops_t *io_vsd_ops; uint64_t io_offset; - uint64_t io_deadline; + uint64_t io_deadline; /* expires at timestamp + deadline */ + uint64_t io_timestamp; /* submitted at (ticks) */ + uint64_t io_delta; /* vdev queue service delta (ticks) */ + uint64_t io_delay; /* vdev disk service delta (ticks) */ avl_node_t io_offset_node; avl_node_t io_deadline_node; avl_tree_t *io_vdev_tree; @@ -423,7 +424,6 @@ struct zio { enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; - uint64_t io_delay; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; @@ -560,6 +560,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); +extern uint64_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 803f7dc374..64e214205e 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -147,6 +147,13 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) return (1); } +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags, + clock_t expire_time) +{ + return (0); +} + int taskq_empty_ent(taskq_ent_t *t) { @@ -339,6 +346,12 @@ taskq_member(taskq_t *tq, kthread_t *t) return (0); } +int +taskq_cancel_id(taskq_t *tq, taskqid_t id) +{ + return (ENOENT); +} + void system_taskq_init(void) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 0babf47522..0d3537eaa1 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1013,6 +1013,8 @@ spa_deactivate(spa_t *spa) list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_state_dirty_list); + taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + for (t = 0; t < ZIO_TYPES; t++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) { if (spa->spa_zio_taskq[t][q] != NULL) @@ -6017,6 +6019,12 @@ spa_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); + spa->spa_sync_starttime = gethrtime(); + taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, + spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + + NSEC_TO_TICK(spa->spa_deadman_synctime)); + /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. @@ -6145,6 +6153,9 @@ spa_sync(spa_t *spa, uint64_t txg) } dmu_tx_commit(tx); + taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = 0; + /* * Clear the dirty config list. */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 5ec8e688e7..476f9c02cd 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -236,6 +236,24 @@ static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; int spa_mode_global; +/* + * Expiration time in units of zfs_txg_synctime_ms. This value has two + * meanings. First it is used to determine when the spa_deadman logic + * should fire. By default the spa_deadman will fire if spa_sync has + * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds). + * Secondly, the value determines if an I/O is considered "hung". + * Any I/O that has not completed in zfs_deadman_synctime is considered + * "hung" resulting in a zevent being posted. + * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds). + */ +unsigned long zfs_deadman_synctime = 1000ULL; + +/* + * By default the deadman is enabled. + */ +int zfs_deadman_enabled = 1; + + /* * ========================================================================== * SPA config locking @@ -412,6 +430,27 @@ spa_lookup(const char *name) return (spa); } +/* + * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. + * If the zfs_deadman_enabled flag is set then it inspects all vdev queues + * looking for potentially hung I/Os. + */ +void +spa_deadman(void *arg) +{ + spa_t *spa = arg; + + zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", + (gethrtime() - spa->spa_sync_starttime) / NANOSEC, + ++spa->spa_deadman_calls); + if (zfs_deadman_enabled) + vdev_deadman(spa->spa_root_vdev); + + spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, + spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + + NSEC_TO_TICK(spa->spa_deadman_synctime)); +} + /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already @@ -454,6 +493,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; + spa->spa_deadman_synctime = zfs_deadman_synctime * + zfs_txg_synctime_ms * MICROSEC; + refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); @@ -1492,6 +1534,12 @@ spa_prev_software_version(spa_t *spa) return (spa->spa_prev_software_version); } +uint64_t +spa_deadman_synctime(spa_t *spa) +{ + return (spa->spa_deadman_synctime); +} + uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { @@ -1812,4 +1860,10 @@ EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); + +module_param(zfs_deadman_synctime, ulong, 0644); +MODULE_PARM_DESC(zfs_deadman_synctime,"Expire in units of zfs_txg_synctime_ms"); + +module_param(zfs_deadman_enabled, int, 0644); +MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer"); #endif diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index afff4d423d..15ff30fb3b 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3195,6 +3195,46 @@ vdev_split(vdev_t *vd) vdev_propagate_state(cvd); } +void +vdev_deadman(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + vdev_deadman(cvd); + } + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_enter(&vq->vq_lock); + if (avl_numnodes(&vq->vq_pending_tree) > 0) { + spa_t *spa = vd->vdev_spa; + zio_t *fio; + uint64_t delta; + + /* + * Look at the head of all the pending queues, + * if any I/O has been outstanding for longer than + * the spa_deadman_synctime we log a zevent. + */ + fio = avl_first(&vq->vq_pending_tree); + delta = ddi_get_lbolt64() - fio->io_timestamp; + if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) { + zfs_dbgmsg("SLOW IO: zio timestamp %llu, " + "delta %llu, last io %llu", + fio->io_timestamp, delta, + vq->vq_io_complete_ts); + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + spa, vd, fio, 0, 0); + } + } + mutex_exit(&vq->vq_lock); + } +} + #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 08f64a9136..31d1a28bdc 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -407,8 +407,7 @@ vdev_disk_dio_put(dio_request_t *dr) vdev_disk_dio_free(dr); if (zio) { - zio->io_delay = jiffies_to_msecs( - jiffies_64 - zio->io_delay); + zio->io_delay = jiffies_64 - zio->io_delay; zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) @@ -609,7 +608,7 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc) { zio_t *zio = bio->bi_private; - zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay); + zio->io_delay = jiffies_64 - zio->io_delay; zio->io_error = -rc; if (rc && (rc == -EOPNOTSUPP)) zio->io_vd->vdev_nowritecache = B_TRUE; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e2096fac9d..3f2793ba4f 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include #include #include @@ -319,6 +323,7 @@ again: vi, size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); + aio->io_timestamp = fio->io_timestamp; nio = fio; do { @@ -391,7 +396,8 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + + zio->io_timestamp = ddi_get_lbolt64(); + zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + zio->io_priority; vdev_queue_io_add(vq, zio); @@ -417,10 +423,17 @@ vdev_queue_io_done(zio_t *zio) vdev_queue_t *vq = &zio->io_vd->vdev_queue; int i; + if (zio_injection_enabled) + delay(SEC_TO_TICK(zio_handle_io_delay(zio))); + mutex_enter(&vq->vq_lock); avl_remove(&vq->vq_pending_tree, zio); + zio->io_delta = ddi_get_lbolt64() - zio->io_timestamp; + vq->vq_io_complete_ts = ddi_get_lbolt64(); + vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; + for (i = 0; i < zfs_vdev_ramp_rate; i++) { zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 820291bf4b..af2030ae7f 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -250,6 +250,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, if (vd != NULL) { vdev_t *pvd = vd->vdev_parent; + vdev_queue_t *vq = &vd->vdev_queue; fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, DATA_TYPE_UINT64, vd->vdev_guid, @@ -272,6 +273,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, DATA_TYPE_UINT64, vd->vdev_ashift, NULL); + if (vq != NULL) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, + DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, + DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); + } + if (pvd != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, @@ -304,6 +314,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT32, zio->io_pipeline, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, DATA_TYPE_UINT64, zio->io_delay, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, + DATA_TYPE_UINT64, zio->io_timestamp, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE, + DATA_TYPE_UINT64, zio->io_deadline, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, + DATA_TYPE_UINT64, zio->io_delta, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0622553f5f..943f2d677a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -609,6 +609,9 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_vsd_ops = NULL; zio->io_offset = offset; zio->io_deadline = 0; + zio->io_timestamp = 0; + zio->io_delta = 0; + zio->io_delay = 0; zio->io_orig_data = zio->io_data = data; zio->io_orig_size = zio->io_size = size; zio->io_orig_flags = zio->io_flags = flags; @@ -620,7 +623,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bp_override = NULL; zio->io_walk_link = NULL; zio->io_transform_stack = NULL; - zio->io_delay = 0; zio->io_error = 0; zio->io_child_count = 0; zio->io_parent_count = 0; @@ -2906,11 +2908,11 @@ zio_done(zio_t *zio) vdev_stat_update(zio, zio->io_size); /* - * If this I/O is attached to a particular vdev is slow, exeeding + * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ - if (zio->io_delay >= zio_delay_max) { + if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio, 0, 0); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 293f267fc9..eb589c4299 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - /* Ignore device errors and panic injection */ - if (handler->zi_record.zi_guid != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; /* If this handler matches, return EIO */ @@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error) uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; - /* Ignore device only faults or panic injection */ - if (handler->zi_record.zi_start == 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) continue; /* @@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* - * Ignore label specific faults, panic injection - * or fake writes - */ - if (handler->zi_record.zi_start != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { @@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio) handler = list_next(&inject_handlers, handler)) { /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; /* @@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; if (handler->zi_record.zi_duration > 0) { @@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa) rw_exit(&inject_lock); } +uint64_t +zio_handle_io_delay(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + inject_handler_t *handler; + uint64_t seconds = 0; + + if (zio_injection_enabled == 0) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) + continue; + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + seconds = handler->zi_record.zi_timer; + break; + } + + } + rw_exit(&inject_lock); + return (seconds); +} + /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled,