From 26ef0cc7db03be6f7a5c2d06c7ecdb2449bfa9e1 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 23 May 2016 10:41:29 -0700 Subject: [PATCH] OpenZFS 6531 - Provide mechanism to artificially limit disk performance Reviewed by: Paul Dagnelie Reviewed by: Matthew Ahrens Reviewed by: George Wilson Approved by: Dan McDonald Ported by: Tony Hutter Signed-off-by: Brian Behlendorf OpenZFS-issue: https://www.illumos.org/issues/6531 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/97e8130 Porting notes: - Added new IO delay tracepoints, and moved common ZIO tracepoint macros to a new trace_common.h file. - Used zio_delay_taskq() in place of OpenZFS's timeout_generic() function. - Updated zinject man page - Updated zpool_scrub test files --- cmd/zinject/zinject.c | 108 +++++++- include/sys/Makefile.am | 2 + include/sys/trace_arc.h | 81 +----- include/sys/trace_common.h | 112 ++++++++ include/sys/trace_zio.h | 87 ++++++ include/sys/zfs_ioctl.h | 1 + include/sys/zio.h | 5 +- man/man8/zinject.8 | 33 +++ module/zfs/trace.c | 1 + module/zfs/vdev_disk.c | 5 +- module/zfs/vdev_file.c | 6 +- module/zfs/vdev_queue.c | 3 - module/zfs/zio.c | 71 +++++ module/zfs/zio_inject.c | 261 +++++++++++++++++- .../functional/cli_root/zpool_scrub/setup.ksh | 7 +- .../zpool_scrub/zpool_scrub_002_pos.ksh | 7 +- .../zpool_scrub/zpool_scrub_003_pos.ksh | 6 +- .../zpool_scrub/zpool_scrub_004_pos.ksh | 6 +- 18 files changed, 681 insertions(+), 121 deletions(-) create mode 100644 include/sys/trace_common.h create mode 100644 include/sys/trace_zio.h diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index f64e504627..f355ebd454 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* @@ -239,6 +239,38 @@ usage(void) "\tzinject -d device -A -D pool\n" "\t\tPerform a specific action on a particular device.\n" "\n" + "\tzinject -d device -D latency:lanes pool\n" + "\n" + "\t\tAdd an artificial delay to IO requests on a particular\n" + "\t\tdevice, such that the requests take a minimum of 'latency'\n" + "\t\tmilliseconds to complete. Each delay has an associated\n" + "\t\tnumber of 'lanes' which defines the number of concurrent\n" + "\t\tIO requests that can be processed.\n" + "\n" + "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n" + "\t\tthe device will only be able to service a single IO request\n" + "\t\tat a time with each request taking 10 ms to complete. So,\n" + "\t\tif only a single request is submitted every 10 ms, the\n" + "\t\taverage latency will be 10 ms; but if more than one request\n" + "\t\tis submitted every 10 ms, the average latency will be more\n" + "\t\tthan 10 ms.\n" + "\n" + "\t\tSimilarly, if a delay of 10 ms is specified to have two\n" + "\t\tlanes (-D 10:2), then the device will be able to service\n" + "\t\ttwo requests at a time, each with a minimum latency of\n" + "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n" + "\t\tthe average latency will be 10 ms; but if more than two\n" + "\t\trequests are submitted every 10 ms, the average latency\n" + "\t\twill be more than 10 ms.\n" + "\n" + "\t\tAlso note, these delays are additive. So two invocations\n" + "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n" + "\t\tof '-D 10:2'. This also means, one can specify multiple\n" + "\t\tlanes with differing target latencies. For example, an\n" + "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n" + "\t\tcreate 3 lanes on the device; one lane with a latency\n" + "\t\tof 10 ms and two lanes with a 25 ms latency.\n" + "\n" "\tzinject -I [-s | -g ] pool\n" "\t\tCause the pool to stop writing blocks yet not\n" "\t\treport errors for a duration. Simulates buggy hardware\n" @@ -353,6 +385,9 @@ print_device_handler(int id, const char *pool, zinject_record_t *record, if (record->zi_guid == 0 || record->zi_func[0] != '\0') return (0); + if (record->zi_cmd == ZINJECT_DELAY_IO) + return (0); + if (*count == 0) { (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); (void) printf("--- --------------- ----------------\n"); @@ -366,6 +401,35 @@ print_device_handler(int id, const char *pool, zinject_record_t *record, return (0); } +static int +print_delay_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid == 0 || record->zi_func[0] != '\0') + return (0); + + if (record->zi_cmd != ZINJECT_DELAY_IO) + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %-15s %-15s %s\n", + "ID", "POOL", "DELAY (ms)", "LANES", "GUID"); + (void) printf("--- --------------- --------------- " + "--------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool, + (u_longlong_t)NSEC2MSEC(record->zi_timer), + (u_longlong_t)record->zi_nlanes, + (u_longlong_t)record->zi_guid); + + return (0); +} + static int print_panic_handler(int id, const char *pool, zinject_record_t *record, void *data) @@ -403,6 +467,13 @@ print_all_handlers(void) count = 0; } + (void) iter_handlers(print_delay_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + (void) iter_handlers(print_data_handler, &count); if (count > 0) { total += count; @@ -545,6 +616,35 @@ perform_action(const char *pool, zinject_record_t *record, int cmd) return (1); } +static int +parse_delay(char *str, uint64_t *delay, uint64_t *nlanes) +{ + unsigned long scan_delay; + unsigned long scan_nlanes; + + if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2) + return (1); + + /* + * We explicitly disallow a delay of zero here, because we key + * off this value being non-zero in translate_device(), to + * determine if the fault is a ZINJECT_DELAY_IO fault or not. + */ + if (scan_delay == 0) + return (1); + + /* + * The units for the CLI delay parameter is milliseconds, but + * the data passed to the kernel is interpreted as nanoseconds. + * Thus we scale the milliseconds to nanoseconds here, and this + * nanosecond value is used to pass the delay to the kernel. + */ + *delay = MSEC2NSEC(scan_delay); + *nlanes = scan_nlanes; + + return (0); +} + int main(int argc, char **argv) { @@ -628,8 +728,10 @@ main(int argc, char **argv) break; case 'D': errno = 0; - record.zi_timer = strtoull(optarg, &end, 10); - if (errno != 0 || *end != '\0') { + ret = parse_delay(optarg, &record.zi_timer, + &record.zi_nlanes); + if (ret != 0) { + (void) fprintf(stderr, "invalid i/o delay " "value: '%s'\n", optarg); usage(); diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 98d7ad6fb0..89c3142797 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -53,6 +53,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/trace.h \ $(top_srcdir)/include/sys/trace_acl.h \ $(top_srcdir)/include/sys/trace_arc.h \ + $(top_srcdir)/include/sys/trace_common.h \ $(top_srcdir)/include/sys/trace_dbgmsg.h \ $(top_srcdir)/include/sys/trace_dbuf.h \ $(top_srcdir)/include/sys/trace_dmu.h \ @@ -60,6 +61,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/trace_multilist.h \ $(top_srcdir)/include/sys/trace_txg.h \ $(top_srcdir)/include/sys/trace_zil.h \ + $(top_srcdir)/include/sys/trace_zio.h \ $(top_srcdir)/include/sys/trace_zrlock.h \ $(top_srcdir)/include/sys/txg.h \ $(top_srcdir)/include/sys/txg_impl.h \ diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 55dbdf19b0..0fca639e14 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -34,6 +34,7 @@ #include #include +#include /* For ZIO macros */ /* * Generic support for one argument tracepoints of the form: @@ -115,86 +116,6 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); * zio_t *, ...); */ -#define ZIO_TP_STRUCT_ENTRY \ - __field(zio_type_t, zio_type) \ - __field(int, zio_cmd) \ - __field(zio_priority_t, zio_priority) \ - __field(uint64_t, zio_size) \ - __field(uint64_t, zio_orig_size) \ - __field(uint64_t, zio_offset) \ - __field(hrtime_t, zio_timestamp) \ - __field(hrtime_t, zio_delta) \ - __field(uint64_t, zio_delay) \ - __field(enum zio_flag, zio_flags) \ - __field(enum zio_stage, zio_stage) \ - __field(enum zio_stage, zio_pipeline) \ - __field(enum zio_flag, zio_orig_flags) \ - __field(enum zio_stage, zio_orig_stage) \ - __field(enum zio_stage, zio_orig_pipeline) \ - __field(uint8_t, zio_reexecute) \ - __field(uint64_t, zio_txg) \ - __field(int, zio_error) \ - __field(uint64_t, zio_ena) \ - \ - __field(enum zio_checksum, zp_checksum) \ - __field(enum zio_compress, zp_compress) \ - __field(dmu_object_type_t, zp_type) \ - __field(uint8_t, zp_level) \ - __field(uint8_t, zp_copies) \ - __field(boolean_t, zp_dedup) \ - __field(boolean_t, zp_dedup_verify) \ - __field(boolean_t, zp_nopwrite) - -#define ZIO_TP_FAST_ASSIGN \ - __entry->zio_type = zio->io_type; \ - __entry->zio_cmd = zio->io_cmd; \ - __entry->zio_priority = zio->io_priority; \ - __entry->zio_size = zio->io_size; \ - __entry->zio_orig_size = zio->io_orig_size; \ - __entry->zio_offset = zio->io_offset; \ - __entry->zio_timestamp = zio->io_timestamp; \ - __entry->zio_delta = zio->io_delta; \ - __entry->zio_delay = zio->io_delay; \ - __entry->zio_flags = zio->io_flags; \ - __entry->zio_stage = zio->io_stage; \ - __entry->zio_pipeline = zio->io_pipeline; \ - __entry->zio_orig_flags = zio->io_orig_flags; \ - __entry->zio_orig_stage = zio->io_orig_stage; \ - __entry->zio_orig_pipeline = zio->io_orig_pipeline; \ - __entry->zio_reexecute = zio->io_reexecute; \ - __entry->zio_txg = zio->io_txg; \ - __entry->zio_error = zio->io_error; \ - __entry->zio_ena = zio->io_ena; \ - \ - __entry->zp_checksum = zio->io_prop.zp_checksum; \ - __entry->zp_compress = zio->io_prop.zp_compress; \ - __entry->zp_type = zio->io_prop.zp_type; \ - __entry->zp_level = zio->io_prop.zp_level; \ - __entry->zp_copies = zio->io_prop.zp_copies; \ - __entry->zp_dedup = zio->io_prop.zp_dedup; \ - __entry->zp_nopwrite = zio->io_prop.zp_nopwrite; \ - __entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify; - -#define ZIO_TP_PRINTK_FMT \ - "zio { type %u cmd %i prio %u size %llu orig_size %llu " \ - "offset %llu timestamp %llu delta %llu delay %llu " \ - "flags 0x%x stage 0x%x pipeline 0x%x orig_flags 0x%x " \ - "orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \ - "txg %llu error %d ena %llu prop { checksum %u compress %u " \ - "type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }" - -#define ZIO_TP_PRINTK_ARGS \ - __entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \ - __entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \ - __entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \ - __entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \ - __entry->zio_orig_flags, __entry->zio_orig_stage, \ - __entry->zio_orig_pipeline, __entry->zio_reexecute, \ - __entry->zio_txg, __entry->zio_error, __entry->zio_ena, \ - __entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \ - __entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \ - __entry->zp_dedup_verify, __entry->zp_nopwrite - DECLARE_EVENT_CLASS(zfs_l2arc_rw_class, TP_PROTO(vdev_t *vd, zio_t *zio), TP_ARGS(vd, zio), diff --git a/include/sys/trace_common.h b/include/sys/trace_common.h new file mode 100644 index 0000000000..6922d1a181 --- /dev/null +++ b/include/sys/trace_common.h @@ -0,0 +1,112 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * This file contains commonly used trace macros. Feel free to add and use + * them in your tracepoint headers. + */ + +#ifndef _SYS_TRACE_COMMON_H +#define _SYS_TRACE_COMMON_H +#include + +/* ZIO macros */ +#define ZIO_TP_STRUCT_ENTRY \ + __field(zio_type_t, zio_type) \ + __field(int, zio_cmd) \ + __field(zio_priority_t, zio_priority) \ + __field(uint64_t, zio_size) \ + __field(uint64_t, zio_orig_size) \ + __field(uint64_t, zio_offset) \ + __field(hrtime_t, zio_timestamp) \ + __field(hrtime_t, zio_delta) \ + __field(uint64_t, zio_delay) \ + __field(enum zio_flag, zio_flags) \ + __field(enum zio_stage, zio_stage) \ + __field(enum zio_stage, zio_pipeline) \ + __field(enum zio_flag, zio_orig_flags) \ + __field(enum zio_stage, zio_orig_stage) \ + __field(enum zio_stage, zio_orig_pipeline) \ + __field(uint8_t, zio_reexecute) \ + __field(uint64_t, zio_txg) \ + __field(int, zio_error) \ + __field(uint64_t, zio_ena) \ + \ + __field(enum zio_checksum, zp_checksum) \ + __field(enum zio_compress, zp_compress) \ + __field(dmu_object_type_t, zp_type) \ + __field(uint8_t, zp_level) \ + __field(uint8_t, zp_copies) \ + __field(boolean_t, zp_dedup) \ + __field(boolean_t, zp_dedup_verify) \ + __field(boolean_t, zp_nopwrite) + +#define ZIO_TP_FAST_ASSIGN \ + __entry->zio_type = zio->io_type; \ + __entry->zio_cmd = zio->io_cmd; \ + __entry->zio_priority = zio->io_priority; \ + __entry->zio_size = zio->io_size; \ + __entry->zio_orig_size = zio->io_orig_size; \ + __entry->zio_offset = zio->io_offset; \ + __entry->zio_timestamp = zio->io_timestamp; \ + __entry->zio_delta = zio->io_delta; \ + __entry->zio_delay = zio->io_delay; \ + __entry->zio_flags = zio->io_flags; \ + __entry->zio_stage = zio->io_stage; \ + __entry->zio_pipeline = zio->io_pipeline; \ + __entry->zio_orig_flags = zio->io_orig_flags; \ + __entry->zio_orig_stage = zio->io_orig_stage; \ + __entry->zio_orig_pipeline = zio->io_orig_pipeline; \ + __entry->zio_reexecute = zio->io_reexecute; \ + __entry->zio_txg = zio->io_txg; \ + __entry->zio_error = zio->io_error; \ + __entry->zio_ena = zio->io_ena; \ + \ + __entry->zp_checksum = zio->io_prop.zp_checksum; \ + __entry->zp_compress = zio->io_prop.zp_compress; \ + __entry->zp_type = zio->io_prop.zp_type; \ + __entry->zp_level = zio->io_prop.zp_level; \ + __entry->zp_copies = zio->io_prop.zp_copies; \ + __entry->zp_dedup = zio->io_prop.zp_dedup; \ + __entry->zp_nopwrite = zio->io_prop.zp_nopwrite; \ + __entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify; + +#define ZIO_TP_PRINTK_FMT \ + "zio { type %u cmd %i prio %u size %llu orig_size %llu " \ + "offset %llu timestamp %llu delta %llu delay %llu " \ + "flags 0x%x stage 0x%x pipeline 0x%x orig_flags 0x%x " \ + "orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \ + "txg %llu error %d ena %llu prop { checksum %u compress %u " \ + "type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }" + +#define ZIO_TP_PRINTK_ARGS \ + __entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \ + __entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \ + __entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \ + __entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \ + __entry->zio_orig_flags, __entry->zio_orig_stage, \ + __entry->zio_orig_pipeline, __entry->zio_reexecute, \ + __entry->zio_txg, __entry->zio_error, __entry->zio_ena, \ + __entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \ + __entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \ + __entry->zp_dedup_verify, __entry->zp_nopwrite + +#endif /* _SYS_TRACE_COMMON_H */ diff --git a/include/sys/trace_zio.h b/include/sys/trace_zio.h new file mode 100644 index 0000000000..6db6cefe27 --- /dev/null +++ b/include/sys/trace_zio.h @@ -0,0 +1,87 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include + +#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_zio + +#if !defined(_TRACE_ZIO_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZIO_H + +#include +#include +#include /* For ZIO macros */ + +TRACE_EVENT(zfs_zio__delay__miss, + TP_PROTO(zio_t *zio, hrtime_t now), + TP_ARGS(zio, now), + TP_STRUCT__entry( + ZIO_TP_STRUCT_ENTRY + __field(hrtime_t, now) + ), + TP_fast_assign( + ZIO_TP_FAST_ASSIGN + __entry->now = now; + ), + TP_printk("now %llu " ZIO_TP_PRINTK_FMT, __entry->now, + ZIO_TP_PRINTK_ARGS) +); + +TRACE_EVENT(zfs_zio__delay__hit, + TP_PROTO(zio_t *zio, hrtime_t now, hrtime_t diff), + TP_ARGS(zio, now, diff), + TP_STRUCT__entry( + ZIO_TP_STRUCT_ENTRY + __field(hrtime_t, now) + __field(hrtime_t, diff) + ), + TP_fast_assign( + ZIO_TP_FAST_ASSIGN + __entry->now = now; + __entry->diff = diff; + ), + TP_printk("now %llu diff %llu " ZIO_TP_PRINTK_FMT, __entry->now, + __entry->diff, ZIO_TP_PRINTK_ARGS) +); + +TRACE_EVENT(zfs_zio__delay__skip, + TP_PROTO(zio_t *zio), + TP_ARGS(zio), + TP_STRUCT__entry(ZIO_TP_STRUCT_ENTRY), + TP_fast_assign(ZIO_TP_FAST_ASSIGN), + TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS) +); + +#endif /* _TRACE_ZIO_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_zio +#include + +#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 601a9a70c5..ac70f690eb 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -287,6 +287,7 @@ typedef struct zinject_record { uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; + uint64_t zi_nlanes; uint32_t zi_cmd; uint32_t zi_pad; } zinject_record_t; diff --git a/include/sys/zio.h b/include/sys/zio.h index 9790b4a900..300a5f5ab7 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -420,6 +420,7 @@ struct zio { uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ + hrtime_t io_target_timestamp; hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ @@ -510,6 +511,8 @@ extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); extern void zio_execute(zio_t *zio); extern void zio_interrupt(zio_t *zio); +extern void zio_delay_init(zio_t *zio); +extern void zio_delay_interrupt(zio_t *zio); extern zio_t *zio_walk_parents(zio_t *cio); extern zio_t *zio_walk_children(zio_t *pio); @@ -572,7 +575,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); -extern uint64_t zio_handle_io_delay(zio_t *zio); +extern hrtime_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index c21d66ac49..90df4fb91b 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -42,6 +42,39 @@ Cancel injection records. .TP .B "zinject \-d \fIvdev\fB \-A \fIpool\fB Force a vdev into the DEGRADED or FAULTED state. +.TP +.B "zinject -d \fIvdev\fB -D latency:lanes \fIpool\fB + +Add an artificial delay to IO requests on a particular +device, such that the requests take a minimum of 'latency' +milliseconds to complete. Each delay has an associated +number of 'lanes' which defines the number of concurrent +IO requests that can be processed. + +For example, with a single lane delay of 10 ms (-D 10:1), +the device will only be able to service a single IO request +at a time with each request taking 10 ms to complete. So, +if only a single request is submitted every 10 ms, the +average latency will be 10 ms; but if more than one request +is submitted every 10 ms, the average latency will be more +than 10 ms. + +Similarly, if a delay of 10 ms is specified to have two +lanes (-D 10:2), then the device will be able to service +two requests at a time, each with a minimum latency of +10 ms. So, if two requests are submitted every 10 ms, then +the average latency will be 10 ms; but if more than two +requests are submitted every 10 ms, the average latency +will be more than 10 ms. + +Also note, these delays are additive. So two invocations +of '-D 10:1', is roughly equivalent to a single invocation +of '-D 10:2'. This also means, one can specify multiple +lanes with differing target latencies. For example, an +invocation of '-D 10:1' followed by '-D 25:2' will +create 3 lanes on the device; one lane with a latency +of 10 ms and two lanes with a 25 ms latency. + .TP .B "zinject \-d \fIvdev\fB [\-e \fIdevice_error\fB] [\-L \fIlabel_error\fB] [\-T \fIfailure\fB] [\-F] \fIpool\fB" Force a vdev error. diff --git a/module/zfs/trace.c b/module/zfs/trace.c index 0c9990e854..e4ebf31b3f 100644 --- a/module/zfs/trace.c +++ b/module/zfs/trace.c @@ -47,4 +47,5 @@ #include #include #include +#include #include diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 4e362226a8..1ef54d3899 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -23,7 +23,7 @@ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include @@ -414,7 +414,7 @@ vdev_disk_dio_put(dio_request_t *dr) ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); - zio_interrupt(zio); + zio_delay_interrupt(zio); } } @@ -726,6 +726,7 @@ vdev_disk_io_start(zio_t *zio) return; } + zio->io_target_timestamp = zio_handle_io_delay(zio); error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, zio->io_size, zio->io_offset, flags, 0); if (error) { diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index a29ea7bf95..bca4175a6f 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include @@ -159,7 +159,7 @@ vdev_file_io_strategy(void *arg) if (resid != 0 && zio->io_error == 0) zio->io_error = SET_ERROR(ENOSPC); - zio_interrupt(zio); + zio_delay_interrupt(zio); } static void @@ -217,6 +217,8 @@ vdev_file_io_start(zio_t *zio) return; } + zio->io_target_timestamp = zio_handle_io_delay(zio); + VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio, TQ_SLEEP), !=, 0); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index af8af67de7..d67912d965 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -753,9 +753,6 @@ vdev_queue_io_done(zio_t *zio) vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; - if (zio_injection_enabled) - delay(SEC_TO_TICK(zio_handle_io_delay(zio))); - mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 523a924d67..4063703adf 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -40,6 +40,7 @@ #include #include #include +#include /* * ========================================================================== @@ -1390,6 +1391,76 @@ zio_interrupt(zio_t *zio) zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } +void +zio_delay_interrupt(zio_t *zio) +{ + /* + * The timeout_generic() function isn't defined in userspace, so + * rather than trying to implement the function, the zio delay + * functionality has been disabled for userspace builds. + */ + +#ifdef _KERNEL + /* + * If io_target_timestamp is zero, then no delay has been registered + * for this IO, thus jump to the end of this function and "skip" the + * delay; issuing it directly to the zio layer. + */ + if (zio->io_target_timestamp != 0) { + hrtime_t now = gethrtime(); + + if (now >= zio->io_target_timestamp) { + /* + * This IO has already taken longer than the target + * delay to complete, so we don't want to delay it + * any longer; we "miss" the delay and issue it + * directly to the zio layer. This is likely due to + * the target latency being set to a value less than + * the underlying hardware can satisfy (e.g. delay + * set to 1ms, but the disks take 10ms to complete an + * IO request). + */ + + DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, + hrtime_t, now); + + zio_interrupt(zio); + } else { + taskqid_t tid; + hrtime_t diff = zio->io_target_timestamp - now; + clock_t expire_at_tick = ddi_get_lbolt() + + NSEC_TO_TICK(diff); + + DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, + hrtime_t, now, hrtime_t, diff); + + if (NSEC_TO_TICK(diff) == 0) { + /* Our delay is less than a jiffy - just spin */ + zfs_sleep_until(zio->io_target_timestamp); + } else { + /* + * Use taskq_dispatch_delay() in the place of + * OpenZFS's timeout_generic(). + */ + tid = taskq_dispatch_delay(system_taskq, + (task_func_t *) zio_interrupt, + zio, TQ_NOSLEEP, expire_at_tick); + if (!tid) { + /* + * Couldn't allocate a task. Just + * finish the zio without a delay. + */ + zio_interrupt(zio); + } + } + } + return; + } +#endif + DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); + zio_interrupt(zio); +} + /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 1458be4772..61b7d25e65 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* @@ -49,15 +49,53 @@ uint32_t zio_injection_enabled = 0; +/* + * Data describing each zinject handler registered on the system, and + * contains the list node linking the handler in the global zinject + * handler list. + */ typedef struct inject_handler { int zi_id; spa_t *zi_spa; zinject_record_t zi_record; + uint64_t *zi_lanes; + int zi_next_lane; list_node_t zi_link; } inject_handler_t; +/* + * List of all zinject handlers registered on the system, protected by + * the inject_lock defined below. + */ static list_t inject_handlers; + +/* + * This protects insertion into, and traversal of, the inject handler + * list defined above; as well as the inject_delay_count. Any time a + * handler is inserted or removed from the list, this lock should be + * taken as a RW_WRITER; and any time traversal is done over the list + * (without modification to it) this lock should be taken as a RW_READER. + */ static krwlock_t inject_lock; + +/* + * This holds the number of zinject delay handlers that have been + * registered on the system. It is protected by the inject_lock defined + * above. Thus modifications to this count must be a RW_WRITER of the + * inject_lock, and reads of this count must be (at least) a RW_READER + * of the lock. + */ +static int inject_delay_count = 0; + +/* + * This lock is used only in zio_handle_io_delay(), refer to the comment + * in that function for more details. + */ +static kmutex_t inject_delay_mtx; + +/* + * Used to assign unique identifying numbers to each new zinject handler. + */ static int inject_next_id = 1; /* @@ -361,21 +399,70 @@ spa_handle_ignored_writes(spa_t *spa) rw_exit(&inject_lock); } -uint64_t +hrtime_t zio_handle_io_delay(zio_t *zio) { vdev_t *vd = zio->io_vd; + inject_handler_t *min_handler = NULL; + hrtime_t min_target = 0; inject_handler_t *handler; - uint64_t seconds = 0; - - if (zio_injection_enabled == 0) - return (0); + hrtime_t idle; + hrtime_t busy; + hrtime_t target; rw_enter(&inject_lock, RW_READER); - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { + /* + * inject_delay_count is a subset of zio_injection_enabled that + * is only incremented for delay handlers. These checks are + * mainly added to remind the reader why we're not explicitly + * checking zio_injection_enabled like the other functions. + */ + IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); + IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); + /* + * If there aren't any inject delay handlers registered, then we + * can short circuit and simply return 0 here. A value of zero + * informs zio_delay_interrupt() that this request should not be + * delayed. This short circuit keeps us from acquiring the + * inject_delay_mutex unnecessarily. + */ + if (inject_delay_count == 0) { + rw_exit(&inject_lock); + return (0); + } + + /* + * Each inject handler has a number of "lanes" associated with + * it. Each lane is able to handle requests independently of one + * another, and at a latency defined by the inject handler + * record's zi_timer field. Thus if a handler in configured with + * a single lane with a 10ms latency, it will delay requests + * such that only a single request is completed every 10ms. So, + * if more than one request is attempted per each 10ms interval, + * the average latency of the requests will be greater than + * 10ms; but if only a single request is submitted each 10ms + * interval the average latency will be 10ms. + * + * We need to acquire this mutex to prevent multiple concurrent + * threads being assigned to the same lane of a given inject + * handler. The mutex allows us to perform the following two + * operations atomically: + * + * 1. determine the minimum handler and minimum target + * value of all the possible handlers + * 2. update that minimum handler's lane array + * + * Without atomicity, two (or more) threads could pick the same + * lane in step (1), and then conflict with each other in step + * (2). This could allow a single lane handler to process + * multiple requests simultaneously, which shouldn't be possible. + */ + mutex_enter(&inject_delay_mtx); + + for (handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) continue; @@ -384,14 +471,101 @@ zio_handle_io_delay(zio_t *zio) continue; } - if (vd->vdev_guid == handler->zi_record.zi_guid) { - seconds = handler->zi_record.zi_timer; - break; + if (vd->vdev_guid != handler->zi_record.zi_guid) + continue; + + /* + * Defensive; should never happen as the array allocation + * occurs prior to inserting this handler on the list. + */ + ASSERT3P(handler->zi_lanes, !=, NULL); + + /* + * This should never happen, the zinject command should + * prevent a user from setting an IO delay with zero lanes. + */ + ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); + + ASSERT3U(handler->zi_record.zi_nlanes, >, + handler->zi_next_lane); + + /* + * We want to issue this IO to the lane that will become + * idle the soonest, so we compare the soonest this + * specific handler can complete the IO with all other + * handlers, to find the lowest value of all possible + * lanes. We then use this lane to submit the request. + * + * Since each handler has a constant value for its + * delay, we can just use the "next" lane for that + * handler; as it will always be the lane with the + * lowest value for that particular handler (i.e. the + * lane that will become idle the soonest). This saves a + * scan of each handler's lanes array. + * + * There's two cases to consider when determining when + * this specific IO request should complete. If this + * lane is idle, we want to "submit" the request now so + * it will complete after zi_timer milliseconds. Thus, + * we set the target to now + zi_timer. + * + * If the lane is busy, we want this request to complete + * zi_timer milliseconds after the lane becomes idle. + * Since the 'zi_lanes' array holds the time at which + * each lane will become idle, we use that value to + * determine when this request should complete. + */ + idle = handler->zi_record.zi_timer + gethrtime(); + busy = handler->zi_record.zi_timer + + handler->zi_lanes[handler->zi_next_lane]; + target = MAX(idle, busy); + + if (min_handler == NULL) { + min_handler = handler; + min_target = target; + continue; } + ASSERT3P(min_handler, !=, NULL); + ASSERT3U(min_target, !=, 0); + + /* + * We don't yet increment the "next lane" variable since + * we still might find a lower value lane in another + * handler during any remaining iterations. Once we're + * sure we've selected the absolute minimum, we'll claim + * the lane and increment the handler's "next lane" + * field below. + */ + + if (target < min_target) { + min_handler = handler; + min_target = target; + } } + + /* + * 'min_handler' will be NULL if no IO delays are registered for + * this vdev, otherwise it will point to the handler containing + * the lane that will become idle the soonest. + */ + if (min_handler != NULL) { + ASSERT3U(min_target, !=, 0); + min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; + + /* + * If we've used all possible lanes for this handler, + * loop back and start using the first lane again; + * otherwise, just increment the lane index. + */ + min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % + min_handler->zi_record.zi_nlanes; + } + + mutex_exit(&inject_delay_mtx); rw_exit(&inject_lock); - return (seconds); + + return (min_target); } /* @@ -415,6 +589,24 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) if ((error = spa_reset(name)) != 0) return (error); + if (record->zi_cmd == ZINJECT_DELAY_IO) { + /* + * A value of zero for the number of lanes or for the + * delay time doesn't make sense. + */ + if (record->zi_timer == 0 || record->zi_nlanes == 0) + return (SET_ERROR(EINVAL)); + + /* + * The number of lanes is directly mapped to the size of + * an array used by the handler. Thus, to ensure the + * user doesn't trigger an allocation that's "too large" + * we cap the number of lanes here. + */ + if (record->zi_nlanes >= UINT16_MAX) + return (SET_ERROR(EINVAL)); + } + if (!(flags & ZINJECT_NULL)) { /* * spa_inject_ref() will add an injection reference, which will @@ -426,11 +618,34 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); - rw_enter(&inject_lock, RW_WRITER); - - *id = handler->zi_id = inject_next_id++; handler->zi_spa = spa; handler->zi_record = *record; + + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + handler->zi_lanes = kmem_zalloc( + sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes, KM_SLEEP); + handler->zi_next_lane = 0; + } else { + handler->zi_lanes = NULL; + handler->zi_next_lane = 0; + } + + rw_enter(&inject_lock, RW_WRITER); + + /* + * We can't move this increment into the conditional + * above because we need to hold the RW_WRITER lock of + * inject_lock, and we don't want to hold that while + * allocating the handler's zi_lanes array. + */ + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >=, 0); + inject_delay_count++; + ASSERT3S(inject_delay_count, >, 0); + } + + *id = handler->zi_id = inject_next_id++; list_insert_tail(&inject_handlers, handler); atomic_inc_32(&zio_injection_enabled); @@ -508,9 +723,23 @@ zio_clear_fault(int id) return (SET_ERROR(ENOENT)); } + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >, 0); + inject_delay_count--; + ASSERT3S(inject_delay_count, >=, 0); + } + list_remove(&inject_handlers, handler); rw_exit(&inject_lock); + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3P(handler->zi_lanes, !=, NULL); + kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes); + } else { + ASSERT3P(handler->zi_lanes, ==, NULL); + } + spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); @@ -522,6 +751,7 @@ void zio_inject_init(void) { rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&inject_handlers, sizeof (inject_handler_t), offsetof(inject_handler_t, zi_link)); } @@ -530,6 +760,7 @@ void zio_inject_fini(void) { list_destroy(&inject_handlers); + mutex_destroy(&inject_delay_mtx); rw_destroy(&inject_lock); } diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh index af078e2354..ae6a8e12c2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh @@ -38,10 +38,7 @@ verify_disk_count "$DISKS" 2 default_mirror_setup_noexit $DISK1 $DISK2 mntpnt=$(get_prop mountpoint $TESTPOOL) -typeset -i i=0 -while ((i < 10)); do - log_must $DD if=/dev/urandom of=$mntpnt/bigfile.$i bs=1024k count=100 - ((i += 1)) -done +# Create 100MB of data +log_must $FILE_WRITE -b 1048576 -c 100 -o create -d 0 -f $mntpnt/bigfile log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh index cf16444465..ce07fd5a2b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh @@ -38,16 +38,15 @@ # 3. Verify zpool scrub -s succeed when the system is scrubbing. # # NOTES: -# A 1 second delay is added to 10% of zio's in order to ensure that -# the scrub does not complete before it has a chance to be cancelled. +# A 10ms delay is added to the ZIOs in order to ensure that the +# scrub does not complete before it has a chance to be cancelled. # This can occur when testing with small pools or very fast hardware. # verify_runnable "global" log_assert "Verify scrub -s works correctly." - -log_must $ZINJECT -d $DISK1 -f10 -D1 $TESTPOOL +log_must $ZINJECT -d $DISK1 -D10:1 $TESTPOOL log_must $ZPOOL scrub $TESTPOOL log_must $ZPOOL scrub -s $TESTPOOL log_must is_pool_scrub_stopped $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh index 54e27d50db..b1b6df1af6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh @@ -40,8 +40,8 @@ # 4. Check the percent again, verify a new scrub started. # # NOTES: -# A 1 second delay is added to 10% of zio's in order to ensure that -# the scrub does not complete before it has a chance to be restarted. +# A 10ms delay is added to the ZIOs in order to ensure that the +# scrub does not complete before it has a chance to be restarted. # This can occur when testing with small pools or very fast hardware. # @@ -61,7 +61,7 @@ function get_scrub_percent log_assert "scrub command terminates the existing scrub process and starts" \ "a new scrub." -log_must $ZINJECT -d $DISK1 -f10 -D1 $TESTPOOL +log_must $ZINJECT -d $DISK1 -D10:1 $TESTPOOL log_must $ZPOOL scrub $TESTPOOL typeset -i PERCENT=30 percent=0 while ((percent < PERCENT)) ; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh index c689d7995a..ca07769ec2 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh @@ -42,16 +42,16 @@ # 3. Verify scrub failed until the resilver completed # # NOTES: -# A 1 second delay is added to 10% of zio's in order to ensure that -# the resilver does not complete before the scrub can be issue. This +# A 10ms delay is added to 10% of zio's in order to ensure that the +# resilver does not complete before the scrub can be issued. This # can occur when testing with small pools or very fast hardware. verify_runnable "global" log_assert "Resilver prevent scrub from starting until the resilver completes" -log_must $ZINJECT -d $DISK1 -f10 -D1 $TESTPOOL log_must $ZPOOL detach $TESTPOOL $DISK2 +log_must $ZINJECT -d $DISK1 -D10:1 $TESTPOOL log_must $ZPOOL attach $TESTPOOL $DISK1 $DISK2 log_must is_pool_resilvering $TESTPOOL log_mustnot $ZPOOL scrub $TESTPOOL