OpenZFS 6531 - Provide mechanism to artificially limit disk performance
Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Dan McDonald <danmcd@omniti.com> Ported by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/6531 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/97e8130 Porting notes: - Added new IO delay tracepoints, and moved common ZIO tracepoint macros to a new trace_common.h file. - Used zio_delay_taskq() in place of OpenZFS's timeout_generic() function. - Updated zinject man page - Updated zpool_scrub test files
This commit is contained in:
parent
7e945072d1
commit
26ef0cc7db
|
@ -20,7 +20,7 @@
|
||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -239,6 +239,38 @@ usage(void)
|
||||||
"\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
|
"\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
|
||||||
"\t\tPerform a specific action on a particular device.\n"
|
"\t\tPerform a specific action on a particular device.\n"
|
||||||
"\n"
|
"\n"
|
||||||
|
"\tzinject -d device -D latency:lanes pool\n"
|
||||||
|
"\n"
|
||||||
|
"\t\tAdd an artificial delay to IO requests on a particular\n"
|
||||||
|
"\t\tdevice, such that the requests take a minimum of 'latency'\n"
|
||||||
|
"\t\tmilliseconds to complete. Each delay has an associated\n"
|
||||||
|
"\t\tnumber of 'lanes' which defines the number of concurrent\n"
|
||||||
|
"\t\tIO requests that can be processed.\n"
|
||||||
|
"\n"
|
||||||
|
"\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
|
||||||
|
"\t\tthe device will only be able to service a single IO request\n"
|
||||||
|
"\t\tat a time with each request taking 10 ms to complete. So,\n"
|
||||||
|
"\t\tif only a single request is submitted every 10 ms, the\n"
|
||||||
|
"\t\taverage latency will be 10 ms; but if more than one request\n"
|
||||||
|
"\t\tis submitted every 10 ms, the average latency will be more\n"
|
||||||
|
"\t\tthan 10 ms.\n"
|
||||||
|
"\n"
|
||||||
|
"\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
|
||||||
|
"\t\tlanes (-D 10:2), then the device will be able to service\n"
|
||||||
|
"\t\ttwo requests at a time, each with a minimum latency of\n"
|
||||||
|
"\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
|
||||||
|
"\t\tthe average latency will be 10 ms; but if more than two\n"
|
||||||
|
"\t\trequests are submitted every 10 ms, the average latency\n"
|
||||||
|
"\t\twill be more than 10 ms.\n"
|
||||||
|
"\n"
|
||||||
|
"\t\tAlso note, these delays are additive. So two invocations\n"
|
||||||
|
"\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
|
||||||
|
"\t\tof '-D 10:2'. This also means, one can specify multiple\n"
|
||||||
|
"\t\tlanes with differing target latencies. For example, an\n"
|
||||||
|
"\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
|
||||||
|
"\t\tcreate 3 lanes on the device; one lane with a latency\n"
|
||||||
|
"\t\tof 10 ms and two lanes with a 25 ms latency.\n"
|
||||||
|
"\n"
|
||||||
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
|
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
|
||||||
"\t\tCause the pool to stop writing blocks yet not\n"
|
"\t\tCause the pool to stop writing blocks yet not\n"
|
||||||
"\t\treport errors for a duration. Simulates buggy hardware\n"
|
"\t\treport errors for a duration. Simulates buggy hardware\n"
|
||||||
|
@ -353,6 +385,9 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
|
||||||
if (record->zi_guid == 0 || record->zi_func[0] != '\0')
|
if (record->zi_guid == 0 || record->zi_func[0] != '\0')
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
if (record->zi_cmd == ZINJECT_DELAY_IO)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if (*count == 0) {
|
if (*count == 0) {
|
||||||
(void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID");
|
(void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID");
|
||||||
(void) printf("--- --------------- ----------------\n");
|
(void) printf("--- --------------- ----------------\n");
|
||||||
|
@ -366,6 +401,35 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
print_delay_handler(int id, const char *pool, zinject_record_t *record,
|
||||||
|
void *data)
|
||||||
|
{
|
||||||
|
int *count = data;
|
||||||
|
|
||||||
|
if (record->zi_guid == 0 || record->zi_func[0] != '\0')
|
||||||
|
return (0);
|
||||||
|
|
||||||
|
if (record->zi_cmd != ZINJECT_DELAY_IO)
|
||||||
|
return (0);
|
||||||
|
|
||||||
|
if (*count == 0) {
|
||||||
|
(void) printf("%3s %-15s %-15s %-15s %s\n",
|
||||||
|
"ID", "POOL", "DELAY (ms)", "LANES", "GUID");
|
||||||
|
(void) printf("--- --------------- --------------- "
|
||||||
|
"--------------- ----------------\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
*count += 1;
|
||||||
|
|
||||||
|
(void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool,
|
||||||
|
(u_longlong_t)NSEC2MSEC(record->zi_timer),
|
||||||
|
(u_longlong_t)record->zi_nlanes,
|
||||||
|
(u_longlong_t)record->zi_guid);
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
print_panic_handler(int id, const char *pool, zinject_record_t *record,
|
print_panic_handler(int id, const char *pool, zinject_record_t *record,
|
||||||
void *data)
|
void *data)
|
||||||
|
@ -403,6 +467,13 @@ print_all_handlers(void)
|
||||||
count = 0;
|
count = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
(void) iter_handlers(print_delay_handler, &count);
|
||||||
|
if (count > 0) {
|
||||||
|
total += count;
|
||||||
|
(void) printf("\n");
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
(void) iter_handlers(print_data_handler, &count);
|
(void) iter_handlers(print_data_handler, &count);
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
total += count;
|
total += count;
|
||||||
|
@ -545,6 +616,35 @@ perform_action(const char *pool, zinject_record_t *record, int cmd)
|
||||||
return (1);
|
return (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
|
||||||
|
{
|
||||||
|
unsigned long scan_delay;
|
||||||
|
unsigned long scan_nlanes;
|
||||||
|
|
||||||
|
if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
|
||||||
|
return (1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We explicitly disallow a delay of zero here, because we key
|
||||||
|
* off this value being non-zero in translate_device(), to
|
||||||
|
* determine if the fault is a ZINJECT_DELAY_IO fault or not.
|
||||||
|
*/
|
||||||
|
if (scan_delay == 0)
|
||||||
|
return (1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The units for the CLI delay parameter is milliseconds, but
|
||||||
|
* the data passed to the kernel is interpreted as nanoseconds.
|
||||||
|
* Thus we scale the milliseconds to nanoseconds here, and this
|
||||||
|
* nanosecond value is used to pass the delay to the kernel.
|
||||||
|
*/
|
||||||
|
*delay = MSEC2NSEC(scan_delay);
|
||||||
|
*nlanes = scan_nlanes;
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char **argv)
|
main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
|
@ -628,8 +728,10 @@ main(int argc, char **argv)
|
||||||
break;
|
break;
|
||||||
case 'D':
|
case 'D':
|
||||||
errno = 0;
|
errno = 0;
|
||||||
record.zi_timer = strtoull(optarg, &end, 10);
|
ret = parse_delay(optarg, &record.zi_timer,
|
||||||
if (errno != 0 || *end != '\0') {
|
&record.zi_nlanes);
|
||||||
|
if (ret != 0) {
|
||||||
|
|
||||||
(void) fprintf(stderr, "invalid i/o delay "
|
(void) fprintf(stderr, "invalid i/o delay "
|
||||||
"value: '%s'\n", optarg);
|
"value: '%s'\n", optarg);
|
||||||
usage();
|
usage();
|
||||||
|
|
|
@ -53,6 +53,7 @@ COMMON_H = \
|
||||||
$(top_srcdir)/include/sys/trace.h \
|
$(top_srcdir)/include/sys/trace.h \
|
||||||
$(top_srcdir)/include/sys/trace_acl.h \
|
$(top_srcdir)/include/sys/trace_acl.h \
|
||||||
$(top_srcdir)/include/sys/trace_arc.h \
|
$(top_srcdir)/include/sys/trace_arc.h \
|
||||||
|
$(top_srcdir)/include/sys/trace_common.h \
|
||||||
$(top_srcdir)/include/sys/trace_dbgmsg.h \
|
$(top_srcdir)/include/sys/trace_dbgmsg.h \
|
||||||
$(top_srcdir)/include/sys/trace_dbuf.h \
|
$(top_srcdir)/include/sys/trace_dbuf.h \
|
||||||
$(top_srcdir)/include/sys/trace_dmu.h \
|
$(top_srcdir)/include/sys/trace_dmu.h \
|
||||||
|
@ -60,6 +61,7 @@ COMMON_H = \
|
||||||
$(top_srcdir)/include/sys/trace_multilist.h \
|
$(top_srcdir)/include/sys/trace_multilist.h \
|
||||||
$(top_srcdir)/include/sys/trace_txg.h \
|
$(top_srcdir)/include/sys/trace_txg.h \
|
||||||
$(top_srcdir)/include/sys/trace_zil.h \
|
$(top_srcdir)/include/sys/trace_zil.h \
|
||||||
|
$(top_srcdir)/include/sys/trace_zio.h \
|
||||||
$(top_srcdir)/include/sys/trace_zrlock.h \
|
$(top_srcdir)/include/sys/trace_zrlock.h \
|
||||||
$(top_srcdir)/include/sys/txg.h \
|
$(top_srcdir)/include/sys/txg.h \
|
||||||
$(top_srcdir)/include/sys/txg_impl.h \
|
$(top_srcdir)/include/sys/txg_impl.h \
|
||||||
|
|
|
@ -34,6 +34,7 @@
|
||||||
|
|
||||||
#include <linux/tracepoint.h>
|
#include <linux/tracepoint.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
#include <sys/trace_common.h> /* For ZIO macros */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Generic support for one argument tracepoints of the form:
|
* Generic support for one argument tracepoints of the form:
|
||||||
|
@ -115,86 +116,6 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
|
||||||
* zio_t *, ...);
|
* zio_t *, ...);
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define ZIO_TP_STRUCT_ENTRY \
|
|
||||||
__field(zio_type_t, zio_type) \
|
|
||||||
__field(int, zio_cmd) \
|
|
||||||
__field(zio_priority_t, zio_priority) \
|
|
||||||
__field(uint64_t, zio_size) \
|
|
||||||
__field(uint64_t, zio_orig_size) \
|
|
||||||
__field(uint64_t, zio_offset) \
|
|
||||||
__field(hrtime_t, zio_timestamp) \
|
|
||||||
__field(hrtime_t, zio_delta) \
|
|
||||||
__field(uint64_t, zio_delay) \
|
|
||||||
__field(enum zio_flag, zio_flags) \
|
|
||||||
__field(enum zio_stage, zio_stage) \
|
|
||||||
__field(enum zio_stage, zio_pipeline) \
|
|
||||||
__field(enum zio_flag, zio_orig_flags) \
|
|
||||||
__field(enum zio_stage, zio_orig_stage) \
|
|
||||||
__field(enum zio_stage, zio_orig_pipeline) \
|
|
||||||
__field(uint8_t, zio_reexecute) \
|
|
||||||
__field(uint64_t, zio_txg) \
|
|
||||||
__field(int, zio_error) \
|
|
||||||
__field(uint64_t, zio_ena) \
|
|
||||||
\
|
|
||||||
__field(enum zio_checksum, zp_checksum) \
|
|
||||||
__field(enum zio_compress, zp_compress) \
|
|
||||||
__field(dmu_object_type_t, zp_type) \
|
|
||||||
__field(uint8_t, zp_level) \
|
|
||||||
__field(uint8_t, zp_copies) \
|
|
||||||
__field(boolean_t, zp_dedup) \
|
|
||||||
__field(boolean_t, zp_dedup_verify) \
|
|
||||||
__field(boolean_t, zp_nopwrite)
|
|
||||||
|
|
||||||
#define ZIO_TP_FAST_ASSIGN \
|
|
||||||
__entry->zio_type = zio->io_type; \
|
|
||||||
__entry->zio_cmd = zio->io_cmd; \
|
|
||||||
__entry->zio_priority = zio->io_priority; \
|
|
||||||
__entry->zio_size = zio->io_size; \
|
|
||||||
__entry->zio_orig_size = zio->io_orig_size; \
|
|
||||||
__entry->zio_offset = zio->io_offset; \
|
|
||||||
__entry->zio_timestamp = zio->io_timestamp; \
|
|
||||||
__entry->zio_delta = zio->io_delta; \
|
|
||||||
__entry->zio_delay = zio->io_delay; \
|
|
||||||
__entry->zio_flags = zio->io_flags; \
|
|
||||||
__entry->zio_stage = zio->io_stage; \
|
|
||||||
__entry->zio_pipeline = zio->io_pipeline; \
|
|
||||||
__entry->zio_orig_flags = zio->io_orig_flags; \
|
|
||||||
__entry->zio_orig_stage = zio->io_orig_stage; \
|
|
||||||
__entry->zio_orig_pipeline = zio->io_orig_pipeline; \
|
|
||||||
__entry->zio_reexecute = zio->io_reexecute; \
|
|
||||||
__entry->zio_txg = zio->io_txg; \
|
|
||||||
__entry->zio_error = zio->io_error; \
|
|
||||||
__entry->zio_ena = zio->io_ena; \
|
|
||||||
\
|
|
||||||
__entry->zp_checksum = zio->io_prop.zp_checksum; \
|
|
||||||
__entry->zp_compress = zio->io_prop.zp_compress; \
|
|
||||||
__entry->zp_type = zio->io_prop.zp_type; \
|
|
||||||
__entry->zp_level = zio->io_prop.zp_level; \
|
|
||||||
__entry->zp_copies = zio->io_prop.zp_copies; \
|
|
||||||
__entry->zp_dedup = zio->io_prop.zp_dedup; \
|
|
||||||
__entry->zp_nopwrite = zio->io_prop.zp_nopwrite; \
|
|
||||||
__entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify;
|
|
||||||
|
|
||||||
#define ZIO_TP_PRINTK_FMT \
|
|
||||||
"zio { type %u cmd %i prio %u size %llu orig_size %llu " \
|
|
||||||
"offset %llu timestamp %llu delta %llu delay %llu " \
|
|
||||||
"flags 0x%x stage 0x%x pipeline 0x%x orig_flags 0x%x " \
|
|
||||||
"orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \
|
|
||||||
"txg %llu error %d ena %llu prop { checksum %u compress %u " \
|
|
||||||
"type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }"
|
|
||||||
|
|
||||||
#define ZIO_TP_PRINTK_ARGS \
|
|
||||||
__entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \
|
|
||||||
__entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \
|
|
||||||
__entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \
|
|
||||||
__entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \
|
|
||||||
__entry->zio_orig_flags, __entry->zio_orig_stage, \
|
|
||||||
__entry->zio_orig_pipeline, __entry->zio_reexecute, \
|
|
||||||
__entry->zio_txg, __entry->zio_error, __entry->zio_ena, \
|
|
||||||
__entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \
|
|
||||||
__entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \
|
|
||||||
__entry->zp_dedup_verify, __entry->zp_nopwrite
|
|
||||||
|
|
||||||
DECLARE_EVENT_CLASS(zfs_l2arc_rw_class,
|
DECLARE_EVENT_CLASS(zfs_l2arc_rw_class,
|
||||||
TP_PROTO(vdev_t *vd, zio_t *zio),
|
TP_PROTO(vdev_t *vd, zio_t *zio),
|
||||||
TP_ARGS(vd, zio),
|
TP_ARGS(vd, zio),
|
||||||
|
|
|
@ -0,0 +1,112 @@
|
||||||
|
/*
|
||||||
|
* CDDL HEADER START
|
||||||
|
*
|
||||||
|
* The contents of this file are subject to the terms of the
|
||||||
|
* Common Development and Distribution License (the "License").
|
||||||
|
* You may not use this file except in compliance with the License.
|
||||||
|
*
|
||||||
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||||
|
* or http://www.opensolaris.org/os/licensing.
|
||||||
|
* See the License for the specific language governing permissions
|
||||||
|
* and limitations under the License.
|
||||||
|
*
|
||||||
|
* When distributing Covered Code, include this CDDL HEADER in each
|
||||||
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||||
|
* If applicable, add the following below this CDDL HEADER, with the
|
||||||
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||||
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||||
|
*
|
||||||
|
* CDDL HEADER END
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file contains commonly used trace macros. Feel free to add and use
|
||||||
|
* them in your tracepoint headers.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _SYS_TRACE_COMMON_H
|
||||||
|
#define _SYS_TRACE_COMMON_H
|
||||||
|
#include <linux/tracepoint.h>
|
||||||
|
|
||||||
|
/* ZIO macros */
|
||||||
|
#define ZIO_TP_STRUCT_ENTRY \
|
||||||
|
__field(zio_type_t, zio_type) \
|
||||||
|
__field(int, zio_cmd) \
|
||||||
|
__field(zio_priority_t, zio_priority) \
|
||||||
|
__field(uint64_t, zio_size) \
|
||||||
|
__field(uint64_t, zio_orig_size) \
|
||||||
|
__field(uint64_t, zio_offset) \
|
||||||
|
__field(hrtime_t, zio_timestamp) \
|
||||||
|
__field(hrtime_t, zio_delta) \
|
||||||
|
__field(uint64_t, zio_delay) \
|
||||||
|
__field(enum zio_flag, zio_flags) \
|
||||||
|
__field(enum zio_stage, zio_stage) \
|
||||||
|
__field(enum zio_stage, zio_pipeline) \
|
||||||
|
__field(enum zio_flag, zio_orig_flags) \
|
||||||
|
__field(enum zio_stage, zio_orig_stage) \
|
||||||
|
__field(enum zio_stage, zio_orig_pipeline) \
|
||||||
|
__field(uint8_t, zio_reexecute) \
|
||||||
|
__field(uint64_t, zio_txg) \
|
||||||
|
__field(int, zio_error) \
|
||||||
|
__field(uint64_t, zio_ena) \
|
||||||
|
\
|
||||||
|
__field(enum zio_checksum, zp_checksum) \
|
||||||
|
__field(enum zio_compress, zp_compress) \
|
||||||
|
__field(dmu_object_type_t, zp_type) \
|
||||||
|
__field(uint8_t, zp_level) \
|
||||||
|
__field(uint8_t, zp_copies) \
|
||||||
|
__field(boolean_t, zp_dedup) \
|
||||||
|
__field(boolean_t, zp_dedup_verify) \
|
||||||
|
__field(boolean_t, zp_nopwrite)
|
||||||
|
|
||||||
|
#define ZIO_TP_FAST_ASSIGN \
|
||||||
|
__entry->zio_type = zio->io_type; \
|
||||||
|
__entry->zio_cmd = zio->io_cmd; \
|
||||||
|
__entry->zio_priority = zio->io_priority; \
|
||||||
|
__entry->zio_size = zio->io_size; \
|
||||||
|
__entry->zio_orig_size = zio->io_orig_size; \
|
||||||
|
__entry->zio_offset = zio->io_offset; \
|
||||||
|
__entry->zio_timestamp = zio->io_timestamp; \
|
||||||
|
__entry->zio_delta = zio->io_delta; \
|
||||||
|
__entry->zio_delay = zio->io_delay; \
|
||||||
|
__entry->zio_flags = zio->io_flags; \
|
||||||
|
__entry->zio_stage = zio->io_stage; \
|
||||||
|
__entry->zio_pipeline = zio->io_pipeline; \
|
||||||
|
__entry->zio_orig_flags = zio->io_orig_flags; \
|
||||||
|
__entry->zio_orig_stage = zio->io_orig_stage; \
|
||||||
|
__entry->zio_orig_pipeline = zio->io_orig_pipeline; \
|
||||||
|
__entry->zio_reexecute = zio->io_reexecute; \
|
||||||
|
__entry->zio_txg = zio->io_txg; \
|
||||||
|
__entry->zio_error = zio->io_error; \
|
||||||
|
__entry->zio_ena = zio->io_ena; \
|
||||||
|
\
|
||||||
|
__entry->zp_checksum = zio->io_prop.zp_checksum; \
|
||||||
|
__entry->zp_compress = zio->io_prop.zp_compress; \
|
||||||
|
__entry->zp_type = zio->io_prop.zp_type; \
|
||||||
|
__entry->zp_level = zio->io_prop.zp_level; \
|
||||||
|
__entry->zp_copies = zio->io_prop.zp_copies; \
|
||||||
|
__entry->zp_dedup = zio->io_prop.zp_dedup; \
|
||||||
|
__entry->zp_nopwrite = zio->io_prop.zp_nopwrite; \
|
||||||
|
__entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify;
|
||||||
|
|
||||||
|
#define ZIO_TP_PRINTK_FMT \
|
||||||
|
"zio { type %u cmd %i prio %u size %llu orig_size %llu " \
|
||||||
|
"offset %llu timestamp %llu delta %llu delay %llu " \
|
||||||
|
"flags 0x%x stage 0x%x pipeline 0x%x orig_flags 0x%x " \
|
||||||
|
"orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \
|
||||||
|
"txg %llu error %d ena %llu prop { checksum %u compress %u " \
|
||||||
|
"type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }"
|
||||||
|
|
||||||
|
#define ZIO_TP_PRINTK_ARGS \
|
||||||
|
__entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \
|
||||||
|
__entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \
|
||||||
|
__entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \
|
||||||
|
__entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \
|
||||||
|
__entry->zio_orig_flags, __entry->zio_orig_stage, \
|
||||||
|
__entry->zio_orig_pipeline, __entry->zio_reexecute, \
|
||||||
|
__entry->zio_txg, __entry->zio_error, __entry->zio_ena, \
|
||||||
|
__entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \
|
||||||
|
__entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \
|
||||||
|
__entry->zp_dedup_verify, __entry->zp_nopwrite
|
||||||
|
|
||||||
|
#endif /* _SYS_TRACE_COMMON_H */
|
|
@ -0,0 +1,87 @@
|
||||||
|
/*
|
||||||
|
* CDDL HEADER START
|
||||||
|
*
|
||||||
|
* The contents of this file are subject to the terms of the
|
||||||
|
* Common Development and Distribution License (the "License").
|
||||||
|
* You may not use this file except in compliance with the License.
|
||||||
|
*
|
||||||
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||||
|
* or http://www.opensolaris.org/os/licensing.
|
||||||
|
* See the License for the specific language governing permissions
|
||||||
|
* and limitations under the License.
|
||||||
|
*
|
||||||
|
* When distributing Covered Code, include this CDDL HEADER in each
|
||||||
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||||
|
* If applicable, add the following below this CDDL HEADER, with the
|
||||||
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||||
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||||
|
*
|
||||||
|
* CDDL HEADER END
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sys/list.h>
|
||||||
|
|
||||||
|
#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
|
||||||
|
|
||||||
|
#undef TRACE_SYSTEM
|
||||||
|
#define TRACE_SYSTEM zfs
|
||||||
|
|
||||||
|
#undef TRACE_SYSTEM_VAR
|
||||||
|
#define TRACE_SYSTEM_VAR zfs_zio
|
||||||
|
|
||||||
|
#if !defined(_TRACE_ZIO_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||||
|
#define _TRACE_ZIO_H
|
||||||
|
|
||||||
|
#include <linux/tracepoint.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/trace_common.h> /* For ZIO macros */
|
||||||
|
|
||||||
|
TRACE_EVENT(zfs_zio__delay__miss,
|
||||||
|
TP_PROTO(zio_t *zio, hrtime_t now),
|
||||||
|
TP_ARGS(zio, now),
|
||||||
|
TP_STRUCT__entry(
|
||||||
|
ZIO_TP_STRUCT_ENTRY
|
||||||
|
__field(hrtime_t, now)
|
||||||
|
),
|
||||||
|
TP_fast_assign(
|
||||||
|
ZIO_TP_FAST_ASSIGN
|
||||||
|
__entry->now = now;
|
||||||
|
),
|
||||||
|
TP_printk("now %llu " ZIO_TP_PRINTK_FMT, __entry->now,
|
||||||
|
ZIO_TP_PRINTK_ARGS)
|
||||||
|
);
|
||||||
|
|
||||||
|
TRACE_EVENT(zfs_zio__delay__hit,
|
||||||
|
TP_PROTO(zio_t *zio, hrtime_t now, hrtime_t diff),
|
||||||
|
TP_ARGS(zio, now, diff),
|
||||||
|
TP_STRUCT__entry(
|
||||||
|
ZIO_TP_STRUCT_ENTRY
|
||||||
|
__field(hrtime_t, now)
|
||||||
|
__field(hrtime_t, diff)
|
||||||
|
),
|
||||||
|
TP_fast_assign(
|
||||||
|
ZIO_TP_FAST_ASSIGN
|
||||||
|
__entry->now = now;
|
||||||
|
__entry->diff = diff;
|
||||||
|
),
|
||||||
|
TP_printk("now %llu diff %llu " ZIO_TP_PRINTK_FMT, __entry->now,
|
||||||
|
__entry->diff, ZIO_TP_PRINTK_ARGS)
|
||||||
|
);
|
||||||
|
|
||||||
|
TRACE_EVENT(zfs_zio__delay__skip,
|
||||||
|
TP_PROTO(zio_t *zio),
|
||||||
|
TP_ARGS(zio),
|
||||||
|
TP_STRUCT__entry(ZIO_TP_STRUCT_ENTRY),
|
||||||
|
TP_fast_assign(ZIO_TP_FAST_ASSIGN),
|
||||||
|
TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS)
|
||||||
|
);
|
||||||
|
|
||||||
|
#endif /* _TRACE_ZIO_H */
|
||||||
|
|
||||||
|
#undef TRACE_INCLUDE_PATH
|
||||||
|
#undef TRACE_INCLUDE_FILE
|
||||||
|
#define TRACE_INCLUDE_PATH sys
|
||||||
|
#define TRACE_INCLUDE_FILE trace_zio
|
||||||
|
#include <trace/define_trace.h>
|
||||||
|
|
||||||
|
#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */
|
|
@ -287,6 +287,7 @@ typedef struct zinject_record {
|
||||||
uint32_t zi_iotype;
|
uint32_t zi_iotype;
|
||||||
int32_t zi_duration;
|
int32_t zi_duration;
|
||||||
uint64_t zi_timer;
|
uint64_t zi_timer;
|
||||||
|
uint64_t zi_nlanes;
|
||||||
uint32_t zi_cmd;
|
uint32_t zi_cmd;
|
||||||
uint32_t zi_pad;
|
uint32_t zi_pad;
|
||||||
} zinject_record_t;
|
} zinject_record_t;
|
||||||
|
|
|
@ -420,6 +420,7 @@ struct zio {
|
||||||
|
|
||||||
uint64_t io_offset;
|
uint64_t io_offset;
|
||||||
hrtime_t io_timestamp; /* submitted at */
|
hrtime_t io_timestamp; /* submitted at */
|
||||||
|
hrtime_t io_target_timestamp;
|
||||||
hrtime_t io_delta; /* vdev queue service delta */
|
hrtime_t io_delta; /* vdev queue service delta */
|
||||||
hrtime_t io_delay; /* Device access time (disk or */
|
hrtime_t io_delay; /* Device access time (disk or */
|
||||||
/* file). */
|
/* file). */
|
||||||
|
@ -510,6 +511,8 @@ extern int zio_wait(zio_t *zio);
|
||||||
extern void zio_nowait(zio_t *zio);
|
extern void zio_nowait(zio_t *zio);
|
||||||
extern void zio_execute(zio_t *zio);
|
extern void zio_execute(zio_t *zio);
|
||||||
extern void zio_interrupt(zio_t *zio);
|
extern void zio_interrupt(zio_t *zio);
|
||||||
|
extern void zio_delay_init(zio_t *zio);
|
||||||
|
extern void zio_delay_interrupt(zio_t *zio);
|
||||||
|
|
||||||
extern zio_t *zio_walk_parents(zio_t *cio);
|
extern zio_t *zio_walk_parents(zio_t *cio);
|
||||||
extern zio_t *zio_walk_children(zio_t *pio);
|
extern zio_t *zio_walk_children(zio_t *pio);
|
||||||
|
@ -572,7 +575,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error);
|
||||||
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
|
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
|
||||||
extern int zio_handle_label_injection(zio_t *zio, int error);
|
extern int zio_handle_label_injection(zio_t *zio, int error);
|
||||||
extern void zio_handle_ignored_writes(zio_t *zio);
|
extern void zio_handle_ignored_writes(zio_t *zio);
|
||||||
extern uint64_t zio_handle_io_delay(zio_t *zio);
|
extern hrtime_t zio_handle_io_delay(zio_t *zio);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Checksum ereport functions
|
* Checksum ereport functions
|
||||||
|
|
|
@ -42,6 +42,39 @@ Cancel injection records.
|
||||||
.TP
|
.TP
|
||||||
.B "zinject \-d \fIvdev\fB \-A <degrade|fault> \fIpool\fB
|
.B "zinject \-d \fIvdev\fB \-A <degrade|fault> \fIpool\fB
|
||||||
Force a vdev into the DEGRADED or FAULTED state.
|
Force a vdev into the DEGRADED or FAULTED state.
|
||||||
|
.TP
|
||||||
|
.B "zinject -d \fIvdev\fB -D latency:lanes \fIpool\fB
|
||||||
|
|
||||||
|
Add an artificial delay to IO requests on a particular
|
||||||
|
device, such that the requests take a minimum of 'latency'
|
||||||
|
milliseconds to complete. Each delay has an associated
|
||||||
|
number of 'lanes' which defines the number of concurrent
|
||||||
|
IO requests that can be processed.
|
||||||
|
|
||||||
|
For example, with a single lane delay of 10 ms (-D 10:1),
|
||||||
|
the device will only be able to service a single IO request
|
||||||
|
at a time with each request taking 10 ms to complete. So,
|
||||||
|
if only a single request is submitted every 10 ms, the
|
||||||
|
average latency will be 10 ms; but if more than one request
|
||||||
|
is submitted every 10 ms, the average latency will be more
|
||||||
|
than 10 ms.
|
||||||
|
|
||||||
|
Similarly, if a delay of 10 ms is specified to have two
|
||||||
|
lanes (-D 10:2), then the device will be able to service
|
||||||
|
two requests at a time, each with a minimum latency of
|
||||||
|
10 ms. So, if two requests are submitted every 10 ms, then
|
||||||
|
the average latency will be 10 ms; but if more than two
|
||||||
|
requests are submitted every 10 ms, the average latency
|
||||||
|
will be more than 10 ms.
|
||||||
|
|
||||||
|
Also note, these delays are additive. So two invocations
|
||||||
|
of '-D 10:1', is roughly equivalent to a single invocation
|
||||||
|
of '-D 10:2'. This also means, one can specify multiple
|
||||||
|
lanes with differing target latencies. For example, an
|
||||||
|
invocation of '-D 10:1' followed by '-D 25:2' will
|
||||||
|
create 3 lanes on the device; one lane with a latency
|
||||||
|
of 10 ms and two lanes with a 25 ms latency.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.B "zinject \-d \fIvdev\fB [\-e \fIdevice_error\fB] [\-L \fIlabel_error\fB] [\-T \fIfailure\fB] [\-F] \fIpool\fB"
|
.B "zinject \-d \fIvdev\fB [\-e \fIdevice_error\fB] [\-L \fIlabel_error\fB] [\-T \fIfailure\fB] [\-F] \fIpool\fB"
|
||||||
Force a vdev error.
|
Force a vdev error.
|
||||||
|
|
|
@ -47,4 +47,5 @@
|
||||||
#include <sys/trace_multilist.h>
|
#include <sys/trace_multilist.h>
|
||||||
#include <sys/trace_txg.h>
|
#include <sys/trace_txg.h>
|
||||||
#include <sys/trace_zil.h>
|
#include <sys/trace_zil.h>
|
||||||
|
#include <sys/trace_zio.h>
|
||||||
#include <sys/trace_zrlock.h>
|
#include <sys/trace_zrlock.h>
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
|
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
|
||||||
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
||||||
* LLNL-CODE-403049.
|
* LLNL-CODE-403049.
|
||||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/zfs_context.h>
|
#include <sys/zfs_context.h>
|
||||||
|
@ -414,7 +414,7 @@ vdev_disk_dio_put(dio_request_t *dr)
|
||||||
ASSERT3S(zio->io_error, >=, 0);
|
ASSERT3S(zio->io_error, >=, 0);
|
||||||
if (zio->io_error)
|
if (zio->io_error)
|
||||||
vdev_disk_error(zio);
|
vdev_disk_error(zio);
|
||||||
zio_interrupt(zio);
|
zio_delay_interrupt(zio);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -726,6 +726,7 @@ vdev_disk_io_start(zio_t *zio)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||||
error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
|
error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
|
||||||
zio->io_size, zio->io_offset, flags, 0);
|
zio->io_size, zio->io_offset, flags, 0);
|
||||||
if (error) {
|
if (error) {
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/zfs_context.h>
|
#include <sys/zfs_context.h>
|
||||||
|
@ -159,7 +159,7 @@ vdev_file_io_strategy(void *arg)
|
||||||
if (resid != 0 && zio->io_error == 0)
|
if (resid != 0 && zio->io_error == 0)
|
||||||
zio->io_error = SET_ERROR(ENOSPC);
|
zio->io_error = SET_ERROR(ENOSPC);
|
||||||
|
|
||||||
zio_interrupt(zio);
|
zio_delay_interrupt(zio);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -217,6 +217,8 @@ vdev_file_io_start(zio_t *zio)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||||
|
|
||||||
VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio,
|
VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio,
|
||||||
TQ_SLEEP), !=, 0);
|
TQ_SLEEP), !=, 0);
|
||||||
}
|
}
|
||||||
|
|
|
@ -753,9 +753,6 @@ vdev_queue_io_done(zio_t *zio)
|
||||||
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
||||||
zio_t *nio;
|
zio_t *nio;
|
||||||
|
|
||||||
if (zio_injection_enabled)
|
|
||||||
delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
|
|
||||||
|
|
||||||
mutex_enter(&vq->vq_lock);
|
mutex_enter(&vq->vq_lock);
|
||||||
|
|
||||||
vdev_queue_pending_remove(vq, zio);
|
vdev_queue_pending_remove(vq, zio);
|
||||||
|
|
|
@ -40,6 +40,7 @@
|
||||||
#include <sys/blkptr.h>
|
#include <sys/blkptr.h>
|
||||||
#include <sys/zfeature.h>
|
#include <sys/zfeature.h>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
#include <sys/trace_zio.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ==========================================================================
|
* ==========================================================================
|
||||||
|
@ -1390,6 +1391,76 @@ zio_interrupt(zio_t *zio)
|
||||||
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
|
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
zio_delay_interrupt(zio_t *zio)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* The timeout_generic() function isn't defined in userspace, so
|
||||||
|
* rather than trying to implement the function, the zio delay
|
||||||
|
* functionality has been disabled for userspace builds.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef _KERNEL
|
||||||
|
/*
|
||||||
|
* If io_target_timestamp is zero, then no delay has been registered
|
||||||
|
* for this IO, thus jump to the end of this function and "skip" the
|
||||||
|
* delay; issuing it directly to the zio layer.
|
||||||
|
*/
|
||||||
|
if (zio->io_target_timestamp != 0) {
|
||||||
|
hrtime_t now = gethrtime();
|
||||||
|
|
||||||
|
if (now >= zio->io_target_timestamp) {
|
||||||
|
/*
|
||||||
|
* This IO has already taken longer than the target
|
||||||
|
* delay to complete, so we don't want to delay it
|
||||||
|
* any longer; we "miss" the delay and issue it
|
||||||
|
* directly to the zio layer. This is likely due to
|
||||||
|
* the target latency being set to a value less than
|
||||||
|
* the underlying hardware can satisfy (e.g. delay
|
||||||
|
* set to 1ms, but the disks take 10ms to complete an
|
||||||
|
* IO request).
|
||||||
|
*/
|
||||||
|
|
||||||
|
DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
|
||||||
|
hrtime_t, now);
|
||||||
|
|
||||||
|
zio_interrupt(zio);
|
||||||
|
} else {
|
||||||
|
taskqid_t tid;
|
||||||
|
hrtime_t diff = zio->io_target_timestamp - now;
|
||||||
|
clock_t expire_at_tick = ddi_get_lbolt() +
|
||||||
|
NSEC_TO_TICK(diff);
|
||||||
|
|
||||||
|
DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
|
||||||
|
hrtime_t, now, hrtime_t, diff);
|
||||||
|
|
||||||
|
if (NSEC_TO_TICK(diff) == 0) {
|
||||||
|
/* Our delay is less than a jiffy - just spin */
|
||||||
|
zfs_sleep_until(zio->io_target_timestamp);
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Use taskq_dispatch_delay() in the place of
|
||||||
|
* OpenZFS's timeout_generic().
|
||||||
|
*/
|
||||||
|
tid = taskq_dispatch_delay(system_taskq,
|
||||||
|
(task_func_t *) zio_interrupt,
|
||||||
|
zio, TQ_NOSLEEP, expire_at_tick);
|
||||||
|
if (!tid) {
|
||||||
|
/*
|
||||||
|
* Couldn't allocate a task. Just
|
||||||
|
* finish the zio without a delay.
|
||||||
|
*/
|
||||||
|
zio_interrupt(zio);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
|
||||||
|
zio_interrupt(zio);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Execute the I/O pipeline until one of the following occurs:
|
* Execute the I/O pipeline until one of the following occurs:
|
||||||
* (1) the I/O completes; (2) the pipeline stalls waiting for
|
* (1) the I/O completes; (2) the pipeline stalls waiting for
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -49,15 +49,53 @@
|
||||||
|
|
||||||
uint32_t zio_injection_enabled = 0;
|
uint32_t zio_injection_enabled = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Data describing each zinject handler registered on the system, and
|
||||||
|
* contains the list node linking the handler in the global zinject
|
||||||
|
* handler list.
|
||||||
|
*/
|
||||||
typedef struct inject_handler {
|
typedef struct inject_handler {
|
||||||
int zi_id;
|
int zi_id;
|
||||||
spa_t *zi_spa;
|
spa_t *zi_spa;
|
||||||
zinject_record_t zi_record;
|
zinject_record_t zi_record;
|
||||||
|
uint64_t *zi_lanes;
|
||||||
|
int zi_next_lane;
|
||||||
list_node_t zi_link;
|
list_node_t zi_link;
|
||||||
} inject_handler_t;
|
} inject_handler_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* List of all zinject handlers registered on the system, protected by
|
||||||
|
* the inject_lock defined below.
|
||||||
|
*/
|
||||||
static list_t inject_handlers;
|
static list_t inject_handlers;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This protects insertion into, and traversal of, the inject handler
|
||||||
|
* list defined above; as well as the inject_delay_count. Any time a
|
||||||
|
* handler is inserted or removed from the list, this lock should be
|
||||||
|
* taken as a RW_WRITER; and any time traversal is done over the list
|
||||||
|
* (without modification to it) this lock should be taken as a RW_READER.
|
||||||
|
*/
|
||||||
static krwlock_t inject_lock;
|
static krwlock_t inject_lock;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This holds the number of zinject delay handlers that have been
|
||||||
|
* registered on the system. It is protected by the inject_lock defined
|
||||||
|
* above. Thus modifications to this count must be a RW_WRITER of the
|
||||||
|
* inject_lock, and reads of this count must be (at least) a RW_READER
|
||||||
|
* of the lock.
|
||||||
|
*/
|
||||||
|
static int inject_delay_count = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This lock is used only in zio_handle_io_delay(), refer to the comment
|
||||||
|
* in that function for more details.
|
||||||
|
*/
|
||||||
|
static kmutex_t inject_delay_mtx;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Used to assign unique identifying numbers to each new zinject handler.
|
||||||
|
*/
|
||||||
static int inject_next_id = 1;
|
static int inject_next_id = 1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -361,21 +399,70 @@ spa_handle_ignored_writes(spa_t *spa)
|
||||||
rw_exit(&inject_lock);
|
rw_exit(&inject_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t
|
hrtime_t
|
||||||
zio_handle_io_delay(zio_t *zio)
|
zio_handle_io_delay(zio_t *zio)
|
||||||
{
|
{
|
||||||
vdev_t *vd = zio->io_vd;
|
vdev_t *vd = zio->io_vd;
|
||||||
|
inject_handler_t *min_handler = NULL;
|
||||||
|
hrtime_t min_target = 0;
|
||||||
inject_handler_t *handler;
|
inject_handler_t *handler;
|
||||||
uint64_t seconds = 0;
|
hrtime_t idle;
|
||||||
|
hrtime_t busy;
|
||||||
if (zio_injection_enabled == 0)
|
hrtime_t target;
|
||||||
return (0);
|
|
||||||
|
|
||||||
rw_enter(&inject_lock, RW_READER);
|
rw_enter(&inject_lock, RW_READER);
|
||||||
|
|
||||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
/*
|
||||||
handler = list_next(&inject_handlers, handler)) {
|
* inject_delay_count is a subset of zio_injection_enabled that
|
||||||
|
* is only incremented for delay handlers. These checks are
|
||||||
|
* mainly added to remind the reader why we're not explicitly
|
||||||
|
* checking zio_injection_enabled like the other functions.
|
||||||
|
*/
|
||||||
|
IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
|
||||||
|
IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there aren't any inject delay handlers registered, then we
|
||||||
|
* can short circuit and simply return 0 here. A value of zero
|
||||||
|
* informs zio_delay_interrupt() that this request should not be
|
||||||
|
* delayed. This short circuit keeps us from acquiring the
|
||||||
|
* inject_delay_mutex unnecessarily.
|
||||||
|
*/
|
||||||
|
if (inject_delay_count == 0) {
|
||||||
|
rw_exit(&inject_lock);
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Each inject handler has a number of "lanes" associated with
|
||||||
|
* it. Each lane is able to handle requests independently of one
|
||||||
|
* another, and at a latency defined by the inject handler
|
||||||
|
* record's zi_timer field. Thus if a handler in configured with
|
||||||
|
* a single lane with a 10ms latency, it will delay requests
|
||||||
|
* such that only a single request is completed every 10ms. So,
|
||||||
|
* if more than one request is attempted per each 10ms interval,
|
||||||
|
* the average latency of the requests will be greater than
|
||||||
|
* 10ms; but if only a single request is submitted each 10ms
|
||||||
|
* interval the average latency will be 10ms.
|
||||||
|
*
|
||||||
|
* We need to acquire this mutex to prevent multiple concurrent
|
||||||
|
* threads being assigned to the same lane of a given inject
|
||||||
|
* handler. The mutex allows us to perform the following two
|
||||||
|
* operations atomically:
|
||||||
|
*
|
||||||
|
* 1. determine the minimum handler and minimum target
|
||||||
|
* value of all the possible handlers
|
||||||
|
* 2. update that minimum handler's lane array
|
||||||
|
*
|
||||||
|
* Without atomicity, two (or more) threads could pick the same
|
||||||
|
* lane in step (1), and then conflict with each other in step
|
||||||
|
* (2). This could allow a single lane handler to process
|
||||||
|
* multiple requests simultaneously, which shouldn't be possible.
|
||||||
|
*/
|
||||||
|
mutex_enter(&inject_delay_mtx);
|
||||||
|
|
||||||
|
for (handler = list_head(&inject_handlers);
|
||||||
|
handler != NULL; handler = list_next(&inject_handlers, handler)) {
|
||||||
if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
|
if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
@ -384,14 +471,101 @@ zio_handle_io_delay(zio_t *zio)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vd->vdev_guid == handler->zi_record.zi_guid) {
|
if (vd->vdev_guid != handler->zi_record.zi_guid)
|
||||||
seconds = handler->zi_record.zi_timer;
|
continue;
|
||||||
break;
|
|
||||||
|
/*
|
||||||
|
* Defensive; should never happen as the array allocation
|
||||||
|
* occurs prior to inserting this handler on the list.
|
||||||
|
*/
|
||||||
|
ASSERT3P(handler->zi_lanes, !=, NULL);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This should never happen, the zinject command should
|
||||||
|
* prevent a user from setting an IO delay with zero lanes.
|
||||||
|
*/
|
||||||
|
ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
|
||||||
|
|
||||||
|
ASSERT3U(handler->zi_record.zi_nlanes, >,
|
||||||
|
handler->zi_next_lane);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We want to issue this IO to the lane that will become
|
||||||
|
* idle the soonest, so we compare the soonest this
|
||||||
|
* specific handler can complete the IO with all other
|
||||||
|
* handlers, to find the lowest value of all possible
|
||||||
|
* lanes. We then use this lane to submit the request.
|
||||||
|
*
|
||||||
|
* Since each handler has a constant value for its
|
||||||
|
* delay, we can just use the "next" lane for that
|
||||||
|
* handler; as it will always be the lane with the
|
||||||
|
* lowest value for that particular handler (i.e. the
|
||||||
|
* lane that will become idle the soonest). This saves a
|
||||||
|
* scan of each handler's lanes array.
|
||||||
|
*
|
||||||
|
* There's two cases to consider when determining when
|
||||||
|
* this specific IO request should complete. If this
|
||||||
|
* lane is idle, we want to "submit" the request now so
|
||||||
|
* it will complete after zi_timer milliseconds. Thus,
|
||||||
|
* we set the target to now + zi_timer.
|
||||||
|
*
|
||||||
|
* If the lane is busy, we want this request to complete
|
||||||
|
* zi_timer milliseconds after the lane becomes idle.
|
||||||
|
* Since the 'zi_lanes' array holds the time at which
|
||||||
|
* each lane will become idle, we use that value to
|
||||||
|
* determine when this request should complete.
|
||||||
|
*/
|
||||||
|
idle = handler->zi_record.zi_timer + gethrtime();
|
||||||
|
busy = handler->zi_record.zi_timer +
|
||||||
|
handler->zi_lanes[handler->zi_next_lane];
|
||||||
|
target = MAX(idle, busy);
|
||||||
|
|
||||||
|
if (min_handler == NULL) {
|
||||||
|
min_handler = handler;
|
||||||
|
min_target = target;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ASSERT3P(min_handler, !=, NULL);
|
||||||
|
ASSERT3U(min_target, !=, 0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We don't yet increment the "next lane" variable since
|
||||||
|
* we still might find a lower value lane in another
|
||||||
|
* handler during any remaining iterations. Once we're
|
||||||
|
* sure we've selected the absolute minimum, we'll claim
|
||||||
|
* the lane and increment the handler's "next lane"
|
||||||
|
* field below.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (target < min_target) {
|
||||||
|
min_handler = handler;
|
||||||
|
min_target = target;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 'min_handler' will be NULL if no IO delays are registered for
|
||||||
|
* this vdev, otherwise it will point to the handler containing
|
||||||
|
* the lane that will become idle the soonest.
|
||||||
|
*/
|
||||||
|
if (min_handler != NULL) {
|
||||||
|
ASSERT3U(min_target, !=, 0);
|
||||||
|
min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we've used all possible lanes for this handler,
|
||||||
|
* loop back and start using the first lane again;
|
||||||
|
* otherwise, just increment the lane index.
|
||||||
|
*/
|
||||||
|
min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
|
||||||
|
min_handler->zi_record.zi_nlanes;
|
||||||
|
}
|
||||||
|
|
||||||
|
mutex_exit(&inject_delay_mtx);
|
||||||
rw_exit(&inject_lock);
|
rw_exit(&inject_lock);
|
||||||
return (seconds);
|
|
||||||
|
return (min_target);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -415,6 +589,24 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
|
||||||
if ((error = spa_reset(name)) != 0)
|
if ((error = spa_reset(name)) != 0)
|
||||||
return (error);
|
return (error);
|
||||||
|
|
||||||
|
if (record->zi_cmd == ZINJECT_DELAY_IO) {
|
||||||
|
/*
|
||||||
|
* A value of zero for the number of lanes or for the
|
||||||
|
* delay time doesn't make sense.
|
||||||
|
*/
|
||||||
|
if (record->zi_timer == 0 || record->zi_nlanes == 0)
|
||||||
|
return (SET_ERROR(EINVAL));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The number of lanes is directly mapped to the size of
|
||||||
|
* an array used by the handler. Thus, to ensure the
|
||||||
|
* user doesn't trigger an allocation that's "too large"
|
||||||
|
* we cap the number of lanes here.
|
||||||
|
*/
|
||||||
|
if (record->zi_nlanes >= UINT16_MAX)
|
||||||
|
return (SET_ERROR(EINVAL));
|
||||||
|
}
|
||||||
|
|
||||||
if (!(flags & ZINJECT_NULL)) {
|
if (!(flags & ZINJECT_NULL)) {
|
||||||
/*
|
/*
|
||||||
* spa_inject_ref() will add an injection reference, which will
|
* spa_inject_ref() will add an injection reference, which will
|
||||||
|
@ -426,11 +618,34 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
|
||||||
|
|
||||||
handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
|
handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
|
||||||
|
|
||||||
rw_enter(&inject_lock, RW_WRITER);
|
|
||||||
|
|
||||||
*id = handler->zi_id = inject_next_id++;
|
|
||||||
handler->zi_spa = spa;
|
handler->zi_spa = spa;
|
||||||
handler->zi_record = *record;
|
handler->zi_record = *record;
|
||||||
|
|
||||||
|
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
|
||||||
|
handler->zi_lanes = kmem_zalloc(
|
||||||
|
sizeof (*handler->zi_lanes) *
|
||||||
|
handler->zi_record.zi_nlanes, KM_SLEEP);
|
||||||
|
handler->zi_next_lane = 0;
|
||||||
|
} else {
|
||||||
|
handler->zi_lanes = NULL;
|
||||||
|
handler->zi_next_lane = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
rw_enter(&inject_lock, RW_WRITER);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We can't move this increment into the conditional
|
||||||
|
* above because we need to hold the RW_WRITER lock of
|
||||||
|
* inject_lock, and we don't want to hold that while
|
||||||
|
* allocating the handler's zi_lanes array.
|
||||||
|
*/
|
||||||
|
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
|
||||||
|
ASSERT3S(inject_delay_count, >=, 0);
|
||||||
|
inject_delay_count++;
|
||||||
|
ASSERT3S(inject_delay_count, >, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
*id = handler->zi_id = inject_next_id++;
|
||||||
list_insert_tail(&inject_handlers, handler);
|
list_insert_tail(&inject_handlers, handler);
|
||||||
atomic_inc_32(&zio_injection_enabled);
|
atomic_inc_32(&zio_injection_enabled);
|
||||||
|
|
||||||
|
@ -508,9 +723,23 @@ zio_clear_fault(int id)
|
||||||
return (SET_ERROR(ENOENT));
|
return (SET_ERROR(ENOENT));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
|
||||||
|
ASSERT3S(inject_delay_count, >, 0);
|
||||||
|
inject_delay_count--;
|
||||||
|
ASSERT3S(inject_delay_count, >=, 0);
|
||||||
|
}
|
||||||
|
|
||||||
list_remove(&inject_handlers, handler);
|
list_remove(&inject_handlers, handler);
|
||||||
rw_exit(&inject_lock);
|
rw_exit(&inject_lock);
|
||||||
|
|
||||||
|
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
|
||||||
|
ASSERT3P(handler->zi_lanes, !=, NULL);
|
||||||
|
kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
|
||||||
|
handler->zi_record.zi_nlanes);
|
||||||
|
} else {
|
||||||
|
ASSERT3P(handler->zi_lanes, ==, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
spa_inject_delref(handler->zi_spa);
|
spa_inject_delref(handler->zi_spa);
|
||||||
kmem_free(handler, sizeof (inject_handler_t));
|
kmem_free(handler, sizeof (inject_handler_t));
|
||||||
atomic_dec_32(&zio_injection_enabled);
|
atomic_dec_32(&zio_injection_enabled);
|
||||||
|
@ -522,6 +751,7 @@ void
|
||||||
zio_inject_init(void)
|
zio_inject_init(void)
|
||||||
{
|
{
|
||||||
rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
|
rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
|
||||||
|
mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
|
||||||
list_create(&inject_handlers, sizeof (inject_handler_t),
|
list_create(&inject_handlers, sizeof (inject_handler_t),
|
||||||
offsetof(inject_handler_t, zi_link));
|
offsetof(inject_handler_t, zi_link));
|
||||||
}
|
}
|
||||||
|
@ -530,6 +760,7 @@ void
|
||||||
zio_inject_fini(void)
|
zio_inject_fini(void)
|
||||||
{
|
{
|
||||||
list_destroy(&inject_handlers);
|
list_destroy(&inject_handlers);
|
||||||
|
mutex_destroy(&inject_delay_mtx);
|
||||||
rw_destroy(&inject_lock);
|
rw_destroy(&inject_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -38,10 +38,7 @@ verify_disk_count "$DISKS" 2
|
||||||
default_mirror_setup_noexit $DISK1 $DISK2
|
default_mirror_setup_noexit $DISK1 $DISK2
|
||||||
|
|
||||||
mntpnt=$(get_prop mountpoint $TESTPOOL)
|
mntpnt=$(get_prop mountpoint $TESTPOOL)
|
||||||
typeset -i i=0
|
|
||||||
while ((i < 10)); do
|
|
||||||
log_must $DD if=/dev/urandom of=$mntpnt/bigfile.$i bs=1024k count=100
|
|
||||||
((i += 1))
|
|
||||||
done
|
|
||||||
|
|
||||||
|
# Create 100MB of data
|
||||||
|
log_must $FILE_WRITE -b 1048576 -c 100 -o create -d 0 -f $mntpnt/bigfile
|
||||||
log_pass
|
log_pass
|
||||||
|
|
|
@ -38,16 +38,15 @@
|
||||||
# 3. Verify zpool scrub -s succeed when the system is scrubbing.
|
# 3. Verify zpool scrub -s succeed when the system is scrubbing.
|
||||||
#
|
#
|
||||||
# NOTES:
|
# NOTES:
|
||||||
# A 1 second delay is added to 10% of zio's in order to ensure that
|
# A 10ms delay is added to the ZIOs in order to ensure that the
|
||||||
# the scrub does not complete before it has a chance to be cancelled.
|
# scrub does not complete before it has a chance to be cancelled.
|
||||||
# This can occur when testing with small pools or very fast hardware.
|
# This can occur when testing with small pools or very fast hardware.
|
||||||
#
|
#
|
||||||
|
|
||||||
verify_runnable "global"
|
verify_runnable "global"
|
||||||
|
|
||||||
log_assert "Verify scrub -s works correctly."
|
log_assert "Verify scrub -s works correctly."
|
||||||
|
log_must $ZINJECT -d $DISK1 -D10:1 $TESTPOOL
|
||||||
log_must $ZINJECT -d $DISK1 -f10 -D1 $TESTPOOL
|
|
||||||
log_must $ZPOOL scrub $TESTPOOL
|
log_must $ZPOOL scrub $TESTPOOL
|
||||||
log_must $ZPOOL scrub -s $TESTPOOL
|
log_must $ZPOOL scrub -s $TESTPOOL
|
||||||
log_must is_pool_scrub_stopped $TESTPOOL
|
log_must is_pool_scrub_stopped $TESTPOOL
|
||||||
|
|
|
@ -40,8 +40,8 @@
|
||||||
# 4. Check the percent again, verify a new scrub started.
|
# 4. Check the percent again, verify a new scrub started.
|
||||||
#
|
#
|
||||||
# NOTES:
|
# NOTES:
|
||||||
# A 1 second delay is added to 10% of zio's in order to ensure that
|
# A 10ms delay is added to the ZIOs in order to ensure that the
|
||||||
# the scrub does not complete before it has a chance to be restarted.
|
# scrub does not complete before it has a chance to be restarted.
|
||||||
# This can occur when testing with small pools or very fast hardware.
|
# This can occur when testing with small pools or very fast hardware.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
@ -61,7 +61,7 @@ function get_scrub_percent
|
||||||
log_assert "scrub command terminates the existing scrub process and starts" \
|
log_assert "scrub command terminates the existing scrub process and starts" \
|
||||||
"a new scrub."
|
"a new scrub."
|
||||||
|
|
||||||
log_must $ZINJECT -d $DISK1 -f10 -D1 $TESTPOOL
|
log_must $ZINJECT -d $DISK1 -D10:1 $TESTPOOL
|
||||||
log_must $ZPOOL scrub $TESTPOOL
|
log_must $ZPOOL scrub $TESTPOOL
|
||||||
typeset -i PERCENT=30 percent=0
|
typeset -i PERCENT=30 percent=0
|
||||||
while ((percent < PERCENT)) ; do
|
while ((percent < PERCENT)) ; do
|
||||||
|
|
|
@ -42,16 +42,16 @@
|
||||||
# 3. Verify scrub failed until the resilver completed
|
# 3. Verify scrub failed until the resilver completed
|
||||||
#
|
#
|
||||||
# NOTES:
|
# NOTES:
|
||||||
# A 1 second delay is added to 10% of zio's in order to ensure that
|
# A 10ms delay is added to 10% of zio's in order to ensure that the
|
||||||
# the resilver does not complete before the scrub can be issue. This
|
# resilver does not complete before the scrub can be issued. This
|
||||||
# can occur when testing with small pools or very fast hardware.
|
# can occur when testing with small pools or very fast hardware.
|
||||||
|
|
||||||
verify_runnable "global"
|
verify_runnable "global"
|
||||||
|
|
||||||
log_assert "Resilver prevent scrub from starting until the resilver completes"
|
log_assert "Resilver prevent scrub from starting until the resilver completes"
|
||||||
|
|
||||||
log_must $ZINJECT -d $DISK1 -f10 -D1 $TESTPOOL
|
|
||||||
log_must $ZPOOL detach $TESTPOOL $DISK2
|
log_must $ZPOOL detach $TESTPOOL $DISK2
|
||||||
|
log_must $ZINJECT -d $DISK1 -D10:1 $TESTPOOL
|
||||||
log_must $ZPOOL attach $TESTPOOL $DISK1 $DISK2
|
log_must $ZPOOL attach $TESTPOOL $DISK1 $DISK2
|
||||||
log_must is_pool_resilvering $TESTPOOL
|
log_must is_pool_resilvering $TESTPOOL
|
||||||
log_mustnot $ZPOOL scrub $TESTPOOL
|
log_mustnot $ZPOOL scrub $TESTPOOL
|
||||||
|
|
Loading…
Reference in New Issue