diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index 54740104aa..393e50a28e 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -291,8 +291,8 @@ usage(void) "\t\tspecified by the remaining tuple. Each number is in\n" "\t\thexadecimal, and only one block can be specified.\n" "\n" - "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n" - "\t [-a] [-m] [-u] [-f freq] \n" + "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n" + "\t\t[-r range] [-a] [-m] [-u] [-f freq] \n" "\n" "\t\tInject an error into the object specified by the '-t' option\n" "\t\tand the object descriptor. The 'object' parameter is\n" @@ -300,7 +300,10 @@ usage(void) "\n" "\t\t-q\tQuiet mode. Only print out the handler number added.\n" "\t\t-e\tInject a specific error. Must be one of 'io',\n" - "\t\t\t'checksum', 'decompress', or decrypt. Default is 'io'.\n" + "\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n" + "\t\t-C\tInject the given error only into specific DVAs. The\n" + "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n" + "\t\t\tseparated by commas (ex. '0,2').\n" "\t\t-l\tInject error at a particular block level. Default is " "0.\n" "\t\t-m\tAutomatically remount underlying filesystem.\n" @@ -361,17 +364,20 @@ print_data_handler(int id, const char *pool, zinject_record_t *record, return (0); if (*count == 0) { - (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n", - "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE"); + (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " + "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", + "LVL", "DVAs", "RANGE"); (void) printf("--- --------------- ------ " - "------ -------- --- ---------------\n"); + "------ -------- --- ---- ---------------\n"); } *count += 1; - (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool, - (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object, - type_to_name(record->zi_type), record->zi_level); + (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ", + id, pool, (u_longlong_t)record->zi_objset, + (u_longlong_t)record->zi_object, type_to_name(record->zi_type), + record->zi_level, record->zi_dvas); + if (record->zi_start == 0 && record->zi_end == -1ULL) @@ -602,6 +608,7 @@ register_handler(const char *pool, int flags, zinject_record_t *record, (void) printf(" range: [%llu, %llu)\n", (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); + (void) printf(" dvas: 0x%x\n", record->zi_dvas); } } @@ -674,6 +681,59 @@ parse_frequency(const char *str, uint32_t *percent) return (0); } +/* + * This function converts a string specifier for DVAs into a bit mask. + * The dva's provided by the user should be 0 indexed and separated by + * a comma. For example: + * "1" -> 0b0010 (0x2) + * "0,1" -> 0b0011 (0x3) + * "0,1,2" -> 0b0111 (0x7) + */ +static int +parse_dvas(const char *str, uint32_t *dvas_out) +{ + const char *c = str; + uint32_t mask = 0; + boolean_t need_delim = B_FALSE; + + /* max string length is 5 ("0,1,2") */ + if (strlen(str) > 5 || strlen(str) == 0) + return (EINVAL); + + while (*c != '\0') { + switch (*c) { + case '0': + case '1': + case '2': + /* check for pipe between DVAs */ + if (need_delim) + return (EINVAL); + + /* check if this DVA has been set already */ + if (mask & (1 << ((*c) - '0'))) + return (EINVAL); + + mask |= (1 << ((*c) - '0')); + need_delim = B_TRUE; + break; + case ',': + need_delim = B_FALSE; + break; + default: + /* check for invalid character */ + return (EINVAL); + } + c++; + } + + /* check for dangling delimiter */ + if (!need_delim) + return (EINVAL); + + *dvas_out = mask; + return (0); +} + int main(int argc, char **argv) { @@ -700,6 +760,7 @@ main(int argc, char **argv) int dur_secs = 0; int ret; int flags = 0; + uint32_t dvas = 0; if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s", libzfs_error_init(errno)); @@ -730,7 +791,7 @@ main(int argc, char **argv) } while ((c = getopt(argc, argv, - ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -754,6 +815,17 @@ main(int argc, char **argv) case 'c': cancel = optarg; break; + case 'C': + ret = parse_dvas(optarg, &dvas); + if (ret != 0) { + (void) fprintf(stderr, "invalid DVA list '%s': " + "DVAs should be 0 indexed and separated by " + "commas.\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; case 'd': device = optarg; break; @@ -937,7 +1009,7 @@ main(int argc, char **argv) */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || - record.zi_freq > 0) { + record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "cancel (-c) incompatible with " "any other options\n"); usage(); @@ -972,7 +1044,8 @@ main(int argc, char **argv) * for doing injection, so handle it separately here. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || + dvas != 0) { (void) fprintf(stderr, "device (-d) incompatible with " "data error injection\n"); usage(); @@ -1020,7 +1093,7 @@ main(int argc, char **argv) } else if (raw != NULL) { if (range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || - record.zi_freq > 0) { + record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "raw (-b) format with " "any other options\n"); usage(); @@ -1055,7 +1128,8 @@ main(int argc, char **argv) error = EIO; } else if (record.zi_cmd == ZINJECT_PANIC) { if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || device != NULL || record.zi_freq > 0) { + level != 0 || device != NULL || record.zi_freq > 0 || + dvas != 0) { (void) fprintf(stderr, "panic (-p) incompatible with " "other options\n"); usage(); @@ -1076,6 +1150,15 @@ main(int argc, char **argv) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_freq > 0 || dvas != 0) { + (void) fprintf(stderr, "hardware failure (-I) " + "incompatible with other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + if (nowrites == 0) { (void) fprintf(stderr, "-s or -g meaningless " "without -I (ignore writes)\n"); @@ -1136,6 +1219,18 @@ main(int argc, char **argv) return (1); } + if (dvas != 0) { + if (error == EACCES || error == EINVAL) { + (void) fprintf(stderr, "the '-c' option may " + "not be used with logical data errors " + "'decrypt' and 'decompress'\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_dvas = dvas; + } + if (error == EACCES) { if (type != TYPE_DATA) { (void) fprintf(stderr, "decryption errors " diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index f8c65f581d..bb5b48c917 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -367,7 +367,7 @@ typedef struct zinject_record { uint64_t zi_timer; uint64_t zi_nlanes; uint32_t zi_cmd; - uint32_t zi_pad; + uint32_t zi_dvas; } zinject_record_t; #define ZINJECT_NULL 0x1 @@ -386,6 +386,8 @@ typedef struct zinject_record { #define ZI_PERCENTAGE_MIN 4294UL #define ZI_PERCENTAGE_MAX UINT32_MAX +#define ZI_NO_DVA (-1) + typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index e97db839b8..f02e78ca20 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -85,13 +85,13 @@ Simulate a hardware failure that fails to honor a cache flush. .B "zinject \-p \fIfunction\fB \fIpool\fB Panic inside the specified function. .TP -.B "zinject \-t data [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB" +.B "zinject \-t data [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB" Force an error into the contents of a file. .TP -.B "zinject \-t dnode [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB" +.B "zinject \-t dnode [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB" Force an error into the metadnode for a file or directory. .TP -.B "zinject \-t \fImos_type\fB [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB" +.B "zinject \-t \fImos_type\fB [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB" Force an error into the MOS of a pool. .SH OPTIONS .TP @@ -102,6 +102,14 @@ Flush the ARC before injection. Force an error into the pool at this bookmark tuple. Each number is in hexadecimal, and only one block can be specified. .TP +.BI "\-C" " dvas" +Inject the given error only into specific DVAs. The mask should be +specified as a list of 0-indexed DVAs separated by commas (ex. '0,2'). This +option is not applicable to logical data errors such as +.BR "decompress" +and +.BR "decrypt" . +.TP .BI "\-d" " vdev" A vdev specified by path or GUID. .TP diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 3180ce65ab..eee122aa6d 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -236,24 +236,43 @@ typedef enum { */ typedef struct scan_io { /* fields from blkptr_t */ - uint64_t sio_offset; uint64_t sio_blk_prop; uint64_t sio_phys_birth; uint64_t sio_birth; zio_cksum_t sio_cksum; - uint32_t sio_asize; + uint32_t sio_nr_dvas; /* fields from zio_t */ - int sio_flags; + uint32_t sio_flags; zbookmark_phys_t sio_zb; /* members for queue sorting */ union { - avl_node_t sio_addr_node; /* link into issueing queue */ + avl_node_t sio_addr_node; /* link into issuing queue */ list_node_t sio_list_node; /* link for issuing to disk */ } sio_nodes; + + /* + * There may be up to SPA_DVAS_PER_BP DVAs here from the bp, + * depending on how many were in the original bp. Only the + * first DVA is really used for sorting and issuing purposes. + * The other DVAs (if provided) simply exist so that the zio + * layer can find additional copies to repair from in the + * event of an error. This array must go at the end of the + * struct to allow this for the variable number of elements. + */ + dva_t sio_dva[0]; } scan_io_t; +#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) +#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x) +#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0]) +#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0]) +#define SIO_GET_END_OFFSET(sio) \ + (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio)) +#define SIO_GET_MUSED(sio) \ + (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t))) + struct dsl_scan_io_queue { dsl_scan_t *q_scn; /* associated dsl_scan_t */ vdev_t *q_vd; /* top-level vdev that this queue represents */ @@ -262,6 +281,7 @@ struct dsl_scan_io_queue { range_tree_t *q_exts_by_addr; avl_tree_t q_exts_by_size; avl_tree_t q_sios_by_addr; + uint64_t q_sio_memused; /* members for zio rate limiting */ uint64_t q_maxinflight_bytes; @@ -300,7 +320,27 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); static void scan_io_queues_destroy(dsl_scan_t *scn); -static kmem_cache_t *sio_cache; +static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP]; + +/* sio->sio_nr_dvas must be set so we know which cache to free from */ +static void +sio_free(scan_io_t *sio) +{ + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio); +} + +/* It is up to the caller to set sio->sio_nr_dvas for freeing */ +static scan_io_t * +sio_alloc(unsigned short nr_dvas) +{ + ASSERT3U(nr_dvas, >, 0); + ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP); + + return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP)); +} void scan_init(void) @@ -315,14 +355,22 @@ scan_init(void) */ fill_weight = zfs_scan_fill_weight; - sio_cache = kmem_cache_create("sio_cache", - sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + char name[36]; + + (void) sprintf(name, "sio_cache_%d", i); + sio_cache[i] = kmem_cache_create(name, + (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), + 0, NULL, NULL, NULL, NULL, NULL, 0); + } } void scan_fini(void) { - kmem_cache_destroy(sio_cache); + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + kmem_cache_destroy(sio_cache[i]); + } } static inline boolean_t @@ -339,29 +387,39 @@ dsl_scan_resilvering(dsl_pool_t *dp) } static inline void -sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) +sio2bp(const scan_io_t *sio, blkptr_t *bp) { bzero(bp, sizeof (*bp)); - DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); - DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); - DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); bp->blk_prop = sio->sio_blk_prop; bp->blk_phys_birth = sio->sio_phys_birth; bp->blk_birth = sio->sio_birth; bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; + + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t)); } static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { - /* we discard the vdev id, since we can deduce it from the queue */ - sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); - sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); sio->sio_blk_prop = bp->blk_prop; sio->sio_phys_birth = bp->blk_phys_birth; sio->sio_birth = bp->blk_birth; sio->sio_cksum = bp->blk_cksum; + sio->sio_nr_dvas = BP_GET_NDVAS(bp); + + /* + * Copy the DVAs to the sio. We need all copies of the block so + * that the self healing code can use the alternate copies if the + * first is corrupted. We want the DVA at index dva_i to be first + * in the sio since this is the primary one that we want to issue. + */ + for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) { + sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas]; + } } int @@ -1176,11 +1234,9 @@ dsl_scan_should_clear(dsl_scan_t *scn) mutex_enter(&tvd->vdev_scan_io_queue_lock); queue = tvd->vdev_scan_io_queue; if (queue != NULL) { - /* #extents in exts_by_size = # in exts_by_addr */ + /* # extents in exts_by_size = # in exts_by_addr */ mused += avl_numnodes(&queue->q_exts_by_size) * - sizeof (range_seg_t) + - avl_numnodes(&queue->q_sios_by_addr) * - sizeof (scan_io_t); + sizeof (range_seg_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); } @@ -2693,13 +2749,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) break; } - sio2bp(sio, &bp, queue->q_vd->vdev_id); - bytes_issued += sio->sio_asize; + sio2bp(sio, &bp); + bytes_issued += SIO_GET_ASIZE(sio); scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, &sio->sio_zb, queue); (void) list_remove_head(io_list); scan_io_queues_update_zio_stats(queue, &bp); - kmem_cache_free(sio_cache, sio); + sio_free(sio); } atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); @@ -2717,7 +2773,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) static boolean_t scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) { - scan_io_t srch_sio, *sio, *next_sio; + scan_io_t *srch_sio, *sio, *next_sio; avl_index_t idx; uint_t num_sios = 0; int64_t bytes_issued = 0; @@ -2725,24 +2781,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) ASSERT(rs != NULL); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - srch_sio.sio_offset = rs->rs_start; + srch_sio = sio_alloc(1); + srch_sio->sio_nr_dvas = 1; + SIO_SET_OFFSET(srch_sio, rs->rs_start); /* * The exact start of the extent might not contain any matching zios, * so if that's the case, examine the next one in the tree. */ - sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { - ASSERT3U(sio->sio_offset, >=, rs->rs_start); - ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); + while (sio != NULL && + SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) { + ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start); + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); - bytes_issued += sio->sio_asize; + bytes_issued += SIO_GET_ASIZE(sio); num_sios++; list_insert_tail(list, sio); sio = next_sio; @@ -2754,11 +2816,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ - if (sio != NULL && sio->sio_offset < rs->rs_end) { + if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, - sio->sio_offset, rs->rs_end - sio->sio_offset); + SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio)); return (B_TRUE); } else { @@ -2862,9 +2924,9 @@ scan_io_queues_run_one(void *arg) first_sio = list_head(&sio_list); last_sio = list_tail(&sio_list); - seg_end = last_sio->sio_offset + last_sio->sio_asize; + seg_end = SIO_GET_END_OFFSET(last_sio); if (seg_start == 0) - seg_start = first_sio->sio_offset; + seg_start = SIO_GET_OFFSET(first_sio); /* * Issuing sios can take a long time so drop the @@ -3567,10 +3629,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; - /* update the spa's stats on how many bytes we have issued */ - for (i = 0; i < BP_GET_NDVAS(bp); i++) { + /* + * Update the spa's stats on how many bytes we have issued. + * Sequential scrubs create a zio for each DVA of the bp. Each + * of these will include all DVAs for repair purposes, but the + * zio code will only try the first one unless there is an issue. + * Therefore, we should only count the first DVA for these IOs. + */ + if (scn->scn_is_sorted) { atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, - DVA_GET_ASIZE(&bp->blk_dva[i])); + DVA_GET_ASIZE(&bp->blk_dva[0])); + } else { + spa_t *spa = scn->scn_dp->dp_spa; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + atomic_add_64(&spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[i])); + } } /* @@ -3625,7 +3700,7 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { avl_index_t idx; - int64_t asize = sio->sio_asize; + int64_t asize = SIO_GET_ASIZE(sio); dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); @@ -3633,11 +3708,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { /* block is already scheduled for reading */ atomic_add_64(&scn->scn_bytes_pending, -asize); - kmem_cache_free(sio_cache, sio); + sio_free(sio); return; } avl_insert(&queue->q_sios_by_addr, sio, idx); - range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); + queue->q_sio_memused += SIO_GET_MUSED(sio); + range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize); } /* @@ -3651,7 +3727,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, int zio_flags, const zbookmark_phys_t *zb) { dsl_scan_t *scn = queue->q_scn; - scan_io_t *sio = kmem_cache_alloc(sio_cache, KM_SLEEP); + scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); ASSERT0(BP_IS_GANG(bp)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); @@ -3665,7 +3741,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, * get an integer underflow in case the worker processes the * zio before we get to incrementing this counter. */ - atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); + atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio)); scan_io_queue_insert_impl(queue, sio); } @@ -3905,11 +3981,7 @@ sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; - if (a->sio_offset < b->sio_offset) - return (-1); - if (a->sio_offset == b->sio_offset) - return (0); - return (1); + return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ @@ -3921,6 +3993,7 @@ scan_io_queue_create(vdev_t *vd) q->q_scn = scn; q->q_vd = vd; + q->q_sio_memused = 0; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); @@ -3948,11 +4021,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { ASSERT(range_tree_contains(queue->q_exts_by_addr, - sio->sio_offset, sio->sio_asize)); - bytes_dequeued += sio->sio_asize; - kmem_cache_free(sio_cache, sio); + SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); + bytes_dequeued += SIO_GET_ASIZE(sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); + sio_free(sio); } + ASSERT0(queue->q_sio_memused); atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); range_tree_vacate(queue->q_exts_by_addr, NULL, queue); range_tree_destroy(queue->q_exts_by_addr); @@ -4007,7 +4082,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) vdev_t *vdev; kmutex_t *q_lock; dsl_scan_io_queue_t *queue; - scan_io_t srch, *sio; + scan_io_t *srch_sio, *sio; avl_index_t idx; uint64_t start, size; @@ -4022,9 +4097,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) return; } - bp2sio(bp, &srch, dva_i); - start = srch.sio_offset; - size = srch.sio_asize; + srch_sio = sio_alloc(BP_GET_NDVAS(bp)); + bp2sio(bp, srch_sio, dva_i); + start = SIO_GET_OFFSET(srch_sio); + size = SIO_GET_ASIZE(srch_sio); /* * We can find the zio in two states: @@ -4044,15 +4120,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) * be done with issuing the zio's it gathered and will * signal us. */ - sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + if (sio != NULL) { - int64_t asize = sio->sio_asize; + int64_t asize = SIO_GET_ASIZE(sio); blkptr_t tmpbp; /* Got it while it was cold in the queue */ - ASSERT3U(start, ==, sio->sio_offset); + ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); ASSERT3U(size, ==, asize); avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); @@ -4065,10 +4144,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) atomic_add_64(&scn->scn_bytes_pending, -asize); /* count the block as though we issued it */ - sio2bp(sio, &tmpbp, dva_i); + sio2bp(sio, &tmpbp); count_block(scn, dp->dp_blkstats, &tmpbp); - kmem_cache_free(sio_cache, sio); + sio_free(sio); } mutex_exit(q_lock); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index a92d956cda..a095e09779 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -254,10 +254,35 @@ vdev_mirror_map_init(zio_t *zio) if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dva_t dva_copy[SPA_DVAS_PER_BP]; c = BP_GET_NDVAS(zio->io_bp); + /* + * The sequential scrub code sorts and issues all DVAs + * of a bp separately. Each of these IOs includes all + * original DVA copies so that repairs can be performed + * in the event of an error, but we only actually want + * to check the first DVA since the others will be + * checked by their respective sorted IOs. Only if we + * hit an error will we try all DVAs upon retrying. + * + * Note: This check is safe even if the user switches + * from a legacy scrub to a sequential one in the middle + * of processing, since scn_is_sorted isn't updated until + * all outstanding IOs from the previous scrub pass + * complete. + */ + if ((zio->io_flags & ZIO_FLAG_SCRUB) && + !(zio->io_flags & ZIO_FLAG_IO_RETRY) && + dsl_scan_scrubbing(spa->spa_dsl_pool) && + scn->scn_is_sorted) { + c = 1; + } else { + c = BP_GET_NDVAS(zio->io_bp); + } + /* * If we do not trust the pool config, some DVAs might be * invalid or point to vdevs that do not exist. We skip them. diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 7a7401ecda..78896d3dc3 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -124,7 +124,7 @@ freq_triggered(uint32_t frequency) * Returns true if the given record matches the I/O in progress. */ static boolean_t -zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, +zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, zinject_record_t *record, int error) { /* @@ -148,8 +148,10 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, zb->zb_level == record->zi_level && zb->zb_blkid >= record->zi_start && zb->zb_blkid <= record->zi_end && - error == record->zi_error) + (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && + error == record->zi_error) { return (freq_triggered(record->zi_freq)); + } return (B_FALSE); } @@ -199,7 +201,8 @@ zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) continue; - if (zio_match_handler(zb, type, &handler->zi_record, error)) { + if (zio_match_handler(zb, type, ZI_NO_DVA, + &handler->zi_record, error)) { ret = error; break; } @@ -209,6 +212,37 @@ zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, return (ret); } +/* + * If this is a physical I/O for a vdev child determine which DVA it is + * for. We iterate backwards through the DVAs matching on the offset so + * that we end up with ZI_NO_DVA (-1) if we don't find a match. + */ +static int +zio_match_dva(zio_t *zio) +{ + int i = ZI_NO_DVA; + + if (zio->io_bp != NULL && zio->io_vd != NULL && + zio->io_child_type == ZIO_CHILD_VDEV) { + for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { + dva_t *dva = &zio->io_bp->blk_dva[i]; + uint64_t off = DVA_GET_OFFSET(dva); + vdev_t *vd = vdev_lookup_top(zio->io_spa, + DVA_GET_VDEV(dva)); + + /* Compensate for vdev label added to leaves */ + if (zio->io_vd->vdev_ops->vdev_op_leaf) + off += VDEV_LABEL_START_SIZE; + + if (zio->io_vd == vd && zio->io_offset == off) + break; + } + } + + return (i); +} + + /* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. @@ -235,15 +269,14 @@ zio_handle_fault_injection(zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; - /* If this handler matches, return EIO */ + /* If this handler matches, return the specified error */ if (zio_match_handler(&zio->io_logical->io_bookmark, zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, - &handler->zi_record, error)) { + zio_match_dva(zio), &handler->zi_record, error)) { ret = error; break; } diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 8a3b4d4eea..93f0c03aac 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -444,7 +444,7 @@ tags = ['functional', 'cli_root', 'zpool_resilver'] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', - 'zpool_scrub_offline_device'] + 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am index 69f4b5d3ba..e2dfd9d64c 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am @@ -9,7 +9,8 @@ dist_pkgdata_SCRIPTS = \ zpool_scrub_005_pos.ksh \ zpool_scrub_encrypted_unloaded.ksh \ zpool_scrub_offline_device.ksh \ - zpool_scrub_print_repairing.ksh + zpool_scrub_print_repairing.ksh \ + zpool_scrub_multiple_copies.ksh dist_pkgdata_DATA = \ zpool_scrub.cfg diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh new file mode 100755 index 0000000000..2dd33c99c5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Scrubs and self-healing should be able to repair data from additional +# copies that may be stored. +# +# +# STRATEGY: +# 1. Create a dataset with copies=3 +# 2. Write a file to the dataset +# 3. zinject errors into the first and second DVAs of that file +# 4. Scrub and verify the scrub repaired all errors +# 7. Read the file normally to check that self healing also works +# 8. Remove the zinject handler +# 9. Scrub again and confirm 0 bytes were scrubbed +# + +verify_runnable "global" + +function cleanup +{ + destroy_dataset $TESTPOOL/$TESTFS2 + log_must zinject -c all +} +log_onexit cleanup + +log_assert "Scrubs and self healing must work with additional copies" + +log_must zfs create -o copies=3 $TESTPOOL/$TESTFS2 +typeset mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS2) +log_must mkfile 10m $mntpnt/file +log_must zpool sync $TESTPOOL + +log_must zinject -a -t data -C 0,1 -e io $mntpnt/file + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +log_must check_pool_status $TESTPOOL "scan" "with 0 errors" +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_must dd if=$mntpnt/file of=/dev/null bs=1M iflag=fullblock +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_must zinject -c all + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +zpool status + +log_must check_pool_status $TESTPOOL "errors" "No known data errors" +log_must check_pool_status $TESTPOOL "scan" "with 0 errors" +log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + +log_pass "Scrubs and self healing work with additional copies" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh index fdf315dea5..7a07e64334 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh @@ -49,7 +49,7 @@ verify_runnable "global" function cleanup { - poolexists $TESTPOOL && destroy_pool $TESTPOOL + poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 log_must rm -f $DISK1 $DISK2 $DISK3 $DISK4 } @@ -103,31 +103,31 @@ log_must truncate -s $DEVSIZE $DISK1 log_must truncate -s $DEVSIZE $DISK2 log_must truncate -s $DEVSIZE $DISK3 log_must truncate -s $DEVSIZE $DISK4 -poolexists $TESTPOOL && destroy_pool $TESTPOOL -log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL \ +poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 +log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL2 \ raidz2 $DISK1 $DISK2 $DISK3 $DISK4 # 2. Offline the first device -zpool_do_sync 'offline' $TESTPOOL $DISK1 +zpool_do_sync 'offline' $TESTPOOL2 $DISK1 # 3. Write to the pool log_must mkfile $FILESIZE "$TESTDIR/data.bin" # 4. Scrub the pool -zpool_scrub_sync $TESTPOOL +zpool_scrub_sync $TESTPOOL2 # 5. Online the first device and offline the second device -zpool_do_sync 'online' $TESTPOOL $DISK1 -zpool_do_sync 'offline' $TESTPOOL $DISK2 -log_must wait_for_resilver_end $TESTPOOL $RESILVER_TIMEOUT +zpool_do_sync 'online' $TESTPOOL2 $DISK1 +zpool_do_sync 'offline' $TESTPOOL2 $DISK2 +log_must wait_for_resilver_end $TESTPOOL2 $RESILVER_TIMEOUT # 6. Scrub the pool again -zpool_scrub_sync $TESTPOOL +zpool_scrub_sync $TESTPOOL2 # 7. Verify data integrity -cksum=$(zpool status $TESTPOOL | awk 'L{print $NF;L=0} /CKSUM$/{L=1}') +cksum=$(zpool status $TESTPOOL2 | awk 'L{print $NF;L=0} /CKSUM$/{L=1}') if [[ $cksum != 0 ]]; then - log_fail "Unexpected CKSUM errors found on $TESTPOOL ($cksum)" + log_fail "Unexpected CKSUM errors found on $TESTPOOL2 ($cksum)" fi log_pass "Scrubbing a pool with offline devices correctly preserves DTLs"