From 3d6da72d183dc655a7dc8fd59f57748fc5c1806c Mon Sep 17 00:00:00 2001 From: Isaac Huang Date: Fri, 12 May 2017 18:28:03 -0600 Subject: [PATCH] Skip spurious resilver IO on raidz vdev On a raidz vdev, a block that does not span all child vdevs, excluding its skip sectors if any, may not be affected by a child vdev outage or failure. In such cases, the block does not need to be resilvered. However, current resilver algorithm simply resilvers all blocks on a degraded raidz vdev. Such spurious IO is not only wasteful, but also adds the risk of overwriting good data. This patch eliminates such spurious IOs. Reviewed-by: Gvozden Neskovic Reviewed-by: Brian Behlendorf Reviewed by: Matthew Ahrens Signed-off-by: Isaac Huang Closes #5316 --- include/sys/vdev.h | 1 + include/sys/vdev_impl.h | 2 ++ module/zfs/dsl_scan.c | 72 ++++++++++++++++++++++++++------------- module/zfs/vdev.c | 15 ++++++++ module/zfs/vdev_disk.c | 1 + module/zfs/vdev_file.c | 2 ++ module/zfs/vdev_mirror.c | 3 ++ module/zfs/vdev_missing.c | 2 ++ module/zfs/vdev_raidz.c | 59 ++++++++++++++++++++++++++------ module/zfs/vdev_root.c | 1 + 10 files changed, 125 insertions(+), 33 deletions(-) diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 4f54b1707c..63b4904c5e 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -65,6 +65,7 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); +extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done); extern boolean_t vdev_dtl_required(vdev_t *vd); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index c9e9bede97..835d2dbbf2 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -68,6 +68,7 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); +typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); @@ -78,6 +79,7 @@ typedef const struct vdev_ops { vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; + vdev_need_resilver_func_t *vdev_op_need_resilver; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; char vdev_op_type[16]; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f5ef2268d2..5b52681d8d 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1836,12 +1836,51 @@ dsl_scan_scrub_done(zio_t *zio) mutex_exit(&spa->spa_scrub_lock); } +static boolean_t +dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + vdev_t *vd; + + if (DVA_GET_GANG(dva)) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + return (B_TRUE); + } + + vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + /* + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + /* + * Check if the top-level vdev must resilver this offset. + * When the offset does not intersect with a dirty leaf DTL + * then it may be possible to skip the resilver IO. The psize + * is provided instead of asize to simplify the check for RAIDZ. + */ + if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + return (B_FALSE); + + return (B_TRUE); +} + static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; - size_t size = BP_GET_PSIZE(bp); + size_t psize = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); boolean_t needs_io = B_FALSE; @@ -1875,33 +1914,19 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, zio_flags |= ZIO_FLAG_SPECULATIVE; for (d = 0; d < BP_GET_NDVAS(bp); d++) { - vdev_t *vd = vdev_lookup_top(spa, - DVA_GET_VDEV(&bp->blk_dva[d])); + const dva_t *dva = &bp->blk_dva[d]; /* * Keep track of how much data we've examined so that * zpool(1M) status can make useful progress reports. */ - scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); - spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); + scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); /* if it's a resilver, this may not be in the target range */ - if (!needs_io) { - if (DVA_GET_GANG(&bp->blk_dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - needs_io = B_TRUE; - } else { - needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, - phys_birth, 1); - } - } + if (!needs_io) + needs_io = dsl_scan_need_resilver(spa, dva, psize, + phys_birth); } if (needs_io && !zfs_no_scrub_io) { @@ -1922,8 +1947,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, delay(scan_delay); zio_nowait(zio_read(NULL, spa, bp, - abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, - NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); + abd_alloc_for_io(psize, B_FALSE), + psize, dsl_scan_scrub_done, NULL, + ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* do not relocate this block */ diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index a71e678bbe..f44d338ef8 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1819,6 +1819,21 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) return (empty); } +/* + * Returns B_TRUE if vdev determines offset needs to be resilvered. + */ +boolean_t +vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + + if (vd->vdev_ops->vdev_op_need_resilver == NULL || + vd->vdev_ops->vdev_op_leaf) + return (B_TRUE); + + return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); +} + /* * Returns the lowest txg in the DTL range. */ diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 33b7f5d155..3fdf5642b1 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -796,6 +796,7 @@ vdev_ops_t vdev_disk_ops = { vdev_disk_io_start, vdev_disk_io_done, NULL, + NULL, vdev_disk_hold, vdev_disk_rele, VDEV_TYPE_DISK, /* name of this vdev type */ diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index c5e64520d3..13c32e0836 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -250,6 +250,7 @@ vdev_ops_t vdev_file_ops = { vdev_file_io_start, vdev_file_io_done, NULL, + NULL, vdev_file_hold, vdev_file_rele, VDEV_TYPE_FILE, /* name of this vdev type */ @@ -283,6 +284,7 @@ vdev_ops_t vdev_disk_ops = { vdev_file_io_start, vdev_file_io_done, NULL, + NULL, vdev_file_hold, vdev_file_rele, VDEV_TYPE_DISK, /* name of this vdev type */ diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 256431e6b3..15d1f204ff 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -615,6 +615,7 @@ vdev_ops_t vdev_mirror_ops = { vdev_mirror_state_change, NULL, NULL, + NULL, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -628,6 +629,7 @@ vdev_ops_t vdev_replacing_ops = { vdev_mirror_state_change, NULL, NULL, + NULL, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -641,6 +643,7 @@ vdev_ops_t vdev_spare_ops = { vdev_mirror_state_change, NULL, NULL, + NULL, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index 2287573342..d7d017fb8f 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -88,6 +88,7 @@ vdev_ops_t vdev_missing_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -101,6 +102,7 @@ vdev_ops_t vdev_hole_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_HOLE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index c073f1374f..ba850b4f83 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -330,18 +330,18 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { * is this functions only caller, as small as possible on the stack. */ noinline raidz_map_t * -vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, +vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = zio->io_offset >> unit_shift; + uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = zio->io_size >> unit_shift; + uint64_t s = zio->io_size >> ashift; /* The first column for this stripe. */ uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ - uint64_t o = (b / dcols) << unit_shift; + uint64_t o = (b / dcols) << ashift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t off = 0; @@ -400,7 +400,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, coff = o; if (col >= dcols) { col -= dcols; - coff += 1ULL << unit_shift; + coff += 1ULL << ashift; } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; @@ -413,17 +413,17 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, if (c >= acols) rm->rm_col[c].rc_size = 0; else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << unit_shift; + rm->rm_col[c].rc_size = (q + 1) << ashift; else - rm->rm_col[c].rc_size = q << unit_shift; + rm->rm_col[c].rc_size = q << ashift; asize += rm->rm_col[c].rc_size; } - ASSERT3U(asize, ==, tot << unit_shift); - rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); + ASSERT3U(asize, ==, tot << ashift); + rm->rm_asize = roundup(asize, (nparity + 1) << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) @@ -2299,6 +2299,44 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. The function + * assumes that at least one DTL is dirty which imples that full stripe + * width blocks must be resilvered. + */ +static boolean_t +vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + uint64_t dcols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + uint64_t ashift = vd->vdev_top->vdev_ashift; + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = offset >> ashift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = ((psize - 1) >> ashift) + 1; + /* The first column for this stripe. */ + uint64_t f = b % dcols; + + if (s + nparity >= dcols) + return (B_TRUE); + + for (uint64_t c = 0; c < s + nparity; c++) { + uint64_t devidx = (f + c) % dcols; + vdev_t *cvd = vd->vdev_child[devidx]; + + /* + * dsl_scan_need_resilver() already checked vd with + * vdev_dtl_contains(). So here just check cvd with + * vdev_dtl_empty(), cheaper and a good approximation. + */ + if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) + return (B_TRUE); + } + + return (B_FALSE); +} + vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, @@ -2306,6 +2344,7 @@ vdev_ops_t vdev_raidz_ops = { vdev_raidz_io_start, vdev_raidz_io_done, vdev_raidz_state_change, + vdev_raidz_need_resilver, NULL, NULL, VDEV_TYPE_RAIDZ, /* name of this vdev type */ diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 90250b0fb9..6b456dd2bd 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -120,6 +120,7 @@ vdev_ops_t vdev_root_ops = { vdev_root_state_change, NULL, NULL, + NULL, VDEV_TYPE_ROOT, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ };