Skip spurious resilver IO on raidz vdev

On a raidz vdev, a block that does not span all child vdevs, excluding
its skip sectors if any, may not be affected by a child vdev outage or
failure. In such cases, the block does not need to be resilvered.
However, current resilver algorithm simply resilvers all blocks on a
degraded raidz vdev. Such spurious IO is not only wasteful, but also
adds the risk of overwriting good data.

This patch eliminates such spurious IOs.

Reviewed-by: Gvozden Neskovic <neskovic@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Isaac Huang <he.huang@intel.com>
Closes #5316
This commit is contained in:
Isaac Huang 2017-05-12 18:28:03 -06:00 committed by Brian Behlendorf
parent 8c54ddd33a
commit 3d6da72d18
10 changed files with 125 additions and 33 deletions

View File

@ -65,6 +65,7 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size); uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
int scrub_done); int scrub_done);
extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_dtl_required(vdev_t *vd);

View File

@ -68,6 +68,7 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_hold_func_t(vdev_t *vd);
typedef void vdev_rele_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd);
@ -78,6 +79,7 @@ typedef const struct vdev_ops {
vdev_io_start_func_t *vdev_op_io_start; vdev_io_start_func_t *vdev_op_io_start;
vdev_io_done_func_t *vdev_op_io_done; vdev_io_done_func_t *vdev_op_io_done;
vdev_state_change_func_t *vdev_op_state_change; vdev_state_change_func_t *vdev_op_state_change;
vdev_need_resilver_func_t *vdev_op_need_resilver;
vdev_hold_func_t *vdev_op_hold; vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele; vdev_rele_func_t *vdev_op_rele;
char vdev_op_type[16]; char vdev_op_type[16];

View File

@ -1836,12 +1836,51 @@ dsl_scan_scrub_done(zio_t *zio)
mutex_exit(&spa->spa_scrub_lock); mutex_exit(&spa->spa_scrub_lock);
} }
static boolean_t
dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
vdev_t *vd;
if (DVA_GET_GANG(dva)) {
/*
* Gang members may be spread across multiple
* vdevs, so the best estimate we have is the
* scrub range, which has already been checked.
* XXX -- it would be better to change our
* allocation policy to ensure that all
* gang members reside on the same vdev.
*/
return (B_TRUE);
}
vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
/*
* Check if the txg falls within the range which must be
* resilvered. DVAs outside this range can always be skipped.
*/
if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
return (B_FALSE);
/*
* Check if the top-level vdev must resilver this offset.
* When the offset does not intersect with a dirty leaf DTL
* then it may be possible to skip the resilver IO. The psize
* is provided instead of asize to simplify the check for RAIDZ.
*/
if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
return (B_FALSE);
return (B_TRUE);
}
static int static int
dsl_scan_scrub_cb(dsl_pool_t *dp, dsl_scan_scrub_cb(dsl_pool_t *dp,
const blkptr_t *bp, const zbookmark_phys_t *zb) const blkptr_t *bp, const zbookmark_phys_t *zb)
{ {
dsl_scan_t *scn = dp->dp_scan; dsl_scan_t *scn = dp->dp_scan;
size_t size = BP_GET_PSIZE(bp); size_t psize = BP_GET_PSIZE(bp);
spa_t *spa = dp->dp_spa; spa_t *spa = dp->dp_spa;
uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
boolean_t needs_io = B_FALSE; boolean_t needs_io = B_FALSE;
@ -1875,33 +1914,19 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
zio_flags |= ZIO_FLAG_SPECULATIVE; zio_flags |= ZIO_FLAG_SPECULATIVE;
for (d = 0; d < BP_GET_NDVAS(bp); d++) { for (d = 0; d < BP_GET_NDVAS(bp); d++) {
vdev_t *vd = vdev_lookup_top(spa, const dva_t *dva = &bp->blk_dva[d];
DVA_GET_VDEV(&bp->blk_dva[d]));
/* /*
* Keep track of how much data we've examined so that * Keep track of how much data we've examined so that
* zpool(1M) status can make useful progress reports. * zpool(1M) status can make useful progress reports.
*/ */
scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
/* if it's a resilver, this may not be in the target range */ /* if it's a resilver, this may not be in the target range */
if (!needs_io) { if (!needs_io)
if (DVA_GET_GANG(&bp->blk_dva[d])) { needs_io = dsl_scan_need_resilver(spa, dva, psize,
/* phys_birth);
* Gang members may be spread across multiple
* vdevs, so the best estimate we have is the
* scrub range, which has already been checked.
* XXX -- it would be better to change our
* allocation policy to ensure that all
* gang members reside on the same vdev.
*/
needs_io = B_TRUE;
} else {
needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
phys_birth, 1);
}
}
} }
if (needs_io && !zfs_no_scrub_io) { if (needs_io && !zfs_no_scrub_io) {
@ -1922,8 +1947,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
delay(scan_delay); delay(scan_delay);
zio_nowait(zio_read(NULL, spa, bp, zio_nowait(zio_read(NULL, spa, bp,
abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, abd_alloc_for_io(psize, B_FALSE),
NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); psize, dsl_scan_scrub_done, NULL,
ZIO_PRIORITY_SCRUB, zio_flags, zb));
} }
/* do not relocate this block */ /* do not relocate this block */

View File

@ -1819,6 +1819,21 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
return (empty); return (empty);
} }
/*
* Returns B_TRUE if vdev determines offset needs to be resilvered.
*/
boolean_t
vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
{
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
vd->vdev_ops->vdev_op_leaf)
return (B_TRUE);
return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
}
/* /*
* Returns the lowest txg in the DTL range. * Returns the lowest txg in the DTL range.
*/ */

View File

@ -796,6 +796,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_disk_io_start, vdev_disk_io_start,
vdev_disk_io_done, vdev_disk_io_done,
NULL, NULL,
NULL,
vdev_disk_hold, vdev_disk_hold,
vdev_disk_rele, vdev_disk_rele,
VDEV_TYPE_DISK, /* name of this vdev type */ VDEV_TYPE_DISK, /* name of this vdev type */

View File

@ -250,6 +250,7 @@ vdev_ops_t vdev_file_ops = {
vdev_file_io_start, vdev_file_io_start,
vdev_file_io_done, vdev_file_io_done,
NULL, NULL,
NULL,
vdev_file_hold, vdev_file_hold,
vdev_file_rele, vdev_file_rele,
VDEV_TYPE_FILE, /* name of this vdev type */ VDEV_TYPE_FILE, /* name of this vdev type */
@ -283,6 +284,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_file_io_start, vdev_file_io_start,
vdev_file_io_done, vdev_file_io_done,
NULL, NULL,
NULL,
vdev_file_hold, vdev_file_hold,
vdev_file_rele, vdev_file_rele,
VDEV_TYPE_DISK, /* name of this vdev type */ VDEV_TYPE_DISK, /* name of this vdev type */

View File

@ -615,6 +615,7 @@ vdev_ops_t vdev_mirror_ops = {
vdev_mirror_state_change, vdev_mirror_state_change,
NULL, NULL,
NULL, NULL,
NULL,
VDEV_TYPE_MIRROR, /* name of this vdev type */ VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */ B_FALSE /* not a leaf vdev */
}; };
@ -628,6 +629,7 @@ vdev_ops_t vdev_replacing_ops = {
vdev_mirror_state_change, vdev_mirror_state_change,
NULL, NULL,
NULL, NULL,
NULL,
VDEV_TYPE_REPLACING, /* name of this vdev type */ VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */ B_FALSE /* not a leaf vdev */
}; };
@ -641,6 +643,7 @@ vdev_ops_t vdev_spare_ops = {
vdev_mirror_state_change, vdev_mirror_state_change,
NULL, NULL,
NULL, NULL,
NULL,
VDEV_TYPE_SPARE, /* name of this vdev type */ VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */ B_FALSE /* not a leaf vdev */
}; };

View File

@ -88,6 +88,7 @@ vdev_ops_t vdev_missing_ops = {
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL,
VDEV_TYPE_MISSING, /* name of this vdev type */ VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */ B_TRUE /* leaf vdev */
}; };
@ -101,6 +102,7 @@ vdev_ops_t vdev_hole_ops = {
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL,
VDEV_TYPE_HOLE, /* name of this vdev type */ VDEV_TYPE_HOLE, /* name of this vdev type */
B_TRUE /* leaf vdev */ B_TRUE /* leaf vdev */
}; };

View File

@ -330,18 +330,18 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
* is this functions only caller, as small as possible on the stack. * is this functions only caller, as small as possible on the stack.
*/ */
noinline raidz_map_t * noinline raidz_map_t *
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
uint64_t nparity) uint64_t nparity)
{ {
raidz_map_t *rm; raidz_map_t *rm;
/* The starting RAIDZ (parent) vdev sector of the block. */ /* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = zio->io_offset >> unit_shift; uint64_t b = zio->io_offset >> ashift;
/* The zio's size in units of the vdev's minimum sector size. */ /* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = zio->io_size >> unit_shift; uint64_t s = zio->io_size >> ashift;
/* The first column for this stripe. */ /* The first column for this stripe. */
uint64_t f = b % dcols; uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */ /* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift; uint64_t o = (b / dcols) << ashift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
uint64_t off = 0; uint64_t off = 0;
@ -400,7 +400,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
coff = o; coff = o;
if (col >= dcols) { if (col >= dcols) {
col -= dcols; col -= dcols;
coff += 1ULL << unit_shift; coff += 1ULL << ashift;
} }
rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff; rm->rm_col[c].rc_offset = coff;
@ -413,17 +413,17 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
if (c >= acols) if (c >= acols)
rm->rm_col[c].rc_size = 0; rm->rm_col[c].rc_size = 0;
else if (c < bc) else if (c < bc)
rm->rm_col[c].rc_size = (q + 1) << unit_shift; rm->rm_col[c].rc_size = (q + 1) << ashift;
else else
rm->rm_col[c].rc_size = q << unit_shift; rm->rm_col[c].rc_size = q << ashift;
asize += rm->rm_col[c].rc_size; asize += rm->rm_col[c].rc_size;
} }
ASSERT3U(asize, ==, tot << unit_shift); ASSERT3U(asize, ==, tot << ashift);
rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); rm->rm_asize = roundup(asize, (nparity + 1) << ashift);
rm->rm_nskip = roundup(tot, nparity + 1) - tot; rm->rm_nskip = roundup(tot, nparity + 1) - tot;
ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift);
ASSERT3U(rm->rm_nskip, <=, nparity); ASSERT3U(rm->rm_nskip, <=, nparity);
for (c = 0; c < rm->rm_firstdatacol; c++) for (c = 0; c < rm->rm_firstdatacol; c++)
@ -2299,6 +2299,44 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
} }
/*
* Determine if any portion of the provided block resides on a child vdev
* with a dirty DTL and therefore needs to be resilvered. The function
* assumes that at least one DTL is dirty which imples that full stripe
* width blocks must be resilvered.
*/
static boolean_t
vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
{
uint64_t dcols = vd->vdev_children;
uint64_t nparity = vd->vdev_nparity;
uint64_t ashift = vd->vdev_top->vdev_ashift;
/* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = offset >> ashift;
/* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = ((psize - 1) >> ashift) + 1;
/* The first column for this stripe. */
uint64_t f = b % dcols;
if (s + nparity >= dcols)
return (B_TRUE);
for (uint64_t c = 0; c < s + nparity; c++) {
uint64_t devidx = (f + c) % dcols;
vdev_t *cvd = vd->vdev_child[devidx];
/*
* dsl_scan_need_resilver() already checked vd with
* vdev_dtl_contains(). So here just check cvd with
* vdev_dtl_empty(), cheaper and a good approximation.
*/
if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
return (B_TRUE);
}
return (B_FALSE);
}
vdev_ops_t vdev_raidz_ops = { vdev_ops_t vdev_raidz_ops = {
vdev_raidz_open, vdev_raidz_open,
vdev_raidz_close, vdev_raidz_close,
@ -2306,6 +2344,7 @@ vdev_ops_t vdev_raidz_ops = {
vdev_raidz_io_start, vdev_raidz_io_start,
vdev_raidz_io_done, vdev_raidz_io_done,
vdev_raidz_state_change, vdev_raidz_state_change,
vdev_raidz_need_resilver,
NULL, NULL,
NULL, NULL,
VDEV_TYPE_RAIDZ, /* name of this vdev type */ VDEV_TYPE_RAIDZ, /* name of this vdev type */

View File

@ -120,6 +120,7 @@ vdev_ops_t vdev_root_ops = {
vdev_root_state_change, vdev_root_state_change,
NULL, NULL,
NULL, NULL,
NULL,
VDEV_TYPE_ROOT, /* name of this vdev type */ VDEV_TYPE_ROOT, /* name of this vdev type */
B_FALSE /* not a leaf vdev */ B_FALSE /* not a leaf vdev */
}; };