Reimplement vdev_random_leaf and rename it

Rename it as mmp_random_leaf() since it is defined in mmp.c.

The earlier implementation could end up spinning forever if a pool had a
vdev marked writeable, none of whose children were writeable.  It also
did not guarantee that if a writeable leaf vdev existed, it would be
found.

Reimplement to recursively walk the device tree to select the leaf.  It
searches the entire tree, so that a return value of (NULL) indicates
there were no usable leaves in the pool; all were either not writeable
or had pending mmp writes.

It still chooses the starting child randomly at each level of the tree,
so if the pool's devices are healthy, the mmp writes go to random leaves
with an even distribution.  This was verified by testing using
zfs_multihost_history enabled.

Reviewed by: Thomas Caputi <tcaputi@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #6631 
Closes #6665
This commit is contained in:
Olaf Faaland 2017-09-22 14:29:26 -07:00 committed by Brian Behlendorf
parent 5df5d06a8d
commit d410c6d9fd
1 changed files with 26 additions and 36 deletions

View File

@ -198,52 +198,42 @@ mmp_thread_stop(spa_t *spa)
} }
/* /*
* Randomly choose a leaf vdev, to write an MMP block to. It must be * Choose a leaf vdev to write an MMP block to. It must not have an
* writable. It must not have an outstanding mmp write (if so then * outstanding mmp write (if so then there is a problem, and a new write will
* there is a problem, and a new write will also block). * also block). If there is no usable leaf in this subtree return NULL,
* otherwise return a pointer to the leaf.
* *
* We try 10 times to pick a random leaf without an outstanding write. * When walking the subtree, a random child is chosen as the starting point so
* If 90% of the leaves have pending writes, this gives us a >65% * that when the tree is healthy, the leaf chosen will be random with even
* chance of finding one we can write to. There will be at least * distribution. If there are unhealthy vdevs in the tree, the distribution
* (zfs_multihost_fail_intervals) tries before the inability to write an MMP * will be really poor only if a large proportion of the vdevs are unhealthy,
* block causes serious problems. * in which case there are other more pressing problems.
*/ */
static vdev_t * static vdev_t *
vdev_random_leaf(spa_t *spa) mmp_random_leaf(vdev_t *vd)
{ {
vdev_t *vd, *child; int child_idx;
int pending_writes = 10;
ASSERT(spa); if (!vdev_writeable(vd))
ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
/*
* Since we hold SCL_STATE, neither pool nor vdev state can
* change. Therefore, if the root is not dead, there is a
* child that is not dead, and so on down to a leaf.
*/
if (!vdev_writeable(spa->spa_root_vdev))
return (NULL); return (NULL);
vd = spa->spa_root_vdev; if (vd->vdev_ops->vdev_op_leaf)
while (!vd->vdev_ops->vdev_op_leaf) { return (vd->vdev_mmp_pending == 0 ? vd : NULL);
child = vd->vdev_child[spa_get_random(vd->vdev_children)];
if (!vdev_writeable(child)) child_idx = spa_get_random(vd->vdev_children);
continue; for (int offset = vd->vdev_children; offset > 0; offset--) {
vdev_t *leaf;
vdev_t *child = vd->vdev_child[(child_idx + offset) %
vd->vdev_children];
leaf = mmp_random_leaf(child);
if (leaf)
return (leaf);
}
if (child->vdev_ops->vdev_op_leaf && child->vdev_mmp_pending) {
if (pending_writes-- > 0)
continue;
else
return (NULL); return (NULL);
} }
vd = child;
}
return (vd);
}
static void static void
mmp_write_done(zio_t *zio) mmp_write_done(zio_t *zio)
{ {
@ -324,8 +314,8 @@ mmp_write_uberblock(spa_t *spa)
uint64_t offset; uint64_t offset;
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
vd = vdev_random_leaf(spa); vd = mmp_random_leaf(spa->spa_root_vdev);
if (vd == NULL || !vdev_writeable(vd)) { if (vd == NULL) {
spa_config_exit(spa, SCL_STATE, FTAG); spa_config_exit(spa, SCL_STATE, FTAG);
return; return;
} }