Update mmp_delay on sync or skipped, failed write

When an MMP write is skipped, or fails, and time since
mts->mmp_last_write is already greater than mts->mmp_delay, increase
mts->mmp_delay.  The original code only updated mts->mmp_delay when a
write succeeded, but this results in the write(s) after delays and
failed write(s) reporting an ub_mmp_delay which is too low.

Update mmp_last_write and mmp_delay if a txg sync was successful.  At
least one uberblock was written, thus extending the time we can be sure
the pool will not be imported by another host.

Do not allow mmp_delay to go below (MSEC2NSEC(zfs_multihost_interval) /
vdev_count_leaves()) so that a period of frequent successful MMP writes,
e.g. due to frequent txg syncs, does not result in an import activity
check so short it is not reliable based on mmp thread writes alone.

Remove unnecessary local variable, start.  We do not use the start time
of the loop iteration.

Add a debug message in spa_activity_check() to allow verification of the
import_delay value and to prove the activity check occurred.

Alter the tests that import pools and attempt to detect an activity
check.  Calculate the expected duration of spa_activity_check() based on
module parameters at the time the import is performed, rather than a
fixed time set in mmp.cfg.  The fixed time may be wrong.  Also, use the
default zfs_multihost_interval value so the activity check is longer and
easier to recognize.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #7330
This commit is contained in:
Olaf Faaland 2018-04-04 16:38:44 -07:00 committed by Tony Hutter
parent f5ecab3aef
commit 5ac017fc04
6 changed files with 93 additions and 47 deletions

View File

@ -280,6 +280,59 @@ mmp_random_leaf(vdev_t *in_vd, vdev_t **out_vd)
return (error_mask); return (error_mask);
} }
/*
* MMP writes are issued on a fixed schedule, but may complete at variable,
* much longer, intervals. The mmp_delay captures long periods between
* successful writes for any reason, including disk latency, scheduling delays,
* etc.
*
* The mmp_delay is usually calculated as a decaying average, but if the latest
* delay is higher we do not average it, so that we do not hide sudden spikes
* which the importing host must wait for.
*
* If writes are occurring frequently, such as due to a high rate of txg syncs,
* the mmp_delay could become very small. Since those short delays depend on
* activity we cannot count on, we never allow mmp_delay to get lower than rate
* expected if only mmp_thread writes occur.
*
* If an mmp write was skipped or fails, and we have already waited longer than
* mmp_delay, we need to update it so the next write reflects the longer delay.
*
* Do not set mmp_delay if the multihost property is not on, so as not to
* trigger an activity check on import.
*/
static void
mmp_delay_update(spa_t *spa, boolean_t write_completed)
{
mmp_thread_t *mts = &spa->spa_mmp;
hrtime_t delay = gethrtime() - mts->mmp_last_write;
ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
if (spa_multihost(spa) == B_FALSE) {
mts->mmp_delay = 0;
return;
}
if (delay > mts->mmp_delay)
mts->mmp_delay = delay;
if (write_completed == B_FALSE)
return;
mts->mmp_last_write = gethrtime();
/*
* strictly less than, in case delay was changed above.
*/
if (delay < mts->mmp_delay) {
hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) /
vdev_count_leaves(spa);
mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
min_delay);
}
}
static void static void
mmp_write_done(zio_t *zio) mmp_write_done(zio_t *zio)
{ {
@ -291,38 +344,8 @@ mmp_write_done(zio_t *zio)
uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id; uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending; hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
if (zio->io_error) mmp_delay_update(spa, (zio->io_error == 0));
goto unlock;
/*
* Mmp writes are queued on a fixed schedule, but under many
* circumstances, such as a busy device or faulty hardware,
* the writes will complete at variable, much longer,
* intervals. In these cases, another node checking for
* activity must wait longer to account for these delays.
*
* The mmp_delay is calculated as a decaying average of the interval
* between completed mmp writes. This is used to predict how long
* the import must wait to detect activity in the pool, before
* concluding it is not in use.
*
* Do not set mmp_delay if the multihost property is not on,
* so as not to trigger an activity check on import.
*/
if (spa_multihost(spa)) {
hrtime_t delay = gethrtime() - mts->mmp_last_write;
if (delay > mts->mmp_delay)
mts->mmp_delay = delay;
else
mts->mmp_delay = (delay + mts->mmp_delay * 127) /
128;
} else {
mts->mmp_delay = 0;
}
mts->mmp_last_write = gethrtime();
unlock:
vd->vdev_mmp_pending = 0; vd->vdev_mmp_pending = 0;
vd->vdev_mmp_kstat_id = 0; vd->vdev_mmp_kstat_id = 0;
@ -348,6 +371,7 @@ mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
mutex_enter(&mmp->mmp_io_lock); mutex_enter(&mmp->mmp_io_lock);
mmp->mmp_ub = *ub; mmp->mmp_ub = *ub;
mmp->mmp_ub.ub_timestamp = gethrestime_sec(); mmp->mmp_ub.ub_timestamp = gethrestime_sec();
mmp_delay_update(spa, B_TRUE);
mutex_exit(&mmp->mmp_io_lock); mutex_exit(&mmp->mmp_io_lock);
} }
@ -386,6 +410,7 @@ mmp_write_uberblock(spa_t *spa)
*/ */
if (error) { if (error) {
mmp_delay_update(spa, B_FALSE);
if (mmp->mmp_skip_error == error) { if (mmp->mmp_skip_error == error) {
spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1); spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
} else { } else {
@ -462,15 +487,14 @@ mmp_thread(spa_t *spa)
MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
boolean_t suspended = spa_suspended(spa); boolean_t suspended = spa_suspended(spa);
boolean_t multihost = spa_multihost(spa); boolean_t multihost = spa_multihost(spa);
hrtime_t start, next_time; hrtime_t next_time;
start = gethrtime(); if (multihost)
if (multihost) { next_time = gethrtime() + mmp_interval /
next_time = start + mmp_interval /
MAX(vdev_count_leaves(spa), 1); MAX(vdev_count_leaves(spa), 1);
} else { else
next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL); next_time = gethrtime() +
} MSEC2NSEC(MMP_DEFAULT_INTERVAL);
/* /*
* MMP off => on, or suspended => !suspended: * MMP off => on, or suspended => !suspended:
@ -514,11 +538,11 @@ mmp_thread(spa_t *spa)
* mmp_interval * mmp_fail_intervals nanoseconds. * mmp_interval * mmp_fail_intervals nanoseconds.
*/ */
if (!suspended && mmp_fail_intervals && multihost && if (!suspended && mmp_fail_intervals && multihost &&
(start - mmp->mmp_last_write) > max_fail_ns) { (gethrtime() - mmp->mmp_last_write) > max_fail_ns) {
cmn_err(CE_WARN, "MMP writes to pool '%s' have not " cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
"succeeded in over %llus; suspending pool", "succeeded in over %llus; suspending pool",
spa_name(spa), spa_name(spa),
NSEC2SEC(start - mmp->mmp_last_write)); NSEC2SEC(gethrtime() - mmp->mmp_last_write));
zio_suspend(spa, NULL, ZIO_SUSPEND_MMP); zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
} }

View File

@ -2471,6 +2471,10 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_delay = MAX(import_delay, import_intervals * import_delay = MAX(import_delay, import_intervals *
MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL))); MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)));
zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu import_intervals=%u "
"leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals,
vdev_count_leaves(spa));
/* Add a small random factor in case of simultaneous imports (0-25%) */ /* Add a small random factor in case of simultaneous imports (0-25%) */
import_expire = gethrtime() + import_delay + import_expire = gethrtime() + import_delay +
(import_delay * spa_get_random(250) / 1000); (import_delay * spa_get_random(250) / 1000);

View File

@ -1501,7 +1501,6 @@ retry:
if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
goto retry; goto retry;
if (spa_multihost(spa)) if (spa_multihost(spa))
mmp_update_uberblock(spa, ub); mmp_update_uberblock(spa, ub);

View File

@ -38,5 +38,3 @@ export MMP_HISTORY_OFF=0
export MMP_INTERVAL_HOUR=$((60*60*1000)) export MMP_INTERVAL_HOUR=$((60*60*1000))
export MMP_INTERVAL_DEFAULT=1000 export MMP_INTERVAL_DEFAULT=1000
export MMP_INTERVAL_MIN=100 export MMP_INTERVAL_MIN=100
export ZPOOL_IMPORT_DURATION=9

View File

@ -163,17 +163,32 @@ function mmp_pool_set_hostid # pool hostid
return 0 return 0
} }
# Return the number of seconds the activity check portion of the import process
# will take. Does not include the time to find devices and assemble the
# preliminary pool configuration passed into the kernel.
function seconds_mmp_waits_for_activity
{
typeset import_intervals=$(get_tunable zfs_multihost_import_intervals)
typeset interval=$(get_tunable zfs_multihost_interval)
typeset seconds=$((interval*import_intervals/1000))
echo $seconds
}
function import_no_activity_check # pool opts function import_no_activity_check # pool opts
{ {
typeset pool=$1 typeset pool=$1
typeset opts=$2 typeset opts=$2
typeset max_duration=$(seconds_mmp_waits_for_activity)
SECONDS=0 SECONDS=0
zpool import $opts $pool zpool import $opts $pool
typeset rc=$? typeset rc=$?
if [[ $SECONDS -gt $ZPOOL_IMPORT_DURATION ]]; then if [[ $SECONDS -gt $max_duration ]]; then
log_fail "unexpected activity check (${SECONDS}s)" log_fail "unexpected activity check (${SECONDS}s gt \
$max_duration)"
fi fi
return $rc return $rc
@ -184,12 +199,15 @@ function import_activity_check # pool opts
typeset pool=$1 typeset pool=$1
typeset opts=$2 typeset opts=$2
typeset min_duration=$(seconds_mmp_waits_for_activity)
SECONDS=0 SECONDS=0
zpool import $opts $pool zpool import $opts $pool
typeset rc=$? typeset rc=$?
if [[ $SECONDS -le $ZPOOL_IMPORT_DURATION ]]; then if [[ $SECONDS -le $min_duration ]]; then
log_fail "expected activity check (${SECONDS}s)" log_fail "expected activity check (${SECONDS}s le \
$min_duration)"
fi fi
return $rc return $rc

View File

@ -103,6 +103,9 @@ MMP_IMPORTED_MSG="pool was previously in use from another system."
log_must try_pool_import $MMP_POOL "-d $MMP_DIR" "$MMP_IMPORTED_MSG" log_must try_pool_import $MMP_POOL "-d $MMP_DIR" "$MMP_IMPORTED_MSG"
# 7. Verify 'zpool import -f $MMP_POOL' can now import the pool. # 7. Verify 'zpool import -f $MMP_POOL' can now import the pool.
# Default interval results in minimum activity test 10s which
# makes detection of the activity test reliable.
log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT
log_must import_activity_check $MMP_POOL "-f -d $MMP_DIR" log_must import_activity_check $MMP_POOL "-f -d $MMP_DIR"
# 8 Verify pool may be exported/imported without -f argument. # 8 Verify pool may be exported/imported without -f argument.