From 533ea0415bd788530ff894bc0edbc3cea20fbe4c Mon Sep 17 00:00:00 2001 From: Olaf Faaland Date: Wed, 4 Apr 2018 16:38:44 -0700 Subject: [PATCH] Update mmp_delay on sync or skipped, failed write When an MMP write is skipped, or fails, and time since mts->mmp_last_write is already greater than mts->mmp_delay, increase mts->mmp_delay. The original code only updated mts->mmp_delay when a write succeeded, but this results in the write(s) after delays and failed write(s) reporting an ub_mmp_delay which is too low. Update mmp_last_write and mmp_delay if a txg sync was successful. At least one uberblock was written, thus extending the time we can be sure the pool will not be imported by another host. Do not allow mmp_delay to go below (MSEC2NSEC(zfs_multihost_interval) / vdev_count_leaves()) so that a period of frequent successful MMP writes, e.g. due to frequent txg syncs, does not result in an import activity check so short it is not reliable based on mmp thread writes alone. Remove unnecessary local variable, start. We do not use the start time of the loop iteration. Add a debug message in spa_activity_check() to allow verification of the import_delay value and to prove the activity check occurred. Alter the tests that import pools and attempt to detect an activity check. Calculate the expected duration of spa_activity_check() based on module parameters at the time the import is performed, rather than a fixed time set in mmp.cfg. The fixed time may be wrong. Also, use the default zfs_multihost_interval value so the activity check is longer and easier to recognize. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Giuseppe Di Natale Signed-off-by: Olaf Faaland Closes #7330 --- module/zfs/mmp.c | 104 +++++++++++------- module/zfs/spa.c | 4 + module/zfs/vdev_label.c | 1 - tests/zfs-tests/tests/functional/mmp/mmp.cfg | 2 - .../zfs-tests/tests/functional/mmp/mmp.kshlib | 26 ++++- .../functional/mmp/mmp_active_import.ksh | 3 + 6 files changed, 93 insertions(+), 47 deletions(-) diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 14379d8046..e50e35665e 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -280,6 +280,59 @@ mmp_random_leaf(vdev_t *in_vd, vdev_t **out_vd) return (error_mask); } +/* + * MMP writes are issued on a fixed schedule, but may complete at variable, + * much longer, intervals. The mmp_delay captures long periods between + * successful writes for any reason, including disk latency, scheduling delays, + * etc. + * + * The mmp_delay is usually calculated as a decaying average, but if the latest + * delay is higher we do not average it, so that we do not hide sudden spikes + * which the importing host must wait for. + * + * If writes are occurring frequently, such as due to a high rate of txg syncs, + * the mmp_delay could become very small. Since those short delays depend on + * activity we cannot count on, we never allow mmp_delay to get lower than rate + * expected if only mmp_thread writes occur. + * + * If an mmp write was skipped or fails, and we have already waited longer than + * mmp_delay, we need to update it so the next write reflects the longer delay. + * + * Do not set mmp_delay if the multihost property is not on, so as not to + * trigger an activity check on import. + */ +static void +mmp_delay_update(spa_t *spa, boolean_t write_completed) +{ + mmp_thread_t *mts = &spa->spa_mmp; + hrtime_t delay = gethrtime() - mts->mmp_last_write; + + ASSERT(MUTEX_HELD(&mts->mmp_io_lock)); + + if (spa_multihost(spa) == B_FALSE) { + mts->mmp_delay = 0; + return; + } + + if (delay > mts->mmp_delay) + mts->mmp_delay = delay; + + if (write_completed == B_FALSE) + return; + + mts->mmp_last_write = gethrtime(); + + /* + * strictly less than, in case delay was changed above. + */ + if (delay < mts->mmp_delay) { + hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) / + vdev_count_leaves(spa); + mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128), + min_delay); + } +} + static void mmp_write_done(zio_t *zio) { @@ -291,38 +344,8 @@ mmp_write_done(zio_t *zio) uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id; hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending; - if (zio->io_error) - goto unlock; + mmp_delay_update(spa, (zio->io_error == 0)); - /* - * Mmp writes are queued on a fixed schedule, but under many - * circumstances, such as a busy device or faulty hardware, - * the writes will complete at variable, much longer, - * intervals. In these cases, another node checking for - * activity must wait longer to account for these delays. - * - * The mmp_delay is calculated as a decaying average of the interval - * between completed mmp writes. This is used to predict how long - * the import must wait to detect activity in the pool, before - * concluding it is not in use. - * - * Do not set mmp_delay if the multihost property is not on, - * so as not to trigger an activity check on import. - */ - if (spa_multihost(spa)) { - hrtime_t delay = gethrtime() - mts->mmp_last_write; - - if (delay > mts->mmp_delay) - mts->mmp_delay = delay; - else - mts->mmp_delay = (delay + mts->mmp_delay * 127) / - 128; - } else { - mts->mmp_delay = 0; - } - mts->mmp_last_write = gethrtime(); - -unlock: vd->vdev_mmp_pending = 0; vd->vdev_mmp_kstat_id = 0; @@ -348,6 +371,7 @@ mmp_update_uberblock(spa_t *spa, uberblock_t *ub) mutex_enter(&mmp->mmp_io_lock); mmp->mmp_ub = *ub; mmp->mmp_ub.ub_timestamp = gethrestime_sec(); + mmp_delay_update(spa, B_TRUE); mutex_exit(&mmp->mmp_io_lock); } @@ -386,6 +410,7 @@ mmp_write_uberblock(spa_t *spa) */ if (error) { + mmp_delay_update(spa, B_FALSE); if (mmp->mmp_skip_error == error) { spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1); } else { @@ -463,15 +488,14 @@ mmp_thread(void *arg) MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); boolean_t suspended = spa_suspended(spa); boolean_t multihost = spa_multihost(spa); - hrtime_t start, next_time; + hrtime_t next_time; - start = gethrtime(); - if (multihost) { - next_time = start + mmp_interval / + if (multihost) + next_time = gethrtime() + mmp_interval / MAX(vdev_count_leaves(spa), 1); - } else { - next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL); - } + else + next_time = gethrtime() + + MSEC2NSEC(MMP_DEFAULT_INTERVAL); /* * MMP off => on, or suspended => !suspended: @@ -515,11 +539,11 @@ mmp_thread(void *arg) * mmp_interval * mmp_fail_intervals nanoseconds. */ if (!suspended && mmp_fail_intervals && multihost && - (start - mmp->mmp_last_write) > max_fail_ns) { + (gethrtime() - mmp->mmp_last_write) > max_fail_ns) { cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llus; suspending pool", spa_name(spa), - NSEC2SEC(start - mmp->mmp_last_write)); + NSEC2SEC(gethrtime() - mmp->mmp_last_write)); zio_suspend(spa, NULL, ZIO_SUSPEND_MMP); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 4b6196cc36..53b5aabf02 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2462,6 +2462,10 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_delay = MAX(import_delay, import_intervals * MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL))); + zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu import_intervals=%u " + "leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals, + vdev_count_leaves(spa)); + /* Add a small random factor in case of simultaneous imports (0-25%) */ import_expire = gethrtime() + import_delay + (import_delay * spa_get_random(250) / 1000); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 26fc936459..4fee4bc7a7 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1495,7 +1495,6 @@ retry: if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) goto retry; - if (spa_multihost(spa)) mmp_update_uberblock(spa, ub); diff --git a/tests/zfs-tests/tests/functional/mmp/mmp.cfg b/tests/zfs-tests/tests/functional/mmp/mmp.cfg index 36f9954435..52680c275a 100644 --- a/tests/zfs-tests/tests/functional/mmp/mmp.cfg +++ b/tests/zfs-tests/tests/functional/mmp/mmp.cfg @@ -38,5 +38,3 @@ export MMP_HISTORY_OFF=0 export MMP_INTERVAL_HOUR=$((60*60*1000)) export MMP_INTERVAL_DEFAULT=1000 export MMP_INTERVAL_MIN=100 - -export ZPOOL_IMPORT_DURATION=9 diff --git a/tests/zfs-tests/tests/functional/mmp/mmp.kshlib b/tests/zfs-tests/tests/functional/mmp/mmp.kshlib index 571affe89a..e74f04a5b6 100644 --- a/tests/zfs-tests/tests/functional/mmp/mmp.kshlib +++ b/tests/zfs-tests/tests/functional/mmp/mmp.kshlib @@ -163,17 +163,32 @@ function mmp_pool_set_hostid # pool hostid return 0 } +# Return the number of seconds the activity check portion of the import process +# will take. Does not include the time to find devices and assemble the +# preliminary pool configuration passed into the kernel. +function seconds_mmp_waits_for_activity +{ + typeset import_intervals=$(get_tunable zfs_multihost_import_intervals) + typeset interval=$(get_tunable zfs_multihost_interval) + typeset seconds=$((interval*import_intervals/1000)) + + echo $seconds +} + function import_no_activity_check # pool opts { typeset pool=$1 typeset opts=$2 + typeset max_duration=$(seconds_mmp_waits_for_activity) + SECONDS=0 zpool import $opts $pool typeset rc=$? - if [[ $SECONDS -gt $ZPOOL_IMPORT_DURATION ]]; then - log_fail "unexpected activity check (${SECONDS}s)" + if [[ $SECONDS -gt $max_duration ]]; then + log_fail "unexpected activity check (${SECONDS}s gt \ +$max_duration)" fi return $rc @@ -184,12 +199,15 @@ function import_activity_check # pool opts typeset pool=$1 typeset opts=$2 + typeset min_duration=$(seconds_mmp_waits_for_activity) + SECONDS=0 zpool import $opts $pool typeset rc=$? - if [[ $SECONDS -le $ZPOOL_IMPORT_DURATION ]]; then - log_fail "expected activity check (${SECONDS}s)" + if [[ $SECONDS -le $min_duration ]]; then + log_fail "expected activity check (${SECONDS}s le \ +$min_duration)" fi return $rc diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_active_import.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_active_import.ksh index 035264fe0f..e39c5ab309 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_active_import.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_active_import.ksh @@ -103,6 +103,9 @@ MMP_IMPORTED_MSG="pool was previously in use from another system." log_must try_pool_import $MMP_POOL "-d $MMP_DIR" "$MMP_IMPORTED_MSG" # 7. Verify 'zpool import -f $MMP_POOL' can now import the pool. +# Default interval results in minimum activity test 10s which +# makes detection of the activity test reliable. +log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT log_must import_activity_check $MMP_POOL "-f -d $MMP_DIR" # 8 Verify pool may be exported/imported without -f argument.