diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index fc8346dc3a..1ae5f31fd6 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -280,6 +280,59 @@ mmp_random_leaf(vdev_t *in_vd, vdev_t **out_vd) return (error_mask); } +/* + * MMP writes are issued on a fixed schedule, but may complete at variable, + * much longer, intervals. The mmp_delay captures long periods between + * successful writes for any reason, including disk latency, scheduling delays, + * etc. + * + * The mmp_delay is usually calculated as a decaying average, but if the latest + * delay is higher we do not average it, so that we do not hide sudden spikes + * which the importing host must wait for. + * + * If writes are occurring frequently, such as due to a high rate of txg syncs, + * the mmp_delay could become very small. Since those short delays depend on + * activity we cannot count on, we never allow mmp_delay to get lower than rate + * expected if only mmp_thread writes occur. + * + * If an mmp write was skipped or fails, and we have already waited longer than + * mmp_delay, we need to update it so the next write reflects the longer delay. + * + * Do not set mmp_delay if the multihost property is not on, so as not to + * trigger an activity check on import. + */ +static void +mmp_delay_update(spa_t *spa, boolean_t write_completed) +{ + mmp_thread_t *mts = &spa->spa_mmp; + hrtime_t delay = gethrtime() - mts->mmp_last_write; + + ASSERT(MUTEX_HELD(&mts->mmp_io_lock)); + + if (spa_multihost(spa) == B_FALSE) { + mts->mmp_delay = 0; + return; + } + + if (delay > mts->mmp_delay) + mts->mmp_delay = delay; + + if (write_completed == B_FALSE) + return; + + mts->mmp_last_write = gethrtime(); + + /* + * strictly less than, in case delay was changed above. + */ + if (delay < mts->mmp_delay) { + hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) / + vdev_count_leaves(spa); + mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128), + min_delay); + } +} + static void mmp_write_done(zio_t *zio) { @@ -291,38 +344,8 @@ mmp_write_done(zio_t *zio) uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id; hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending; - if (zio->io_error) - goto unlock; + mmp_delay_update(spa, (zio->io_error == 0)); - /* - * Mmp writes are queued on a fixed schedule, but under many - * circumstances, such as a busy device or faulty hardware, - * the writes will complete at variable, much longer, - * intervals. In these cases, another node checking for - * activity must wait longer to account for these delays. - * - * The mmp_delay is calculated as a decaying average of the interval - * between completed mmp writes. This is used to predict how long - * the import must wait to detect activity in the pool, before - * concluding it is not in use. - * - * Do not set mmp_delay if the multihost property is not on, - * so as not to trigger an activity check on import. - */ - if (spa_multihost(spa)) { - hrtime_t delay = gethrtime() - mts->mmp_last_write; - - if (delay > mts->mmp_delay) - mts->mmp_delay = delay; - else - mts->mmp_delay = (delay + mts->mmp_delay * 127) / - 128; - } else { - mts->mmp_delay = 0; - } - mts->mmp_last_write = gethrtime(); - -unlock: vd->vdev_mmp_pending = 0; vd->vdev_mmp_kstat_id = 0; @@ -348,6 +371,7 @@ mmp_update_uberblock(spa_t *spa, uberblock_t *ub) mutex_enter(&mmp->mmp_io_lock); mmp->mmp_ub = *ub; mmp->mmp_ub.ub_timestamp = gethrestime_sec(); + mmp_delay_update(spa, B_TRUE); mutex_exit(&mmp->mmp_io_lock); } @@ -386,6 +410,7 @@ mmp_write_uberblock(spa_t *spa) */ if (error) { + mmp_delay_update(spa, B_FALSE); if (mmp->mmp_skip_error == error) { spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1); } else { @@ -462,15 +487,14 @@ mmp_thread(spa_t *spa) MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); boolean_t suspended = spa_suspended(spa); boolean_t multihost = spa_multihost(spa); - hrtime_t start, next_time; + hrtime_t next_time; - start = gethrtime(); - if (multihost) { - next_time = start + mmp_interval / + if (multihost) + next_time = gethrtime() + mmp_interval / MAX(vdev_count_leaves(spa), 1); - } else { - next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL); - } + else + next_time = gethrtime() + + MSEC2NSEC(MMP_DEFAULT_INTERVAL); /* * MMP off => on, or suspended => !suspended: @@ -514,11 +538,11 @@ mmp_thread(spa_t *spa) * mmp_interval * mmp_fail_intervals nanoseconds. */ if (!suspended && mmp_fail_intervals && multihost && - (start - mmp->mmp_last_write) > max_fail_ns) { + (gethrtime() - mmp->mmp_last_write) > max_fail_ns) { cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llus; suspending pool", spa_name(spa), - NSEC2SEC(start - mmp->mmp_last_write)); + NSEC2SEC(gethrtime() - mmp->mmp_last_write)); zio_suspend(spa, NULL, ZIO_SUSPEND_MMP); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1add7ad246..02dda927d2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2471,6 +2471,10 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_delay = MAX(import_delay, import_intervals * MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL))); + zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu import_intervals=%u " + "leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals, + vdev_count_leaves(spa)); + /* Add a small random factor in case of simultaneous imports (0-25%) */ import_expire = gethrtime() + import_delay + (import_delay * spa_get_random(250) / 1000); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 27e79ae60e..a1bfe5a1d8 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1501,7 +1501,6 @@ retry: if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) goto retry; - if (spa_multihost(spa)) mmp_update_uberblock(spa, ub); diff --git a/tests/zfs-tests/tests/functional/mmp/mmp.cfg b/tests/zfs-tests/tests/functional/mmp/mmp.cfg index 36f9954435..52680c275a 100644 --- a/tests/zfs-tests/tests/functional/mmp/mmp.cfg +++ b/tests/zfs-tests/tests/functional/mmp/mmp.cfg @@ -38,5 +38,3 @@ export MMP_HISTORY_OFF=0 export MMP_INTERVAL_HOUR=$((60*60*1000)) export MMP_INTERVAL_DEFAULT=1000 export MMP_INTERVAL_MIN=100 - -export ZPOOL_IMPORT_DURATION=9 diff --git a/tests/zfs-tests/tests/functional/mmp/mmp.kshlib b/tests/zfs-tests/tests/functional/mmp/mmp.kshlib index 571affe89a..e74f04a5b6 100644 --- a/tests/zfs-tests/tests/functional/mmp/mmp.kshlib +++ b/tests/zfs-tests/tests/functional/mmp/mmp.kshlib @@ -163,17 +163,32 @@ function mmp_pool_set_hostid # pool hostid return 0 } +# Return the number of seconds the activity check portion of the import process +# will take. Does not include the time to find devices and assemble the +# preliminary pool configuration passed into the kernel. +function seconds_mmp_waits_for_activity +{ + typeset import_intervals=$(get_tunable zfs_multihost_import_intervals) + typeset interval=$(get_tunable zfs_multihost_interval) + typeset seconds=$((interval*import_intervals/1000)) + + echo $seconds +} + function import_no_activity_check # pool opts { typeset pool=$1 typeset opts=$2 + typeset max_duration=$(seconds_mmp_waits_for_activity) + SECONDS=0 zpool import $opts $pool typeset rc=$? - if [[ $SECONDS -gt $ZPOOL_IMPORT_DURATION ]]; then - log_fail "unexpected activity check (${SECONDS}s)" + if [[ $SECONDS -gt $max_duration ]]; then + log_fail "unexpected activity check (${SECONDS}s gt \ +$max_duration)" fi return $rc @@ -184,12 +199,15 @@ function import_activity_check # pool opts typeset pool=$1 typeset opts=$2 + typeset min_duration=$(seconds_mmp_waits_for_activity) + SECONDS=0 zpool import $opts $pool typeset rc=$? - if [[ $SECONDS -le $ZPOOL_IMPORT_DURATION ]]; then - log_fail "expected activity check (${SECONDS}s)" + if [[ $SECONDS -le $min_duration ]]; then + log_fail "expected activity check (${SECONDS}s le \ +$min_duration)" fi return $rc diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_active_import.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_active_import.ksh index 035264fe0f..e39c5ab309 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_active_import.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_active_import.ksh @@ -103,6 +103,9 @@ MMP_IMPORTED_MSG="pool was previously in use from another system." log_must try_pool_import $MMP_POOL "-d $MMP_DIR" "$MMP_IMPORTED_MSG" # 7. Verify 'zpool import -f $MMP_POOL' can now import the pool. +# Default interval results in minimum activity test 10s which +# makes detection of the activity test reliable. +log_must set_tunable64 zfs_multihost_interval $MMP_INTERVAL_DEFAULT log_must import_activity_check $MMP_POOL "-f -d $MMP_DIR" # 8 Verify pool may be exported/imported without -f argument.