vdev probe to slow disk can stall mmp write checker
Simplify vdev probes in the zio_vdev_io_done context to avoid holding the spa config lock for a long duration. Also allow zpool clear if no evidence of another host is using the pool. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Olaf Faaland <faaland1@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Don Brady <don.brady@klarasystems.com> Closes #15839
This commit is contained in:
parent
b28461b7c6
commit
c3f2f1aa2d
|
@ -9050,7 +9050,7 @@ status_callback(zpool_handle_t *zhp, void *data)
|
||||||
printf_color(ANSI_BOLD, gettext("action: "));
|
printf_color(ANSI_BOLD, gettext("action: "));
|
||||||
printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
|
printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
|
||||||
" are connected, then reboot your system and\n\timport the "
|
" are connected, then reboot your system and\n\timport the "
|
||||||
"pool.\n"));
|
"pool or run 'zpool clear' to resume the pool.\n"));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ZPOOL_STATUS_IO_FAILURE_WAIT:
|
case ZPOOL_STATUS_IO_FAILURE_WAIT:
|
||||||
|
|
|
@ -770,7 +770,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
|
||||||
|
|
||||||
#define SPA_ASYNC_CONFIG_UPDATE 0x01
|
#define SPA_ASYNC_CONFIG_UPDATE 0x01
|
||||||
#define SPA_ASYNC_REMOVE 0x02
|
#define SPA_ASYNC_REMOVE 0x02
|
||||||
#define SPA_ASYNC_PROBE 0x04
|
#define SPA_ASYNC_FAULT_VDEV 0x04
|
||||||
#define SPA_ASYNC_RESILVER_DONE 0x08
|
#define SPA_ASYNC_RESILVER_DONE 0x08
|
||||||
#define SPA_ASYNC_RESILVER 0x10
|
#define SPA_ASYNC_RESILVER 0x10
|
||||||
#define SPA_ASYNC_AUTOEXPAND 0x20
|
#define SPA_ASYNC_AUTOEXPAND 0x20
|
||||||
|
@ -1123,6 +1123,8 @@ extern uint32_t spa_get_hostid(spa_t *spa);
|
||||||
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
|
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
|
||||||
extern boolean_t spa_livelist_delete_check(spa_t *spa);
|
extern boolean_t spa_livelist_delete_check(spa_t *spa);
|
||||||
|
|
||||||
|
extern boolean_t spa_mmp_remote_host_activity(spa_t *spa);
|
||||||
|
|
||||||
extern spa_mode_t spa_mode(spa_t *spa);
|
extern spa_mode_t spa_mode(spa_t *spa);
|
||||||
extern uint64_t zfs_strtonum(const char *str, char **nptr);
|
extern uint64_t zfs_strtonum(const char *str, char **nptr);
|
||||||
|
|
||||||
|
|
|
@ -50,20 +50,20 @@ extern "C" {
|
||||||
#define MMP_SEQ_VALID_BIT 0x02
|
#define MMP_SEQ_VALID_BIT 0x02
|
||||||
#define MMP_FAIL_INT_VALID_BIT 0x04
|
#define MMP_FAIL_INT_VALID_BIT 0x04
|
||||||
|
|
||||||
#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
|
#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \
|
||||||
ubp->ub_mmp_magic == MMP_MAGIC)
|
(ubp)->ub_mmp_magic == MMP_MAGIC)
|
||||||
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
|
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
|
||||||
MMP_INTERVAL_VALID_BIT))
|
MMP_INTERVAL_VALID_BIT))
|
||||||
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
|
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
|
||||||
MMP_SEQ_VALID_BIT))
|
MMP_SEQ_VALID_BIT))
|
||||||
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
|
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
|
||||||
MMP_FAIL_INT_VALID_BIT))
|
MMP_FAIL_INT_VALID_BIT))
|
||||||
|
|
||||||
#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
|
#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
|
||||||
>> 8)
|
>> 8)
|
||||||
#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
|
#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
|
||||||
>> 32)
|
>> 32)
|
||||||
#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
|
#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \
|
||||||
>> 48)
|
>> 48)
|
||||||
|
|
||||||
#define MMP_INTERVAL_SET(write) \
|
#define MMP_INTERVAL_SET(write) \
|
||||||
|
|
|
@ -273,7 +273,7 @@ struct vdev {
|
||||||
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
|
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
|
||||||
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
|
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
|
||||||
boolean_t vdev_remove_wanted; /* async remove wanted? */
|
boolean_t vdev_remove_wanted; /* async remove wanted? */
|
||||||
boolean_t vdev_probe_wanted; /* async probe wanted? */
|
boolean_t vdev_fault_wanted; /* async faulted wanted? */
|
||||||
list_node_t vdev_config_dirty_node; /* config dirty list */
|
list_node_t vdev_config_dirty_node; /* config dirty list */
|
||||||
list_node_t vdev_state_dirty_node; /* state dirty list */
|
list_node_t vdev_state_dirty_node; /* state dirty list */
|
||||||
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
|
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
|
||||||
|
|
|
@ -50,9 +50,10 @@ If the pool was suspended it will be brought back online provided the
|
||||||
devices can be accessed.
|
devices can be accessed.
|
||||||
Pools with
|
Pools with
|
||||||
.Sy multihost
|
.Sy multihost
|
||||||
enabled which have been suspended cannot be resumed.
|
enabled which have been suspended cannot be resumed when there is evidence
|
||||||
While the pool was suspended, it may have been imported on
|
that the pool was imported by another host.
|
||||||
another host, and resuming I/O could result in pool damage.
|
The same checks performed during an import will be applied before the clear
|
||||||
|
proceeds.
|
||||||
.Bl -tag -width Ds
|
.Bl -tag -width Ds
|
||||||
.It Fl -power
|
.It Fl -power
|
||||||
Power on the devices's slot in the storage enclosure and wait for the device
|
Power on the devices's slot in the storage enclosure and wait for the device
|
||||||
|
|
|
@ -664,12 +664,13 @@ mmp_thread(void *arg)
|
||||||
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
|
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
|
||||||
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
|
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
|
||||||
"mmp_last_write %llu mmp_interval %llu "
|
"mmp_last_write %llu mmp_interval %llu "
|
||||||
"mmp_fail_intervals %llu mmp_fail_ns %llu",
|
"mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
|
||||||
spa_name(spa), (u_longlong_t)gethrtime(),
|
spa_name(spa), (u_longlong_t)gethrtime(),
|
||||||
(u_longlong_t)mmp->mmp_last_write,
|
(u_longlong_t)mmp->mmp_last_write,
|
||||||
(u_longlong_t)mmp_interval,
|
(u_longlong_t)mmp_interval,
|
||||||
(u_longlong_t)mmp_fail_intervals,
|
(u_longlong_t)mmp_fail_intervals,
|
||||||
(u_longlong_t)mmp_fail_ns);
|
(u_longlong_t)mmp_fail_ns,
|
||||||
|
(u_longlong_t)spa->spa_uberblock.ub_txg);
|
||||||
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
|
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
|
||||||
"succeeded in over %llu ms; suspending pool. "
|
"succeeded in over %llu ms; suspending pool. "
|
||||||
"Hrtime %llu",
|
"Hrtime %llu",
|
||||||
|
|
102
module/zfs/spa.c
102
module/zfs/spa.c
|
@ -3594,11 +3594,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Perform the import activity check. If the user canceled the import or
|
* Remote host activity check.
|
||||||
* we detected activity then fail.
|
*
|
||||||
|
* error results:
|
||||||
|
* 0 - no activity detected
|
||||||
|
* EREMOTEIO - remote activity detected
|
||||||
|
* EINTR - user canceled the operation
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
|
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
|
||||||
|
boolean_t importing)
|
||||||
{
|
{
|
||||||
uint64_t txg = ub->ub_txg;
|
uint64_t txg = ub->ub_txg;
|
||||||
uint64_t timestamp = ub->ub_timestamp;
|
uint64_t timestamp = ub->ub_timestamp;
|
||||||
|
@ -3643,19 +3648,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
|
||||||
|
|
||||||
import_expire = gethrtime() + import_delay;
|
import_expire = gethrtime() + import_delay;
|
||||||
|
|
||||||
spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
|
if (importing) {
|
||||||
"%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
|
spa_import_progress_set_notes(spa, "Checking MMP activity, "
|
||||||
|
"waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
|
||||||
|
}
|
||||||
|
|
||||||
int interations = 0;
|
int iterations = 0;
|
||||||
while ((now = gethrtime()) < import_expire) {
|
while ((now = gethrtime()) < import_expire) {
|
||||||
if (interations++ % 30 == 0) {
|
if (importing && iterations++ % 30 == 0) {
|
||||||
spa_import_progress_set_notes(spa, "Checking MMP "
|
spa_import_progress_set_notes(spa, "Checking MMP "
|
||||||
"activity, %llu ms remaining",
|
"activity, %llu ms remaining",
|
||||||
(u_longlong_t)NSEC2MSEC(import_expire - now));
|
(u_longlong_t)NSEC2MSEC(import_expire - now));
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
|
if (importing) {
|
||||||
NSEC2SEC(import_expire - gethrtime()));
|
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
|
||||||
|
NSEC2SEC(import_expire - gethrtime()));
|
||||||
|
}
|
||||||
|
|
||||||
vdev_uberblock_load(rvd, ub, &mmp_label);
|
vdev_uberblock_load(rvd, ub, &mmp_label);
|
||||||
|
|
||||||
|
@ -3737,6 +3746,61 @@ out:
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Called from zfs_ioc_clear for a pool that was suspended
|
||||||
|
* after failing mmp write checks.
|
||||||
|
*/
|
||||||
|
boolean_t
|
||||||
|
spa_mmp_remote_host_activity(spa_t *spa)
|
||||||
|
{
|
||||||
|
ASSERT(spa_multihost(spa) && spa_suspended(spa));
|
||||||
|
|
||||||
|
nvlist_t *best_label;
|
||||||
|
uberblock_t best_ub;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Locate the best uberblock on disk
|
||||||
|
*/
|
||||||
|
vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
|
||||||
|
if (best_label) {
|
||||||
|
/*
|
||||||
|
* confirm that the best hostid matches our hostid
|
||||||
|
*/
|
||||||
|
if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
|
||||||
|
spa_get_hostid(spa) !=
|
||||||
|
fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
|
||||||
|
nvlist_free(best_label);
|
||||||
|
return (B_TRUE);
|
||||||
|
}
|
||||||
|
nvlist_free(best_label);
|
||||||
|
} else {
|
||||||
|
return (B_TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!MMP_VALID(&best_ub) ||
|
||||||
|
!MMP_FAIL_INT_VALID(&best_ub) ||
|
||||||
|
MMP_FAIL_INT(&best_ub) == 0) {
|
||||||
|
return (B_TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
|
||||||
|
best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
|
||||||
|
zfs_dbgmsg("txg mismatch detected during pool clear "
|
||||||
|
"txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
|
||||||
|
(u_longlong_t)spa->spa_uberblock.ub_txg,
|
||||||
|
(u_longlong_t)best_ub.ub_txg,
|
||||||
|
(u_longlong_t)spa->spa_uberblock.ub_timestamp,
|
||||||
|
(u_longlong_t)best_ub.ub_timestamp);
|
||||||
|
return (B_TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Perform an activity check looking for any remote writer
|
||||||
|
*/
|
||||||
|
return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
|
||||||
|
B_FALSE) != 0);
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
|
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
|
||||||
{
|
{
|
||||||
|
@ -4063,7 +4127,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
|
||||||
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
|
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
|
||||||
}
|
}
|
||||||
|
|
||||||
int error = spa_activity_check(spa, ub, spa->spa_config);
|
int error =
|
||||||
|
spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
|
||||||
if (error) {
|
if (error) {
|
||||||
nvlist_free(label);
|
nvlist_free(label);
|
||||||
return (error);
|
return (error);
|
||||||
|
@ -8771,15 +8836,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
spa_async_probe(spa_t *spa, vdev_t *vd)
|
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
|
||||||
{
|
{
|
||||||
if (vd->vdev_probe_wanted) {
|
if (vd->vdev_fault_wanted) {
|
||||||
vd->vdev_probe_wanted = B_FALSE;
|
vd->vdev_fault_wanted = B_FALSE;
|
||||||
vdev_reopen(vd); /* vdev_open() does the actual probe */
|
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
|
||||||
|
VDEV_AUX_ERR_EXCEEDED);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int c = 0; c < vd->vdev_children; c++)
|
for (int c = 0; c < vd->vdev_children; c++)
|
||||||
spa_async_probe(spa, vd->vdev_child[c]);
|
spa_async_fault_vdev(spa, vd->vdev_child[c]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -8867,11 +8933,11 @@ spa_async_thread(void *arg)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* See if any devices need to be probed.
|
* See if any devices need to be marked faulted.
|
||||||
*/
|
*/
|
||||||
if (tasks & SPA_ASYNC_PROBE) {
|
if (tasks & SPA_ASYNC_FAULT_VDEV) {
|
||||||
spa_vdev_state_enter(spa, SCL_NONE);
|
spa_vdev_state_enter(spa, SCL_NONE);
|
||||||
spa_async_probe(spa, spa->spa_root_vdev);
|
spa_async_fault_vdev(spa, spa->spa_root_vdev);
|
||||||
(void) spa_vdev_state_exit(spa, NULL, 0);
|
(void) spa_vdev_state_exit(spa, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -550,6 +550,15 @@ txg_sync_thread(void *arg)
|
||||||
timer = (delta > timeout ? 0 : timeout - delta);
|
timer = (delta > timeout ? 0 : timeout - delta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When we're suspended, nothing should be changing and for
|
||||||
|
* MMP we don't want to bump anything that would make it
|
||||||
|
* harder to detect if another host is changing it when
|
||||||
|
* resuming after a MMP suspend.
|
||||||
|
*/
|
||||||
|
if (spa_suspended(spa))
|
||||||
|
continue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Wait until the quiesce thread hands off a txg to us,
|
* Wait until the quiesce thread hands off a txg to us,
|
||||||
* prompting it to do so if necessary.
|
* prompting it to do so if necessary.
|
||||||
|
|
|
@ -1664,6 +1664,7 @@ vdev_metaslab_fini(vdev_t *vd)
|
||||||
typedef struct vdev_probe_stats {
|
typedef struct vdev_probe_stats {
|
||||||
boolean_t vps_readable;
|
boolean_t vps_readable;
|
||||||
boolean_t vps_writeable;
|
boolean_t vps_writeable;
|
||||||
|
boolean_t vps_zio_done_probe;
|
||||||
int vps_flags;
|
int vps_flags;
|
||||||
} vdev_probe_stats_t;
|
} vdev_probe_stats_t;
|
||||||
|
|
||||||
|
@ -1709,6 +1710,17 @@ vdev_probe_done(zio_t *zio)
|
||||||
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
|
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
|
||||||
spa, vd, NULL, NULL, 0);
|
spa, vd, NULL, NULL, 0);
|
||||||
zio->io_error = SET_ERROR(ENXIO);
|
zio->io_error = SET_ERROR(ENXIO);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If this probe was initiated from zio pipeline, then
|
||||||
|
* change the state in a spa_async_request. Probes that
|
||||||
|
* were initiated from a vdev_open can change the state
|
||||||
|
* as part of the open call.
|
||||||
|
*/
|
||||||
|
if (vps->vps_zio_done_probe) {
|
||||||
|
vd->vdev_fault_wanted = B_TRUE;
|
||||||
|
spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_enter(&vd->vdev_probe_lock);
|
mutex_enter(&vd->vdev_probe_lock);
|
||||||
|
@ -1759,6 +1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
|
||||||
|
|
||||||
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
|
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
|
||||||
ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
|
ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
|
||||||
|
vps->vps_zio_done_probe = (zio != NULL);
|
||||||
|
|
||||||
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
|
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
|
||||||
/*
|
/*
|
||||||
|
@ -1785,15 +1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
|
||||||
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
|
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
|
||||||
vdev_probe_done, vps,
|
vdev_probe_done, vps,
|
||||||
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
|
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
|
||||||
|
|
||||||
/*
|
|
||||||
* We can't change the vdev state in this context, so we
|
|
||||||
* kick off an async task to do it on our behalf.
|
|
||||||
*/
|
|
||||||
if (zio != NULL) {
|
|
||||||
vd->vdev_probe_wanted = B_TRUE;
|
|
||||||
spa_async_request(spa, SPA_ASYNC_PROBE);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (zio != NULL)
|
if (zio != NULL)
|
||||||
|
|
|
@ -2027,6 +2027,7 @@ retry:
|
||||||
/*
|
/*
|
||||||
* If this isn't a resync due to I/O errors,
|
* If this isn't a resync due to I/O errors,
|
||||||
* and nothing changed in this transaction group,
|
* and nothing changed in this transaction group,
|
||||||
|
* and multihost protection isn't enabled,
|
||||||
* and the vdev configuration hasn't changed,
|
* and the vdev configuration hasn't changed,
|
||||||
* then there's nothing to do.
|
* then there's nothing to do.
|
||||||
*/
|
*/
|
||||||
|
@ -2034,7 +2035,8 @@ retry:
|
||||||
boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
|
boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
|
||||||
txg, spa->spa_mmp.mmp_delay);
|
txg, spa->spa_mmp.mmp_delay);
|
||||||
|
|
||||||
if (!changed && list_is_empty(&spa->spa_config_dirty_list))
|
if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
|
||||||
|
!spa_multihost(spa))
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5823,10 +5823,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If multihost is enabled, resuming I/O is unsafe as another
|
* If multihost is enabled, resuming I/O is unsafe as another
|
||||||
* host may have imported the pool.
|
* host may have imported the pool. Check for remote activity.
|
||||||
*/
|
*/
|
||||||
if (spa_multihost(spa) && spa_suspended(spa))
|
if (spa_multihost(spa) && spa_suspended(spa) &&
|
||||||
return (SET_ERROR(EINVAL));
|
spa_mmp_remote_host_activity(spa)) {
|
||||||
|
spa_close(spa, FTAG);
|
||||||
|
return (SET_ERROR(EREMOTEIO));
|
||||||
|
}
|
||||||
|
|
||||||
spa_vdev_state_enter(spa, SCL_NONE);
|
spa_vdev_state_enter(spa, SCL_NONE);
|
||||||
|
|
||||||
|
|
|
@ -2532,8 +2532,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
|
||||||
"failure and the failure mode property for this pool "
|
"failure and the failure mode property for this pool "
|
||||||
"is set to panic.", spa_name(spa));
|
"is set to panic.", spa_name(spa));
|
||||||
|
|
||||||
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
|
if (reason != ZIO_SUSPEND_MMP) {
|
||||||
"failure and has been suspended.\n", spa_name(spa));
|
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
|
||||||
|
"I/O failure and has been suspended.\n", spa_name(spa));
|
||||||
|
}
|
||||||
|
|
||||||
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
|
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
|
||||||
NULL, NULL, 0);
|
NULL, NULL, 0);
|
||||||
|
|
|
@ -607,9 +607,11 @@ zio_handle_io_delay(zio_t *zio)
|
||||||
if (vd->vdev_guid != handler->zi_record.zi_guid)
|
if (vd->vdev_guid != handler->zi_record.zi_guid)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
/* also match on I/O type (e.g., -T read) */
|
||||||
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
|
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
|
||||||
handler->zi_record.zi_iotype != zio->io_type)
|
handler->zi_record.zi_iotype != zio->io_type) {
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Defensive; should never happen as the array allocation
|
* Defensive; should never happen as the array allocation
|
||||||
|
|
|
@ -146,7 +146,7 @@ tags = ['functional', 'mmap']
|
||||||
tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
|
tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
|
||||||
'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
|
'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
|
||||||
'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
|
'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
|
||||||
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid']
|
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk']
|
||||||
tags = ['functional', 'mmp']
|
tags = ['functional', 'mmp']
|
||||||
|
|
||||||
[tests/functional/mount:Linux]
|
[tests/functional/mount:Linux]
|
||||||
|
|
|
@ -1593,6 +1593,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||||
functional/mmp/mmp_on_zdb.ksh \
|
functional/mmp/mmp_on_zdb.ksh \
|
||||||
functional/mmp/mmp_reset_interval.ksh \
|
functional/mmp/mmp_reset_interval.ksh \
|
||||||
functional/mmp/mmp_write_distribution.ksh \
|
functional/mmp/mmp_write_distribution.ksh \
|
||||||
|
functional/mmp/mmp_write_slow_disk.ksh \
|
||||||
functional/mmp/mmp_write_uberblocks.ksh \
|
functional/mmp/mmp_write_uberblocks.ksh \
|
||||||
functional/mmp/multihost_history.ksh \
|
functional/mmp/multihost_history.ksh \
|
||||||
functional/mmp/setup.ksh \
|
functional/mmp/setup.ksh \
|
||||||
|
|
|
@ -0,0 +1,97 @@
|
||||||
|
#!/bin/ksh -p
|
||||||
|
#
|
||||||
|
# CDDL HEADER START
|
||||||
|
#
|
||||||
|
# This file and its contents are supplied under the terms of the
|
||||||
|
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
# You may only use this file in accordance with the terms of version
|
||||||
|
# 1.0 of the CDDL.
|
||||||
|
#
|
||||||
|
# A full copy of the text of the CDDL should have accompanied this
|
||||||
|
# source. A copy of the CDDL is also available via the Internet at
|
||||||
|
# http://www.illumos.org/license/CDDL.
|
||||||
|
#
|
||||||
|
# CDDL HEADER END
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright (c) 2024, Klara Inc
|
||||||
|
#
|
||||||
|
|
||||||
|
# DESCRIPTION:
|
||||||
|
# Verify that long VDEV probes do not cause MMP checks to suspend pool
|
||||||
|
# Note: without PR-15839 fix, this test will suspend the pool.
|
||||||
|
#
|
||||||
|
# A device that is returning unexpected errors will trigger a vdev_probe.
|
||||||
|
# When the device additionally has slow response times, the probe can hold
|
||||||
|
# the spa config lock as a writer for a long period of time such that the
|
||||||
|
# mmp uberblock updates stall when trying to acquire the spa config lock.
|
||||||
|
#
|
||||||
|
# STRATEGY:
|
||||||
|
# 1. Create a pool with multiple leaf vdevs
|
||||||
|
# 2. Enable multihost and multihost_history
|
||||||
|
# 3. Delay for MMP writes to occur
|
||||||
|
# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool
|
||||||
|
#
|
||||||
|
|
||||||
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
. $STF_SUITE/tests/functional/mmp/mmp.cfg
|
||||||
|
. $STF_SUITE/tests/functional/mmp/mmp.kshlib
|
||||||
|
|
||||||
|
verify_runnable "both"
|
||||||
|
|
||||||
|
function cleanup
|
||||||
|
{
|
||||||
|
log_must zinject -c all
|
||||||
|
|
||||||
|
if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then
|
||||||
|
log_must zpool clear $MMP_POOL
|
||||||
|
zpool get state $MMP_POOL $MMP_DIR/file.3
|
||||||
|
zpool events | grep ".fs.zfs." | grep -v "history_event"
|
||||||
|
fi
|
||||||
|
|
||||||
|
poolexists $MMP_POOL && destroy_pool $MMP_POOL
|
||||||
|
log_must rm -r $MMP_DIR
|
||||||
|
log_must mmp_clear_hostid
|
||||||
|
}
|
||||||
|
|
||||||
|
log_assert "A long VDEV probe doesn't cause a MMP check suspend"
|
||||||
|
log_onexit cleanup
|
||||||
|
|
||||||
|
MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost
|
||||||
|
|
||||||
|
# Create a multiple drive pool
|
||||||
|
log_must zpool events -c
|
||||||
|
log_must mkdir -p $MMP_DIR
|
||||||
|
log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5}
|
||||||
|
log_must zpool create -f $MMP_POOL \
|
||||||
|
mirror $MMP_DIR/file.{0,1,2} \
|
||||||
|
mirror $MMP_DIR/file.{3,4,5}
|
||||||
|
|
||||||
|
# Enable MMP
|
||||||
|
log_must mmp_set_hostid $HOSTID1
|
||||||
|
log_must zpool set multihost=on $MMP_POOL
|
||||||
|
clear_mmp_history
|
||||||
|
|
||||||
|
# Inject vdev write error along with a delay
|
||||||
|
log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL
|
||||||
|
log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL
|
||||||
|
log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL
|
||||||
|
|
||||||
|
log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5
|
||||||
|
sleep 10
|
||||||
|
sync_pool $MMP_POOL
|
||||||
|
|
||||||
|
# Confirm mmp writes to the non-slow disks have taken place
|
||||||
|
for x in {0,1,2,4}; do
|
||||||
|
write_count=$(grep -c file.${x} $MMP_HISTORY_URL)
|
||||||
|
[[ $write_count -gt 0 ]] || log_fail "expecting mmp writes"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Expect that the pool was not suspended
|
||||||
|
log_must check_state $MMP_POOL "" "ONLINE"
|
||||||
|
health=$(zpool list -H -o health $MMP_POOL)
|
||||||
|
log_note "$MMP_POOL health is $health"
|
||||||
|
[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected"
|
||||||
|
|
||||||
|
log_pass "A long VDEV probe doesn't cause a MMP check suspend"
|
Loading…
Reference in New Issue