Merge 2331d19dab
into 1713aa7b4d
This commit is contained in:
commit
68a96ce1a5
|
@ -598,7 +598,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
|
|||
|
||||
extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
|
||||
blkptr_t *new_bp, uint64_t size, boolean_t *slog);
|
||||
extern void zio_flush(zio_t *zio, vdev_t *vd);
|
||||
extern void zio_flush(zio_t *zio, vdev_t *vd, boolean_t propagate);
|
||||
extern void zio_shrink(zio_t *zio, uint64_t size);
|
||||
|
||||
extern int zio_wait(zio_t *zio);
|
||||
|
|
|
@ -1014,21 +1014,6 @@ vdev_geom_io_intr(struct bio *bp)
|
|||
zio->io_error = SET_ERROR(EIO);
|
||||
|
||||
switch (zio->io_error) {
|
||||
case ENOTSUP:
|
||||
/*
|
||||
* If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
|
||||
* that future attempts will never succeed. In this case
|
||||
* we set a persistent flag so that we don't bother with
|
||||
* requests in the future.
|
||||
*/
|
||||
switch (bp->bio_cmd) {
|
||||
case BIO_FLUSH:
|
||||
vd->vdev_nowritecache = B_TRUE;
|
||||
break;
|
||||
case BIO_DELETE:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case ENXIO:
|
||||
if (!vd->vdev_remove_wanted) {
|
||||
/*
|
||||
|
|
|
@ -1232,9 +1232,6 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
|
|||
zio->io_error = -error;
|
||||
#endif
|
||||
|
||||
if (zio->io_error && (zio->io_error == EOPNOTSUPP))
|
||||
zio->io_vd->vdev_nowritecache = B_TRUE;
|
||||
|
||||
bio_put(bio);
|
||||
ASSERT3S(zio->io_error, >=, 0);
|
||||
if (zio->io_error)
|
||||
|
|
|
@ -1830,19 +1830,21 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
|
|||
|
||||
for (int v = 0; v < svdcount; v++) {
|
||||
if (vdev_writeable(svd[v])) {
|
||||
zio_flush(zio, svd[v]);
|
||||
zio_flush(zio, svd[v], B_FALSE);
|
||||
}
|
||||
}
|
||||
if (spa->spa_aux_sync_uber) {
|
||||
spa->spa_aux_sync_uber = B_FALSE;
|
||||
for (int v = 0; v < spa->spa_spares.sav_count; v++) {
|
||||
if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) {
|
||||
zio_flush(zio, spa->spa_spares.sav_vdevs[v]);
|
||||
zio_flush(zio, spa->spa_spares.sav_vdevs[v],
|
||||
B_FALSE);
|
||||
}
|
||||
}
|
||||
for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
|
||||
if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) {
|
||||
zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]);
|
||||
zio_flush(zio, spa->spa_l2cache.sav_vdevs[v],
|
||||
B_FALSE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2007,13 +2009,13 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
|
|||
zio = zio_root(spa, NULL, NULL, flags);
|
||||
|
||||
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
|
||||
zio_flush(zio, vd);
|
||||
zio_flush(zio, vd, B_FALSE);
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (!sav[i]->sav_label_sync)
|
||||
continue;
|
||||
for (int v = 0; v < sav[i]->sav_count; v++)
|
||||
zio_flush(zio, sav[i]->sav_vdevs[v]);
|
||||
zio_flush(zio, sav[i]->sav_vdevs[v], B_FALSE);
|
||||
if (l == 1)
|
||||
sav[i]->sav_label_sync = B_FALSE;
|
||||
}
|
||||
|
@ -2091,7 +2093,7 @@ retry:
|
|||
for (vdev_t *vd =
|
||||
txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
|
||||
vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
|
||||
zio_flush(zio, vd);
|
||||
zio_flush(zio, vd, B_FALSE);
|
||||
|
||||
(void) zio_wait(zio);
|
||||
|
||||
|
|
|
@ -4172,7 +4172,7 @@ io_error_exit:
|
|||
goto io_error_exit;
|
||||
}
|
||||
pio = zio_root(spa, NULL, NULL, 0);
|
||||
zio_flush(pio, raidvd);
|
||||
zio_flush(pio, raidvd, B_FALSE);
|
||||
zio_wait(pio);
|
||||
|
||||
zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
|
||||
|
@ -4231,7 +4231,7 @@ overwrite:
|
|||
goto io_error_exit;
|
||||
}
|
||||
pio = zio_root(spa, NULL, NULL, 0);
|
||||
zio_flush(pio, raidvd);
|
||||
zio_flush(pio, raidvd, B_FALSE);
|
||||
zio_wait(pio);
|
||||
|
||||
zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
|
||||
|
@ -4339,7 +4339,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
|
|||
}
|
||||
zio_wait(pio);
|
||||
pio = zio_root(spa, NULL, NULL, 0);
|
||||
zio_flush(pio, raidvd);
|
||||
zio_flush(pio, raidvd, B_FALSE);
|
||||
zio_wait(pio);
|
||||
|
||||
zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright (c) 2018 Datto Inc.
|
||||
* Copyright (c) 2024, Klara, Inc.
|
||||
*/
|
||||
|
||||
/* Portions Copyright 2010 Robert Milkowski */
|
||||
|
@ -1495,12 +1496,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
|
|||
* includes ZIO errors from either this LWB's write or
|
||||
* flush, as well as any errors from other dependent LWBs
|
||||
* (e.g. a root LWB ZIO that might be a child of this LWB).
|
||||
*
|
||||
* With that said, it's important to note that LWB flush
|
||||
* errors are not propagated up to the LWB root ZIO.
|
||||
* This is incorrect behavior, and results in VDEV flush
|
||||
* errors not being handled correctly here. See the
|
||||
* comment above the call to "zio_flush" for details.
|
||||
*/
|
||||
|
||||
zcw->zcw_zio_error = zio->io_error;
|
||||
|
@ -1650,17 +1645,8 @@ zil_lwb_write_done(zio_t *zio)
|
|||
|
||||
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
|
||||
vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
|
||||
if (vd != NULL) {
|
||||
/*
|
||||
* The "ZIO_FLAG_DONT_PROPAGATE" is currently
|
||||
* always used within "zio_flush". This means,
|
||||
* any errors when flushing the vdev(s), will
|
||||
* (unfortunately) not be handled correctly,
|
||||
* since these "zio_flush" errors will not be
|
||||
* propagated up to "zil_lwb_flush_vdevs_done".
|
||||
*/
|
||||
zio_flush(lwb->lwb_root_zio, vd);
|
||||
}
|
||||
if (vd != NULL)
|
||||
zio_flush(lwb->lwb_root_zio, vd, B_TRUE);
|
||||
kmem_free(zv, sizeof (*zv));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1640,10 +1640,10 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
|
|||
* the flushes complete.
|
||||
*/
|
||||
void
|
||||
zio_flush(zio_t *pio, vdev_t *vd)
|
||||
zio_flush(zio_t *pio, vdev_t *vd, boolean_t propagate)
|
||||
{
|
||||
const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
|
||||
ZIO_FLAG_DONT_RETRY;
|
||||
const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
|
||||
(propagate ? 0 : ZIO_FLAG_DONT_PROPAGATE);
|
||||
|
||||
if (vd->vdev_nowritecache)
|
||||
return;
|
||||
|
@ -1654,7 +1654,7 @@ zio_flush(zio_t *pio, vdev_t *vd)
|
|||
NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
|
||||
} else {
|
||||
for (uint64_t c = 0; c < vd->vdev_children; c++)
|
||||
zio_flush(pio, vd->vdev_child[c]);
|
||||
zio_flush(pio, vd->vdev_child[c], propagate);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4553,11 +4553,14 @@ zio_vdev_io_assess(zio_t *zio)
|
|||
/*
|
||||
* If a cache flush returns ENOTSUP or ENOTTY, we know that no future
|
||||
* attempts will ever succeed. In this case we set a persistent
|
||||
* boolean flag so that we don't bother with it in the future.
|
||||
* boolean flag so that we don't bother with it in the future, and
|
||||
* then we act like the flush succeeded.
|
||||
*/
|
||||
if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
|
||||
zio->io_type == ZIO_TYPE_FLUSH && vd != NULL)
|
||||
zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) {
|
||||
vd->vdev_nowritecache = B_TRUE;
|
||||
zio->io_error = 0;
|
||||
}
|
||||
|
||||
if (zio->io_error)
|
||||
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
||||
|
|
|
@ -124,6 +124,10 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
|
|||
'scrub_after_resilver', 'suspend_resume_single', 'zpool_status_-s']
|
||||
tags = ['functional', 'fault']
|
||||
|
||||
[tests/functional/flush:Linux]
|
||||
tests = ['zil_flush_error']
|
||||
tags = ['functional', 'flush']
|
||||
|
||||
[tests/functional/features/large_dnode:Linux]
|
||||
tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos']
|
||||
tags = ['functional', 'features', 'large_dnode']
|
||||
|
|
|
@ -379,6 +379,7 @@ if os.environ.get('CI') == 'true':
|
|||
'fault/auto_spare_ashift': ['SKIP', ci_reason],
|
||||
'fault/auto_spare_shared': ['SKIP', ci_reason],
|
||||
'fault/suspend_resume_single': ['SKIP', ci_reason],
|
||||
'flush/zil_flush_error': ['SKIP', ci_reason],
|
||||
'procfs/pool_state': ['SKIP', ci_reason],
|
||||
})
|
||||
|
||||
|
|
|
@ -462,13 +462,16 @@ function unload_scsi_debug
|
|||
# Get scsi_debug device name.
|
||||
# Returns basename of scsi_debug device (for example "sdb").
|
||||
#
|
||||
function get_debug_device
|
||||
# $1 (optional): Return the first $1 number of SCSI debug device names.
|
||||
function get_debug_device #num
|
||||
{
|
||||
typeset num=${1:-1}
|
||||
|
||||
for i in {1..10} ; do
|
||||
val=$(lsscsi | awk '/scsi_debug/ {print $6; exit}' | cut -d/ -f3)
|
||||
val=$(lsscsi | awk '/scsi_debug/ {print $6}' | cut -d/ -f3 | head -n$num)
|
||||
|
||||
# lsscsi can take time to settle
|
||||
if [ "$val" != "-" ] ; then
|
||||
if [[ ! "$val" =~ "-" ]] ; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
|
|
|
@ -1516,6 +1516,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||
functional/features/large_dnode/large_dnode_008_pos.ksh \
|
||||
functional/features/large_dnode/large_dnode_009_pos.ksh \
|
||||
functional/features/large_dnode/setup.ksh \
|
||||
functional/flush/cleanup.ksh \
|
||||
functional/flush/zil_flush_error.ksh \
|
||||
functional/flush/setup.ksh \
|
||||
functional/grow/grow_pool_001_pos.ksh \
|
||||
functional/grow/grow_replicas_001_pos.ksh \
|
||||
functional/history/cleanup.ksh \
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
|
||||
#
|
||||
# Copyright (c) 2024, Klara, Inc.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
default_cleanup
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
|
||||
#
|
||||
# Copyright (c) 2024, Klara, Inc.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
log_pass
|
|
@ -0,0 +1,259 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2024, Klara, Inc.
|
||||
#
|
||||
|
||||
#
|
||||
# This tests that if the ZIL write sequence fails, it corectly falls back and
|
||||
# waits until the transaction has fully committed before returning.
|
||||
#
|
||||
# When this test was written, the ZIL has a flaw - it assumes that if its
|
||||
# writes succeed, then the data is definitely on disk and available for reply
|
||||
# if the pool fails. It issues a flush immediately after the write, but does
|
||||
# not check it is result. If a disk fails after the data has been accepted into
|
||||
# the disk cache, but before it can be written to permanent storage, then
|
||||
# fsync() will have returned success even though the data is not stored in the
|
||||
# ZIL for replay.
|
||||
#
|
||||
# If the main pool then fails before the transaction can be written, then data
|
||||
# is lost, and fsync() returning success was premature.
|
||||
#
|
||||
# To prove this, we create a pool with a separate log device. We inject two
|
||||
# faults:
|
||||
#
|
||||
# - ZIL writes appear to succeed, but never make it disk
|
||||
# - ZIL flushes fail, and return error
|
||||
#
|
||||
# We then remove the main pool device, and do a write+fsync. This goes to the
|
||||
# ZIL, and appears to succeed. When the txg closes, the write will fail, and
|
||||
# the pool suspends.
|
||||
#
|
||||
# Then, we simulate a reboot by copying the content of the pool devices aside.
|
||||
# We restore the pool devices, bring it back online, and export it - we don't
|
||||
# need it anymore, but we have to clean up properly. Then we restore the copied
|
||||
# content and import the pool, in whatever state it was in when it suspended.
|
||||
#
|
||||
# Finally, we check the content of the file we wrote to. If it matches what we
|
||||
# wrote, then the fsync() was correct, and all is well. If it doesn't match,
|
||||
# then the flaw is present, and the test fails.
|
||||
#
|
||||
# We run the test twice: once without the log device injections, one with. The
|
||||
# first confirms the expected behaviour of the ZIL - when the pool is imported,
|
||||
# the log is replayed. The second fails as above. When the flaw is corrected,
|
||||
# both will succeed, and this overall test succeeds.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
TMPDIR=${TMPDIR:-$TEST_BASE_DIR}
|
||||
|
||||
BACKUP_MAIN="$TMPDIR/backup_main"
|
||||
BACKUP_LOG="$TMPDIR/backup_log"
|
||||
|
||||
LOOP_LOG="$TMPDIR/loop_log"
|
||||
|
||||
DATA_FILE="$TMPDIR/data_file"
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
zinject -c all
|
||||
destroy_pool $TESTPOOL
|
||||
unload_scsi_debug
|
||||
rm -f $BACKUP_MAIN $BACKUP_LOG $DATA_FILE
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
log_assert "verify fsync() waits if the ZIL commit fails"
|
||||
|
||||
# create 128K of random data, and take its checksum. we do this up front to
|
||||
# ensure we don't get messed up by any latency from reading /dev/random or
|
||||
# checksumming the file on the pool
|
||||
log_must dd if=/dev/random of=$DATA_FILE bs=128K count=1
|
||||
typeset sum=$(sha256digest $DATA_FILE)
|
||||
|
||||
# create a virtual scsi device with two device nodes. these are backed by the
|
||||
# same memory. we do this because we need to be able to take the device offline
|
||||
# properly in order to get the pool to suspend; fault injection on a loop
|
||||
# device can't do it. once offline, we can use the second node to take a copy
|
||||
# of its state.
|
||||
load_scsi_debug 100 1 2 1 '512b'
|
||||
set -A sd $(get_debug_device 2)
|
||||
|
||||
# create a loop device for the log.
|
||||
truncate -s 100M $LOOP_LOG
|
||||
typeset ld=$(basename $(losetup -f))
|
||||
log_must losetup /dev/$ld $LOOP_LOG
|
||||
|
||||
# this function runs the entire test sequence. the option decides if faults
|
||||
# are injected on the slog device, mimicking the trigger situation that causes
|
||||
# the fsync() bug to occur
|
||||
function test_fsync
|
||||
{
|
||||
typeset -i do_fault_log="$1"
|
||||
|
||||
log_note "setting up test"
|
||||
|
||||
# create the pool. the main data store is on the scsi device, with the
|
||||
# log on a loopback. we bias the ZIL towards to the log device to try
|
||||
# to ensure that fsync() never involves the main device
|
||||
log_must zpool create -f -O logbias=latency $TESTPOOL ${sd[0]} log $ld
|
||||
|
||||
# create the file ahead of time. the ZIL head structure is created on
|
||||
# first use, and does a full txg wait, which we need to avoid
|
||||
log_must dd if=/dev/zero of=/$TESTPOOL/data_file \
|
||||
bs=128k count=1 conv=fsync
|
||||
log_must zpool sync
|
||||
|
||||
# arrange for writes to the log device to appear to succeed, and
|
||||
# flushes to fail. this simulates a loss of the device between it
|
||||
# accepting the the write into its cache, but before it can be written
|
||||
# out
|
||||
if [[ $do_fault_log != 0 ]] ; then
|
||||
log_note "injecting log device faults"
|
||||
log_must zinject -d $ld -e noop -T write $TESTPOOL
|
||||
log_must zinject -d $ld -e io -T flush $TESTPOOL
|
||||
fi
|
||||
|
||||
# take the main device offline. there is no IO in flight, so ZFS won't
|
||||
# notice immediately
|
||||
log_note "taking main pool offline"
|
||||
log_must eval "echo offline > /sys/block/${sd[0]}/device/state"
|
||||
|
||||
# write out some data, then call fsync(). there are three possible
|
||||
# results:
|
||||
#
|
||||
# - if the bug is present, fsync() will return success, and dd will
|
||||
# succeed "immediately"; before the pool suspends
|
||||
# - if the bug is fixed, fsync() will block, the pool will suspend, and
|
||||
# dd will return success after the pool returns to service
|
||||
# - if something else goes wrong, dd will fail; this may happen before
|
||||
# or after the pool suspends or returns. this shouldn't happen, and
|
||||
# should abort the test
|
||||
#
|
||||
# we have to put dd in the background, otherwise if it blocks we will
|
||||
# block with it. what we're interested in is whether or not it succeeds
|
||||
# before the pool is suspended. if it does, then we expect that after
|
||||
# the suspended pool is reimported, the data will have been written
|
||||
log_note "running dd in background to write data and call fsync()"
|
||||
dd if=$DATA_FILE of=/$TESTPOOL/data_file bs=128k count=1 conv=fsync &
|
||||
fsync_pid=$!
|
||||
|
||||
# wait for the pool to suspend. this should happen within ~5s, when the
|
||||
# txg sync tries to write the change to the main device
|
||||
log_note "waiting for pool to suspend"
|
||||
typeset -i tries=10
|
||||
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
|
||||
if ((tries-- == 0)); then
|
||||
log_fail "pool didn't suspend"
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# the pool is suspended. see if dd is still present; if it is, then
|
||||
# it's blocked in fsync(), and we have no expectation that the write
|
||||
# made it to disk. if dd has exited, then its return code will tell
|
||||
# us whether fsync() returned success, or it failed for some other
|
||||
# reason
|
||||
typeset -i fsync_success=0
|
||||
if kill -0 $fsync_pid ; then
|
||||
log_note "dd is blocked; fsync() has not returned"
|
||||
else
|
||||
log_note "dd has finished, ensuring it was successful"
|
||||
log_must wait $fsync_pid
|
||||
fsync_success=1
|
||||
fi
|
||||
|
||||
# pool is suspended. if we online the main device right now, it will
|
||||
# retry writing the transaction, which will succed, and everything will
|
||||
# continue as its supposed to. that's the opposite of what we want; we
|
||||
# want to do an import, as if after reboot, to force the pool to try to
|
||||
# replay the ZIL, so we can compare the final result against what
|
||||
# fsync() told us
|
||||
#
|
||||
# however, right now the pool is wedged. we need to get it back online
|
||||
# so we can export it, so we can do the import. so we need to copy the
|
||||
# entire pool state away. for the scsi device, we can do this through
|
||||
# the second device node. for the loopback, we can copy it directly
|
||||
log_note "taking copy of suspended pool"
|
||||
log_must cp /dev/${sd[1]} $BACKUP_MAIN
|
||||
log_must cp /dev/$ld $BACKUP_LOG
|
||||
|
||||
# bring the entire pool back online, by clearing error injections and
|
||||
# restoring the main device. this will unblock anything still waiting
|
||||
# on it, and tidy up all the internals so we can reset it
|
||||
log_note "bringing pool back online"
|
||||
if [[ $do_fault_log != 0 ]] ; then
|
||||
log_must zinject -c all
|
||||
fi
|
||||
log_must eval "echo running > /sys/block/${sd[0]}/device/state"
|
||||
log_must zpool clear $TESTPOOL
|
||||
|
||||
# now the pool is back online. if dd was blocked, it should now
|
||||
# complete successfully. make sure that's true
|
||||
if [[ $fsync_success == 0 ]] ; then
|
||||
log_note "ensuring blocked dd has now finished"
|
||||
log_must wait $fsync_pid
|
||||
fi
|
||||
|
||||
log_note "exporting pool"
|
||||
|
||||
# pool now clean, export it
|
||||
log_must zpool export $TESTPOOL
|
||||
|
||||
log_note "reverting pool to suspended state"
|
||||
|
||||
# restore the pool to the suspended state, mimicking a reboot
|
||||
log_must cp $BACKUP_MAIN /dev/${sd[0]}
|
||||
log_must cp $BACKUP_LOG /dev/$ld
|
||||
|
||||
# import the crashed pool
|
||||
log_must zpool import $TESTPOOL
|
||||
|
||||
# if fsync() succeeded before the pool suspended, then the ZIL should
|
||||
# have replayed properly and the data is now available on the pool
|
||||
#
|
||||
# note that we don't check the alternative; fsync() blocking does not
|
||||
# mean that data _didn't_ make it to disk, just the ZFS never claimed
|
||||
# that it did. in that case we can't know what _should_ be on disk
|
||||
# right now, so can't check
|
||||
if [[ $fsync_success == 1 ]] ; then
|
||||
log_note "fsync() succeeded earlier; checking data was written correctly"
|
||||
typeset newsum=$(sha256digest /$TESTPOOL/data_file)
|
||||
log_must test "$sum" = "$newsum"
|
||||
fi
|
||||
|
||||
log_note "test finished, cleaning up"
|
||||
log_must zpool destroy -f $TESTPOOL
|
||||
}
|
||||
|
||||
log_note "first run: ZIL succeeds, and repairs the pool at import"
|
||||
test_fsync 0
|
||||
|
||||
log_note "second run: ZIL commit fails, and falls back to txg sync"
|
||||
test_fsync 1
|
||||
|
||||
log_pass "fsync() waits if the ZIL commit fails"
|
Loading…
Reference in New Issue