zvol: call zil_replaying() during replay

zil_replaying(zil, tx) has the side-effect of informing the ZIL that an
entry has been replayed in the (still open) tx.  The ZIL uses that
information to record the replay progress in the ZIL header when that
tx's txg syncs.

ZPL log entries are not idempotent and logically dependent and thus
calling zil_replaying() is necessary for correctness.

For ZVOLs the question of correctness is more nuanced: ZVOL logs only
TX_WRITE and TX_TRUNCATE, both of which are idempotent. Logical
dependencies between two records exist only if the write or discard
request had sync semantics or if the ranges affected by the records
overlap.

Thus, at a first glance, it would be correct to restart replay from
the beginning if we crash before replay completes. But this does not
address the following scenario:
Assume one log record per LWB.
The chain on disk is

    HDR -> 1:W(1, "A") -> 2:W(1, "B") -> 3:W(2, "X") -> 4:W(3, "Z")

where N:W(O, C) represents log entry number N which is a TX_WRITE of C
to offset A.
We replay 1, 2 and 3 in one txg, sync that txg, then crash.
Bit flips corrupt 2, 3, and 4.
We come up again and restart replay from the beginning because
we did not call zil_replaying() during replay.
We replay 1 again, then interpret 2's invalid checksum as the end
of the ZIL chain and call replay done.
The replayed zvol content is "AX".

If we had called zil_replaying() the HDR would have pointed to 3
and our resumed replay would not have replayed anything because
3 was corrupted, resulting in zvol content "BX".

If 3 logically depends on 2 then the replay corrupted the ZVOL_OBJ's
contents.

This patch adds the zil_replaying() calls to the replay functions.
Since the callbacks in the replay function need the zilog_t* pointer
so that they can call zil_replaying() we open the ZIL while
replaying in zvol_create_minor(). We also verify that replay has
been done when on-demand-opening the ZIL on the first modifying
bio.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christian Schwarz <me@cschwarz.com>
Closes #11667
This commit is contained in:
Christian Schwarz 2021-03-07 18:49:58 +01:00 committed by Tony Hutter
parent 0ccffb2634
commit abb485a34a
3 changed files with 30 additions and 3 deletions

View File

@ -1157,6 +1157,9 @@ zvol_ensure_zilog(zvol_state_t *zv)
zv->zv_zilog = zil_open(zv->zv_objset, zv->zv_zilog = zil_open(zv->zv_objset,
zvol_get_data); zvol_get_data);
zv->zv_flags |= ZVOL_WRITTEN_TO; zv->zv_flags |= ZVOL_WRITTEN_TO;
/* replay / destroy done in zvol_create_minor_impl() */
VERIFY0((zv->zv_zilog->zl_header->zh_flags &
ZIL_REPLAY_NEEDED));
} }
rw_downgrade(&zv->zv_suspend_lock); rw_downgrade(&zv->zv_suspend_lock);
} }
@ -1381,12 +1384,16 @@ zvol_create_minor_impl(const char *name)
zv->zv_volsize = volsize; zv->zv_volsize = volsize;
zv->zv_objset = os; zv->zv_objset = os;
ASSERT3P(zv->zv_zilog, ==, NULL);
zv->zv_zilog = zil_open(os, zvol_get_data);
if (spa_writeable(dmu_objset_spa(os))) { if (spa_writeable(dmu_objset_spa(os))) {
if (zil_replay_disable) if (zil_replay_disable)
zil_destroy(dmu_objset_zil(os), B_FALSE); zil_destroy(zv->zv_zilog, B_FALSE);
else else
zil_replay(os, zv, zvol_replay_vector); zil_replay(os, zv, zvol_replay_vector);
} }
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);

View File

@ -357,6 +357,9 @@ zvol_request(struct request_queue *q, struct bio *bio)
zv->zv_zilog = zil_open(zv->zv_objset, zv->zv_zilog = zil_open(zv->zv_objset,
zvol_get_data); zvol_get_data);
zv->zv_flags |= ZVOL_WRITTEN_TO; zv->zv_flags |= ZVOL_WRITTEN_TO;
/* replay / destroy done in zvol_create_minor */
VERIFY0((zv->zv_zilog->zl_header->zh_flags &
ZIL_REPLAY_NEEDED));
} }
rw_downgrade(&zv->zv_suspend_lock); rw_downgrade(&zv->zv_suspend_lock);
} }
@ -947,12 +950,16 @@ zvol_os_create_minor(const char *name)
blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
#endif #endif
ASSERT3P(zv->zv_zilog, ==, NULL);
zv->zv_zilog = zil_open(os, zvol_get_data);
if (spa_writeable(dmu_objset_spa(os))) { if (spa_writeable(dmu_objset_spa(os))) {
if (zil_replay_disable) if (zil_replay_disable)
zil_destroy(dmu_objset_zil(os), B_FALSE); zil_destroy(zv->zv_zilog, B_FALSE);
else else
zil_replay(os, zv, zvol_replay_vector); zil_replay(os, zv, zvol_replay_vector);
} }
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);

View File

@ -473,7 +473,19 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
offset = lr->lr_offset; offset = lr->lr_offset;
length = lr->lr_length; length = lr->lr_length;
return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
dmu_tx_mark_netfree(tx);
int error = dmu_tx_assign(tx, TXG_WAIT);
if (error != 0) {
dmu_tx_abort(tx);
} else {
zil_replaying(zv->zv_zilog, tx);
dmu_tx_commit(tx);
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset,
length);
}
return (error);
} }
/* /*
@ -513,6 +525,7 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_tx_abort(tx); dmu_tx_abort(tx);
} else { } else {
dmu_write(os, ZVOL_OBJ, offset, length, data, tx); dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
zil_replaying(zv->zv_zilog, tx);
dmu_tx_commit(tx); dmu_tx_commit(tx);
} }