ZIL: Second attempt to reduce scope of zl_issuer_lock.

The previous patch #14841 appeared to have significant flaw, causing
deadlocks if zl_get_data callback got blocked waiting for TXG sync.  I
already handled some of such cases in the original patch, but issue
 #14982 shown cases that were impossible to solve in that design.

This patch fixes the problem by postponing log blocks allocation till
the very end, just before the zios issue, leaving nothing blocking after
that point to cause deadlocks.  Before that point though any sleeps are
now allowed, not causing sync thread blockage.  This require slightly
more complicated lwb state machine to allocate blocks and issue zios
in proper order.  But with removal of special early issue workarounds
the new code is much cleaner now, and should even be more efficient.

Since this patch uses null zios between write, I've found that null
zios do not wait for logical children ready status in zio_ready(),
that makes parent write to proceed prematurely, producing incorrect
log blocks.  Added ZIO_CHILD_LOGICAL_BIT to zio_wait_for_children()
fixes it.

Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15122
This commit is contained in:
Alexander Motin 2023-08-24 20:08:49 -04:00 committed by GitHub
parent 11fbcacf37
commit eda3fcd56f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 362 additions and 388 deletions

View File

@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
int error; int error;
ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0); ASSERT3U(size, !=, 0);
ztest_object_lock(zd, object, RL_READER); ztest_object_lock(zd, object, RL_READER);
@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
DMU_READ_NO_PREFETCH); DMU_READ_NO_PREFETCH);
ASSERT0(error); ASSERT0(error);
} else { } else {
ASSERT3P(zio, !=, NULL);
size = doi.doi_data_block_size; size = doi.doi_data_block_size;
if (ISP2(size)) { if (ISP2(size)) {
offset = P2ALIGN(offset, size); offset = P2ALIGN(offset, size);

View File

@ -38,14 +38,22 @@ extern "C" {
/* /*
* Possible states for a given lwb structure. * Possible states for a given lwb structure.
* *
* An lwb will start out in the "closed" state, and then transition to * An lwb will start out in the "new" state, and transition to the "opened"
* the "opened" state via a call to zil_lwb_write_open(). When * state via a call to zil_lwb_write_open() on first itx assignment. When
* transitioning from "closed" to "opened" the zilog's "zl_issuer_lock" * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be
* must be held. * held.
* *
* After the lwb is "opened", it can transition into the "issued" state * After the lwb is "opened", it can be assigned number of itxs and transition
* via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must * into the "closed" state via zil_lwb_write_close() when full or on timeout.
* be held when making this transition. * When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock"
* must be held. New lwb allocation also takes "zl_lock" to protect the list.
*
* After the lwb is "closed", it can transition into the "ready" state via
* zil_lwb_write_issue(). "zl_lock" must be held when making this transition.
* Since it is done by the same thread, "zl_issuer_lock" is not needed.
*
* When lwb in "ready" state receives its block pointer, it can transition to
* "issued". "zl_lock" must be held when making this transition.
* *
* After the lwb's write zio completes, it transitions into the "write * After the lwb's write zio completes, it transitions into the "write
* done" state via zil_lwb_write_done(); and then into the "flush done" * done" state via zil_lwb_write_done(); and then into the "flush done"
@ -62,17 +70,20 @@ extern "C" {
* *
* Additionally, correctness when reading an lwb's state is often * Additionally, correctness when reading an lwb's state is often
* achieved by exploiting the fact that these state transitions occur in * achieved by exploiting the fact that these state transitions occur in
* this specific order; i.e. "closed" to "opened" to "issued" to "done". * this specific order; i.e. "new" to "opened" to "closed" to "ready" to
* "issued" to "write_done" and finally "flush_done".
* *
* Thus, if an lwb is in the "closed" or "opened" state, holding the * Thus, if an lwb is in the "new" or "opened" state, holding the
* "zl_issuer_lock" will prevent a concurrent thread from transitioning * "zl_issuer_lock" will prevent a concurrent thread from transitioning
* that lwb to the "issued" state. Likewise, if an lwb is already in the * that lwb to the "closed" state. Likewise, if an lwb is already in the
* "issued" state, holding the "zl_lock" will prevent a concurrent * "ready" state, holding the "zl_lock" will prevent a concurrent thread
* thread from transitioning that lwb to the "write done" state. * from transitioning that lwb to the "issued" state.
*/ */
typedef enum { typedef enum {
LWB_STATE_CLOSED, LWB_STATE_NEW,
LWB_STATE_OPENED, LWB_STATE_OPENED,
LWB_STATE_CLOSED,
LWB_STATE_READY,
LWB_STATE_ISSUED, LWB_STATE_ISSUED,
LWB_STATE_WRITE_DONE, LWB_STATE_WRITE_DONE,
LWB_STATE_FLUSH_DONE, LWB_STATE_FLUSH_DONE,
@ -91,17 +102,21 @@ typedef enum {
typedef struct lwb { typedef struct lwb {
zilog_t *lwb_zilog; /* back pointer to log struct */ zilog_t *lwb_zilog; /* back pointer to log struct */
blkptr_t lwb_blk; /* on disk address of this log blk */ blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_slim; /* log block has slim format */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */
boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ int lwb_error; /* log block allocation error */
int lwb_nmax; /* max bytes in the buffer */
int lwb_nused; /* # used bytes in buffer */ int lwb_nused; /* # used bytes in buffer */
int lwb_nfilled; /* # filled bytes in buffer */ int lwb_nfilled; /* # filled bytes in buffer */
int lwb_sz; /* size of block and buffer */ int lwb_sz; /* size of block and buffer */
lwb_state_t lwb_state; /* the state of this lwb */ lwb_state_t lwb_state; /* the state of this lwb */
char *lwb_buf; /* log write buffer */ char *lwb_buf; /* log write buffer */
zio_t *lwb_child_zio; /* parent zio for children */
zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_write_zio; /* zio for the lwb buffer */
zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */
hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
uint64_t lwb_issued_txg; /* the txg when the write is issued */ uint64_t lwb_issued_txg; /* the txg when the write is issued */
uint64_t lwb_alloc_txg; /* the txg when lwb_blk is allocated */
uint64_t lwb_max_txg; /* highest txg in this lwb */ uint64_t lwb_max_txg; /* highest txg in this lwb */
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */

View File

@ -839,7 +839,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
uint64_t zp_gen; uint64_t zp_gen;
ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0); ASSERT3U(size, !=, 0);
/* /*
@ -889,6 +888,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
} }
ASSERT(error == 0 || error == ENOENT); ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */ } else { /* indirect write */
ASSERT3P(zio, !=, NULL);
/* /*
* Have to lock the whole block to ensure when it's * Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated * written out and its checksum is being calculated

File diff suppressed because it is too large Load Diff

View File

@ -4466,8 +4466,8 @@ zio_ready(zio_t *zio)
zio_t *pio, *pio_next; zio_t *pio, *pio_next;
zio_link_t *zl = NULL; zio_link_t *zl = NULL;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
ZIO_WAIT_READY)) { ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
return (NULL); return (NULL);
} }

View File

@ -698,7 +698,6 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
int error; int error;
ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0); ASSERT3U(size, !=, 0);
zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
@ -717,6 +716,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH); DMU_READ_NO_PREFETCH);
} else { /* indirect write */ } else { /* indirect write */
ASSERT3P(zio, !=, NULL);
/* /*
* Have to lock the whole block to ensure when it's written out * Have to lock the whole block to ensure when it's written out
* and its checksum is being calculated that no one can change * and its checksum is being calculated that no one can change