zil: refactor zil_commit_waiter()
The previous change added a check to fall back to waiting forever if the ZIL failed. This check was inverted; it actually caused it to always enter a timed wait when it failed. When combined with the fact that the last lwb issued likely would have failed quickly and so had a very small latency, this caused effectively an infinite loop. I initially fixed the check, but on further study I decided that this loop doesn't need to exist. The way the whole logic falls out of the original code in 2.1.5 is that if the lwb is OPENED, wait then issue it, and if not (or post issue), wait forever. The loop will never see more than two iterations, one for each half of the OPENED check, and it will stop as soon as the waiter is signaled (zcw_done true), so it can be far more simply expressed as a linear sequence: if (!issued) { wait a few if (done) return issue IO } if (!done) wait forever This still holds when the ZIL fails, because zil_commit_waiter_timeout() will check for failure under zl_issuer_lock, which zil_fail() will wait for, and in turn, zil_fail() will wait on zcw_lock and then signal the waiter before it releases zl_issuer_lock. Taken together, that means that zil_commit_waiter_timeout() will do all it can under the circumstances, and waiting forever the waiter to complete is all we can past that point. (cherry picked from commit c57c2ddd6f803f429da1e2b53abab277d781a5a3)
This commit is contained in:
parent
d6d343bb87
commit
5b98ced63c
158
module/zfs/zil.c
158
module/zfs/zil.c
|
@ -3302,108 +3302,84 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
|
|||
|
||||
mutex_enter(&zcw->zcw_lock);
|
||||
|
||||
/*
|
||||
* The timeout is scaled based on the lwb latency to avoid
|
||||
* significantly impacting the latency of each individual itx.
|
||||
* For more details, see the comment at the bottom of the
|
||||
* zil_process_commit_list() function.
|
||||
*/
|
||||
int pct = MAX(zfs_commit_timeout_pct, 1);
|
||||
hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
|
||||
hrtime_t wakeup = gethrtime() + sleep;
|
||||
boolean_t timedout = B_FALSE;
|
||||
|
||||
while (!zcw->zcw_done) {
|
||||
ASSERT(MUTEX_HELD(&zcw->zcw_lock));
|
||||
|
||||
lwb_t *lwb = zcw->zcw_lwb;
|
||||
|
||||
if (zcw->zcw_done) {
|
||||
/*
|
||||
* Usually, the waiter will have a non-NULL lwb field here,
|
||||
* but it's possible for it to be NULL as a result of
|
||||
* zil_commit() racing with spa_sync().
|
||||
*
|
||||
* When zil_clean() is called, it's possible for the itxg
|
||||
* list (which may be cleaned via a taskq) to contain
|
||||
* commit itxs. When this occurs, the commit waiters linked
|
||||
* off of these commit itxs will not be committed to an
|
||||
* lwb. Additionally, these commit waiters will not be
|
||||
* marked done until zil_commit_waiter_skip() is called via
|
||||
* zil_itxg_clean().
|
||||
*
|
||||
* Thus, it's possible for this commit waiter (i.e. the
|
||||
* "zcw" variable) to be found in this "in between" state;
|
||||
* where it's "zcw_lwb" field is NULL, and it hasn't yet
|
||||
* been skipped, so it's "zcw_done" field is still B_FALSE.
|
||||
*
|
||||
* If the ZIL failed before we got the issuer lock in
|
||||
* zil_commit_waiter_timeout(), then the lwb will remain open
|
||||
* and unissued. There's no point doing a timeout wait after
|
||||
* that, as all its ops will be commited before the ZIL is
|
||||
* reopened. So we just go to sleep until the waiter is fired
|
||||
* by zil_clean().
|
||||
* The IO may have been issued and completed, or the ZIL may
|
||||
* have failed (signaling all outstanding waiters) since we
|
||||
* dropped zl_issuer_lock at the end of zil_commit_writer().
|
||||
* If so, there's no need to wait; our work here is done.
|
||||
*/
|
||||
IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
|
||||
mutex_exit(&zcw->zcw_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
if ((lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) ||
|
||||
zil_failed(zilog)) {
|
||||
ASSERT3B(timedout, ==, B_FALSE);
|
||||
/*
|
||||
* Usually, the waiter will have a non-NULL lwb field here, but it's
|
||||
* possible for it to be NULL as a result of zil_commit() racing with
|
||||
* spa_sync().
|
||||
*
|
||||
* When zil_clean() is called, it's possible for the itxg list (which
|
||||
* may be cleaned via a taskq) to contain commit itxs. When this
|
||||
* occurs, the commit waiters linked off of these commit itxs will not
|
||||
* be committed to an lwb. Additionally, these commit waiters will not
|
||||
* be marked done until zil_commit_waiter_skip() is called via
|
||||
* zil_itxg_clean().
|
||||
*
|
||||
* Thus, it's possible for this commit waiter (i.e. the "zcw" variable)
|
||||
* to be found in this "in between" state; where it's "zcw_lwb" field
|
||||
* is NULL, and it hasn't yet been skipped, so it's "zcw_done" field is
|
||||
* still B_FALSE.
|
||||
*/
|
||||
lwb_t *lwb = zcw->zcw_lwb;
|
||||
IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
|
||||
|
||||
if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
|
||||
/*
|
||||
* If the lwb hasn't been issued yet, then we need to wait with
|
||||
* a timeout, in case this function needs to issue the lwb
|
||||
* after the timeout is reached; responsibility (2) from the
|
||||
* comment above this function.
|
||||
*
|
||||
* The timeout is scaled based on the lwb latency to avoid
|
||||
* significantly impacting the latency of each individual itx.
|
||||
* For more details, see the comment at the bottom of the
|
||||
* zil_process_commit_list() function.
|
||||
*/
|
||||
int pct = MAX(zfs_commit_timeout_pct, 1);
|
||||
hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
|
||||
hrtime_t wakeup = gethrtime() + sleep;
|
||||
|
||||
int rc = cv_timedwait_hires(&zcw->zcw_cv,
|
||||
&zcw->zcw_lock, wakeup, USEC2NSEC(1),
|
||||
CALLOUT_FLAG_ABSOLUTE);
|
||||
|
||||
if (!zcw->zcw_done) {
|
||||
ASSERT3U(rc, ==, -1);
|
||||
/*
|
||||
* If the lwb hasn't been issued yet, then we
|
||||
* need to wait with a timeout, in case this
|
||||
* function needs to issue the lwb after the
|
||||
* timeout is reached; responsibility (2) from
|
||||
* the comment above this function.
|
||||
* The wait timed out and the lwb hasn't been issued,
|
||||
* so we have to do that now.
|
||||
*/
|
||||
int rc = cv_timedwait_hires(&zcw->zcw_cv,
|
||||
&zcw->zcw_lock, wakeup, USEC2NSEC(1),
|
||||
CALLOUT_FLAG_ABSOLUTE);
|
||||
|
||||
if (rc != -1 || zcw->zcw_done)
|
||||
continue;
|
||||
|
||||
timedout = B_TRUE;
|
||||
zil_commit_waiter_timeout(zilog, zcw);
|
||||
|
||||
if (!zcw->zcw_done) {
|
||||
/*
|
||||
* If the commit waiter has already been
|
||||
* marked "done", it's possible for the
|
||||
* waiter's lwb structure to have already
|
||||
* been freed. Thus, we can only reliably
|
||||
* make these assertions if the waiter
|
||||
* isn't done.
|
||||
*/
|
||||
ASSERT3P(lwb, ==, zcw->zcw_lwb);
|
||||
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If the lwb isn't open, then it must have already
|
||||
* been issued. In that case, there's no need to
|
||||
* use a timeout when waiting for the lwb to
|
||||
* complete.
|
||||
*
|
||||
* Additionally, if the lwb is NULL, the waiter
|
||||
* will soon be signaled and marked done via
|
||||
* zil_clean() and zil_itxg_clean(), so no timeout
|
||||
* is required.
|
||||
*
|
||||
* If it is open, then the ZIL failed before we could
|
||||
* issue it. See comment at top of loop.
|
||||
*/
|
||||
|
||||
IMPLY(lwb != NULL,
|
||||
lwb->lwb_state == LWB_STATE_ISSUED ||
|
||||
lwb->lwb_state == LWB_STATE_WRITE_DONE ||
|
||||
lwb->lwb_state == LWB_STATE_FLUSH_DONE ||
|
||||
(lwb->lwb_state == LWB_STATE_OPENED &&
|
||||
zil_failed(zilog)));
|
||||
cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
|
||||
ASSERT(MUTEX_HELD(&zcw->zcw_lock));
|
||||
}
|
||||
}
|
||||
|
||||
if (!zcw->zcw_done) {
|
||||
ASSERT(MUTEX_HELD(&zcw->zcw_lock));
|
||||
|
||||
/*
|
||||
* If we're here, then the lwb has been issued or the ZIL has
|
||||
* failed or is failing. In all cases, there's nothing else for
|
||||
* us to do except wait to be signaled that the data is on disk,
|
||||
* either when the lwb is flushed, the txg of the original op
|
||||
* completes, or the ZIL fails.
|
||||
*/
|
||||
cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
|
||||
}
|
||||
|
||||
ASSERT(zcw->zcw_done);
|
||||
|
||||
mutex_exit(&zcw->zcw_lock);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue