zil: add stats for commit failure/fallback (#16315)
There's no good way to tell when a ZIL commit fails and falls back to a transaction sync, other than perhaps a throughput drop. This adds counters so we can see when it happens and why. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov>
This commit is contained in:
parent
2fc646160f
commit
7ddc1f737f
|
@ -43,6 +43,9 @@ cols = {
|
||||||
"obj": [12, -1, "objset"],
|
"obj": [12, -1, "objset"],
|
||||||
"cc": [5, 1000, "zil_commit_count"],
|
"cc": [5, 1000, "zil_commit_count"],
|
||||||
"cwc": [5, 1000, "zil_commit_writer_count"],
|
"cwc": [5, 1000, "zil_commit_writer_count"],
|
||||||
|
"cec": [5, 1000, "zil_commit_error_count"],
|
||||||
|
"csc": [5, 1000, "zil_commit_stall_count"],
|
||||||
|
"cSc": [5, 1000, "zil_commit_suspend_count"],
|
||||||
"ic": [5, 1000, "zil_itx_count"],
|
"ic": [5, 1000, "zil_itx_count"],
|
||||||
"iic": [5, 1000, "zil_itx_indirect_count"],
|
"iic": [5, 1000, "zil_itx_indirect_count"],
|
||||||
"iib": [5, 1024, "zil_itx_indirect_bytes"],
|
"iib": [5, 1024, "zil_itx_indirect_bytes"],
|
||||||
|
|
|
@ -467,6 +467,21 @@ typedef struct zil_stats {
|
||||||
*/
|
*/
|
||||||
kstat_named_t zil_commit_writer_count;
|
kstat_named_t zil_commit_writer_count;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Number of times a ZIL commit failed and the ZIL was forced to fall
|
||||||
|
* back to txg_wait_synced(). The separate counts are for different
|
||||||
|
* reasons:
|
||||||
|
* - error: ZIL IO (write/flush) returned an error
|
||||||
|
* (see zil_commit_impl())
|
||||||
|
* - stall: LWB block allocation failed, ZIL chain abandoned
|
||||||
|
* (see zil_commit_writer_stall())
|
||||||
|
* - suspend: ZIL suspended
|
||||||
|
* (see zil_commit(), zil_get_commit_list())
|
||||||
|
*/
|
||||||
|
kstat_named_t zil_commit_error_count;
|
||||||
|
kstat_named_t zil_commit_stall_count;
|
||||||
|
kstat_named_t zil_commit_suspend_count;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Number of transactions (reads, writes, renames, etc.)
|
* Number of transactions (reads, writes, renames, etc.)
|
||||||
* that have been committed.
|
* that have been committed.
|
||||||
|
@ -510,6 +525,9 @@ typedef struct zil_stats {
|
||||||
typedef struct zil_sums {
|
typedef struct zil_sums {
|
||||||
wmsum_t zil_commit_count;
|
wmsum_t zil_commit_count;
|
||||||
wmsum_t zil_commit_writer_count;
|
wmsum_t zil_commit_writer_count;
|
||||||
|
wmsum_t zil_commit_error_count;
|
||||||
|
wmsum_t zil_commit_stall_count;
|
||||||
|
wmsum_t zil_commit_suspend_count;
|
||||||
wmsum_t zil_itx_count;
|
wmsum_t zil_itx_count;
|
||||||
wmsum_t zil_itx_indirect_count;
|
wmsum_t zil_itx_indirect_count;
|
||||||
wmsum_t zil_itx_indirect_bytes;
|
wmsum_t zil_itx_indirect_bytes;
|
||||||
|
|
|
@ -99,6 +99,9 @@ static uint_t zfs_commit_timeout_pct = 10;
|
||||||
static zil_kstat_values_t zil_stats = {
|
static zil_kstat_values_t zil_stats = {
|
||||||
{ "zil_commit_count", KSTAT_DATA_UINT64 },
|
{ "zil_commit_count", KSTAT_DATA_UINT64 },
|
||||||
{ "zil_commit_writer_count", KSTAT_DATA_UINT64 },
|
{ "zil_commit_writer_count", KSTAT_DATA_UINT64 },
|
||||||
|
{ "zil_commit_error_count", KSTAT_DATA_UINT64 },
|
||||||
|
{ "zil_commit_stall_count", KSTAT_DATA_UINT64 },
|
||||||
|
{ "zil_commit_suspend_count", KSTAT_DATA_UINT64 },
|
||||||
{ "zil_itx_count", KSTAT_DATA_UINT64 },
|
{ "zil_itx_count", KSTAT_DATA_UINT64 },
|
||||||
{ "zil_itx_indirect_count", KSTAT_DATA_UINT64 },
|
{ "zil_itx_indirect_count", KSTAT_DATA_UINT64 },
|
||||||
{ "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 },
|
{ "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 },
|
||||||
|
@ -360,6 +363,9 @@ zil_sums_init(zil_sums_t *zs)
|
||||||
{
|
{
|
||||||
wmsum_init(&zs->zil_commit_count, 0);
|
wmsum_init(&zs->zil_commit_count, 0);
|
||||||
wmsum_init(&zs->zil_commit_writer_count, 0);
|
wmsum_init(&zs->zil_commit_writer_count, 0);
|
||||||
|
wmsum_init(&zs->zil_commit_error_count, 0);
|
||||||
|
wmsum_init(&zs->zil_commit_stall_count, 0);
|
||||||
|
wmsum_init(&zs->zil_commit_suspend_count, 0);
|
||||||
wmsum_init(&zs->zil_itx_count, 0);
|
wmsum_init(&zs->zil_itx_count, 0);
|
||||||
wmsum_init(&zs->zil_itx_indirect_count, 0);
|
wmsum_init(&zs->zil_itx_indirect_count, 0);
|
||||||
wmsum_init(&zs->zil_itx_indirect_bytes, 0);
|
wmsum_init(&zs->zil_itx_indirect_bytes, 0);
|
||||||
|
@ -382,6 +388,9 @@ zil_sums_fini(zil_sums_t *zs)
|
||||||
{
|
{
|
||||||
wmsum_fini(&zs->zil_commit_count);
|
wmsum_fini(&zs->zil_commit_count);
|
||||||
wmsum_fini(&zs->zil_commit_writer_count);
|
wmsum_fini(&zs->zil_commit_writer_count);
|
||||||
|
wmsum_fini(&zs->zil_commit_error_count);
|
||||||
|
wmsum_fini(&zs->zil_commit_stall_count);
|
||||||
|
wmsum_fini(&zs->zil_commit_suspend_count);
|
||||||
wmsum_fini(&zs->zil_itx_count);
|
wmsum_fini(&zs->zil_itx_count);
|
||||||
wmsum_fini(&zs->zil_itx_indirect_count);
|
wmsum_fini(&zs->zil_itx_indirect_count);
|
||||||
wmsum_fini(&zs->zil_itx_indirect_bytes);
|
wmsum_fini(&zs->zil_itx_indirect_bytes);
|
||||||
|
@ -406,6 +415,12 @@ zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
|
||||||
wmsum_value(&zil_sums->zil_commit_count);
|
wmsum_value(&zil_sums->zil_commit_count);
|
||||||
zs->zil_commit_writer_count.value.ui64 =
|
zs->zil_commit_writer_count.value.ui64 =
|
||||||
wmsum_value(&zil_sums->zil_commit_writer_count);
|
wmsum_value(&zil_sums->zil_commit_writer_count);
|
||||||
|
zs->zil_commit_error_count.value.ui64 =
|
||||||
|
wmsum_value(&zil_sums->zil_commit_error_count);
|
||||||
|
zs->zil_commit_stall_count.value.ui64 =
|
||||||
|
wmsum_value(&zil_sums->zil_commit_stall_count);
|
||||||
|
zs->zil_commit_suspend_count.value.ui64 =
|
||||||
|
wmsum_value(&zil_sums->zil_commit_suspend_count);
|
||||||
zs->zil_itx_count.value.ui64 =
|
zs->zil_itx_count.value.ui64 =
|
||||||
wmsum_value(&zil_sums->zil_itx_count);
|
wmsum_value(&zil_sums->zil_itx_count);
|
||||||
zs->zil_itx_indirect_count.value.ui64 =
|
zs->zil_itx_indirect_count.value.ui64 =
|
||||||
|
@ -2823,6 +2838,7 @@ zil_commit_writer_stall(zilog_t *zilog)
|
||||||
* (which is achieved via the txg_wait_synced() call).
|
* (which is achieved via the txg_wait_synced() call).
|
||||||
*/
|
*/
|
||||||
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
|
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
|
||||||
|
ZIL_STAT_BUMP(zilog, zil_commit_stall_count);
|
||||||
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
||||||
ASSERT(list_is_empty(&zilog->zl_lwb_list));
|
ASSERT(list_is_empty(&zilog->zl_lwb_list));
|
||||||
}
|
}
|
||||||
|
@ -3592,6 +3608,7 @@ zil_commit(zilog_t *zilog, uint64_t foid)
|
||||||
* semantics, and avoid calling those functions altogether.
|
* semantics, and avoid calling those functions altogether.
|
||||||
*/
|
*/
|
||||||
if (zilog->zl_suspend > 0) {
|
if (zilog->zl_suspend > 0) {
|
||||||
|
ZIL_STAT_BUMP(zilog, zil_commit_suspend_count);
|
||||||
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -3645,10 +3662,12 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
|
||||||
* implications, but the expectation is for this to be
|
* implications, but the expectation is for this to be
|
||||||
* an exceptional case, and shouldn't occur often.
|
* an exceptional case, and shouldn't occur often.
|
||||||
*/
|
*/
|
||||||
|
ZIL_STAT_BUMP(zilog, zil_commit_error_count);
|
||||||
DTRACE_PROBE2(zil__commit__io__error,
|
DTRACE_PROBE2(zil__commit__io__error,
|
||||||
zilog_t *, zilog, zil_commit_waiter_t *, zcw);
|
zilog_t *, zilog, zil_commit_waiter_t *, zcw);
|
||||||
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
||||||
} else if (wtxg != 0) {
|
} else if (wtxg != 0) {
|
||||||
|
ZIL_STAT_BUMP(zilog, zil_commit_suspend_count);
|
||||||
txg_wait_synced(zilog->zl_dmu_pool, wtxg);
|
txg_wait_synced(zilog->zl_dmu_pool, wtxg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue