Refactor Log Size Limit
Original Log Size Limit implementation blocked all writes in case of limit reached until the TXG is committed and the log is freed. It caused huge delays and following speed spikes in application writes. This implementation instead smoothly throttles writes, using exactly the same mechanism as used for dirty data. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: jxdking <lostking2008@hotmail.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Issue #12284 Closes #13476
This commit is contained in:
parent
91e02156dd
commit
33223cbc3c
|
@ -124,8 +124,8 @@ typedef struct dmu_tx_stats {
|
||||||
kstat_named_t dmu_tx_dirty_throttle;
|
kstat_named_t dmu_tx_dirty_throttle;
|
||||||
kstat_named_t dmu_tx_dirty_delay;
|
kstat_named_t dmu_tx_dirty_delay;
|
||||||
kstat_named_t dmu_tx_dirty_over_max;
|
kstat_named_t dmu_tx_dirty_over_max;
|
||||||
kstat_named_t dmu_tx_wrlog_over_max;
|
|
||||||
kstat_named_t dmu_tx_dirty_frees_delay;
|
kstat_named_t dmu_tx_dirty_frees_delay;
|
||||||
|
kstat_named_t dmu_tx_wrlog_delay;
|
||||||
kstat_named_t dmu_tx_quota;
|
kstat_named_t dmu_tx_quota;
|
||||||
} dmu_tx_stats_t;
|
} dmu_tx_stats_t;
|
||||||
|
|
||||||
|
|
|
@ -164,7 +164,7 @@ uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
|
||||||
zfs_space_check_t slop_policy);
|
zfs_space_check_t slop_policy);
|
||||||
uint64_t dsl_pool_deferred_space(dsl_pool_t *dp);
|
uint64_t dsl_pool_deferred_space(dsl_pool_t *dp);
|
||||||
void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg);
|
void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg);
|
||||||
boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp);
|
boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp);
|
||||||
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
|
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
|
||||||
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
|
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
|
||||||
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
|
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
|
||||||
|
|
|
@ -1101,9 +1101,9 @@ This should be less than
|
||||||
.
|
.
|
||||||
.It Sy zfs_wrlog_data_max Ns = Pq int
|
.It Sy zfs_wrlog_data_max Ns = Pq int
|
||||||
The upper limit of write-transaction zil log data size in bytes.
|
The upper limit of write-transaction zil log data size in bytes.
|
||||||
Once it is reached, write operation is blocked, until log data is cleared out
|
Write operations are throttled when approaching the limit until log data is
|
||||||
after transaction group sync. Because of some overhead, it should be set
|
cleared out after transaction group sync.
|
||||||
at least 2 times the size of
|
Because of some overhead, it should be set at least 2 times the size of
|
||||||
.Sy zfs_dirty_data_max
|
.Sy zfs_dirty_data_max
|
||||||
.No to prevent harming normal write throughput.
|
.No to prevent harming normal write throughput.
|
||||||
It also should be smaller than the size of the slog device if slog is present.
|
It also should be smaller than the size of the slog device if slog is present.
|
||||||
|
|
|
@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = {
|
||||||
{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
|
{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
|
||||||
{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
|
{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
|
||||||
{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
|
{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
|
||||||
{ "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 },
|
|
||||||
{ "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
|
{ "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
|
||||||
|
{ "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 },
|
||||||
{ "dmu_tx_quota", KSTAT_DATA_UINT64 },
|
{ "dmu_tx_quota", KSTAT_DATA_UINT64 },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -781,34 +781,49 @@ static void
|
||||||
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
|
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
|
||||||
{
|
{
|
||||||
dsl_pool_t *dp = tx->tx_pool;
|
dsl_pool_t *dp = tx->tx_pool;
|
||||||
uint64_t delay_min_bytes =
|
uint64_t delay_min_bytes, wrlog;
|
||||||
|
hrtime_t wakeup, tx_time = 0, now;
|
||||||
|
|
||||||
|
/* Calculate minimum transaction time for the dirty data amount. */
|
||||||
|
delay_min_bytes =
|
||||||
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
|
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
|
||||||
hrtime_t wakeup, min_tx_time, now;
|
if (dirty > delay_min_bytes) {
|
||||||
|
|
||||||
if (dirty <= delay_min_bytes)
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The caller has already waited until we are under the max.
|
* The caller has already waited until we are under the max.
|
||||||
* We make them pass us the amount of dirty data so we don't
|
* We make them pass us the amount of dirty data so we don't
|
||||||
* have to handle the case of it being >= the max, which could
|
* have to handle the case of it being >= the max, which
|
||||||
* cause a divide-by-zero if it's == the max.
|
* could cause a divide-by-zero if it's == the max.
|
||||||
*/
|
*/
|
||||||
ASSERT3U(dirty, <, zfs_dirty_data_max);
|
ASSERT3U(dirty, <, zfs_dirty_data_max);
|
||||||
|
|
||||||
|
tx_time = zfs_delay_scale * (dirty - delay_min_bytes) /
|
||||||
|
(zfs_dirty_data_max - dirty);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Calculate minimum transaction time for the TX_WRITE log size. */
|
||||||
|
wrlog = aggsum_upper_bound(&dp->dp_wrlog_total);
|
||||||
|
delay_min_bytes =
|
||||||
|
zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
|
||||||
|
if (wrlog >= zfs_wrlog_data_max) {
|
||||||
|
tx_time = zfs_delay_max_ns;
|
||||||
|
} else if (wrlog > delay_min_bytes) {
|
||||||
|
tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) /
|
||||||
|
(zfs_wrlog_data_max - wrlog), tx_time);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tx_time == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
tx_time = MIN(tx_time, zfs_delay_max_ns);
|
||||||
now = gethrtime();
|
now = gethrtime();
|
||||||
min_tx_time = zfs_delay_scale *
|
if (now > tx->tx_start + tx_time)
|
||||||
(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
|
|
||||||
min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
|
|
||||||
if (now > tx->tx_start + min_tx_time)
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
|
DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
|
||||||
uint64_t, min_tx_time);
|
uint64_t, tx_time);
|
||||||
|
|
||||||
mutex_enter(&dp->dp_lock);
|
mutex_enter(&dp->dp_lock);
|
||||||
wakeup = MAX(tx->tx_start + min_tx_time,
|
wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time);
|
||||||
dp->dp_last_wakeup + min_tx_time);
|
|
||||||
dp->dp_last_wakeup = wakeup;
|
dp->dp_last_wakeup = wakeup;
|
||||||
mutex_exit(&dp->dp_lock);
|
mutex_exit(&dp->dp_lock);
|
||||||
|
|
||||||
|
@ -886,8 +901,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!tx->tx_dirty_delayed &&
|
if (!tx->tx_dirty_delayed &&
|
||||||
dsl_pool_wrlog_over_max(tx->tx_pool)) {
|
dsl_pool_need_wrlog_delay(tx->tx_pool)) {
|
||||||
DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
|
tx->tx_wait_dirty = B_TRUE;
|
||||||
|
DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay);
|
||||||
return (SET_ERROR(ERESTART));
|
return (SET_ERROR(ERESTART));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -105,9 +105,8 @@ int zfs_dirty_data_max_percent = 10;
|
||||||
int zfs_dirty_data_max_max_percent = 25;
|
int zfs_dirty_data_max_max_percent = 25;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
|
* The upper limit of TX_WRITE log data. Write operations are throttled
|
||||||
* Once it is reached, write operation is blocked,
|
* when approaching the limit until log data is cleared out after txg sync.
|
||||||
* until log data is cleared out after txg sync.
|
|
||||||
* It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
|
* It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
|
||||||
*/
|
*/
|
||||||
unsigned long zfs_wrlog_data_max = 0;
|
unsigned long zfs_wrlog_data_max = 0;
|
||||||
|
@ -621,15 +620,18 @@ dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
|
||||||
|
|
||||||
/* Choose a value slightly bigger than min dirty sync bytes */
|
/* Choose a value slightly bigger than min dirty sync bytes */
|
||||||
uint64_t sync_min =
|
uint64_t sync_min =
|
||||||
zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
|
zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200;
|
||||||
if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
|
if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
|
||||||
txg_kick(dp, txg);
|
txg_kick(dp, txg);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean_t
|
boolean_t
|
||||||
dsl_pool_wrlog_over_max(dsl_pool_t *dp)
|
dsl_pool_need_wrlog_delay(dsl_pool_t *dp)
|
||||||
{
|
{
|
||||||
return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
|
uint64_t delay_min_bytes =
|
||||||
|
zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
|
||||||
|
|
||||||
|
return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -639,6 +641,9 @@ dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
|
||||||
delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
|
delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
|
||||||
aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
|
aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
|
||||||
aggsum_add(&dp->dp_wrlog_total, delta);
|
aggsum_add(&dp->dp_wrlog_total, delta);
|
||||||
|
/* Compact per-CPU sums after the big change. */
|
||||||
|
(void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
|
||||||
|
(void) aggsum_value(&dp->dp_wrlog_total);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef ZFS_DEBUG
|
#ifdef ZFS_DEBUG
|
||||||
|
|
Loading…
Reference in New Issue