Add Module Parameter Regarding Log Size Limit

zfs_wrlog_data_max
The upper limit of TX_WRITE log data. Once it is reached,
write operation is blocked, until log data is cleared out
after txg sync. It only counts TX_WRITE log with WR_COPIED
or WR_NEED_COPY.

Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: jxdking <lostking2008@hotmail.com>
Closes #12284
This commit is contained in:
Kevin Jin 2021-07-20 11:40:24 -04:00 committed by Tony Hutter
parent 999830a021
commit d05f3039f7
8 changed files with 106 additions and 2 deletions

View File

@ -124,6 +124,7 @@ typedef struct dmu_tx_stats {
kstat_named_t dmu_tx_dirty_throttle; kstat_named_t dmu_tx_dirty_throttle;
kstat_named_t dmu_tx_dirty_delay; kstat_named_t dmu_tx_dirty_delay;
kstat_named_t dmu_tx_dirty_over_max; kstat_named_t dmu_tx_dirty_over_max;
kstat_named_t dmu_tx_wrlog_over_max;
kstat_named_t dmu_tx_dirty_frees_delay; kstat_named_t dmu_tx_dirty_frees_delay;
kstat_named_t dmu_tx_quota; kstat_named_t dmu_tx_quota;
} dmu_tx_stats_t; } dmu_tx_stats_t;

View File

@ -40,6 +40,7 @@
#include <sys/rrwlock.h> #include <sys/rrwlock.h>
#include <sys/dsl_synctask.h> #include <sys/dsl_synctask.h>
#include <sys/mmp.h> #include <sys/mmp.h>
#include <sys/aggsum.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -58,6 +59,7 @@ struct dsl_deadlist;
extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max;
extern unsigned long zfs_dirty_data_max_max; extern unsigned long zfs_dirty_data_max_max;
extern unsigned long zfs_wrlog_data_max;
extern int zfs_dirty_data_sync_percent; extern int zfs_dirty_data_sync_percent;
extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_percent;
extern int zfs_dirty_data_max_max_percent; extern int zfs_dirty_data_max_max_percent;
@ -118,6 +120,9 @@ typedef struct dsl_pool {
uint64_t dp_mos_compressed_delta; uint64_t dp_mos_compressed_delta;
uint64_t dp_mos_uncompressed_delta; uint64_t dp_mos_uncompressed_delta;
aggsum_t dp_wrlog_pertxg[TXG_SIZE];
aggsum_t dp_wrlog_total;
/* /*
* Time of most recently scheduled (furthest in the future) * Time of most recently scheduled (furthest in the future)
* wakeup for delayed transactions. * wakeup for delayed transactions.
@ -158,6 +163,8 @@ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
zfs_space_check_t slop_policy); zfs_space_check_t slop_policy);
uint64_t dsl_pool_deferred_space(dsl_pool_t *dp); uint64_t dsl_pool_deferred_space(dsl_pool_t *dp);
void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg);
boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp);
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);

View File

@ -1096,6 +1096,18 @@ Start syncing out a transaction group if there's at least this much dirty data
This should be less than This should be less than
.Sy zfs_vdev_async_write_active_min_dirty_percent . .Sy zfs_vdev_async_write_active_min_dirty_percent .
. .
.It Sy zfs_wrlog_data_max Ns = Pq int
The upper limit of write-transaction zil log data size in bytes.
Once it is reached, write operation is blocked, until log data is cleared out
after transaction group sync. Because of some overhead, it should be set
at least 2 times the size of
.Sy zfs_dirty_data_max
.No to prevent harming normal write throughput.
It also should be smaller than the size of the slog device if slog is present.
.Pp
Defaults to
.Sy zfs_dirty_data_max*2
.
.It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint
Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
preallocated for a file in order to guarantee that later writes will not preallocated for a file in order to guarantee that later writes will not

View File

@ -8062,6 +8062,18 @@ arc_init(void)
zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max = MIN(zfs_dirty_data_max,
zfs_dirty_data_max_max); zfs_dirty_data_max_max);
} }
if (zfs_wrlog_data_max == 0) {
/*
* dp_wrlog_total is reduced for each txg at the end of
* spa_sync(). However, dp_dirty_total is reduced every time
* a block is written out. Thus under normal operation,
* dp_wrlog_total could grow 2 times as big as
* zfs_dirty_data_max.
*/
zfs_wrlog_data_max = zfs_dirty_data_max * 2;
}
} }
void void

View File

@ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = {
{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_quota", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 },
}; };
@ -884,6 +885,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
return (SET_ERROR(ERESTART)); return (SET_ERROR(ERESTART));
} }
if (!tx->tx_dirty_delayed &&
dsl_pool_wrlog_over_max(tx->tx_pool)) {
DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
return (SET_ERROR(ERESTART));
}
if (!tx->tx_dirty_delayed && if (!tx->tx_dirty_delayed &&
dsl_pool_need_dirty_delay(tx->tx_pool)) { dsl_pool_need_dirty_delay(tx->tx_pool)) {
tx->tx_wait_dirty = B_TRUE; tx->tx_wait_dirty = B_TRUE;

View File

@ -104,6 +104,14 @@ unsigned long zfs_dirty_data_max_max = 0;
int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_percent = 10;
int zfs_dirty_data_max_max_percent = 25; int zfs_dirty_data_max_max_percent = 25;
/*
* zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
* Once it is reached, write operation is blocked,
* until log data is cleared out after txg sync.
* It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
*/
unsigned long zfs_wrlog_data_max = 0;
/* /*
* If there's at least this much dirty data (as a percentage of * If there's at least this much dirty data (as a percentage of
* zfs_dirty_data_max), push out a txg. This should be less than * zfs_dirty_data_max), push out a txg. This should be less than
@ -220,6 +228,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
aggsum_init(&dp->dp_wrlog_total, 0);
for (int i = 0; i < TXG_SIZE; i++) {
aggsum_init(&dp->dp_wrlog_pertxg[i], 0);
}
dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
TASKQ_THREADS_CPU_PCT); TASKQ_THREADS_CPU_PCT);
@ -416,6 +429,14 @@ dsl_pool_close(dsl_pool_t *dp)
rrw_destroy(&dp->dp_config_rwlock); rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock); mutex_destroy(&dp->dp_lock);
cv_destroy(&dp->dp_spaceavail_cv); cv_destroy(&dp->dp_spaceavail_cv);
ASSERT0(aggsum_value(&dp->dp_wrlog_total));
aggsum_fini(&dp->dp_wrlog_total);
for (int i = 0; i < TXG_SIZE; i++) {
ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i]));
aggsum_fini(&dp->dp_wrlog_pertxg[i]);
}
taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_unlinked_drain_taskq);
taskq_destroy(dp->dp_zrele_taskq); taskq_destroy(dp->dp_zrele_taskq);
if (dp->dp_blkstats != NULL) if (dp->dp_blkstats != NULL)
@ -590,6 +611,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
cv_signal(&dp->dp_spaceavail_cv); cv_signal(&dp->dp_spaceavail_cv);
} }
void
dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
{
ASSERT3S(size, >=, 0);
aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size);
aggsum_add(&dp->dp_wrlog_total, size);
/* Choose a value slightly bigger than min dirty sync bytes */
uint64_t sync_min =
zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
txg_kick(dp, txg);
}
boolean_t
dsl_pool_wrlog_over_max(dsl_pool_t *dp)
{
return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
}
static void
dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
{
int64_t delta;
delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
aggsum_add(&dp->dp_wrlog_total, delta);
}
#ifdef ZFS_DEBUG #ifdef ZFS_DEBUG
static boolean_t static boolean_t
dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
@ -814,6 +865,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
dmu_buf_rele(ds->ds_dbuf, zilog); dmu_buf_rele(ds->ds_dbuf, zilog);
} }
dsl_pool_wrlog_clear(dp, txg);
ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
} }
@ -1409,6 +1463,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
"Determines the dirty space limit"); "Determines the dirty space limit");
ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW,
"The size limit of write-transaction zil log data");
/* zfs_dirty_data_max_max only applied at module load in arc_init(). */ /* zfs_dirty_data_max_max only applied at module load in arc_init(). */
ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
"zfs_dirty_data_max upper bound in bytes"); "zfs_dirty_data_max upper bound in bytes");

View File

@ -538,6 +538,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
itx_wr_state_t write_state; itx_wr_state_t write_state;
uintptr_t fsync_cnt; uintptr_t fsync_cnt;
uint64_t gen = 0; uint64_t gen = 0;
ssize_t size = resid;
if (zil_replaying(zilog, tx) || zp->z_unlinked || if (zil_replaying(zilog, tx) || zp->z_unlinked ||
zfs_xattr_owner_unlinked(zp)) { zfs_xattr_owner_unlinked(zp)) {
@ -623,6 +624,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
off += len; off += len;
resid -= len; resid -= len;
} }
if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg);
}
} }
/* /*

View File

@ -84,10 +84,8 @@
#include <sys/zfs_rlock.h> #include <sys/zfs_rlock.h>
#include <sys/spa_impl.h> #include <sys/spa_impl.h>
#include <sys/zvol.h> #include <sys/zvol.h>
#include <sys/zvol_impl.h> #include <sys/zvol_impl.h>
unsigned int zvol_inhibit_dev = 0; unsigned int zvol_inhibit_dev = 0;
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
@ -577,6 +575,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
uint32_t blocksize = zv->zv_volblocksize; uint32_t blocksize = zv->zv_volblocksize;
zilog_t *zilog = zv->zv_zilog; zilog_t *zilog = zv->zv_zilog;
itx_wr_state_t write_state; itx_wr_state_t write_state;
uint64_t sz = size;
if (zil_replaying(zilog, tx)) if (zil_replaying(zilog, tx))
return; return;
@ -628,6 +627,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
offset += len; offset += len;
size -= len; size -= len;
} }
if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg);
}
} }
/* /*