OpenZFS 9617 - too-frequent TXG sync causes excessive write inflation

Porting notes:
* Renamed zfs_dirty_data_sync_pct to zfs_dirty_data_sync_percent and
  changed the type to be consistent with the other dirty module params.
* Updated zfs-module-parameters.5 accordingly.

Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Approved by: Robert Mustacchi <rm@joyent.com>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>

OpenZFS-issue: https://illumos.org/issues/9617
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7928f4ba
Closes #7976
This commit is contained in:
Matthew Ahrens 2017-12-12 15:46:58 -08:00 committed by Brian Behlendorf
parent 58c0f374f1
commit dfbe267503
4 changed files with 18 additions and 11 deletions

View File

@ -57,7 +57,7 @@ struct dsl_crypto_params;
extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max;
extern unsigned long zfs_dirty_data_max_max; extern unsigned long zfs_dirty_data_max_max;
extern unsigned long zfs_dirty_data_sync; extern int zfs_dirty_data_sync_percent;
extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_percent;
extern int zfs_dirty_data_max_max_percent; extern int zfs_dirty_data_max_max_percent;
extern int zfs_delay_min_dirty_percent; extern int zfs_delay_min_dirty_percent;

View File

@ -1225,12 +1225,14 @@ Default value: \fB10\fR%, subject to \fBzfs_dirty_data_max_max\fR.
.sp .sp
.ne 2 .ne 2
.na .na
\fBzfs_dirty_data_sync\fR (int) \fBzfs_dirty_data_sync_percent\fR (int)
.ad .ad
.RS 12n .RS 12n
Start syncing out a transaction group if there is at least this much dirty data. Start syncing out a transaction group if there's at least this much dirty data
as a percentage of \fBzfs_dirty_data_max\fR. This should be less than
\fBzfs_vdev_async_write_active_min_dirty_percent\fR.
.sp .sp
Default value: \fB67,108,864\fR. Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
.RE .RE
.sp .sp

View File

@ -106,9 +106,11 @@ int zfs_dirty_data_max_percent = 10;
int zfs_dirty_data_max_max_percent = 25; int zfs_dirty_data_max_max_percent = 25;
/* /*
* If there is at least this much dirty data, push out a txg. * If there's at least this much dirty data (as a percentage of
* zfs_dirty_data_max), push out a txg. This should be less than
* zfs_vdev_async_write_active_min_dirty_percent.
*/ */
unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024; int zfs_dirty_data_sync_percent = 20;
/* /*
* Once there is this amount of dirty data, the dmu_tx_delay() will kick in * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
@ -879,10 +881,12 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
{ {
uint64_t delay_min_bytes = uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
uint64_t dirty_min_bytes =
zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
boolean_t rv; boolean_t rv;
mutex_enter(&dp->dp_lock); mutex_enter(&dp->dp_lock);
if (dp->dp_dirty_total > zfs_dirty_data_sync) if (dp->dp_dirty_total > dirty_min_bytes)
txg_kick(dp); txg_kick(dp);
rv = (dp->dp_dirty_total > delay_min_bytes); rv = (dp->dp_dirty_total > delay_min_bytes);
mutex_exit(&dp->dp_lock); mutex_exit(&dp->dp_lock);
@ -1345,7 +1349,7 @@ module_param(zfs_dirty_data_max_max, ulong, 0444);
MODULE_PARM_DESC(zfs_dirty_data_max_max, MODULE_PARM_DESC(zfs_dirty_data_max_max,
"zfs_dirty_data_max upper bound in bytes"); "zfs_dirty_data_max upper bound in bytes");
module_param(zfs_dirty_data_sync, ulong, 0644); module_param(zfs_dirty_data_sync_percent, int, 0644);
MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data"); MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data");
module_param(zfs_delay_scale, ulong, 0644); module_param(zfs_delay_scale, ulong, 0644);

View File

@ -517,7 +517,8 @@ txg_sync_thread(void *arg)
clock_t timeout = zfs_txg_timeout * hz; clock_t timeout = zfs_txg_timeout * hz;
clock_t timer; clock_t timer;
uint64_t txg; uint64_t txg;
txg_stat_t *ts; uint64_t dirty_min_bytes =
zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
/* /*
* We sync when we're scanning, there's someone waiting * We sync when we're scanning, there's someone waiting
@ -529,7 +530,7 @@ txg_sync_thread(void *arg)
!tx->tx_exiting && timer > 0 && !tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
!txg_has_quiesced_to_sync(dp) && !txg_has_quiesced_to_sync(dp) &&
dp->dp_dirty_total < zfs_dirty_data_sync) { dp->dp_dirty_total < dirty_min_bytes) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
@ -561,7 +562,7 @@ txg_sync_thread(void *arg)
tx->tx_quiesced_txg = 0; tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg; tx->tx_syncing_txg = txg;
DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
ts = spa_txg_history_init_io(spa, txg, dp); txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp);
cv_broadcast(&tx->tx_quiesce_more_cv); cv_broadcast(&tx->tx_quiesce_more_cv);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",