This commit is contained in:
Allan Jude 2024-09-12 11:59:14 +00:00 committed by GitHub
commit d1474915ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 187 additions and 15 deletions

View File

@ -16,7 +16,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.Dd June 27, 2024
.Dd August 28, 2024
.Dt ZFS 4
.Os
.
@ -73,6 +73,17 @@ When set to
.Sy 0
the array is dynamically sized based on total system memory.
.
.It Sy dbuf_evict_parallel Ns = Ns Sy 0 Pq uint
When set to 1, ZFS will use up to
.Sy dbuf_evict_threads
threads to evict dbuf data in parallel, improving the responsiveness
of ZFS to memory pressure.
.
.It Sy dbuf_evict_threads Ns = Ns Sy 0 Pq uint
Sets the maximum number of dbuf eviction threads to be used.
When set to 0, ZFS uses one-eighth of the available CPUs,
with a minimum of 2 and a maximum of 16.
.
.It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint
dnode slots allocated in a single operation as a power of 2.
The default value minimizes lock contention for the bulk operation performed.

View File

@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
*/
static kmem_cache_t *dbuf_kmem_cache;
static taskq_t *dbu_evict_taskq;
static taskq_t *dbuf_evict_taskq;
static kthread_t *dbuf_cache_evict_thread;
static kmutex_t dbuf_evict_lock;
@ -237,6 +238,20 @@ static uint_t dbuf_metadata_cache_shift = 6;
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
static uint_t dbuf_mutex_cache_shift = 0;
/*
* Number of dbuf_evict threads
*/
static uint_t dbuf_evict_threads = 0;
/*
* The minimum number of bytes we can evict at once is a block size.
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
* We use this value to compute a scaling factor for the eviction tasks.
*/
#define DBUF_MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT)
static uint_t dbuf_evict_parallel = 0;
static unsigned long dbuf_cache_target_bytes(void);
static unsigned long dbuf_metadata_cache_target_bytes(void);
@ -762,26 +777,47 @@ dbuf_cache_above_lowater(void)
}
/*
* Evict the oldest eligible dbuf from the dbuf cache.
* Evict the oldest eligible dbufs from the dbuf cache.
* Use the multilist sublist (mls) with the provided index #idx.
*/
static void
dbuf_evict_one(void)
dbuf_evict_many(uint64_t bytes, unsigned int idx)
{
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
int64_t evicted = 0;
dmu_buf_impl_t *marker = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
marker->db_objset = NULL;
ASSERT3U(idx, <, multilist_get_num_sublists(
&dbuf_caches[DB_DBUF_CACHE].cache));
multilist_sublist_t *mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
db = multilist_sublist_prev(mls, db);
}
multilist_sublist_insert_after(mls, db, marker);
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
multilist_sublist_t *, mls);
while (db != NULL && evicted < bytes) {
int skip = 0;
while (db != NULL && (db->db_objset == NULL ||
mutex_tryenter(&db->db_mtx) == 0)) {
db = multilist_sublist_prev(mls, db);
if (skip == 0)
skip = 1;
}
if (db == NULL)
break;
if (skip) {
multilist_sublist_remove(mls, marker);
multilist_sublist_insert_before(mls, db, marker);
}
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
multilist_sublist_t *, mls);
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
uint64_t size = db->db.db_size;
@ -797,9 +833,121 @@ dbuf_evict_one(void)
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
DBUF_STAT_BUMP(cache_total_evicts);
} else {
multilist_sublist_unlock(mls);
evicted += size + usize;
mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
db = multilist_sublist_prev(mls, marker);
}
multilist_sublist_remove(mls, marker);
multilist_sublist_unlock(mls);
kmem_cache_free(dbuf_kmem_cache, marker);
}
typedef struct evict_arg {
taskq_ent_t tqe;
unsigned idx;
uint64_t bytes;
} evict_arg_t;
static void
dbuf_evict_task(void *arg)
{
evict_arg_t *eva = arg;
dbuf_evict_many(eva->bytes, eva->idx);
}
static void
dbuf_evict(void)
{
int64_t bytes = (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) -
dbuf_cache_lowater_bytes());
if (bytes <= 0)
return;
unsigned idx = multilist_get_random_index(
&dbuf_caches[DB_DBUF_CACHE].cache);
if (!dbuf_evict_parallel)
return (dbuf_evict_many(bytes, idx));
/*
* Go to the parallel eviction.
*/
unsigned int num_sublists = multilist_get_num_sublists(
&dbuf_caches[DB_DBUF_CACHE].cache);
evict_arg_t *evarg = kmem_zalloc(sizeof (*evarg) * num_sublists,
KM_SLEEP);
/*
* How we scale
*
* Example 1, # of chunks less than # of tasks.
* We have:
* - 4 tasks
* - 3 chunks
* - 3 full col
* - 0 low cols.
*
* The first low col index is 3.
* The tasks #0-#2 evict 1 chunk each.
*
* 0 | 1 | 2 | 3 |
* +===+===+===+===+
* | x | x | x | |
* +---+---+---+---+
*
* Example 2, # of chunks more than # of tasks.
* We have:
* - 4 tasks
* - 9 chunks
* - 1 full col
* - 3 low cols
*
* The first low col index is 1.
* The task #0 evicts 3 chunks, the others evict 2 chunks each.
*
* 0 | 1 | 2 | 3 |
* +===+===+===+===+
* | x | x | x | x |
* +---+---+---+---+
* | x | x | x | x |
* +---+---+---+---+
* | x | | | |
* +---+---+---+---+
*/
/*
* Compute number of tasks to run (n), first low col index (k),
* normal and low bytes per task.
*/
uint64_t nchunks = ((bytes - 1) >> DBUF_MIN_EVICT_PERTASK_SHIFT) + 1;
unsigned n = nchunks < num_sublists ? nchunks : num_sublists;
uint64_t fullrows = nchunks / n;
unsigned lastrowcols = nchunks % n;
unsigned k = (lastrowcols ? lastrowcols : n);
uint64_t bytes_pertask_low = fullrows << DBUF_MIN_EVICT_PERTASK_SHIFT;
uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ?
(1 << DBUF_MIN_EVICT_PERTASK_SHIFT) : 0);
for (unsigned i = 0; i < n; i++) {
uint64_t evict = i < k ? bytes_pertask : bytes_pertask_low;
evarg[i].idx = idx;
evarg[i].bytes = evict;
taskq_dispatch_ent(dbuf_evict_taskq, dbuf_evict_task,
&evarg[i], 0, &evarg[i].tqe);
/* wrap idx */
if (++idx >= num_sublists)
idx = 0;
}
taskq_wait(dbuf_evict_taskq);
kmem_free(evarg, sizeof (*evarg) * num_sublists);
}
/*
@ -833,7 +981,7 @@ dbuf_evict_thread(void *unused)
* minimize lock contention.
*/
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
dbuf_evict_one();
dbuf_evict();
}
mutex_enter(&dbuf_evict_lock);
@ -860,7 +1008,7 @@ dbuf_evict_notify(uint64_t size)
*/
if (size > dbuf_cache_target_bytes()) {
if (size > dbuf_cache_hiwater_bytes())
dbuf_evict_one();
dbuf_evict();
cv_signal(&dbuf_evict_cv);
}
}
@ -965,11 +1113,16 @@ dbuf_init(void)
dbuf_stats_init(h);
if (dbuf_evict_threads == 0)
dbuf_evict_threads = MAX(2, MIN(16, max_ncpus >> 3));
/*
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
* configuration is not required.
*/
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
dbuf_evict_taskq = taskq_create("dbuf_evict",
MIN(dbuf_evict_threads, max_ncpus), defclsyspri,
MIN(dbuf_evict_threads, max_ncpus), max_ncpus, TASKQ_PREPOPULATE);
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
multilist_create(&dbuf_caches[dcs].cache,
@ -1035,6 +1188,8 @@ dbuf_fini(void)
kmem_cache_destroy(dbuf_kmem_cache);
taskq_destroy(dbu_evict_taskq);
taskq_wait(dbuf_evict_taskq);
taskq_destroy(dbuf_evict_taskq);
mutex_enter(&dbuf_evict_lock);
dbuf_evict_thread_exit = B_TRUE;
@ -3963,7 +4118,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag)
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
* ^ |
* | |
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
* +-----dbuf_destroy()<--dbuf_evict()<------------+
*
*/
void
@ -5282,3 +5437,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
"Set size of dbuf cache mutex array as log2 shift.");
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_parallel, UINT, ZMOD_RW,
"Evict from the dbuf cache in parallel using a taskq");
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_threads, UINT, ZMOD_RW,
"Maximum number of dbuf_evict threads");