From 505df8d133788c28a2d6fcdf504dd1ffa55fa43f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 19 Sep 2022 12:17:11 -0700 Subject: [PATCH] Dynamically size dbuf hash mutex array Incorrectly sizing the array of hash locks used to protect the dbuf hash table can lead to contention and reduce performance. We could unconditionally allocate a larger array for the locks but it's wasteful, particularly for a low-memory system. Instead, dynamically allocate the array of locks and scale it based on total system memory. Additionally, add a new `dbuf_mutex_cache_shift` module option which can be used to override the hash lock array size. This is disabled by default (dbuf_mutex_hash_shift=0) and can only be set at module load time. The minimum target array size is set to 8192, this matches the current constant value. Note that the count of the dbuf hash table and count of the mutex array were added to the /proc/spl/kstat/zfs/dbufstats kstat. Finally, this change removes the _KERNEL conditional checks. These were not required since for the user space build there is no difference between the kmem and vmem interfaces. Reviewed-by: Ryan Moeller Reviewed-by: Richard Yao Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #13928 --- include/sys/dbuf.h | 9 ++--- man/man4/zfs.4 | 6 ++++ module/zfs/dbuf.c | 83 +++++++++++++++++++++++++++++----------------- 3 files changed, 64 insertions(+), 34 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 7886f728bb..06489ea84b 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -321,13 +321,14 @@ typedef struct dmu_buf_impl { uint8_t db_dirtycnt; } dmu_buf_impl_t; -/* Note: the dbuf hash table is exposed only for the mdb module */ -#define DBUF_MUTEXES 2048 -#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) +#define DBUF_HASH_MUTEX(h, idx) \ + (&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)]) + typedef struct dbuf_hash_table { uint64_t hash_table_mask; + uint64_t hash_mutex_mask; dmu_buf_impl_t **hash_table; - kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned; + kmutex_t *hash_mutexes; } dbuf_hash_table_t; typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 90a8ca788c..805c037e3d 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -66,6 +66,12 @@ Set the size of the dbuf metadata cache .Pq Sy dbuf_metadata_cache_max_bytes to a log2 fraction of the target ARC size. . +.It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint +Set the size of the mutex array for the dbuf cache. +When set to +.Sy 0 +the array is dynamically sized based on total system memory. +. .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq int dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 9e36afd310..85ba4e3221 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -99,6 +99,11 @@ typedef struct dbuf_stats { * already created and in the dbuf hash table. */ kstat_named_t hash_insert_race; + /* + * Number of entries in the hash table dbuf and mutex arrays. + */ + kstat_named_t hash_table_count; + kstat_named_t hash_mutex_count; /* * Statistics about the size of the metadata dbuf cache. */ @@ -131,6 +136,8 @@ dbuf_stats_t dbuf_stats = { { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 }, + { "hash_table_count", KSTAT_DATA_UINT64 }, + { "hash_mutex_count", KSTAT_DATA_UINT64 }, { "metadata_cache_count", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, @@ -227,6 +234,9 @@ static unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX; static int dbuf_cache_shift = 5; static int dbuf_metadata_cache_shift = 6; +/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */ +static uint32_t dbuf_mutex_cache_shift = 0; + static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void); @@ -838,6 +848,7 @@ static int dbuf_kstat_update(kstat_t *ksp, int rw) { dbuf_stats_t *ds = ksp->ks_data; + dbuf_hash_table_t *h = &dbuf_hash_table; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); @@ -867,6 +878,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw) wmsum_value(&dbuf_sums.hash_chains); ds->hash_insert_race.value.ui64 = wmsum_value(&dbuf_sums.hash_insert_race); + ds->hash_table_count.value.ui64 = h->hash_table_mask + 1; + ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1; ds->metadata_cache_count.value.ui64 = wmsum_value(&dbuf_sums.metadata_cache_count); ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( @@ -879,9 +892,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw) void dbuf_init(void) { - uint64_t hsize = 1ULL << 16; + uint64_t hmsize, hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; - int i; /* * The hash table is big enough to fill one eighth of physical memory @@ -892,29 +904,42 @@ dbuf_init(void) while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8) hsize <<= 1; -retry: - h->hash_table_mask = hsize - 1; -#if defined(_KERNEL) + h->hash_table = NULL; + while (h->hash_table == NULL) { + h->hash_table_mask = hsize - 1; + + h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); + if (h->hash_table == NULL) + hsize >>= 1; + + ASSERT3U(hsize, >=, 1ULL << 10); + } + /* - * Large allocations which do not require contiguous pages - * should be using vmem_alloc() in the linux kernel + * The hash table buckets are protected by an array of mutexes where + * each mutex is reponsible for protecting 128 buckets. A minimum + * array size of 8192 is targeted to avoid contention. */ - h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); -#else - h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); -#endif - if (h->hash_table == NULL) { - /* XXX - we should really return an error instead of assert */ - ASSERT(hsize > (1ULL << 10)); - hsize >>= 1; - goto retry; + if (dbuf_mutex_cache_shift == 0) + hmsize = MAX(hsize >> 7, 1ULL << 13); + else + hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24); + + h->hash_mutexes = NULL; + while (h->hash_mutexes == NULL) { + h->hash_mutex_mask = hmsize - 1; + + h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t), + KM_SLEEP); + if (h->hash_mutexes == NULL) + hmsize >>= 1; } dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - for (i = 0; i < DBUF_MUTEXES; i++) + for (int i = 0; i < hmsize; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); @@ -941,7 +966,7 @@ retry: wmsum_init(&dbuf_sums.cache_count, 0); wmsum_init(&dbuf_sums.cache_total_evicts, 0); - for (i = 0; i < DN_MAX_LEVELS; i++) { + for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); } @@ -957,7 +982,7 @@ retry: KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dbuf_ksp != NULL) { - for (i = 0; i < DN_MAX_LEVELS; i++) { + for (int i = 0; i < DN_MAX_LEVELS; i++) { snprintf(dbuf_stats.cache_levels[i].name, KSTAT_STRLEN, "cache_level_%d", i); dbuf_stats.cache_levels[i].data_type = @@ -977,21 +1002,16 @@ void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; - int i; dbuf_stats_destroy(); - for (i = 0; i < DBUF_MUTEXES; i++) + for (int i = 0; i < (h->hash_mutex_mask + 1); i++) mutex_destroy(&h->hash_mutexes[i]); -#if defined(_KERNEL) - /* - * Large allocations which do not require contiguous pages - * should be using vmem_free() in the linux kernel - */ + vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); -#else - kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); -#endif + vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) * + sizeof (kmutex_t)); + kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); @@ -1018,7 +1038,7 @@ dbuf_fini(void) wmsum_fini(&dbuf_sums.cache_count); wmsum_fini(&dbuf_sums.cache_total_evicts); - for (i = 0; i < DN_MAX_LEVELS; i++) { + for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); } @@ -5117,3 +5137,6 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW, "Set size of dbuf metadata cache to log2 fraction of arc size."); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD, + "Set size of dbuf cache mutex array as log2 shift.");