Dynamically size dbuf hash mutex array

Incorrectly sizing the array of hash locks used to protect the
dbuf hash table can lead to contention and reduce performance.
We could unconditionally allocate a larger array for the locks
but it's wasteful, particularly for a low-memory system.
Instead, dynamically allocate the array of locks and scale
it based on total system memory.

Additionally, add a new `dbuf_mutex_cache_shift` module option
which can be used to override the hash lock array size.  This is
disabled by default (dbuf_mutex_hash_shift=0) and can only be
set at module load time.  The minimum target array size is set
to 8192, this matches the current constant value.

Note that the count of the dbuf hash table and count of the
mutex array were added to the /proc/spl/kstat/zfs/dbufstats
kstat.

Finally, this change removes the _KERNEL conditional checks.
These were not required since for the user space build there
is no difference between the kmem and vmem interfaces.

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #13928
This commit is contained in:
Brian Behlendorf 2022-09-19 12:17:11 -07:00
parent 223b04d23d
commit 505df8d133
3 changed files with 64 additions and 34 deletions

View File

@ -321,13 +321,14 @@ typedef struct dmu_buf_impl {
uint8_t db_dirtycnt; uint8_t db_dirtycnt;
} dmu_buf_impl_t; } dmu_buf_impl_t;
/* Note: the dbuf hash table is exposed only for the mdb module */ #define DBUF_HASH_MUTEX(h, idx) \
#define DBUF_MUTEXES 2048 (&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)])
#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
typedef struct dbuf_hash_table { typedef struct dbuf_hash_table {
uint64_t hash_table_mask; uint64_t hash_table_mask;
uint64_t hash_mutex_mask;
dmu_buf_impl_t **hash_table; dmu_buf_impl_t **hash_table;
kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned; kmutex_t *hash_mutexes;
} dbuf_hash_table_t; } dbuf_hash_table_t;
typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t); typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t);

View File

@ -66,6 +66,12 @@ Set the size of the dbuf metadata cache
.Pq Sy dbuf_metadata_cache_max_bytes .Pq Sy dbuf_metadata_cache_max_bytes
to a log2 fraction of the target ARC size. to a log2 fraction of the target ARC size.
. .
.It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint
Set the size of the mutex array for the dbuf cache.
When set to
.Sy 0
the array is dynamically sized based on total system memory.
.
.It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq int .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq int
dnode slots allocated in a single operation as a power of 2. dnode slots allocated in a single operation as a power of 2.
The default value minimizes lock contention for the bulk operation performed. The default value minimizes lock contention for the bulk operation performed.

View File

@ -99,6 +99,11 @@ typedef struct dbuf_stats {
* already created and in the dbuf hash table. * already created and in the dbuf hash table.
*/ */
kstat_named_t hash_insert_race; kstat_named_t hash_insert_race;
/*
* Number of entries in the hash table dbuf and mutex arrays.
*/
kstat_named_t hash_table_count;
kstat_named_t hash_mutex_count;
/* /*
* Statistics about the size of the metadata dbuf cache. * Statistics about the size of the metadata dbuf cache.
*/ */
@ -131,6 +136,8 @@ dbuf_stats_t dbuf_stats = {
{ "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 },
{ "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 },
{ "hash_insert_race", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 },
{ "hash_table_count", KSTAT_DATA_UINT64 },
{ "hash_mutex_count", KSTAT_DATA_UINT64 },
{ "metadata_cache_count", KSTAT_DATA_UINT64 }, { "metadata_cache_count", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
@ -227,6 +234,9 @@ static unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
static int dbuf_cache_shift = 5; static int dbuf_cache_shift = 5;
static int dbuf_metadata_cache_shift = 6; static int dbuf_metadata_cache_shift = 6;
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
static uint32_t dbuf_mutex_cache_shift = 0;
static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_cache_target_bytes(void);
static unsigned long dbuf_metadata_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void);
@ -838,6 +848,7 @@ static int
dbuf_kstat_update(kstat_t *ksp, int rw) dbuf_kstat_update(kstat_t *ksp, int rw)
{ {
dbuf_stats_t *ds = ksp->ks_data; dbuf_stats_t *ds = ksp->ks_data;
dbuf_hash_table_t *h = &dbuf_hash_table;
if (rw == KSTAT_WRITE) if (rw == KSTAT_WRITE)
return (SET_ERROR(EACCES)); return (SET_ERROR(EACCES));
@ -867,6 +878,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
wmsum_value(&dbuf_sums.hash_chains); wmsum_value(&dbuf_sums.hash_chains);
ds->hash_insert_race.value.ui64 = ds->hash_insert_race.value.ui64 =
wmsum_value(&dbuf_sums.hash_insert_race); wmsum_value(&dbuf_sums.hash_insert_race);
ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
ds->metadata_cache_count.value.ui64 = ds->metadata_cache_count.value.ui64 =
wmsum_value(&dbuf_sums.metadata_cache_count); wmsum_value(&dbuf_sums.metadata_cache_count);
ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
@ -879,9 +892,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
void void
dbuf_init(void) dbuf_init(void)
{ {
uint64_t hsize = 1ULL << 16; uint64_t hmsize, hsize = 1ULL << 16;
dbuf_hash_table_t *h = &dbuf_hash_table; dbuf_hash_table_t *h = &dbuf_hash_table;
int i;
/* /*
* The hash table is big enough to fill one eighth of physical memory * The hash table is big enough to fill one eighth of physical memory
@ -892,29 +904,42 @@ dbuf_init(void)
while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8) while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
hsize <<= 1; hsize <<= 1;
retry: h->hash_table = NULL;
while (h->hash_table == NULL) {
h->hash_table_mask = hsize - 1; h->hash_table_mask = hsize - 1;
#if defined(_KERNEL)
/*
* Large allocations which do not require contiguous pages
* should be using vmem_alloc() in the linux kernel
*/
h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
#else if (h->hash_table == NULL)
h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
#endif
if (h->hash_table == NULL) {
/* XXX - we should really return an error instead of assert */
ASSERT(hsize > (1ULL << 10));
hsize >>= 1; hsize >>= 1;
goto retry;
ASSERT3U(hsize, >=, 1ULL << 10);
}
/*
* The hash table buckets are protected by an array of mutexes where
* each mutex is reponsible for protecting 128 buckets. A minimum
* array size of 8192 is targeted to avoid contention.
*/
if (dbuf_mutex_cache_shift == 0)
hmsize = MAX(hsize >> 7, 1ULL << 13);
else
hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
h->hash_mutexes = NULL;
while (h->hash_mutexes == NULL) {
h->hash_mutex_mask = hmsize - 1;
h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
KM_SLEEP);
if (h->hash_mutexes == NULL)
hmsize >>= 1;
} }
dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t), sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < DBUF_MUTEXES; i++) for (int i = 0; i < hmsize; i++)
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
dbuf_stats_init(h); dbuf_stats_init(h);
@ -941,7 +966,7 @@ retry:
wmsum_init(&dbuf_sums.cache_count, 0); wmsum_init(&dbuf_sums.cache_count, 0);
wmsum_init(&dbuf_sums.cache_total_evicts, 0); wmsum_init(&dbuf_sums.cache_total_evicts, 0);
for (i = 0; i < DN_MAX_LEVELS; i++) { for (int i = 0; i < DN_MAX_LEVELS; i++) {
wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels[i], 0);
wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
} }
@ -957,7 +982,7 @@ retry:
KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL); KSTAT_FLAG_VIRTUAL);
if (dbuf_ksp != NULL) { if (dbuf_ksp != NULL) {
for (i = 0; i < DN_MAX_LEVELS; i++) { for (int i = 0; i < DN_MAX_LEVELS; i++) {
snprintf(dbuf_stats.cache_levels[i].name, snprintf(dbuf_stats.cache_levels[i].name,
KSTAT_STRLEN, "cache_level_%d", i); KSTAT_STRLEN, "cache_level_%d", i);
dbuf_stats.cache_levels[i].data_type = dbuf_stats.cache_levels[i].data_type =
@ -977,21 +1002,16 @@ void
dbuf_fini(void) dbuf_fini(void)
{ {
dbuf_hash_table_t *h = &dbuf_hash_table; dbuf_hash_table_t *h = &dbuf_hash_table;
int i;
dbuf_stats_destroy(); dbuf_stats_destroy();
for (i = 0; i < DBUF_MUTEXES; i++) for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
mutex_destroy(&h->hash_mutexes[i]); mutex_destroy(&h->hash_mutexes[i]);
#if defined(_KERNEL)
/*
* Large allocations which do not require contiguous pages
* should be using vmem_free() in the linux kernel
*/
vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
#else vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); sizeof (kmutex_t));
#endif
kmem_cache_destroy(dbuf_kmem_cache); kmem_cache_destroy(dbuf_kmem_cache);
taskq_destroy(dbu_evict_taskq); taskq_destroy(dbu_evict_taskq);
@ -1018,7 +1038,7 @@ dbuf_fini(void)
wmsum_fini(&dbuf_sums.cache_count); wmsum_fini(&dbuf_sums.cache_count);
wmsum_fini(&dbuf_sums.cache_total_evicts); wmsum_fini(&dbuf_sums.cache_total_evicts);
for (i = 0; i < DN_MAX_LEVELS; i++) { for (int i = 0; i < DN_MAX_LEVELS; i++) {
wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels[i]);
wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
} }
@ -5117,3 +5137,6 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
"Set size of dbuf metadata cache to log2 fraction of arc size."); "Set size of dbuf metadata cache to log2 fraction of arc size.");
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
"Set size of dbuf cache mutex array as log2 shift.");