Dynamically size dbuf hash mutex array
Incorrectly sizing the array of hash locks used to protect the dbuf hash table can lead to contention and reduce performance. We could unconditionally allocate a larger array for the locks but it's wasteful, particularly for a low-memory system. Instead, dynamically allocate the array of locks and scale it based on total system memory. Additionally, add a new `dbuf_mutex_cache_shift` module option which can be used to override the hash lock array size. This is disabled by default (dbuf_mutex_hash_shift=0) and can only be set at module load time. The minimum target array size is set to 8192, this matches the current constant value. Note that the count of the dbuf hash table and count of the mutex array were added to the /proc/spl/kstat/zfs/dbufstats kstat. Finally, this change removes the _KERNEL conditional checks. These were not required since for the user space build there is no difference between the kmem and vmem interfaces. Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #13928
This commit is contained in:
parent
223b04d23d
commit
505df8d133
|
@ -321,13 +321,14 @@ typedef struct dmu_buf_impl {
|
|||
uint8_t db_dirtycnt;
|
||||
} dmu_buf_impl_t;
|
||||
|
||||
/* Note: the dbuf hash table is exposed only for the mdb module */
|
||||
#define DBUF_MUTEXES 2048
|
||||
#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
|
||||
#define DBUF_HASH_MUTEX(h, idx) \
|
||||
(&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)])
|
||||
|
||||
typedef struct dbuf_hash_table {
|
||||
uint64_t hash_table_mask;
|
||||
uint64_t hash_mutex_mask;
|
||||
dmu_buf_impl_t **hash_table;
|
||||
kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned;
|
||||
kmutex_t *hash_mutexes;
|
||||
} dbuf_hash_table_t;
|
||||
|
||||
typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t);
|
||||
|
|
|
@ -66,6 +66,12 @@ Set the size of the dbuf metadata cache
|
|||
.Pq Sy dbuf_metadata_cache_max_bytes
|
||||
to a log2 fraction of the target ARC size.
|
||||
.
|
||||
.It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint
|
||||
Set the size of the mutex array for the dbuf cache.
|
||||
When set to
|
||||
.Sy 0
|
||||
the array is dynamically sized based on total system memory.
|
||||
.
|
||||
.It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq int
|
||||
dnode slots allocated in a single operation as a power of 2.
|
||||
The default value minimizes lock contention for the bulk operation performed.
|
||||
|
|
|
@ -99,6 +99,11 @@ typedef struct dbuf_stats {
|
|||
* already created and in the dbuf hash table.
|
||||
*/
|
||||
kstat_named_t hash_insert_race;
|
||||
/*
|
||||
* Number of entries in the hash table dbuf and mutex arrays.
|
||||
*/
|
||||
kstat_named_t hash_table_count;
|
||||
kstat_named_t hash_mutex_count;
|
||||
/*
|
||||
* Statistics about the size of the metadata dbuf cache.
|
||||
*/
|
||||
|
@ -131,6 +136,8 @@ dbuf_stats_t dbuf_stats = {
|
|||
{ "hash_chains", KSTAT_DATA_UINT64 },
|
||||
{ "hash_chain_max", KSTAT_DATA_UINT64 },
|
||||
{ "hash_insert_race", KSTAT_DATA_UINT64 },
|
||||
{ "hash_table_count", KSTAT_DATA_UINT64 },
|
||||
{ "hash_mutex_count", KSTAT_DATA_UINT64 },
|
||||
{ "metadata_cache_count", KSTAT_DATA_UINT64 },
|
||||
{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
|
||||
{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
|
||||
|
@ -227,6 +234,9 @@ static unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
|
|||
static int dbuf_cache_shift = 5;
|
||||
static int dbuf_metadata_cache_shift = 6;
|
||||
|
||||
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
|
||||
static uint32_t dbuf_mutex_cache_shift = 0;
|
||||
|
||||
static unsigned long dbuf_cache_target_bytes(void);
|
||||
static unsigned long dbuf_metadata_cache_target_bytes(void);
|
||||
|
||||
|
@ -838,6 +848,7 @@ static int
|
|||
dbuf_kstat_update(kstat_t *ksp, int rw)
|
||||
{
|
||||
dbuf_stats_t *ds = ksp->ks_data;
|
||||
dbuf_hash_table_t *h = &dbuf_hash_table;
|
||||
|
||||
if (rw == KSTAT_WRITE)
|
||||
return (SET_ERROR(EACCES));
|
||||
|
@ -867,6 +878,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
|
|||
wmsum_value(&dbuf_sums.hash_chains);
|
||||
ds->hash_insert_race.value.ui64 =
|
||||
wmsum_value(&dbuf_sums.hash_insert_race);
|
||||
ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
|
||||
ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
|
||||
ds->metadata_cache_count.value.ui64 =
|
||||
wmsum_value(&dbuf_sums.metadata_cache_count);
|
||||
ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
|
||||
|
@ -879,9 +892,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
|
|||
void
|
||||
dbuf_init(void)
|
||||
{
|
||||
uint64_t hsize = 1ULL << 16;
|
||||
uint64_t hmsize, hsize = 1ULL << 16;
|
||||
dbuf_hash_table_t *h = &dbuf_hash_table;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* The hash table is big enough to fill one eighth of physical memory
|
||||
|
@ -892,29 +904,42 @@ dbuf_init(void)
|
|||
while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
|
||||
hsize <<= 1;
|
||||
|
||||
retry:
|
||||
h->hash_table = NULL;
|
||||
while (h->hash_table == NULL) {
|
||||
h->hash_table_mask = hsize - 1;
|
||||
#if defined(_KERNEL)
|
||||
/*
|
||||
* Large allocations which do not require contiguous pages
|
||||
* should be using vmem_alloc() in the linux kernel
|
||||
*/
|
||||
|
||||
h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
|
||||
#else
|
||||
h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
|
||||
#endif
|
||||
if (h->hash_table == NULL) {
|
||||
/* XXX - we should really return an error instead of assert */
|
||||
ASSERT(hsize > (1ULL << 10));
|
||||
if (h->hash_table == NULL)
|
||||
hsize >>= 1;
|
||||
goto retry;
|
||||
|
||||
ASSERT3U(hsize, >=, 1ULL << 10);
|
||||
}
|
||||
|
||||
/*
|
||||
* The hash table buckets are protected by an array of mutexes where
|
||||
* each mutex is reponsible for protecting 128 buckets. A minimum
|
||||
* array size of 8192 is targeted to avoid contention.
|
||||
*/
|
||||
if (dbuf_mutex_cache_shift == 0)
|
||||
hmsize = MAX(hsize >> 7, 1ULL << 13);
|
||||
else
|
||||
hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
|
||||
|
||||
h->hash_mutexes = NULL;
|
||||
while (h->hash_mutexes == NULL) {
|
||||
h->hash_mutex_mask = hmsize - 1;
|
||||
|
||||
h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
|
||||
KM_SLEEP);
|
||||
if (h->hash_mutexes == NULL)
|
||||
hmsize >>= 1;
|
||||
}
|
||||
|
||||
dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
|
||||
sizeof (dmu_buf_impl_t),
|
||||
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
|
||||
|
||||
for (i = 0; i < DBUF_MUTEXES; i++)
|
||||
for (int i = 0; i < hmsize; i++)
|
||||
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
dbuf_stats_init(h);
|
||||
|
@ -941,7 +966,7 @@ retry:
|
|||
|
||||
wmsum_init(&dbuf_sums.cache_count, 0);
|
||||
wmsum_init(&dbuf_sums.cache_total_evicts, 0);
|
||||
for (i = 0; i < DN_MAX_LEVELS; i++) {
|
||||
for (int i = 0; i < DN_MAX_LEVELS; i++) {
|
||||
wmsum_init(&dbuf_sums.cache_levels[i], 0);
|
||||
wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
|
||||
}
|
||||
|
@ -957,7 +982,7 @@ retry:
|
|||
KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
|
||||
KSTAT_FLAG_VIRTUAL);
|
||||
if (dbuf_ksp != NULL) {
|
||||
for (i = 0; i < DN_MAX_LEVELS; i++) {
|
||||
for (int i = 0; i < DN_MAX_LEVELS; i++) {
|
||||
snprintf(dbuf_stats.cache_levels[i].name,
|
||||
KSTAT_STRLEN, "cache_level_%d", i);
|
||||
dbuf_stats.cache_levels[i].data_type =
|
||||
|
@ -977,21 +1002,16 @@ void
|
|||
dbuf_fini(void)
|
||||
{
|
||||
dbuf_hash_table_t *h = &dbuf_hash_table;
|
||||
int i;
|
||||
|
||||
dbuf_stats_destroy();
|
||||
|
||||
for (i = 0; i < DBUF_MUTEXES; i++)
|
||||
for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
|
||||
mutex_destroy(&h->hash_mutexes[i]);
|
||||
#if defined(_KERNEL)
|
||||
/*
|
||||
* Large allocations which do not require contiguous pages
|
||||
* should be using vmem_free() in the linux kernel
|
||||
*/
|
||||
|
||||
vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
|
||||
#else
|
||||
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
|
||||
#endif
|
||||
vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
|
||||
sizeof (kmutex_t));
|
||||
|
||||
kmem_cache_destroy(dbuf_kmem_cache);
|
||||
taskq_destroy(dbu_evict_taskq);
|
||||
|
||||
|
@ -1018,7 +1038,7 @@ dbuf_fini(void)
|
|||
|
||||
wmsum_fini(&dbuf_sums.cache_count);
|
||||
wmsum_fini(&dbuf_sums.cache_total_evicts);
|
||||
for (i = 0; i < DN_MAX_LEVELS; i++) {
|
||||
for (int i = 0; i < DN_MAX_LEVELS; i++) {
|
||||
wmsum_fini(&dbuf_sums.cache_levels[i]);
|
||||
wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
|
||||
}
|
||||
|
@ -5117,3 +5137,6 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
|
|||
|
||||
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
|
||||
"Set size of dbuf metadata cache to log2 fraction of arc size.");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
|
||||
"Set size of dbuf cache mutex array as log2 shift.");
|
||||
|
|
Loading…
Reference in New Issue